Producer/consumer doesn't generate expected results - c#

I've written such producer/consumer code, which should generate big file filled with random data
class Program
{
static void Main(string[] args)
{
Random random = new Random();
String filename = #"d:\test_out";
long numlines = 1000000;
var buffer = new BlockingCollection<string[]>(10); //limit to not get OOM.
int arrSize = 100; //size of each string chunk in buffer;
String[] block = new string[arrSize];
Task producer = Task.Factory.StartNew(() =>
{
long blockNum = 0;
long lineStopped = 0;
for (long i = 0; i < numlines; i++)
{
if (blockNum == arrSize)
{
buffer.Add(block);
blockNum = 0;
lineStopped = i;
}
block[blockNum] = random.Next().ToString();
//null is sign to stop if last block is not fully filled
if (blockNum < arrSize - 1)
{
block[blockNum + 1] = null;
}
blockNum++;
};
if (lineStopped < numlines)
{
buffer.Add(block);
}
buffer.CompleteAdding();
}, TaskCreationOptions.LongRunning);
Task consumer = Task.Factory.StartNew(() =>
{
using (var outputFile = new StreamWriter(filename))
{
foreach (string[] chunk in buffer.GetConsumingEnumerable())
{
foreach (string value in chunk)
{
if (value == null) break;
outputFile.WriteLine(value);
}
}
}
}, TaskCreationOptions.LongRunning);
Task.WaitAll(producer, consumer);
}
}
And it does what is intended to do. But for some unknown reason it produces only ~550000 strings, not 1000000 and I can not understand why this is happening.
Can someone point on my mistake? I really don't get what's wrong with this code.

The buffer
String[] block = new string[arrSize];
is declared outside the Lambda. That means it is captured and re-used.
That would normally go unnoticed (you would just write out the wrong random data) but because your if (blockNum < arrSize - 1) is placed inside the for loop you regularly write a null into the shared buffer.
Exercise, instead of:
block[blockNum] = random.Next().ToString();
use
block[blockNum] = i.ToString();
and predict and verify the results.

Related

Task process ending before finishing all the work

I've been having trouble running multiple tasks with heavy operations.
It seems as if the task processes is killed before all the operations are complete.
The code here is an example code I used to replicate the issue. If I add something like Debug.Write(), the added wait for writing fixes the issue. The issue is gone if I test on a smaller sample size too. The reason there is a class in the example below is to create complexity for the test.
The real case where I encountered the issue first is too complicated to explain for a post here.
public static class StaticRandom
{
static int seed = Environment.TickCount;
static readonly ThreadLocal<Random> random =
new ThreadLocal<Random>(() => new Random(Interlocked.Increment(ref seed)));
public static int Next()
{
return random.Value.Next();
}
public static int Next(int maxValue)
{
return random.Value.Next(maxValue);
}
public static double NextDouble()
{
return random.Value.NextDouble();
}
}
// this is the test function I run to recreate the problem:
static void tasktest()
{
var testlist = new List<ExampleClass>();
for (var index = 0; index < 10000; ++index)
{
var newClass = new ExampleClass();
newClass.Populate(Enumerable.Range(0, 1000).ToList());
testlist.Add(newClass);
}
var anotherClassList = new List<ExampleClass>();
var threadNumber = 5;
if (threadNumber > testlist.Count)
{
threadNumber = testlist.Count;
}
var taskList = new List<Task>();
var tokenSource = new CancellationTokenSource();
CancellationToken cancellationToken = tokenSource.Token;
int stuffPerThread = testlist.Count / threadNumber;
var stuffCounter = 0;
for (var count = 1; count <= threadNumber; ++count)
{
var toSkip = stuffCounter;
var threadWorkLoad = stuffPerThread;
var currentIndex = count;
// these ifs make sure all the indexes are covered
if (stuffCounter + threadWorkLoad > testlist.Count)
{
threadWorkLoad = testlist.Count - stuffCounter;
}
else if (count == threadNumber && stuffCounter + threadWorkLoad < testlist.Count)
{
threadWorkLoad = testlist.Count - stuffCounter;
}
taskList.Add(Task.Factory.StartNew(() => taskfunc(testlist, anotherClassList, toSkip, threadWorkLoad),
cancellationToken, TaskCreationOptions.None, TaskScheduler.Default));
stuffCounter += stuffPerThread;
}
Task.WaitAll(taskList.ToArray());
}
public class ExampleClass
{
public ExampleClassInner[] Inners { get; set; }
public ExampleClass()
{
Inners = new ExampleClassInner[5];
for (var index = 0; index < Inners.Length; ++index)
{
Inners[index] = new ExampleClassInner();
}
}
public void Populate(List<int> intlist) {/*adds random ints to the inner class*/}
public ExampleClass(ExampleClass copyFrom)
{
Inners = new ExampleClassInner[5];
for (var index = 0; index < Inners.Length; ++index)
{
Inners[index] = new ExampleClassInner(copyFrom.Inners[index]);
}
}
public class ExampleClassInner
{
public bool SomeBool { get; set; } = false;
public int SomeInt { get; set; } = -1;
public ExampleClassInner()
{
}
public ExampleClassInner(ExampleClassInner copyFrom)
{
SomeBool = copyFrom.SomeBool;
SomeInt = copyFrom.SomeInt;
}
}
}
static int expensivefunc(int theint)
{
/*a lot of pointless arithmetic and loops done only on primitives and with primitives,
just to increase the complexity*/
theint *= theint + 1;
var anotherlist = Enumerable.Range(0, 10000).ToList();
for (var index = 0; index < anotherlist.Count; ++index)
{
theint += index;
if (theint % 5 == 0)
{
theint *= index / 2;
}
}
var yetanotherlist = Enumerable.Range(0, 50000).ToList();
for (var index = 0; index < yetanotherlist.Count; ++index)
{
theint += index;
if (theint % 7 == 0)
{
theint -= index / 3;
}
}
while (theint > 8)
{
theint /= 2;
}
return theint;
}
// this function is intentionally creating a lot of objects, to simulate complexity
static void taskfunc(List<ExampleClass> intlist, List<ExampleClass> anotherClassList, int skip, int take)
{
if (take == 0)
{
take = intlist.Count;
}
var partial = intlist.Skip(skip).Take(take).ToList();
for (var index = 0; index < partial.Count; ++index)
{
var testint = expensivefunc(index);
var newClass = new ExampleClass(partial[index]);
newDna.Inners[StaticRandom.Next(5)].SomeInt = testint;
anotherClassList.Add(new ExampleClass(newClass));
}
}
The expected result is that the list anotherClassList will be the same size as testlist and this happens when the lists are smaller or the complexity of the task operations is smaller. However, when I increase the volume of operations, the anotherClassList has a few indexes missing and sometimes some of the indexes in the list are null objects.
Example result:
Why does this happen, I have Task.WaitAll?
Your problem is it's just not thread-safe; you just can't add to a list<T> in a multi-threaded environment and expect it to play nice.
One way is to use lock or a thread safe collection, but I feel this all should be refactored (my OCD is going off all over the place).
private static object _sync = new object();
...
private static void TaskFunc(List<ExampleClass> intlist, List<ExampleClass> anotherClassList, int skip, int take)
{
...
var partial = intlist.Skip(skip).Take(take).ToList();
...
// note that locking here will likely drastically decrease any performance threading gain
lock (_sync)
{
for (var index = 0; index < partial.Count; ++index)
{
// this is your problem, you are adding to a list from multiple threads
anotherClassList.Add(...);
}
}
}
In short, I think you need to better thinking about the threading logic of your method, identify what you are trying to achieve, and how to make it conceptually thread safe (while keeping your performance gains).
After TheGeneral enlightened me that Lists are not thread safe, I changed the List to which I was adding in a thread, to an Array type and this fixed my issue.

custom threads in my code?

I need to create a program scraping a website.
And I did use Thread to solve.
Example:
I have 100 pages and I need divide it, instead of get each page I need custom Thread number to get page:
2 threads - 50 pages/thread
4 threads - 25 pages/thread
I tried my code below, however when to the last page of each thread that very slow.
Before I ask I used to find the way to solve but I can't, therefore I need help.
int so_thread = 10;//thread number
int page_du = 0;
List<NameValueCollection> List_item = new List<NameValueCollection>();
Thread[] threads = new Thread[so_thread];
int dem = 0;
await Task.Run(() =>
{
for (int i = 1; i <= so_thread; i++)
{
if ((Int32.Parse(o_sopage.Text) % so_thread) != 0 && i == so_thread)
{
page_du = Int32.Parse(o_sopage.Text) % so_thread;//Int32.Parse(o_sopage.Text) == page number need get
}
threads[i - 1] = new Thread((object data) =>
{
Array New_Data = new object[2];
New_Data = (Array)data;
int _i = (int)New_Data.GetValue(0);
int _pagedu = (int)New_Data.GetValue(1);
int page_per_thread = Int32.Parse(o_sopage.Text) / so_thread;//Int32.Parse(o_sopage.Text) == page number need get
for (int j = ((page_per_thread * _i) - page_per_thread) + 1; j <= ((page_per_thread * _i) + _pagedu); j++)
{
//MessageBox.Show(j.ToString());
var TG = ebay.GetPage(j);
lock (List_item)
{
List_item.AddRange(TG);
dem++;
progressBar1.Invoke((MethodInvoker)delegate
{
progressBar1.Value = dem;
});
}
}
});
object DATA = new object[2] { i, page_du };
threads[i - 1].Start(DATA);
}
});
Use Parallel.ForEach instead of creating the threads on your own.
Parallel.ForEach(yourCollection, () => { /* your code here */});

Thread safety Parallel.For c#

im frenchi so sorry first sorry for my english .
I have an error on visual studio (index out of range) i have this problem only with a Parallel.For not with classic for.
I think one thread want acces on my array[i] and another thread want too ..
It's a code for calcul Kmeans clustering for building link between document (with cosine similarity).
more information :
IndexOutOfRange is on similarityMeasure[i]=.....
I have a computer with 2 Processor (12logical)
with classic for , cpu usage is 9-14% , time for 1 iteration=9min..
with parallel.for , cpu usage is 70-90% =p, time for 1 iteration =~1min30
Sometimes it works longer before generating an error
My function is :
private static int FindClosestClusterCenter(List<Centroid> clustercenter, DocumentVector obj)
{
float[] similarityMeasure = new float[clustercenter.Count()];
float[] copy = similarityMeasure;
object sync = new Object();
Parallel.For(0, clustercenter.Count(), (i) => //for(int i = 0; i < clustercenter.Count(); i++) Parallel.For(0, clustercenter.Count(), (i) => //
{
similarityMeasure[i] = SimilarityMatrics.FindCosineSimilarity(clustercenter[i].GroupedDocument[0].VectorSpace, obj.VectorSpace);
});
int index = 0;
float maxValue = similarityMeasure[0];
for (int i = 0; i < similarityMeasure.Count(); i++)
{
if (similarityMeasure[i] > maxValue)
{
maxValue = similarityMeasure[i];
index = i;
}
}
return index;
}
My function is call here :
do
{
prevClusterCenter = centroidCollection;
DateTime starttime = DateTime.Now;
foreach (DocumentVector obj in documentCollection)//Parallel.ForEach(documentCollection, parallelOptions, obj =>//foreach (DocumentVector obj in documentCollection)
{
int ind = FindClosestClusterCenter(centroidCollection, obj);
resultSet[ind].GroupedDocument.Add(obj);
}
TimeSpan tempsecoule = DateTime.Now.Subtract(starttime);
Console.WriteLine(tempsecoule);
//Console.ReadKey();
InitializeClusterCentroid(out centroidCollection, centroidCollection.Count());
centroidCollection = CalculMeanPoints(resultSet);
stoppingCriteria = CheckStoppingCriteria(prevClusterCenter, centroidCollection);
if (!stoppingCriteria)
{
//initialisation du resultat pour la prochaine itération
InitializeClusterCentroid(out resultSet, centroidCollection.Count);
}
} while (stoppingCriteria == false);
_counter = counter;
return resultSet;
FindCosSimilarity :
public static float FindCosineSimilarity(float[] vecA, float[] vecB)
{
var dotProduct = DotProduct(vecA, vecB);
var magnitudeOfA = Magnitude(vecA);
var magnitudeOfB = Magnitude(vecB);
float result = dotProduct / (float)Math.Pow((magnitudeOfA * magnitudeOfB),2);
//when 0 is divided by 0 it shows result NaN so return 0 in such case.
if (float.IsNaN(result))
return 0;
else
return (float)result;
}
CalculMeansPoint :
private static List<Centroid> CalculMeanPoints(List<Centroid> _clust)
{
for (int i = 0; i < _clust.Count(); i++)
{
if (_clust[i].GroupedDocument.Count() > 0)
{
for (int j = 0; j < _clust[i].GroupedDocument[0].VectorSpace.Count(); j++)
{
float total = 0;
foreach (DocumentVector vspace in _clust[i].GroupedDocument)
{
total += vspace.VectorSpace[j];
}
_clust[i].GroupedDocument[0].VectorSpace[j] = total / _clust[i].GroupedDocument.Count();
}
}
}
return _clust;
}
You may have some side effects in FindCosineSimilarity, make sure it does not modify any field or input parameter. Example: resultSet[ind].GroupedDocument.Add(obj);. If resultSet is not a reference to locally instantiated array, then that is a side effect.
That may fix it. But FYI you could use AsParallel for this rather than Parallel.For:
similarityMeasure = clustercenter
.AsParallel().AsOrdered()
.Select(c=> SimilarityMatrics.FindCosineSimilarity(c.GroupedDocument[0].VectorSpace, obj.VectorSpace))
.ToArray();
You realize that if you synchronize the whole Content of the Parallel-For, it's just the same as having a normal synchrone for-loop, right? Meaning the code as it is doesnt do anything in parallel, so I dont think you'll have any Problems with concurrency. My guess from what I can tell is clustercenter[i].GroupedDocument is propably an empty Array.

c# How to run a application faster

I am creating a word list of possible uppercase letters to prove how insecure 8 digit passwords are this code will write aaaaaaaa to aaaaaaab to aaaaaaac etc. until zzzzzzzz using this code:
class Program
{
static string path;
static int file = 0;
static void Main(string[] args)
{
new_file();
var alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789+-*_!$£^=<>§°ÖÄÜöäü.;:,?{}[]";
var q = alphabet.Select(x => x.ToString());
int size = 3;
int counter = 0;
for (int i = 0; i < size - 1; i++)
{
q = q.SelectMany(x => alphabet, (x, y) => x + y);
}
foreach (var item in q)
{
if (counter >= 20000000)
{
new_file();
counter = 0;
}
if (File.Exists(path))
{
using (StreamWriter sw = File.AppendText(path))
{
sw.WriteLine(item);
Console.WriteLine(item);
/*if (!(Regex.IsMatch(item, #"(.)\1")))
{
sw.WriteLine(item);
counter++;
}
else
{
Console.WriteLine(item);
}*/
}
}
else
{
new_file();
}
}
}
static void new_file()
{
path = #"C:\" + "list" + file + ".txt";
if (!File.Exists(path))
{
using (StreamWriter sw = File.CreateText(path))
{
}
}
file++;
}
}
The Code is working fine but it takes Weeks to run it. Does anyone know a way to speed it up or do I have to wait? If anyone has a idea please tell me.
Performance:
size 3: 0.02s
size 4: 1.61s
size 5: 144.76s
Hints:
removed LINQ for combination generation
removed Console.WriteLine for each password
removed StreamWriter
large buffer (128k) for file writing
const string alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789+-*_!$£^=<>§°ÖÄÜöäü.;:,?{}[]";
var byteAlphabet = alphabet.Select(ch => (byte)ch).ToArray();
var alphabetLength = alphabet.Length;
var newLine = new[] { (byte)'\r', (byte)'\n' };
const int size = 4;
var number = new byte[size];
var password = Enumerable.Range(0, size).Select(i => byteAlphabet[0]).Concat(newLine).ToArray();
var watcher = new System.Diagnostics.Stopwatch();
watcher.Start();
var isRunning = true;
for (var counter = 0; isRunning; counter++)
{
Console.Write("{0}: ", counter);
Console.Write(password.Select(b => (char)b).ToArray());
using (var file = System.IO.File.Create(string.Format(#"list.{0:D5}.txt", counter), 2 << 16))
{
for (var i = 0; i < 2000000; ++i)
{
file.Write(password, 0, password.Length);
var j = size - 1;
for (; j >= 0; j--)
{
if (number[j] < alphabetLength - 1)
{
password[j] = byteAlphabet[++number[j]];
break;
}
else
{
number[j] = 0;
password[j] = byteAlphabet[0];
}
}
if (j < 0)
{
isRunning = false;
break;
}
}
}
}
watcher.Stop();
Console.WriteLine(watcher.Elapsed);
}
Try the following modified code. In LINQPad it runs in < 1 second. With your original code I gave up after 40 seconds. It removes the overhead of opening and closing the file for every WriteLine operation. You'll need to test and ensure it gives the same results because I'm not willing to run your original code for 24 hours to ensure the output is the same.
class Program
{
static string path;
static int file = 0;
static void Main(string[] args)
{
new_file();
var alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789+-*_!$£^=<>§°ÖÄÜöäü.;:,?{}[]";
var q = alphabet.Select(x => x.ToString());
int size = 3;
int counter = 0;
for (int i = 0; i < size - 1; i++)
{
q = q.SelectMany(x => alphabet, (x, y) => x + y);
}
StreamWriter sw = File.AppendText(path);
try
{
foreach (var item in q)
{
if (counter >= 20000000)
{
sw.Dispose();
new_file();
counter = 0;
}
sw.WriteLine(item);
Console.WriteLine(item);
}
}
finally
{
if(sw != null)
{
sw.Dispose();
}
}
}
static void new_file()
{
path = #"C:\temp\list" + file + ".txt";
if (!File.Exists(path))
{
using (StreamWriter sw = File.CreateText(path))
{
}
}
file++;
}
}
your alphabet is missing 0
With that fixed there would be 89 chars in your set. Let's call it 100 for simplicity. The set you are looking for is all the 8 character length strings drawn from that set. There are 100^8 of these, i.e. 10,000,000,000,000,000.
The disk space they will take up depends on how you encode them, lets be generous - assume you use some 8 bit char set that contains the these characters, and you don't put in carriage returns, so one byte per char, so 10,000,000,000,000,000 bytes =~ 10 peta byes?
Do you have 10 petabytes of disk? (10000 TB)?
[EDIT] In response to 'this is not an answer':
The original motivation is to create the list? The shows how large the list would be. Its hard to see what could be DONE with the list if it was actualised, i.e. it would always be quicker to reproduce it than to load it. Surely whatever point could be made by producing the list can also be made by simply knowing it's size, which the above shows how to work it out.
There are LOTS of inefficiencies in you code, but if your questions is 'how can i quickly produce this list and write it to disk' the answer is 'you literally cannot'.
[/EDIT]

C# using loop to start threads and pass parameters

In below sample code, I use lambda function to make 3 threads doing different things. My goal is make the thread count configurable, so I was thinking using a loop to start threads. But I always got in static function can't call non-static members error. Can the community help me or direct me to a tutorial? Thanks a lot!
My Code:
internal class FeedClient
{
private static void Main(string[] args)
{
int iteration = 10;
int ranSleepTime = 1000;
var obj = new MyClass();
var threads = new Thread[3];
(threads[0] = new Thread(() =>
{
Random random = new System.Random();
for (int i = 0; i < iteration; i++)
{
obj.MyMethod("my string 1");
Thread.Sleep(random.Next(ranSleepTime));
}
})).Start();
(threads[1] = new Thread(() =>
{
Random random = new System.Random();
for (int i = 0; i < iteration; i++)
{
obj.MyMethod("my string 2");
Thread.Sleep(random.Next(ranSleepTime));
}
})).Start();
(threads[2] = new Thread(() =>
{
Random random = new System.Random();
for (int i = 0; i < iteration; i++)
{
obj.MyMethod("my string 3");
Thread.Sleep(random.Next(ranSleepTime));
}
})).Start();
foreach (Thread thread in threads)
{
thread.Join();
}
obj.Close(false);
Console.WriteLine("Press any key to exit.");
Console.ReadKey();
}
}
Desired look:
for(int i=0;i<3;i++){
threads[i] = new Thread(func); // func is the lambda function
threads[i].Start(myData[i]); // myData[] may be a string array
}
The error message seems to indicate that you are attempting to use an instance member from a static method somewhere. Naturally that is not allowed since a static method does not have a this reference. Here is how I would refactor your code.
public static void Main()
{
string[] myData = GetStringArray();
int iteration = 10;
int ranSleepTime = 1000;
var obj = new MyClass();
var threads = new Thread[myData.Length];
for (int i = 0; i < threads.Length; i++)
{
int captured = i; // This is required to avoid capturing the loop variable.
threads[i] = new Thread(
() =>
{
var random = new Random();
for (int i = 0; i < iteration; i++)
{
obj.MyMethod(myData[captured]);
Thread.Sleep(random.Next(ranSleepTime));
}
});
threads[i].Start();
}
foreach (Thread thread in threads)
{
thread.Join();
}
obj.Close(false);
}
I must mention, however, that creating new threads in an unbounded loop is generally undesirable. If the loop has a tight bound then maybe, but I would have to get a better understanding of the problem before making any further comments regarding this point.

Categories

Resources