Captured variable-like error in Parallel.For loop

Captured variable-like error in Parallel.For loop - c#

The following code section worked appropriately:
Parallel.For(
0, numberOfRunsNeeded, j =>
{
var copyOfj = j;
var researchItems = viewModel.ResearchItems[queryNumber].GetRange((int)(copyOfj * itemsAtOnce), Math.Min(itemsAtOnce, viewModel.ResearchItems[queryNumber].Count - (copyOfj * itemsAtOnce)));
var finalQuery = GetCorrectedQuery(query.BaseQuery, query.SQLVariants[copyOfi]);
if (researchItems.Count > 0)
{
finalQuery = GetCorrectedQueryWithResearchItems(finalQuery, researchItems, query.SQLVariants[copyOfi]);
}
PerformSingleRun(query, copyOfi, dataSource, finalQuery, copyOfj, viewModel);
}
);
This updated code shows captured variable-like errors - finalQuery comes back like the For loop is repeating the same value many times:
Parallel.For(
0, numberOfRunsNeeded, parallelOptionsWithMaxDegreeOfParallelism, j =>
{
var copyOfj = j;
if (researchItemsPresent)
{
var researchItems = ViewModel.ResearchItems[queryNumber].GetRange(copyOfj * itemsAtOnce, Math.Min(itemsAtOnce, ViewModel.ResearchItems[queryNumber].Count - (copyOfj * itemsAtOnce)));
finalQuery = GetAdaptedBaseQueryWithResearchItemsInserted(finalQuery, researchItems, query.SQLVariants[copyOfi]);
}
PerformSingleRun(query, copyOfi, dataSource, finalQuery, copyOfj, viewModel);
}
);
As stated above, I already have working code - I'm just trying to understand what I did wrong in my revision. Full methods listed below:
Previous, working:
public static void ProcessSingleQuery(int queryNumber, ViewModel viewModel)
{
var query = new Query
{
Name = viewModel.QueryNames[queryNumber],
BaseQuery = viewModel.BaseQueries[queryNumber],
SelectedDatabases = viewModel.SelectedDatabases[queryNumber],
SQLVariants = viewModel.SQLVariants[queryNumber],
Usernames = viewModel.Usernames[queryNumber],
Passwords = viewModel.Passwords[queryNumber],
CSVFiles = viewModel.CSVFiles[queryNumber],
CSVFileAliases = viewModel.CSVFileAliases[queryNumber],
ColumnDelimiters = viewModel.ColumnDelimiters[queryNumber],
HeaderRowsPresent = viewModel.HeaderRowsPresent[queryNumber],
TextDelimiters = viewModel.TextDelimiters[queryNumber],
ResearchItemColumnNumber = viewModel.ResearchItemColumnNumber[queryNumber]
};
for (var i = 0; i < query.SelectedDatabases.Count; i++)
{
var dataSource = GetDataSource(query.SelectedDatabases[i]);
var itemsAtOnce = ViewModel.ItemsAtOnceBySQLVariant[query.SQLVariants[i]];
if (query.SelectedDatabases[i].Equals("CSV"))
{
RefreshOrCreateSchemaIniFile(query);
dataSource = query.CSVFiles[0].DirectoryName;
}
var numberOfRunsNeeded = Math.Max(
(int)Math.Ceiling((double)viewModel.ResearchItems[queryNumber].Count / itemsAtOnce), 1
);
viewModel.QueryRunsCompletedMaximum += numberOfRunsNeeded;
var copyOfi = i;
Parallel.For(
0, numberOfRunsNeeded, j =>
{
var copyOfj = j;
var researchItems = viewModel.ResearchItems[queryNumber].GetRange((int)(copyOfj * itemsAtOnce), Math.Min(itemsAtOnce, viewModel.ResearchItems[queryNumber].Count - (copyOfj * itemsAtOnce)));
var finalQuery = GetCorrectedQuery(query.BaseQuery, query.SQLVariants[copyOfi]);
if (researchItems.Count > 0)
{
finalQuery = GetCorrectedQueryWithResearchItems(finalQuery, researchItems, query.SQLVariants[copyOfi]);
}
PerformSingleRun(query, copyOfi, dataSource, finalQuery, copyOfj, viewModel);
}
);
}
GeneralTools.CombineAndDeleteQueryResults(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), query.Name);
if (query.ResearchItemColumnNumber != 0)
{
CompileMissingItemsReport(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), query.Name, viewModel, queryNumber);
}
}
Revised, broken:
public static void ProcessSingleQuery(int queryNumber, ViewModel viewModel)
{
var query = new Query
{
Name = ViewModel.QueryNames[queryNumber],
BaseQuery = ViewModel.BaseQueries[queryNumber],
SelectedDatabases = ViewModel.SelectedDatabases[queryNumber],
SQLVariants = ViewModel.SQLVariants[queryNumber],
Usernames = ViewModel.Usernames[queryNumber],
Passwords = ViewModel.Passwords[queryNumber],
CSVFiles = ViewModel.CSVFiles[queryNumber],
CSVFileAliases = ViewModel.CSVFileAliases[queryNumber],
ColumnDelimiters = ViewModel.ColumnDelimiters[queryNumber],
HeaderRowsPresent = ViewModel.HeaderRowsPresent[queryNumber],
TextDelimiters = ViewModel.TextDelimiters[queryNumber],
ResearchItemColumnNumber = ViewModel.ResearchItemColumnNumber[queryNumber]
};
for (var i = 0; i < query.SelectedDatabases.Count; i++)
{
var finalQuery = GetAdaptedBaseQuery(query, query.SQLVariants[i]);
var dataSource = GetDataSource(query.SelectedDatabases[i]);
var itemsAtOnce = ViewModel.ItemsAtOnceBySQLVariant[query.SQLVariants[i]];
if (query.SelectedDatabases[i].Contains("CSV"))
{
CreateSchemaIniFile(query);
dataSource = query.CSVFiles[0].DirectoryName;
}
var researchItemsPresent = ViewModel.ResearchItems[queryNumber].Count > 0;
var numberOfRunsNeeded = Math.Max(
(int)Math.Ceiling((double)ViewModel.ResearchItems[queryNumber].Count / itemsAtOnce), 1
);
viewModel.QueryRunsCompletedMaximum += numberOfRunsNeeded;
var copyOfi = i;
var parallelOptionsWithMaxDegreeOfParallelism = new ParallelOptions
{
MaxDegreeOfParallelism =
query.SQLVariants[i] == SQLVariant.Teradata ? 6 : -1
};
Parallel.For(
0, numberOfRunsNeeded, parallelOptionsWithMaxDegreeOfParallelism, j =>
{
var copyOfj = j;
if (researchItemsPresent)
{
var researchItems = ViewModel.ResearchItems[queryNumber].GetRange(copyOfj * itemsAtOnce, Math.Min(itemsAtOnce, ViewModel.ResearchItems[queryNumber].Count - (copyOfj * itemsAtOnce)));
finalQuery = GetAdaptedBaseQueryWithResearchItemsInserted(finalQuery, researchItems, query.SQLVariants[copyOfi]);
}
PerformSingleRun(query, copyOfi, dataSource, finalQuery, copyOfj, viewModel);
}
);
}
GeneralTools.CombineAndDeleteQueryResults(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), query.Name);
if (query.ResearchItemColumnNumber != 0)
{
CompileMissingItemsReport(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), query.Name, queryNumber);
}
}

Why your broken version is broken
The problem appears to be two-fold:
First, you have a variable called finalQuery in an outer scope which you also use in a closure, specifically the one passed in as the body delegate of your Parallel.For, and is therefore the same variable in all iterations of your Parallel.For.
Second, you both read and write this finalQuery variable in that same Parallel.For body, notably with the code:
finalQuery = GetAdaptedBaseQueryWithResearchItemsInserted(finalQuery, ...)
...where you'll see you pass the current value of finalQuery as your base query.
The order in which the various iterations of that loop reach that line of code can change and depends on system architecture and processor load, causing a race condition. Access to your variable is also not governed by a lock.
Why the other version worked
In your working version, finalQuery is a variable that is declared within and therefore entirely local to the Parallel.For body function. This prevents any iterations from seeing values of finalQuery from other iterations. And more importantly, each finalQuery is constructed from a common, invariant base query (query.baseQuery) with this code:
var finalQuery = GetCorrectedQuery(query.BaseQuery, ...)
And although you further adjust the value of finalQuery in the line below:
finalQuery = GetCorrectedQueryWithResearchItems(finalQuery, ...)
...this is fine because this finalQuery variable is local to your lambda function and its value is based solely on the previous line, and fortunately, not from varying values being written by other iterations of the Parallel.For, as was the case in your race condition.

Related

Inject stopwatch using c# into all dll methods using Mono.Cecil, including methods with multiple Return statements

Case 1 Case 2 Case 3 Case 4
Objective:
Using the Injector code i'm trying to inject the stopwatch methods (which are in stopwatch dll) in the desired code location of target dll, inorder to calculate the time taken by each method in the target dll which may or may not be a void method and it may have multiple return statements.
Target dll
public class targetDll
{
void func1(){
//Inject Stopwatch_start(); method here
int a = 3;
int b = 4;
int temp;
temp = a;
a = b;
b =temp;
if (a + b > 2)
{
Console.WriteLine("function____1");
}
#Stopwatch_stop() //Inject stop time here
}
String func2(){
//Inject Stopwatch_start(); method here
int a = 3;
int b = 4;
int c = 5;
int temp;
temp = a;
a = b;
b = c;
c = temp;
if (a + b > 5)
{
Console.WriteLine("function____2");
//inject Stopwatch_stop() method here
return ;
}
a = temp;
//inject Stopwatch_stop(); method here
return;
}
}
Source dll(stopwatch dll)
public static class stopwatch_class
{
static System.Diagnostics.Stopwatch stopwatch_obj = new System.Diagnostics.Stopwatch();
public static void stopwatch_start()
{
stopwatch_obj.Start();
}
public static void stopwatch_stop()
{
stopwatch_obj.Stop();
Console.WriteLine(stopwatch_obj.ElapsedMilliseconds);
}
}
}
Injector code
class Trial_injector
{
static void Main(string[] args)
{
var start_method = (dynamic)null;
var stop_method = (dynamic)null;
AssemblyDefinition target_assembly = AssemblyDefinition.ReadAssembly("targetDll.dll",
new ReaderParameters { ReadWrite = true });
var target_modules = target_assembly.MainModule;
TypeDefinition[] target_module = target_modules.Types.ToArray();
AssemblyDefinition source_assembly = AssemblyDefinition.ReadAssembly("stopwatch.dll", new
ReaderParameters { ReadWrite = true });
var source_modules = source_assembly.MainModule;
TypeDefinition[] source_module = source_modules.Types.ToArray();
foreach (var type in source_module)
{
foreach (var method in type.Methods)
{
if (method.Name == "stopwatch_start")
{
start_method = method;
}
if (method.Name == "stopwatch_stop")
{
stop_method = method;
}
}
}
foreach(var module_ in target_module)
{
foreach(var method_ in module_.Methods)
{
String stg="hello_world";
var processor2 = method_.Body.GetILProcessor();
var first_instruction = method_.Body.Instructions.First();
var last_instruction = method_.Body.Instructions.Last();
var ldstr = processor2.Create(OpCodes.Ldstr, stg);
var call = processor2.Create(OpCodes.Call, method_.Module.Import(start_method));
var call2 = processor2.Create(OpCodes.Call, method_.Module.Import(stop_method));
processor2.InsertBefore(first_instruction, ldstr);
processor2.InsertAfter(first_instruction, call);
processor2.InsertBefore(last_instruction, ldstr);
processor2.InsertBefore(last_instruction, call2);
}
}
target_assembly.Write();
}

You were almost correct with your code. There were few modifications that needed to be done.
Not sure why you need the ldstr opcode as it's not needed anywhere. For the call you want to have that inserted before the first opcode not after. As for the last instruction you could go with InsertBefore. So the final code could be like this:
foreach (var module_ in target_module)
{
foreach (var method_ in module_.Methods)
{
var processor2 = method_.Body.GetILProcessor();
var first_instruction = method_.Body.Instructions.First();
var last_instruction = method_.Body.Instructions.Last();
var call = processor2.Create(OpCodes.Call, method_.Module.Import(start_method));
var call2 = processor2.Create(OpCodes.Call, method_.Module.Import(stop_method));
processor2.InsertBefore(first_instruction, call);
processor2.InsertBefore(last_instruction, call2);
}
}
but that wouldn't work with some early returns. Why? Early returns are coded as a br or br_s opcode to the ret at the end of procedure and if we inject our call before the ret those early returns will skip that. In your example it's not needed as this code is converted to if-else and we have branch in both cases correctly matched. But image we have code like this:
int a = 3;
if (a == 3)
{
return; // very early return here
}
// the rest as in original one
we wont see elapsed time printed for this method as return will direct the execution after our injected call. What we need to do here is to update all branch instructions that are responsible for early returns (so they jump to ret opcode) and point them to our call. We can do this in the following way:
foreach (var bodyInstruction in method_.Body.Instructions)
{
if (bodyInstruction.OpCode != OpCodes.Br && bodyInstruction.OpCode != OpCodes.Br_S) continue;
if (((Instruction)bodyInstruction.Operand).OpCode != OpCodes.Ret) continue;
bodyInstruction.Operand = call2;
}
So what we do here is that we scan through all opcodes and see if we have a br or br_s that jumps to return we update it to jump to our call instead. Violà.
Note: used Elapsed instead of ElapsedMilliseconds as the former was giving all zeros.
Full code:
var start_method = (dynamic) null;
var stop_method = (dynamic) null;
AssemblyDefinition target_assembly = AssemblyDefinition.ReadAssembly("target.exe", new ReaderParameters {ReadWrite = true});
var target_modules = target_assembly.MainModule;
TypeDefinition[] target_module = target_modules.Types.ToArray();
AssemblyDefinition source_assembly = AssemblyDefinition.ReadAssembly("stopwatch.dll", new ReaderParameters {ReadWrite = true});
var source_modules = source_assembly.MainModule;
TypeDefinition[] source_module = source_modules.Types.ToArray();
foreach (var type in source_module)
{
foreach (var method in type.Methods)
{
if (method.Name == "stopwatch_start")
{
start_method = method;
}
if (method.Name == "stopwatch_stop")
{
stop_method = method;
}
}
}
foreach (var module_ in target_module)
{
foreach (var method_ in module_.Methods)
{
var processor2 = method_.Body.GetILProcessor();
var first_instruction = method_.Body.Instructions.First();
var last_instruction = method_.Body.Instructions.Last();
var call = processor2.Create(OpCodes.Call, method_.Module.Import(start_method));
var call2 = processor2.Create(OpCodes.Call, method_.Module.Import(stop_method));
processor2.InsertBefore(first_instruction, call);
processor2.InsertBefore(last_instruction, call2);
foreach (var bodyInstruction in method_.Body.Instructions)
{
if (bodyInstruction.OpCode != OpCodes.Br && bodyInstruction.OpCode != OpCodes.Br_S) continue;
if (((Instruction)bodyInstruction.Operand).OpCode != OpCodes.Ret) continue;
bodyInstruction.Operand = call2;
}
}
}
target_assembly.Write();
self-promotion on
I happen to recorded two videos about doing this (in a bit different way) with Mono.Cecil. You can find it Writing simple .NET execution tracer with Mono.Cecil and Instrumenting .NET assemblies to measure method's execution time in with Mono.Cecil.
self-promotion off

How to track loop values after Parallelizing

I have a nested for loop which takes 30 seconds to run and I'm looking to parallelize it based on the number of cores on my machine.
Original loop:
var currentCap = model.LoanCap;
var currentRlRate = model.RlRate;
var maxRateObj = new Dictionary<string, double>();
var maxRateOuterLoopCount = 0;
var maxRateInnerLoopCount = 0;
for (var i = currentRlRate + rlRateStep; i <= maxRlRate; i += rlRateStep)
{
maxRateOuterLoopCount++;
var tempFyy = currentFyy;
var tempIrr = currentIrr;
var lowestCapSoFar = currentCap;
var startingCap = maxRateObj.ContainsKey(capKey) ? maxRateObj[capKey] : currentCap;
for (var j = startingCap - capStep; j >= minCap; j -= capStep)
{
maxRateInnerLoopCount++;
tempModel = new ApplicationModel(model);
var tempIrrAndFyy = GetIrrAndFyyTuple(tempModel, i, j, precision);
var updatedIrr = tempIrrAndFyy.Item1;
var updatedFyy = tempIrrAndFyy.Item2;
// stop decrementing cap because we got a good-enough IRR to save this pair
if (Math.Abs(currentIrr - updatedIrr) >= irrDiffPrecision || updatedFyy < minFyy)
{
var endingCap = j + capStep; // go back one step since we just stepped out of bounds
maxRateObj = new Dictionary<string, double>
{
{rlRateKey, i },
{capKey, endingCap }
};
// set vars so the outer loop can check if we are still operating within constraints
lowestCapSoFar = endingCap;
tempIrr = updatedIrr;
tempFyy = updatedFyy;
break;
}
}
// Break out of the outerloop if the cap gets too low
if (lowestCapSoFar <= minCap) { break; }
// ... or if Fyy gets too low (when credit policy is enforced)
if (enforceFyyPolicy && tempFyy < minFyy) { break; }
// ... or if Irr gets too low (when credit policy is enforced)
if (enforceIrrPolicy && Math.Abs(tempIrr - targetIrr) > irrDiffPrecision) { break; }
}
Now when I move this loop into the body of Parallel.For(), I lose the context which I previously had for the variable i... How can I get that functionality back since I need it for my maxRateObj?
var degreeOfParallelism = Environment.ProcessorCount;
var result = Parallel.For(0, degreeOfParallelism, x =>
{
var tempFyy = currentFyy;
var tempIrr = currentIrr;
var lowestCapSoFar = currentCap;
var startingCap = maxRateObj.ContainsKey(capKey) ? maxRateObj[capKey] : currentCap;
for (var j = startingCap - capStep; j >= minCap; j -= capStep)
{
tempModel = new ApplicationModel(model);
var tempIrrAndFyy = GetIrrAndFyyTuple(tempModel, i, j, precision); // i IS NOT DEFINED HERE!
var updatedIrr = tempIrrAndFyy.Item1;
var updatedFyy = tempIrrAndFyy.Item2;
// stop decrementing cap because we got a good-enough IRR to save this pair
if (Math.Abs(currentIrr - updatedIrr) >= irrDiffPrecision || updatedFyy < minFyy)
{
var endingCap = j + capStep; // go back one step since we just stepped out of bounds
maxRateObj = new Dictionary<string, double>
{
{rlRateKey, i }, // i IS NOT DEFINED HERE!
{capKey, endingCap }
};
// set vars so the outer loop can check if we are still operating within constraints
lowestCapSoFar = endingCap;
tempIrr = updatedIrr;
tempFyy = updatedFyy;
break;
}
}
// Break out of the outerloop if the cap gets too low
if (lowestCapSoFar <= minCap) { return; }
// ... or if Fyy gets too low (when credit policy is enforced)
if (enforceFyyPolicy && tempFyy < minFyy) { return; }
// ... or if Irr gets too low (when credit policy is enforced)
if (enforceIrrPolicy && Math.Abs(tempIrr - targetIrr) > irrDiffPrecision) { return; }
});

Don't do degreeOfParallelism number of parallel iterations. Perform the same number of iterations in your parallel loop as you were doing previously, but spread them over your processors by using ParallelOptions.MaxDegreeOfParallelism.
It looks to me like it's a matter of performing a parallel loop from 0 to numSteps (calculated below), setting the MaxDegreeOfParallelism of your loop, and reconstituting i from the value of x in the loop body. Something like...
var start = (currentRlRate + rlRateStep);
var end = maxRlRate;
var numSteps = (end - start) / rlRateStep;
Parallel.For(0,
numSteps,
new ParallelOptions {
MaxDegreeOfParallelism = degreeOfParallelism
},
x => {
var i = (x * rlRateStep) + start;
//lean on i
});

How to Merge items within a List<> collection C#

I have a implememtation where i need to loop through a collection of documents and based on certain condition merge the documents .
The merge condition is very simple, if present document's doctype is same as later document's doctype, then copy all the pages from the later doctype and append it to the pages of present document's and remove the later document from the collection.
Note : Both response.documents and response.documents[].pages are List<> collections.
I was trying this but was getting following exception Once I remove the document.
collection was modified enumeration may not execute
Here is the code:
int docindex = 0;
foreach( var document in response.documents)
{
string presentDoctype = string.Empty;
string laterDoctype = string.Empty;
presentDoctype = response.documents[docindex].doctype;
laterDoctype = response.documents[docindex + 1].doctype;
if (laterDoctype == presentDoctype)
{
response.documents[docindex].pages.AddRange(response.documents[docindex + 1].pages);
response.documents.RemoveAt(docindex + 1);
}
docindex = docindex + 1;
}
Ex:
reponse.documents[0].doctype = "BankStatement" //page count = 1
reponse.documents[1].doctype = "BankStatement" //page count = 2
reponse.documents[2].doctype = "BankStatement" //page count = 2
reponse.documents[3].doctype = "BankStatement" //page count = 1
reponse.documents[4].doctype = "BankStatement" //page count = 4
Expected result:
response.documents[0].doctype = "BankStatement" //page count = 10
Please suggest.Appreciate your help.

I would recommend you to look at LINQ GroupBy and Distinct to process your response.documents
Example (as I cannot use your class, I give example using my own defined class):
Suppose you have DummyClass
public class DummyClass {
public int DummyInt;
public string DummyString;
public double DummyDouble;
public DummyClass() {
}
public DummyClass(int dummyInt, string dummyString, double dummyDouble) {
DummyInt = dummyInt;
DummyString = dummyString;
DummyDouble = dummyDouble;
}
}
Then doing GroupBy as shown,
DummyClass dc1 = new DummyClass(1, "This dummy", 2.0);
DummyClass dc2 = new DummyClass(2, "That dummy", 2.0);
DummyClass dc3 = new DummyClass(1, "These dummies", 2.0);
DummyClass dc4 = new DummyClass(2, "Those dummies", 2.0);
DummyClass dc5 = new DummyClass(3, "The dummies", 2.0);
List<DummyClass> dummyList = new List<DummyClass>() { dc1, dc2, dc3, dc4, dc5 };
var groupedDummy = dummyList.GroupBy(x => x.DummyInt).ToList();
Will create three groups, marked by DummyInt
Then to process the group you could do
for (int i = 0; i < groupedDummy.Count; ++i){
foreach (DummyClass dummy in groupedDummy[i]) { //this will process the (i-1)-th group
//do something on this group
//groupedDummy[0] will consists of "this" and "these", [1] "that" and "those", while [2] "the"
//Try it out!
}
}
In your case, you should create group based on doctype.
Once you create groups based on your doctype, everything else would be pretty "natural" for you to continue.
Another LINQ method which you might be interested in would be Distinct. But I think for this case, GroupBy would be the primary method you would like to use.

Use only "for loop" instead of "foreach".
foreach will hold the collection and cannot be modified while looping thru it.

Here is an example using groupBy, hope this help.
//mock a collection
ICollection<string> collection1 = new List<string>();
for (int i = 0; i < 10; i++)
{
collection1.Add("BankStatement");
}
for (int i = 0; i < 5; i++)
{
collection1.Add("BankStatement2");
}
for (int i = 0; i < 4; i++)
{
collection1.Add("BankStatement3");
}
//merge and get count
var result = collection1.GroupBy(c => c).Select(c => new { name = c.First(), count = c.Count().ToString() }).ToList();
foreach (var item in result)
{
Console.WriteLine(item.name + ": " + item.count);
}

Just use AddRange()
response.documents[0].pages.AddRange(response.documents[1].pages);
it will merge all pages of document[1] with the document[0] into document[0]

The fastest approach to inserting big data collections to Cassandra in C#

I'm a little bit confused about the fastest way to insert large collections to cassandra database. I read that I shouldn't use batch insert because it's created for atomicity. Even Cassandra thow an information for me to use asynchronic writes for performace.
I've used code for the fastest insert without 'batch' keyword:
var cluster = Cluster.Builder()
.AddContactPoint(“127.0.0.1")
.Build();
var session = cluster.Connect();
//Save off the prepared statement you’re going to use
var statement = session.Prepare (“INSERT INTO tester.users (userID, firstName, lastName) VALUES (?,?,?)”);
var tasks = new List<Task>();
for (int i = 0; i < 1000; i++)
{
//please bind with whatever actually useful data you’re importing
var bind = statement.Bind (i, “John”, “Tester”);
var resultSetFuture = session.ExecuteAsync (bind);
tasks.Add (resultSetFuture);
}
Task.WaitAll(tasks.ToArray());
cluster.Shutdown();
from: https://medium.com/#foundev/cassandra-batch-loading-without-the-batch-keyword-40f00e35e23e
But it's still much slower than batch option i'm using. My current code looks like this:
IList<Movie> moviesList = Movie.CreateMoviesCollectionForCassandra(collectionEntriesNumber);
var preparedStatements = new List<PreparedStatement>();
foreach (var statement in preparedStatements)
{
statement.SetConsistencyLevel(ConsistencyLevel.One);
}
var statementBinding = new BatchStatement();
statementBinding.SetBatchType(BatchType.Unlogged);
for (int i = 0; i < collectionEntriesNumber; i++)
{
preparedStatements.Add(Session.Prepare("INSERT INTO Movies (id, title, description, year, genres, rating, originallanguage, productioncountry, votingsnumber, director) VALUES (?,?,?,?,?,?,?,?,?,?)"));
}
for (int i = 0; i < collectionEntriesNumber; i++)
{
statementBinding.Add(preparedStatements[i].Bind(moviesList[i].Id, moviesList[i].Title,
moviesList[i].Description, moviesList[i].Year, moviesList[i].Genres, moviesList[i].Rating,
moviesList[i].OriginalLanguage, moviesList[i].ProductionCountry, moviesList[i].VotingsNumber,
new Director(moviesList[0].Director.Id, moviesList[i].Director.Firstname,
moviesList[i].Director.Lastname, moviesList[i].Director.Age)));
}
watch.Start();
Session.ExecuteAsync(statementBinding);
watch.Stop();
It really works much much faster but i can only insert ~2500 prepared statements, no more, and I want to measure time of about 100000 objects insertion.
Is my code correct? Maybe I just should increase insert treshold?
Please, explain my how to do it right way.

Remember that you should prepare your once and reuse that same PreparedStatement to bind to different parameters.
You can use small sized batches if you are targeting the same partition, if not you should use individual requests.
When using individual requests, you can schedule executions in parallel and limit the amount of outstanding requests using a semaphore.
Something like:
public async Task<long> Execute(
IStatement[] statements, int parallelism, int maxOutstandingRequests)
{
var semaphore = new SemaphoreSlim(maxOutstandingRequests);
var tasks = new Task<RowSet>[statements.Length];
var chunkSize = statements.Length / parallelism;
if (chunkSize == 0)
{
chunkSize = 1;
}
var statementLength = statements.Length;
var launchTasks = new Task[parallelism + 1];
var watch = new Stopwatch();
watch.Start();
for (var i = 0; i < parallelism + 1; i++)
{
var startIndex = i * chunkSize;
//start to launch in parallel
launchTasks[i] = Task.Run(async () =>
{
for (var j = 0; j < chunkSize; j++)
{
var index = startIndex + j;
if (index >= statementLength)
{
break;
}
await semaphore.WaitAsync();
var t = _session.ExecuteAsync(statements[index]);
tasks[index] = t;
var rs = await t;
semaphore.Release();
}
});
}
await Task.WhenAll(launchTasks);
await Task.WhenAll(tasks);
watch.Stop();
return watch.ElapsedMilliseconds;
}

Linq performance and delayed execution

I have run some tests for .Net CF. Basically, I wanted to compare for, foreach, extenstion method ForEach and LINQ query. Here is the whole code (you can skip it, to get to the point which bothers me)
namespace ForEachForLINQPerTest
{
class IntBox
{
public int fieldX;
public int PropertyX { get; set; }
}
public partial class MainPage : PhoneApplicationPage
{
/// <summary>
/// size of tested List
/// </summary>
public const int TEST_SIZE = 1000000;
//
private List<int> m_intList = new List<int>(TEST_SIZE);
//
private List<IntBox> m_intBoxList = new List<IntBox>(TEST_SIZE);
//
private Stopwatch m_stopwatch = null;
// Constructor
public MainPage()
{
InitializeComponent();
for (int i = 0; i < TEST_SIZE; ++i)
{
m_intBoxList.Add( new IntBox());
m_intList.Add(0);
}
}
private void startButton_Click(object sender, RoutedEventArgs e)
{
var forTest = ForTest(); // Jitter preheat
forTest = ForTest();
forResultTextBlock.Text = forTest;
var foreachTest = ForEachTest();
foreachTest = ForEachTest();
foreachResultTextBlock.Text = foreachTest;
var exTest = Extenstion();
exTest = Extenstion();
ExtensionResultTextBlock.Text = exTest;
var linqTest = LINQTest();
linqTest = LINQTest();
LINQResultTextBlock.Text = linqTest;
}
private string LINQTest()
{
m_stopwatch = new Stopwatch();
m_stopwatch.Start();
long temp = 0;
var result = from x in m_intList
select temp += x;
m_stopwatch.Stop();
var intListTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
result.ToList();
m_stopwatch.Start();
var result2 = from x in m_intBoxList
select temp += x.fieldX;
m_stopwatch.Stop();
var intBoxListFieldTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
result2.ToList();
m_stopwatch.Start();
var result3 = from x in m_intBoxList
select temp += x.PropertyX;
m_stopwatch.Stop();
var intBoxListPropertyTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
result3.ToList();
return String.Format("LINQ test List<int> = {0} \n List<IntBox> field = {1} \n List<IntBos> property = {2}", intListTime, intBoxListFieldTime, intBoxListPropertyTime);
}
private string Extenstion()
{
m_stopwatch = new Stopwatch();
m_stopwatch.Start();
long temp = 0;
m_intList.ForEach(i => temp += i);
m_stopwatch.Stop();
var intListTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
m_stopwatch.Start();
m_intBoxList.ForEach(i => temp += i.fieldX);
m_stopwatch.Stop();
var intBoxListFieldTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
m_stopwatch.Start();
m_intBoxList.ForEach(i => temp += i.PropertyX);
m_stopwatch.Stop();
var intBoxListPropertyTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
return String.Format("Extenstion test List<int> = {0} \n List<IntBox> field = {1} \n List<IntBos> property = {2}", intListTime, intBoxListFieldTime, intBoxListPropertyTime);
}
private string ForEachTest()
{
m_stopwatch = new Stopwatch();
long temp = 0;
m_stopwatch.Start();
foreach(int item in m_intList)
{
temp += item;
}
m_stopwatch.Stop();
var intListTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
m_stopwatch.Start();
foreach (IntBox item in m_intBoxList)
{
temp += item.fieldX;
}
m_stopwatch.Stop();
var intBoxListFieldTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
m_stopwatch.Start();
foreach (IntBox item in m_intBoxList)
{
temp += item.PropertyX;
}
m_stopwatch.Stop();
var intBoxListPropertyTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
return String.Format("ForEach test List<int> = {0} \n List<IntBox> field = {1} \n List<IntBos> property = {2}", intListTime, intBoxListFieldTime, intBoxListPropertyTime);
}
private string ForTest()
{
m_stopwatch = new Stopwatch();
m_stopwatch.Start();
long temp = 0;
for (int i = 0; i < TEST_SIZE; ++i)
{
temp += m_intList[i];
}
m_stopwatch.Stop();
var intListTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
m_stopwatch.Start();
for (int i = 0; i < m_intList.Count; ++i)
{
temp += m_intBoxList[i].fieldX;
}
m_stopwatch.Stop();
var intBoxListFieldTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
m_stopwatch.Start();
for (int i = 0; i < m_intList.Count; ++i)
{
temp += m_intBoxList[i].PropertyX;
}
m_stopwatch.Stop();
var intBoxListPropertyTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
return String.Format("For loop test List<int> = {0} \n List<IntBox> field = {1} \n List<IntBos> property = {2}", intListTime, intBoxListFieldTime, intBoxListPropertyTime);
}
}
}
And here I am confused
m_stopwatch = new Stopwatch();
m_stopwatch.Start();
long temp = 0;
var result = from x in m_intList
select temp += x;
m_stopwatch.Stop();
var intListTime = m_stopwatch.ElapsedMilliseconds;
m_stopwatch.Reset();
result.ToList();
The ouput is:
For loop test List = 93
List field = 119 // ref -> field
List property = 136 // ref -> property -> field properties are just functions for CF
ForEach test List = 88
List field = 140
List property = 152
Extenstions test List = 176 // another function is called.
List field = 220
List property = 239
LINQ test List = 0 Why?
List field = 163
List property = 165
Why intListTime == 0? What am I doing wrong? Also the last two values for field and property are almost the same (run it a few times). Does it mean that PropertyX in LINQ query is evaluated in-line?

The first time is zero because expression tree is built at compile time and it gets evaluated on ToList call that you have not included in timing.
For field and property access timing, I wouldn't worry too much - in reality, in release build, simple property accessor will be get inlined giving same performance as field access. For linq case, you might be seeing the same performance because linq internally might be converting property/field access into a method call and it would result in same timings (as I believe that method call overhead will be probably large compared to the field/prop access.

this is called "deferred execution". the linq statement isn't evaluated until it needs to be. move the ToList to be before you stop the clock and the time will go up

Develop Reference

C# (C-Sharp) is a programming language developed by Microsoft that runs on the .NET Framework.

Captured variable-like error in Parallel.For loop - c#

Related

Inject stopwatch using c# into all dll methods using Mono.Cecil, including methods with multiple Return statements

How to track loop values after Parallelizing

How to Merge items within a List<> collection C#

The fastest approach to inserting big data collections to Cassandra in C#

Linq performance and delayed execution

Categories

Resources