I have a nested for loop which takes 30 seconds to run and I'm looking to parallelize it based on the number of cores on my machine.
Original loop:
var currentCap = model.LoanCap;
var currentRlRate = model.RlRate;
var maxRateObj = new Dictionary<string, double>();
var maxRateOuterLoopCount = 0;
var maxRateInnerLoopCount = 0;
for (var i = currentRlRate + rlRateStep; i <= maxRlRate; i += rlRateStep)
{
maxRateOuterLoopCount++;
var tempFyy = currentFyy;
var tempIrr = currentIrr;
var lowestCapSoFar = currentCap;
var startingCap = maxRateObj.ContainsKey(capKey) ? maxRateObj[capKey] : currentCap;
for (var j = startingCap - capStep; j >= minCap; j -= capStep)
{
maxRateInnerLoopCount++;
tempModel = new ApplicationModel(model);
var tempIrrAndFyy = GetIrrAndFyyTuple(tempModel, i, j, precision);
var updatedIrr = tempIrrAndFyy.Item1;
var updatedFyy = tempIrrAndFyy.Item2;
// stop decrementing cap because we got a good-enough IRR to save this pair
if (Math.Abs(currentIrr - updatedIrr) >= irrDiffPrecision || updatedFyy < minFyy)
{
var endingCap = j + capStep; // go back one step since we just stepped out of bounds
maxRateObj = new Dictionary<string, double>
{
{rlRateKey, i },
{capKey, endingCap }
};
// set vars so the outer loop can check if we are still operating within constraints
lowestCapSoFar = endingCap;
tempIrr = updatedIrr;
tempFyy = updatedFyy;
break;
}
}
// Break out of the outerloop if the cap gets too low
if (lowestCapSoFar <= minCap) { break; }
// ... or if Fyy gets too low (when credit policy is enforced)
if (enforceFyyPolicy && tempFyy < minFyy) { break; }
// ... or if Irr gets too low (when credit policy is enforced)
if (enforceIrrPolicy && Math.Abs(tempIrr - targetIrr) > irrDiffPrecision) { break; }
}
Now when I move this loop into the body of Parallel.For(), I lose the context which I previously had for the variable i... How can I get that functionality back since I need it for my maxRateObj?
var degreeOfParallelism = Environment.ProcessorCount;
var result = Parallel.For(0, degreeOfParallelism, x =>
{
var tempFyy = currentFyy;
var tempIrr = currentIrr;
var lowestCapSoFar = currentCap;
var startingCap = maxRateObj.ContainsKey(capKey) ? maxRateObj[capKey] : currentCap;
for (var j = startingCap - capStep; j >= minCap; j -= capStep)
{
tempModel = new ApplicationModel(model);
var tempIrrAndFyy = GetIrrAndFyyTuple(tempModel, i, j, precision); // i IS NOT DEFINED HERE!
var updatedIrr = tempIrrAndFyy.Item1;
var updatedFyy = tempIrrAndFyy.Item2;
// stop decrementing cap because we got a good-enough IRR to save this pair
if (Math.Abs(currentIrr - updatedIrr) >= irrDiffPrecision || updatedFyy < minFyy)
{
var endingCap = j + capStep; // go back one step since we just stepped out of bounds
maxRateObj = new Dictionary<string, double>
{
{rlRateKey, i }, // i IS NOT DEFINED HERE!
{capKey, endingCap }
};
// set vars so the outer loop can check if we are still operating within constraints
lowestCapSoFar = endingCap;
tempIrr = updatedIrr;
tempFyy = updatedFyy;
break;
}
}
// Break out of the outerloop if the cap gets too low
if (lowestCapSoFar <= minCap) { return; }
// ... or if Fyy gets too low (when credit policy is enforced)
if (enforceFyyPolicy && tempFyy < minFyy) { return; }
// ... or if Irr gets too low (when credit policy is enforced)
if (enforceIrrPolicy && Math.Abs(tempIrr - targetIrr) > irrDiffPrecision) { return; }
});
Don't do degreeOfParallelism number of parallel iterations. Perform the same number of iterations in your parallel loop as you were doing previously, but spread them over your processors by using ParallelOptions.MaxDegreeOfParallelism.
It looks to me like it's a matter of performing a parallel loop from 0 to numSteps (calculated below), setting the MaxDegreeOfParallelism of your loop, and reconstituting i from the value of x in the loop body. Something like...
var start = (currentRlRate + rlRateStep);
var end = maxRlRate;
var numSteps = (end - start) / rlRateStep;
Parallel.For(0,
numSteps,
new ParallelOptions {
MaxDegreeOfParallelism = degreeOfParallelism
},
x => {
var i = (x * rlRateStep) + start;
//lean on i
});
Related
The following code section worked appropriately:
Parallel.For(
0, numberOfRunsNeeded, j =>
{
var copyOfj = j;
var researchItems = viewModel.ResearchItems[queryNumber].GetRange((int)(copyOfj * itemsAtOnce), Math.Min(itemsAtOnce, viewModel.ResearchItems[queryNumber].Count - (copyOfj * itemsAtOnce)));
var finalQuery = GetCorrectedQuery(query.BaseQuery, query.SQLVariants[copyOfi]);
if (researchItems.Count > 0)
{
finalQuery = GetCorrectedQueryWithResearchItems(finalQuery, researchItems, query.SQLVariants[copyOfi]);
}
PerformSingleRun(query, copyOfi, dataSource, finalQuery, copyOfj, viewModel);
}
);
This updated code shows captured variable-like errors - finalQuery comes back like the For loop is repeating the same value many times:
Parallel.For(
0, numberOfRunsNeeded, parallelOptionsWithMaxDegreeOfParallelism, j =>
{
var copyOfj = j;
if (researchItemsPresent)
{
var researchItems = ViewModel.ResearchItems[queryNumber].GetRange(copyOfj * itemsAtOnce, Math.Min(itemsAtOnce, ViewModel.ResearchItems[queryNumber].Count - (copyOfj * itemsAtOnce)));
finalQuery = GetAdaptedBaseQueryWithResearchItemsInserted(finalQuery, researchItems, query.SQLVariants[copyOfi]);
}
PerformSingleRun(query, copyOfi, dataSource, finalQuery, copyOfj, viewModel);
}
);
As stated above, I already have working code - I'm just trying to understand what I did wrong in my revision. Full methods listed below:
Previous, working:
public static void ProcessSingleQuery(int queryNumber, ViewModel viewModel)
{
var query = new Query
{
Name = viewModel.QueryNames[queryNumber],
BaseQuery = viewModel.BaseQueries[queryNumber],
SelectedDatabases = viewModel.SelectedDatabases[queryNumber],
SQLVariants = viewModel.SQLVariants[queryNumber],
Usernames = viewModel.Usernames[queryNumber],
Passwords = viewModel.Passwords[queryNumber],
CSVFiles = viewModel.CSVFiles[queryNumber],
CSVFileAliases = viewModel.CSVFileAliases[queryNumber],
ColumnDelimiters = viewModel.ColumnDelimiters[queryNumber],
HeaderRowsPresent = viewModel.HeaderRowsPresent[queryNumber],
TextDelimiters = viewModel.TextDelimiters[queryNumber],
ResearchItemColumnNumber = viewModel.ResearchItemColumnNumber[queryNumber]
};
for (var i = 0; i < query.SelectedDatabases.Count; i++)
{
var dataSource = GetDataSource(query.SelectedDatabases[i]);
var itemsAtOnce = ViewModel.ItemsAtOnceBySQLVariant[query.SQLVariants[i]];
if (query.SelectedDatabases[i].Equals("CSV"))
{
RefreshOrCreateSchemaIniFile(query);
dataSource = query.CSVFiles[0].DirectoryName;
}
var numberOfRunsNeeded = Math.Max(
(int)Math.Ceiling((double)viewModel.ResearchItems[queryNumber].Count / itemsAtOnce), 1
);
viewModel.QueryRunsCompletedMaximum += numberOfRunsNeeded;
var copyOfi = i;
Parallel.For(
0, numberOfRunsNeeded, j =>
{
var copyOfj = j;
var researchItems = viewModel.ResearchItems[queryNumber].GetRange((int)(copyOfj * itemsAtOnce), Math.Min(itemsAtOnce, viewModel.ResearchItems[queryNumber].Count - (copyOfj * itemsAtOnce)));
var finalQuery = GetCorrectedQuery(query.BaseQuery, query.SQLVariants[copyOfi]);
if (researchItems.Count > 0)
{
finalQuery = GetCorrectedQueryWithResearchItems(finalQuery, researchItems, query.SQLVariants[copyOfi]);
}
PerformSingleRun(query, copyOfi, dataSource, finalQuery, copyOfj, viewModel);
}
);
}
GeneralTools.CombineAndDeleteQueryResults(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), query.Name);
if (query.ResearchItemColumnNumber != 0)
{
CompileMissingItemsReport(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), query.Name, viewModel, queryNumber);
}
}
Revised, broken:
public static void ProcessSingleQuery(int queryNumber, ViewModel viewModel)
{
var query = new Query
{
Name = ViewModel.QueryNames[queryNumber],
BaseQuery = ViewModel.BaseQueries[queryNumber],
SelectedDatabases = ViewModel.SelectedDatabases[queryNumber],
SQLVariants = ViewModel.SQLVariants[queryNumber],
Usernames = ViewModel.Usernames[queryNumber],
Passwords = ViewModel.Passwords[queryNumber],
CSVFiles = ViewModel.CSVFiles[queryNumber],
CSVFileAliases = ViewModel.CSVFileAliases[queryNumber],
ColumnDelimiters = ViewModel.ColumnDelimiters[queryNumber],
HeaderRowsPresent = ViewModel.HeaderRowsPresent[queryNumber],
TextDelimiters = ViewModel.TextDelimiters[queryNumber],
ResearchItemColumnNumber = ViewModel.ResearchItemColumnNumber[queryNumber]
};
for (var i = 0; i < query.SelectedDatabases.Count; i++)
{
var finalQuery = GetAdaptedBaseQuery(query, query.SQLVariants[i]);
var dataSource = GetDataSource(query.SelectedDatabases[i]);
var itemsAtOnce = ViewModel.ItemsAtOnceBySQLVariant[query.SQLVariants[i]];
if (query.SelectedDatabases[i].Contains("CSV"))
{
CreateSchemaIniFile(query);
dataSource = query.CSVFiles[0].DirectoryName;
}
var researchItemsPresent = ViewModel.ResearchItems[queryNumber].Count > 0;
var numberOfRunsNeeded = Math.Max(
(int)Math.Ceiling((double)ViewModel.ResearchItems[queryNumber].Count / itemsAtOnce), 1
);
viewModel.QueryRunsCompletedMaximum += numberOfRunsNeeded;
var copyOfi = i;
var parallelOptionsWithMaxDegreeOfParallelism = new ParallelOptions
{
MaxDegreeOfParallelism =
query.SQLVariants[i] == SQLVariant.Teradata ? 6 : -1
};
Parallel.For(
0, numberOfRunsNeeded, parallelOptionsWithMaxDegreeOfParallelism, j =>
{
var copyOfj = j;
if (researchItemsPresent)
{
var researchItems = ViewModel.ResearchItems[queryNumber].GetRange(copyOfj * itemsAtOnce, Math.Min(itemsAtOnce, ViewModel.ResearchItems[queryNumber].Count - (copyOfj * itemsAtOnce)));
finalQuery = GetAdaptedBaseQueryWithResearchItemsInserted(finalQuery, researchItems, query.SQLVariants[copyOfi]);
}
PerformSingleRun(query, copyOfi, dataSource, finalQuery, copyOfj, viewModel);
}
);
}
GeneralTools.CombineAndDeleteQueryResults(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), query.Name);
if (query.ResearchItemColumnNumber != 0)
{
CompileMissingItemsReport(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), query.Name, queryNumber);
}
}
Why your broken version is broken
The problem appears to be two-fold:
First, you have a variable called finalQuery in an outer scope which you also use in a closure, specifically the one passed in as the body delegate of your Parallel.For, and is therefore the same variable in all iterations of your Parallel.For.
Second, you both read and write this finalQuery variable in that same Parallel.For body, notably with the code:
finalQuery = GetAdaptedBaseQueryWithResearchItemsInserted(finalQuery, ...)
...where you'll see you pass the current value of finalQuery as your base query.
The order in which the various iterations of that loop reach that line of code can change and depends on system architecture and processor load, causing a race condition. Access to your variable is also not governed by a lock.
Why the other version worked
In your working version, finalQuery is a variable that is declared within and therefore entirely local to the Parallel.For body function. This prevents any iterations from seeing values of finalQuery from other iterations. And more importantly, each finalQuery is constructed from a common, invariant base query (query.baseQuery) with this code:
var finalQuery = GetCorrectedQuery(query.BaseQuery, ...)
And although you further adjust the value of finalQuery in the line below:
finalQuery = GetCorrectedQueryWithResearchItems(finalQuery, ...)
...this is fine because this finalQuery variable is local to your lambda function and its value is based solely on the previous line, and fortunately, not from varying values being written by other iterations of the Parallel.For, as was the case in your race condition.
I need to create a program scraping a website.
And I did use Thread to solve.
Example:
I have 100 pages and I need divide it, instead of get each page I need custom Thread number to get page:
2 threads - 50 pages/thread
4 threads - 25 pages/thread
I tried my code below, however when to the last page of each thread that very slow.
Before I ask I used to find the way to solve but I can't, therefore I need help.
int so_thread = 10;//thread number
int page_du = 0;
List<NameValueCollection> List_item = new List<NameValueCollection>();
Thread[] threads = new Thread[so_thread];
int dem = 0;
await Task.Run(() =>
{
for (int i = 1; i <= so_thread; i++)
{
if ((Int32.Parse(o_sopage.Text) % so_thread) != 0 && i == so_thread)
{
page_du = Int32.Parse(o_sopage.Text) % so_thread;//Int32.Parse(o_sopage.Text) == page number need get
}
threads[i - 1] = new Thread((object data) =>
{
Array New_Data = new object[2];
New_Data = (Array)data;
int _i = (int)New_Data.GetValue(0);
int _pagedu = (int)New_Data.GetValue(1);
int page_per_thread = Int32.Parse(o_sopage.Text) / so_thread;//Int32.Parse(o_sopage.Text) == page number need get
for (int j = ((page_per_thread * _i) - page_per_thread) + 1; j <= ((page_per_thread * _i) + _pagedu); j++)
{
//MessageBox.Show(j.ToString());
var TG = ebay.GetPage(j);
lock (List_item)
{
List_item.AddRange(TG);
dem++;
progressBar1.Invoke((MethodInvoker)delegate
{
progressBar1.Value = dem;
});
}
}
});
object DATA = new object[2] { i, page_du };
threads[i - 1].Start(DATA);
}
});
Use Parallel.ForEach instead of creating the threads on your own.
Parallel.ForEach(yourCollection, () => { /* your code here */});
this is my very first post here on StackOverflow so please tell me if I did anything wrong, also english is not my native language, forgive me if there is any gramatical errors.
My question is how can I permutate the items of an array of type "Location", I need to get all possible permutations of waypoints given by the user to then calculate the best route based on time or distance. (I don't want to use the normal route calculation)
I've searched for algorithms but all of them when I put the array of type "Location[]" in the function's parameter I get the error that the object needs to be IEnumerable and I don't know how to convert to that if is even possible, I never worked with IEnumerable.
If it is of any help this is my code for calculating the route:
//Gets the waypoints from a listBox provided by the user, "mode" selects between best time and best distance
//backgroundworker so the UI dont freezes, and return the optimal waypoint order
public Location[] CalcularRota(Location[] waypoints, int mode, BackgroundWorker work, DoWorkEventArgs e)
{
//Declarations
string origem = "";
string destino = "";
Rota[] prop = new Rota[100]; //this index is the number of times the algorithm will be executed, more equals accuracy but much more time to complete
Rota bestDist = new Rota();
Rota bestTime = new Rota();
DirectionService serv = new DirectionService();
DirectionRequest reqs = new DirectionRequest();
DirectionResponse resp;
Random rnd = new Random();
Location[] rndWays;
int dist = 0;
int ti = 0;
bestDist.Distance = 1000000000; //put higher values for the first comparation to be true (end of code)
bestTime.Time = 1000000000;
if (waypoints != null)
{
reqs.Sensor = false;
reqs.Mode = TravelMode.driving;
for (int i = 0; i < prop.Length; i++) //initializes prop
prop[i] = new Rota();
for (int i = 0; i < prop.Length; i++)
{
rndWays = waypoints.OrderBy(x => rnd.Next()).ToArray(); //randomizes the order, I want to get all permutations and then test
//but I dont know how so I've been using randomized
dist = ti = 0;
origem = prop[0].ToString(); //save this particular waypoint's origin and destination
destino = prop[1].ToString();
reqs.Origin = origem;
reqs.Destination = destino;
if (waypoints.Length > 0)
reqs.Waypoints = rndWays;
resp = serv.GetResponse(reqs); //request the route with X order of waypoints to google
if (resp.Status == ServiceResponseStatus.Ok) //wait the response otherwise the program crashes
{
for (int j = 0; j < resp.Routes[0].Legs.Length; j++) //gets the distance and time of this particular order
{
ti += int.Parse(resp.Routes[0].Legs[j].Duration.Value);
dist += int.Parse(resp.Routes[0].Legs[j].Distance.Value);
}
}
prop[i].Origem = origem; //saves this waypoints order details for further comparison
prop[i].Destino = destino;
prop[i].Distance = dist;
prop[i].Time = ti;
prop[i].Order = rndWays;
work.ReportProgress(i); //report the progress
}
for (int i = 0; i < prop.Length; i++) //gets the best distance and time
{
if (bestDist.Distance > prop[i].Distance)
{
bestDist.Distance = prop[i].Distance;
bestDist.Time = prop[i].Time;
bestDist.Order = prop[i].Order;
bestDist.Origem = prop[i].Origem;
bestDist.Destino = prop[i].Destino;
}
if (bestTime.Time > prop[i].Time)
{
bestTime.Distance = prop[i].Distance;
bestTime.Time = prop[i].Time;
bestTime.Order = prop[i].Order;
bestTime.Origem = prop[i].Origem;
bestTime.Destino = prop[i].Destino;
}
}
if (bestDist.Order == bestTime.Order) //if the same waypoint order has the same time and distance
return bestDist.Order; // returns whatever bestDist.Order or bestTime.Order
else if (bestDist.Order != bestTime.Order) //if different returns corresponding to the mode selected
{
if (mode == 1) return bestDist.Order;
if (mode == 2) return bestTime.Order;
}
}
return null;
}
What I want is to permutate the waypoints given and test each permutation, I've been struggling with this for a time, if u guys could help me with any way possible would be great.
Ty.
EDIT.
I found this function here on StackOverflow:
public static bool NextPermutation<T>(T[] elements) where T : IComparable<T>
{
var count = elements.Length;
var done = true;
for (var i = count - 1; i > 0; i--)
{
var curr = elements[i];
// Check if the current element is less than the one before it
if (curr.CompareTo(elements[i - 1]) < 0)
{
continue;
}
// An element bigger than the one before it has been found,
// so this isn't the last lexicographic permutation.
done = false;
// Save the previous (bigger) element in a variable for more efficiency.
var prev = elements[i - 1];
// Have a variable to hold the index of the element to swap
// with the previous element (the to-swap element would be
// the smallest element that comes after the previous element
// and is bigger than the previous element), initializing it
// as the current index of the current item (curr).
var currIndex = i;
// Go through the array from the element after the current one to last
for (var j = i + 1; j < count; j++)
{
// Save into variable for more efficiency
var tmp = elements[j];
// Check if tmp suits the "next swap" conditions:
// Smallest, but bigger than the "prev" element
if (tmp.CompareTo(curr) < 0 && tmp.CompareTo(prev) > 0)
{
curr = tmp;
currIndex = j;
}
}
// Swap the "prev" with the new "curr" (the swap-with element)
elements[currIndex] = prev;
elements[i - 1] = curr;
// Reverse the order of the tail, in order to reset it's lexicographic order
for (var j = count - 1; j > i; j--, i++)
{
var tmp = elements[j];
elements[j] = elements[i];
elements[i] = tmp;
}
// Break since we have got the next permutation
// The reason to have all the logic inside the loop is
// to prevent the need of an extra variable indicating "i" when
// the next needed swap is found (moving "i" outside the loop is a
// bad practice, and isn't very readable, so I preferred not doing
// that as well).
break;
}
// Return whether this has been the last lexicographic permutation.
return done;
}
The usage is:
NextPermutation(array);
Doing this and putting my array (rndWays) as overload I get the following error:
The type 'Google.Maps.Location' cannot be used as type parameter 'T' in the generic type or method 'Form1.NextPermutation< T >(T[])'. There is no implicit reference conversion from 'Google.Maps.Location' to 'System.IComparable< Google.Maps.Location >'.
The problem is that Location does not implement the IComparable interface.
Change:
public static bool NextPermutation<T>(T[] elements) where T : IComparable<T>
to:
public static bool NextPermutation(Location[] elements)
And replace each CompareTo() with your own comparison function.
I'm at a loss as to why I can't get this seemingly simple problem solved using Microsoft Solver Foundation.
All I need is to modify the weights (numbers) of certain observations to ensure that no 1 observation's weight AS A PERCENTAGE exceeds 25%. This is for the purposes of later calculating a constrained weighted average with the results of this algorithm.
For example, given the 5 weights of { 45, 100, 33, 500, 28 }, I would expect the result of this algorithm to be { 45, 53, 33, 53, 28 }, where 2 of the numbers had to be reduced such that they're within the 25% threshold of the new total (212 = 45+53+33+53+28) while the others remained untouched. Note that even though initially, the 2nd weight of 100 was only 14% of the total (706), as a result of decreasing the 4th weight of 500, it subsequently pushed up the % of the other observations and therein lies the only challenge with this.
I tried to recreate this using Solver only for it to tell me that it is the solution is "Infeasible" and it just returns all 1s. Update: solution need not use Solver, any alternative is welcome so long as it is fast when dealing with a decent number of weights.
var solver = SolverContext.GetContext();
var model = solver.CreateModel();
var decisionList = new List<Decision>();
decisionList.Add(new Decision(Domain.IntegerRange(1, 45), "Dec1"));
decisionList.Add(new Decision(Domain.IntegerRange(1, 100), "Dec2"));
decisionList.Add(new Decision(Domain.IntegerRange(1, 33), "Dec3"));
decisionList.Add(new Decision(Domain.IntegerRange(1, 500), "Dec4"));
decisionList.Add(new Decision(Domain.IntegerRange(1, 28), "Dec5"));
model.AddDecisions(decisionList.ToArray());
int weightLimit = 25;
foreach (var decision in model.Decisions)
{
model.AddConstraint(decision.Name + "weightLimit", 100 * (decision / Model.Sum(model.Decisions.ToArray())) <= weightLimit);
}
model.AddGoal("calcGoal", GoalKind.Maximize, Model.Sum(model.Decisions.ToArray()));
var solution = solver.Solve();
foreach (var decision in model.Decisions)
{
Debug.Print(decision.GetDouble().ToString());
}
Debug.Print("Solution Quality: " + solution.Quality.ToString());
Any help with this would be very much appreciated, thanks in advance.
I ditched Solver b/c it didn't live up to its name imo (or I didn't live up to its standards :)). Below is where I landed. Because this function gets used many times and on large lists of input weights, efficiency and performance are key so this function attempts to do the least # of iterations possible (let me know if anyone has any suggested improvements though). The results get used for a weighted average so I use "AttributeWeightPair" to store the value (attribute) and its weight and the function below is what modifies the weights to be within the constraint when given a list of these AWPs. The function assumes that weightLimit is passed in as a %, e.g. 25% gets passed in as 25, not 0.25 --- ok I'll stop stating what'll be obvious from the code - so here it is:
public static List<AttributeWeightPair<decimal>> WeightLimiter(List<AttributeWeightPair<decimal>> source, decimal weightLimit)
{
weightLimit /= 100; //convert to percentage
var zeroWeights = source.Where(w => w.Weight == 0).ToList();
var nonZeroWeights = source.Where(w => w.Weight > 0).ToList();
if (nonZeroWeights.Count == 0)
return source;
//return equal weights if given infeasible constraint
if ((1m / nonZeroWeights.Count()) > weightLimit)
{
nonZeroWeights.ForEach(w => w.Weight = 1);
return nonZeroWeights.Concat(zeroWeights).ToList();
}
//return original list if weight-limiting is unnecessary
if ((nonZeroWeights.Max(w => w.Weight) / nonZeroWeights.Sum(w => w.Weight)) <= weightLimit)
{
return source;
}
//sort (ascending) and store original weights
nonZeroWeights = nonZeroWeights.OrderBy(w => w.Weight).ToList();
var originalWeights = nonZeroWeights.Select(w => w.Weight).ToList();
//set starting point and determine direction from there
var initialSumWeights = nonZeroWeights.Sum(w => w.Weight);
var initialLimit = weightLimit * initialSumWeights;
var initialSuspects = nonZeroWeights.Where(w => w.Weight > initialLimit).ToList();
var initialTarget = weightLimit * (initialSumWeights - (initialSuspects.Sum(w => w.Weight) - initialLimit * initialSuspects.Count()));
var antepenultimateIndex = Math.Max(nonZeroWeights.FindLastIndex(w => w.Weight <= initialTarget), 1); //needs to be at least 1
for (int i = antepenultimateIndex; i < nonZeroWeights.Count(); i++)
{
nonZeroWeights[i].Weight = originalWeights[antepenultimateIndex - 1]; //set cap equal to the preceding weight
}
bool goingUp = (nonZeroWeights[antepenultimateIndex].Weight / nonZeroWeights.Sum(w => w.Weight)) > weightLimit ? false : true;
//Procedure 1 - find the weight # at which a cap would result in a weight % just UNDER the weight limit
int penultimateIndex = antepenultimateIndex;
bool justUnderTarget = false;
while (!justUnderTarget)
{
for (int i = penultimateIndex; i < nonZeroWeights.Count(); i++)
{
nonZeroWeights[i].Weight = originalWeights[penultimateIndex - 1]; //set cap equal to the preceding weight
}
var currentMaxPcntWeight = nonZeroWeights[penultimateIndex].Weight / nonZeroWeights.Sum(w => w.Weight);
if (currentMaxPcntWeight == weightLimit)
{
return nonZeroWeights.Concat(zeroWeights).ToList();
}
else if (goingUp && currentMaxPcntWeight < weightLimit)
{
nonZeroWeights[penultimateIndex].Weight = originalWeights[penultimateIndex]; //reset
if (penultimateIndex < nonZeroWeights.Count() - 1)
penultimateIndex++; //move up
else break;
}
else if (!goingUp && currentMaxPcntWeight > weightLimit)
{
if (penultimateIndex > 1)
penultimateIndex--; //move down
else break;
}
else
{
justUnderTarget = true;
}
}
if (goingUp) //then need to back up a step
{
penultimateIndex = (penultimateIndex > 1 ? penultimateIndex - 1 : 1);
for (int i = penultimateIndex; i < nonZeroWeights.Count(); i++)
{
nonZeroWeights[i].Weight = originalWeights[penultimateIndex - 1];
}
}
//Procedure 2 - increment the modified weights (subject to a cap equal to their original values) until the weight limit is hit (allowing a very slight overage for the last term in some cases)
int ultimateIndex = penultimateIndex;
var sumWeights = nonZeroWeights.Sum(w => w.Weight); //use this counter instead of summing every time for condition check within loop
bool justOverTarget = false;
while (!justOverTarget)
{
for (int i = ultimateIndex; i < nonZeroWeights.Count(); i++)
{
if (nonZeroWeights[i].Weight + 1 > originalWeights[i])
{
if (ultimateIndex < nonZeroWeights.Count() - 1)
ultimateIndex++;
else justOverTarget = true;
}
else
{
nonZeroWeights[i].Weight++;
sumWeights++;
}
}
if ((nonZeroWeights.Last().Weight / sumWeights) >= weightLimit)
{
justOverTarget = true;
}
}
return nonZeroWeights.Concat(zeroWeights).ToList();
}
public class AttributeWeightPair<T>
{
public T Attribute { get; set; }
public decimal? Weight { get; set; }
public AttributeWeightPair(T attribute, decimal? count)
{
this.Attribute = attribute;
this.Weight = count;
}
}
i have following code I used up until now to compare a list of file-entrys to itsef by hash-codes
for (int i = 0; i < fileLists.SourceFileListBefore.Count; i++) // Compare SourceFileList-Files to themselves
{
for (int n = i + 1; n < fileLists.SourceFileListBefore.Count; n++) // Don´t need to do the same comparison twice!
{
if (fileLists.SourceFileListBefore[i].targetNode.IsFile && fileLists.SourceFileListBefore[n].targetNode.IsFile)
if (fileLists.SourceFileListBefore[i].hash == fileLists.SourceFileListBefore[n].hash)
{
// do Something
}
}
}
where SourceFileListBefore is a List
I want to change this code to be able to execute parallel on multiple cores. I thought about doing this with PLINQ, but im completely new to LINQ.
I tried
var duplicate = from entry in fileLists.SourceFileListBefore.AsParallel()
where fileLists.SourceFileListBefore.Any(x => (x.hash == entry.hash) && (x.targetNode.IsFile) && (entry.targetNode.IsFile))
select entry;
but it wont work like this, because I have to execute code for each pair of two hash-code matching entrys. So I would at least have to get a collection of results with x+entry from LINQ, not just one entry. Is that possible with PLINQ?
Why don't you look at optimising your code first?
looking at this statement:
if (fileLists.SourceFileListBefore[i].targetNode.IsFile && fileLists.SourceFileListBefore[n].targetNode.IsFile)
Means you can straight away build1 single list of files where IsFile == true (making the loop smaller already)
secondly,
if (fileLists.SourceFileListBefore[i].hash == fileLists.SourceFileListBefore[n].hash)
Why don't you build a hash lookup of the hashes first.
Then iterate over your filtered list, looking up in the lookup you created, if it contains > 1, it means there is a match as (current node hash + some other node hash). So you only do some work on the matched hashes which is not your node.
I wrote a blog post about it which you can read at # CodePERF[dot]NET -.NET Nested Loops vs Hash Lookups
PLINQ will only be slightly improving a bad solution to your problem.
Added some comparisons:
Total File Count: 16900
TargetNode.IsFile == true: 11900
Files with Duplicate Hashes = 10000 (5000 unique hashes)
Files with triplicate Hashes = 900 (300 unique hashes)
Files with Unique hashes = 1000
And the actual setup method:
[SetUp]
public void TestStup()
{
_sw = new Stopwatch();
_files = new List<File>();
int duplicateHashes = 10000;
int triplicateHashesCount = 900;
int randomCount = 1000;
int nonFileCount = 5000;
for (int i = 0; i < duplicateHashes; i++)
{
var hash = i % (duplicateHashes / 2);
_files.Add(new File {Id = i, Hash = hash.ToString(), TargetNode = new Node {IsFile = true}});
}
for (int i = 0; i < triplicateHashesCount; i++)
{
var hash = int.MaxValue - 100000 - i % (triplicateHashesCount / 3);
_files.Add(new File {Id = i, Hash = hash.ToString(), TargetNode = new Node {IsFile = true}});
}
for (int i = 0; i < randomCount; i++)
{
var hash = int.MaxValue - i;
_files.Add(new File { Id = i, Hash = hash.ToString(), TargetNode = new Node { IsFile = true } });
}
for (int i = 0; i < nonFileCount; i++)
{
var hash = i % (nonFileCount / 2);
_files.Add(new File {Id = i, Hash = hash.ToString(), TargetNode = new Node {IsFile = false}});
}
_matched = 0;
}
Than your current method:
[Test]
public void FindDuplicates()
{
_sw.Start();
for (int i = 0; i < _files.Count; i++) // Compare SourceFileList-Files to themselves
{
for (int n = i + 1; n < _files.Count; n++) // Don´t need to do the same comparison twice!
{
if (_files[i].TargetNode.IsFile && _files[n].TargetNode.IsFile)
if (_files[i].Hash == _files[n].Hash)
{
// Do Work
_matched++;
}
}
}
_sw.Stop();
}
Takes around 7.1 seconds on my machine.
Using lookup to find hashes which appear multiple times takes 21ms.
[Test]
public void FindDuplicatesHash()
{
_sw.Start();
var lookup = _files.Where(f => f.TargetNode.IsFile).ToLookup(f => f.Hash);
foreach (var duplicateFiles in lookup.Where(files => files.Count() > 1))
{
// Do Work for each unique hash, which appears multiple times in _files.
// If you need to do work on each pair, you will need to create pairs from duplicateFiles
// this can be an excercise for you ;-)
_matched++;
}
_sw.Stop();
}
In my test, using PLINQ for counting the lookups, is actually slower (As there is a large cost of dividing lists between threads and aggregating results back)
[Test]
public void FindDuplicatesHashParallel()
{
_sw.Start();
var lookup = _files.Where(f => f.TargetNode.IsFile).ToLookup(f => f.Hash);
_matched = lookup.AsParallel().Where(g => g.Count() > 1).Sum(g => 1);
_sw.Stop();
}
This took 120ms, so almost 6 times as long with my current source list.