I'm working on a simple benchmark testing out both Mono's ParallelFX against Java on several Linux boxes. The test for .NET runs great on Windows and Linux alike, but I'm having some kind of snag with the Java version...
I can see the specified number of threads starting up, but they run in a strange fashion. It acts like they start up, but they finish very slowly. They continue to start, but take forever to finish. It seems like it should be exceeding the limit of the thread pool, and my CPU usage looks to me like it's only using one or two cores (I've got an i7 processor so something like 8 should try to be used).
Yes, I know I am not being "thread safe" with my integers and probably other stuff too. I don't really care right now. Something larger is an issue here.
C# Version
public class Program
{
static void Main(string[] args)
{
const int numberOfCycles = 1000;
const int numbersPerCycle = 1000000;
Stopwatch swG = Stopwatch.StartNew();
int threadCount = 0;
int completeCount = 0;
Parallel.For(0, numberOfCycles, x =>
{
Console.WriteLine(string.Format("Starting cycle {0}. Thread count at {1}", x, threadCount++));
Random r = new Random();
Stopwatch sw = Stopwatch.StartNew();
List<double> numbers = new List<double>();
for (int i = 0; i < numbersPerCycle; i++)
{
numbers.Add(r.NextDouble() * 1000);
}
numbers.Sort();
double min = numbers.Min();
double max = numbers.Max();
completeCount++;
Console.WriteLine(string.Format("{0} cycles complete: {1:#,##0.0} ms. Min: {2:0.###} Max: {3:0.###}", completeCount, sw.ElapsedMilliseconds, min, max));
threadCount--;
});
Console.WriteLine(string.Format("All {0} cycles complete. Took {1:#,##0.0} ms.", numberOfCycles, swG.ElapsedMilliseconds));
Console.WriteLine("Press any key to continue.");
Console.ReadKey();
}
}
Java Version
P.S. I am lazy and stole the Stopwatch class from here: Is there a stopwatch in Java?
public class JavaMonoTest {
static int threadCount = 0;
static int completeCount = 0;
static String CLRF = "\r\n";
public static void main(String[] args) throws IOException, InterruptedException {
final int numberOfCycles = 1000;
final int numbersPerCycle = 1000000;
final int NUM_CORES = Runtime.getRuntime().availableProcessors();
//Setup the running array
List<Integer> cyclesList = new LinkedList<Integer>();
for(int i = 0; i < numberOfCycles; i++){
cyclesList.add(i);
}
Stopwatch swG = new Stopwatch();
swG.start();
ExecutorService exec = Executors.newFixedThreadPool(NUM_CORES);
try {
for (final Integer x : cyclesList) {
exec.submit(new Runnable() {
#Override
public void run() {
System.out.printf("Starting cycle %s. Thread count at %s %s", x, threadCount++, CLRF);
Random r = new Random();
Stopwatch sw = new Stopwatch();
sw.start();
List<Double> numbers = new LinkedList<Double>();
for (int i = 0; i < numbersPerCycle; i++)
{
numbers.add(r.nextDouble() * 1000);
}
Collections.sort(numbers);
double min = Collections.min(numbers);
double max = Collections.max(numbers);
completeCount++;
System.out.printf("%s cycles complete: %.2f ms. Min: %.2f Max: %.2f %s", completeCount, sw.getElapsedTime(), min, max, CLRF);
threadCount--;
}
});
}
} finally {
exec.shutdown();
}
exec.awaitTermination(1, TimeUnit.DAYS);
System.out.printf("All %s cycles complete. Took %.2f ms. %s", numberOfCycles, swG.getElapsedTime(), CLRF);
System.out.println("Press any key to continue.");
System.in.read();
}
}
Updated C# Version to Match Java Version In Answer
public class Program
{
static void Main(string[] args)
{
const int numberOfCycles = 1000;
const int numbersPerCycle = 1000000;
Stopwatch swG = Stopwatch.StartNew();
int threadCount = 0;
int completeCount = 0;
Parallel.For(0, numberOfCycles, x =>
{
Console.WriteLine(string.Format("Starting cycle {0}. Thread count at {1}", x, Interlocked.Increment(ref threadCount)));
Random r = new Random();
Stopwatch sw = Stopwatch.StartNew();
double[] numbers = new double[numbersPerCycle];
for (int i = 0; i < numbersPerCycle; i++)
{
numbers[i] = r.NextDouble() * 1000;
}
Array.Sort(numbers);
double min = numbers[0];
double max = numbers[numbers.Length - 1];
Interlocked.Increment(ref completeCount);
Console.WriteLine(string.Format("{0} cycles complete: {1:#,##0.0} ms. Min: {2:0.###} Max: {3:0.###}", completeCount, sw.ElapsedMilliseconds, min, max));
Interlocked.Decrement(ref threadCount);
});
Console.WriteLine(string.Format("All {0} cycles complete. Took {1:#,##0.0} ms.", numberOfCycles, swG.ElapsedMilliseconds));
Console.WriteLine("Press any key to continue.");
Console.ReadKey();
}
}
Running the program I see that its using 97%-98% of eight cpus, but also creating an insane amount of garbage. If I make the program more efficient it runs to completion much faster.
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
public class JavaMonoTest {
static final AtomicInteger threadCount = new AtomicInteger();
static final AtomicInteger completeCount = new AtomicInteger();
public static void main(String[] args) throws InterruptedException {
final int numberOfCycles = 1000;
final int numbersPerCycle = 1000000;
final int NUM_CORES = Runtime.getRuntime().availableProcessors();
long swG = System.nanoTime();
ExecutorService exec = Executors.newFixedThreadPool(NUM_CORES);
try {
for (int i = 0; i < numberOfCycles; i++) {
final int x = i;
exec.submit(new Runnable() {
#Override
public void run() {
try {
System.out.printf("Starting cycle %s. Thread count at %s %n", x, threadCount.getAndIncrement());
Random r = new Random();
long sw = System.nanoTime();
double[] numbers = new double[numbersPerCycle];
for (int i = 0; i < numbersPerCycle; i++) {
numbers[i] = r.nextDouble() * 1000;
}
Arrays.sort(numbers);
double min = numbers[0];
double max = numbers[numbers.length - 1];
completeCount.getAndIncrement();
System.out.printf("%s cycles complete: %.2f ms. Min: %.2f Max: %.2f %n",
completeCount, (System.nanoTime() - sw) / 1e6, min, max);
threadCount.getAndDecrement();
} catch (Throwable t) {
t.printStackTrace();
}
}
});
}
} finally {
exec.shutdown();
}
exec.awaitTermination(1, TimeUnit.DAYS);
System.out.printf("All %s cycles complete. Took %.2f ms. %n",
numberOfCycles, (System.nanoTime() - swG) / 1e6);
}
}
prints
Starting cycle 0. Thread count at 0
Starting cycle 7. Thread count at 7
Starting cycle 6. Thread count at 6
... deleted ...
999 cycles complete: 139.28 ms. Min: 0.00 Max: 1000.00
1000 cycles complete: 139.05 ms. Min: 0.00 Max: 1000.00
All 1000 cycles complete. Took 19431.14 ms.
In place of:
ExecutorService exec = Executors.newFixedThreadPool(NUM_CORES);
try {
for (final Integer x : cyclesList) {
exec.submit(new Runnable() {
try:
ExecutorService exec = Executors.newFixedThreadPool(NUM_CORES);
try {
for (final Integer x : cyclesList) {
exec.execute( new Runnable() { // No Future< T > needed
Related
I know this question has been asked before, and the answers I found were all about the pre-empting & synchronization overhead etc.. But still, I am curious to know the answer of my own situation. So here's the deal.
I am running on Intel Core i7-2670QM CPU (4 cores, 8 threads) and I wrote this code:
using System;
using System.Diagnostics;
using System.Threading;
namespace _T
{
class Program
{
private static void stquicksort(object parameter)
{
object[] parameters = (object[])parameter;
int[] array = (int[])parameters[0];
int left = (int)parameters[1];
int right = (int)parameters[2];
if (left >= right) return;
int temp = (left + right) / 2;
int pivot = array[temp];
array[temp] = array[right];
int j = left;
for (int i = left; i < right; i++)
{
if (array[i] < pivot)
{
if (i != j)
{
temp = array[i];
array[i] = array[j];
array[j++] = temp;
}
else j++;
}
}
array[right] = array[j];
array[j] = pivot;
stquicksort(new object[] { array, left, j - 1 });
stquicksort(new object[] { array, j + 1, right });
}
private static void mtquicksort(object parameter)
{
object[] parameters = (object[])parameter;
int[] array = (int[])parameters[0];
int left = (int)parameters[1];
int right = (int)parameters[2];
if (left >= right) return;
int temp = (left + right) / 2;
int pivot = array[temp];
array[temp] = array[right];
int j = left;
for (int i = left; i < right; i++)
{
if (array[i] < pivot)
{
if (i != j)
{
temp = array[i];
array[i] = array[j];
array[j++] = temp;
}
else j++;
}
}
array[right] = array[j];
array[j] = pivot;
Thread t = new Thread(mtquicksort);
t.Start(new object[] { array, left, j - 1 });
mtquicksort(new object[] { array, j + 1, right });
t.Join();
}
private static void dump(int[] array)
{
Console.Write("Array:");
foreach (int el in array) Console.Write(" " + el);
Console.WriteLine();
}
private static void Main(string[] args)
{
while (true)
{
Console.Write("Enter the number of elements: ");
int count = Convert.ToInt32(Console.ReadLine());
if (count < 0) break;
Random rnd = new Random();
int[] array1 = new int[count];
for (int i = 0; i < array1.Length; i++)
array1[i] = rnd.Next(1, 100);
int[] array2 = (int[])array1.Clone();
Stopwatch sw = new Stopwatch();
sw.Reset(); sw.Start();
stquicksort(new object[] { array1, 0, array1.Length - 1 });
sw.Stop();
Console.WriteLine("[ST] Time needed: " + sw.ElapsedMilliseconds + "ms");
sw.Reset(); sw.Start();
mtquicksort(new object[] { array2, 0, array2.Length - 1 });
sw.Stop();
Console.WriteLine("[MT] Time needed: " + sw.ElapsedMilliseconds + "ms");
}
Console.WriteLine("Press any key to exit . . .");
Console.ReadKey(true);
}
}
}
The stquicksort is the single threaded, mtquicksort is the multi one, and yes, I left the st parameters that way on purpose so the boxing/unboxing overheads are the same on both versions (if any noticable). I've put the solution on release (disabled all debugging), and the output is somewhat sad:
Enter the number of elements: 100
[ST] Time needed: 0ms
[MT] Time needed: 323ms
Enter the number of elements: 1000
[ST] Time needed: 0ms
[MT] Time needed: 7476ms
Enter the number of elements: 1000
[ST] Time needed: 0ms
[MT] Time needed: 7804ms
Enter the number of elements: 1000
[ST] Time needed: 0ms
[MT] Time needed: 7474ms
Enter the number of elements: 10
[ST] Time needed: 0ms
[MT] Time needed: 32ms
Enter the number of elements: 100
[ST] Time needed: 0ms
[MT] Time needed: 339ms
So again, is the problem pre-empting, is it maybe a flaw in code? And more importantly, what would be a proper way to solve this.
Spawning threads is a fairly expensive operation. It's not instantaneous so the massive time you are seeing is not additional time required to perform the sort but the time required to spawn the treads. When you spawn a new thread in order for it to be worth it that thread has to run for a while.
.NET and C# do have a Task system Task's are similar to threads except they operate on a thread pool instead of spawning a new thread every time. This allows you to multi thread tasks without the high cost of creating a new thread for each one.
Try replacing your threading code with this.
Task t = Task.Run(()=>mtquicksort(new object[] { array, left, j - 1 }));
t.Wait();
Note you will have to use the System.Threading.Tasks namespace
I'm trying to write a code to find prime numbers within a given range. Unfortunately I'm running into some problems with too many repetitions that'll give me a stackoverflowexception after prime nr: 30000. I have tried using a 'foreach' and also not using a list, (doing each number as it comes) but nothing seems to handle the problem in hand.
How can I make this program run forever without causing a stackoverflow?
class Program
{
static void Main(string[] args)
{
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
List<double> Primes = new List<double>();
const double Start = 0;
const double End = 100000;
double counter = 0;
int lastInt = 0;
for (int i = 0; i < End; i++)
Primes.Add(i);
for (int i =0;i< Primes.Count;i++)
{
lastInt = (int)Primes[i] - RoundOff((int)Primes[i]);
Primes[i] = (int)CheckForPrime(Primes[i], Math.Round(Primes[i] / 2));
if (Primes[i] != 0)
{
Console.Write(", {0}", Primes[i]);
counter++;
}
}
stopwatch.Stop();
Console.WriteLine("\n\nNumber of prime-numbers between {0} and {1} is: {2}, time it took to calc this: {3} (millisecounds).\n\n" +
" The End\n", Start, End, counter, stopwatch.ElapsedMilliseconds);
}
public static double CheckForPrime(double Prim, double Devider)
{
if (Prim / Devider == Math.Round(Prim / Devider))
return 0;
else if (Devider > 2)
return CheckForPrime(Prim, Devider - 1);
else
return Prim;
}
public static int RoundOff(int i)
{
return ((int)Math.Floor(i / 10.0)) * 10;
}
}
I need to calculate Pi - number via Monte-Carlo method using Task Parallel Library, but when my paralleled program is running, it calculates Pi - number much longer than it's unparallel analog.How two fix it? Paralleled calculating class and it's unparallel analog are below:
class CalcPiTPL
{
Object randLock = new object();
int n;
int N_0;
double aPi;
public StringBuilder Msg; // diagonstic message
double x, y;
Stopwatch stopWatch = new Stopwatch();
public void Init(int aN)
{
stopWatch.Start();
n = aN; // save total calculate-iterations amount
aPi = -1; // flag, if no any calculate-iteration has been completed
Msg = new StringBuilder("No any calculate-iteration has been completed");
}
public void Run()
{
if (n < 1)
{
Msg = new StringBuilder("Inbalid N-value");
return;
}
Random rnd = new Random(); // to create randomizer
Task[] tasks = new Task[4];
tasks[0] = Task.Factory.StartNew(() => PointGenerator(n, rnd));
tasks[1] = Task.Factory.StartNew(() => PointGenerator(n, rnd));
tasks[2] = Task.Factory.StartNew(() => PointGenerator(n, rnd));
tasks[3] = Task.Factory.StartNew(() => PointGenerator(n, rnd));
Task.WaitAll(tasks[0], tasks[1], tasks[2], tasks[3]);
aPi = 4.0 * ((double)N_0 / (double)n); // to calculate approximate Pi - value
stopWatch.Stop();
TimeSpan ts = stopWatch.Elapsed;
string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}",
ts.Hours, ts.Minutes, ts.Seconds,
ts.Milliseconds / 10);
Console.WriteLine("RunTime " + elapsedTime);
}
public double Done()
{
if (aPi > 0)
{
Msg = new StringBuilder("Calculates has been completed successful");
return aPi; // return gotten value
}
else
{
return 0; // no result
}
}
public void PointGenerator(int n, Random rnd)
{
for (int i = 1; i <= n / 4; i++)
{
lock (randLock)
{
x = rnd.NextDouble(); // to generate coordinates
y = rnd.NextDouble(); //
if (((x - 0.5) * (x - 0.5) + (y - 0.5) * (y - 0.5)) < 0.25)
{
//Interlocked.Increment(ref N_0);
N_0++; // coordinate in a circle! mark it by incrementing N_0
}
}
}
}
}
Unparallel analog:
class TCalcPi//unparallel calculating method
{
int N;
int N_0;
double aPi;
public StringBuilder Msg; // diagnostic message
double x, y;
Stopwatch stopWatch = new Stopwatch();
public void Init(int aN)
{
stopWatch.Start();
N = aN; // save total calculate-iterations amount
aPi = -1; // flag, if no any calculate-iteration has been completed
Msg = new StringBuilder("No any calculate-iteration has been completed");
}
public void Run()
{
if (N < 1)
{
Msg = new StringBuilder("Invalid N - value");
return;
}
int i;
Random rnd = new Random(); // to create randomizer
for (i = 1; i <= N; i++)
{
x = rnd.NextDouble(); // to generate coordinates
y = rnd.NextDouble(); //
if (((x - 0.5) * (x - 0.5) + (y - 0.5) * (y - 0.5)) < 0.25)
{
N_0++; // coordinate in a circle! mark it by incrementing N_0
}
}
aPi = 4.0 * ((double)N_0 / (double)N); // to calculate approximate Pi - value
stopWatch.Stop();
TimeSpan ts = stopWatch.Elapsed;
string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}",
ts.Hours, ts.Minutes, ts.Seconds,
ts.Milliseconds / 10);
Console.WriteLine("RunTime " + elapsedTime);
}
public double Done()
{
if (aPi > 0)
{
Msg = new StringBuilder("Calculates has been completed successful");
return aPi; // return gotten value
}
else
{
return 0; // no result
}
}
}
You have written the PointGenerator in a way in which it can barely benefit from being executed in parallel.
the lock means it will have basically single-threaded performance with additional threading overhead
a global state N_0 means you will have to synchronize access. Granted, since it's just an int you can use the Interlocked class for efficiently incrementing it.
What I would is to let each PointGenerator have a different Random object and a different counter. Then there won't be any shared mutable state which could cause problems. Be careful though, the default constructor of Random uses the tick count of the system. Creating several objects might result in random generators with the same seed.
Once all PointGenerator finish you combine the results.
This would be very similar to what some of the TPL overloads of Parallel.For and Parallel.ForEach do.
I know this post is old but it still shows up when searching for how to compute pi in parallel in C#. I have modified this to use the systems thread count for the workers. Also the lock is not needed if we use a return type for the workers, put some of the other variables in the worker function and finally let everything be put together by yet another task. This uses long for a larger count of iterations. The instances of Random are created with the thread id as the seed, which i hope makes them give different sequences of random numbers. Removed the Init-Method and put initialization in the Run-Method instead. There are two ways of using this now, blocking and non-blocking. But first here is the class:
public class CalcPiTPL
{
private long n;
private double pi;
private Stopwatch stopWatch = new Stopwatch();
private Task<int>[]? tasks = null;
private Task? taskOrchestrator = null;
private ManualResetEvent rst = new ManualResetEvent(false);
private bool isDone = false;
public string elapsedTime = string.Empty;
public double Pi { get { return pi; } }
public void Run(long n)
{
if (n < 1 || taskOrchestrator!=null) return;
isDone = false;
rst.Reset();
stopWatch.Start();
this.n = n; // save total calculate-iterations amount
pi = -1; // flag, if no any calculate-iteration has been completed
tasks = new Task<int>[Environment.ProcessorCount];
for(int i = 0; i < Environment.ProcessorCount; i++)
{
tasks[i] = Task.Factory.StartNew(() => PointGenerator(n));
}
taskOrchestrator = Task.Factory.StartNew(() => Orchestrator());
}
private void Orchestrator()
{
Task.WaitAll(tasks);
long N_0 = 0;
foreach (var task in tasks)
{
N_0 += task.GetAwaiter().GetResult();
}
pi = 4.0 * ((double)N_0 / (double)n); // to calculate approximate Pi - value
stopWatch.Stop();
TimeSpan ts = stopWatch.Elapsed;
elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);
tasks = null;
taskOrchestrator = null;
isDone = true;
rst.Set();
}
public double Wait()
{
rst.WaitOne();
return pi;
}
public bool IsDone()
{
return isDone;
}
private int PointGenerator(long n)
{
int N_0 = 0;
Random rnd = new Random(Thread.CurrentThread.ManagedThreadId);
for (int i = 1; i <= n / Environment.ProcessorCount; i++)
{
double x = rnd.NextDouble(); // to generate coordinates
double y = rnd.NextDouble(); //
if (((x - 0.5) * (x - 0.5) + (y - 0.5) * (y - 0.5)) < 0.25)
{
N_0++;
}
}
return N_0;
}
}
Blocking call:
CalcPiTPL pi = new CalcPiTPL();
pi.Run(1000000000);
Console.WriteLine(pi.Wait());
non-blocking call:
CalcPiTPL pi = new CalcPiTPL();
pi.Run(100000000);
while (pi.IsDone()==false)
{
Thread.Sleep(100);
// Do something else
}
Console.WriteLine(pi.Pi);
Adding an event would probably be nice, if someone wants to use this in a GUI application. Maybe i will do that later.
Feel free to correct, if i messed something up.
When your whole parallel part is inside a lock scope nothing is actually parallel. Only a single thread can be inside a lock scope in any given moment.
You can simply use different Random instances instead of a single one.
I am trying to compare performance between parallel streams in Java 8 and PLINQ (C#/.Net 4.5.1).
Here is the result I get on my machine ( System Manufacturer Dell Inc. System Model Precision M4700 Processor Intel(R) Core(TM) i7-3740QM CPU # 2.70GHz, 2701 Mhz, 4 Core(s), 8 Logical Processor(s) Installed Physical Memory (RAM) 16.0 GB OS Name Microsoft Windows 7 Enterprise Version 6.1.7601 Service Pack 1 Build 7601)
C# .Net 4.5.1 (X64-release)
Serial:
470.7784, 491.4226, 502.4643, 481.7507, 464.1156, 463.0088, 546.149, 481.2942, 502.414, 483.1166
Average: 490.6373
Parallel:
158.6935, 133.4113, 217.4304, 182.3404, 184.188, 128.5767, 160.352, 277.2829, 127.6818, 213.6832
Average: 180.5496
Java 8 (X64)
Serial:
471.911822, 333.843924, 324.914299, 325.215631, 325.208402, 324.872828, 324.888046, 325.53066, 325.765791, 325.935861
Average:326.241715
Parallel:
212.09323, 73.969783, 68.015431, 66.246628, 66.15912, 66.185373, 80.120837, 75.813539, 70.085948, 66.360769
Average:70.3286
It looks like PLINQ does not scale across the CPU cores. I am wondering if I miss something.
Here is the code for C#:
class Program
{
static void Main(string[] args)
{
var NUMBER_OF_RUNS = 10;
var size = 10000000;
var vals = new double[size];
var rnd = new Random();
for (int i = 0; i < size; i++)
{
vals[i] = rnd.NextDouble();
}
var avg = 0.0;
Console.WriteLine("Serial:");
for (int i = 0; i < NUMBER_OF_RUNS; i++)
{
var watch = Stopwatch.StartNew();
var res = vals.Select(v => Math.Sin(v)).ToArray();
var elapsed = watch.Elapsed.TotalMilliseconds;
Console.Write(elapsed + ", ");
if (i > 0)
avg += elapsed;
}
Console.Write("\nAverage: " + (avg / (NUMBER_OF_RUNS - 1)));
avg = 0.0;
Console.WriteLine("\n\nParallel:");
for (int i = 0; i < NUMBER_OF_RUNS; i++)
{
var watch = Stopwatch.StartNew();
var res = vals.AsParallel().Select(v => Math.Sin(v)).ToArray();
var elapsed = watch.Elapsed.TotalMilliseconds;
Console.Write(elapsed + ", ");
if (i > 0)
avg += elapsed;
}
Console.Write("\nAverage: " + (avg / (NUMBER_OF_RUNS - 1)));
}
}
Here is the code for Java:
import java.util.Arrays;
import java.util.Random;
import java.util.stream.DoubleStream;
public class Main {
private static final Random rand = new Random();
private static final int MIN = 1;
private static final int MAX = 140;
private static final int POPULATION_SIZE = 10_000_000;
public static final int NUMBER_OF_RUNS = 10;
public static void main(String[] args) throws InterruptedException {
Random rnd = new Random();
double[] vals1 = DoubleStream.generate(rnd::nextDouble).limit(POPULATION_SIZE).toArray();
double avg = 0.0;
System.out.println("Serial:");
for (int i = 0; i < NUMBER_OF_RUNS; i++)
{
long start = System.nanoTime();
double[] res = Arrays.stream(vals1).map(Math::sin).toArray();
double duration = (System.nanoTime() - start) / 1_000_000.0;
System.out.print(duration + ", " );
if (i > 0)
avg += duration;
}
System.out.println("\nAverage:" + (avg / (NUMBER_OF_RUNS - 1)));
avg = 0.0;
System.out.println("\n\nParallel:");
for (int i = 0; i < NUMBER_OF_RUNS; i++)
{
long start = System.nanoTime();
double[] res = Arrays.stream(vals1).parallel().map(Math::sin).toArray();
double duration = (System.nanoTime() - start) / 1_000_000.0;
System.out.print(duration + ", " );
if (i > 0)
avg += duration;
}
System.out.println("\nAverage:" + (avg / (NUMBER_OF_RUNS - 1)));
}
}
Both runtimes make a decision about how many threads to use in order to complete the parallel operation. That is a non-trivial task that can take many factors into account, including the degree to which the task is CPU bound, the estimated time to complete the task, etc.
Each runtime is different decisions about how many threads to use to resolve the request. Neither decision is obviously right or wrong in terms of system-wide scheduling, but the Java strategy performs the benchmark better (and leaves fewer CPU resources available for other tasks on the system).
I have a method Limit() which counts a bandwidth passed thought some channel in certain time and limits by using Thread.Sleep() it (if bandwidth limit is reached).
Method itself produces proper ( in my opinion results ) but Thread.Sleep doesn't ( due to multithreaded CPU usage ) because i have proper "millisecondsToWait" but speed check afterwards is far from limitation i've passed.
Is there a way to make limitation more precise ?
Limiter Class
private readonly int m_maxSpeedInKbps;
public Limiter(int maxSpeedInKbps)
{
m_maxSpeedInKbps = maxSpeedInKbps;
}
public int Limit(DateTime startOfCycleDateTime, long writtenInBytes)
{
if (m_maxSpeedInKbps > 0)
{
double totalMilliseconds = DateTime.Now.Subtract(startOfCycleDateTime).TotalMilliseconds;
int currentSpeedInKbps = (int)((writtenInBytes / totalMilliseconds));
if (currentSpeedInKbps - m_maxSpeedInKbps > 0)
{
double delta = (double)currentSpeedInKbps / m_maxSpeedInKbps;
int millisecondsToWait = (int)((totalMilliseconds * delta) - totalMilliseconds);
if (millisecondsToWait > 0)
{
Thread.Sleep(millisecondsToWait);
return millisecondsToWait;
}
}
}
return 0;
}
Test Class which always fails in large delta
[TestMethod]
public void ATest()
{
List<File> files = new List<File>();
for (int i = 0; i < 1; i++)
{
files.Add(new File(i + 1, 100));
}
const int maxSpeedInKbps = 1024; // 1MBps
Limiter limiter = new Limiter(maxSpeedInKbps);
DateTime startDateTime = DateTime.Now;
Parallel.ForEach(files, new ParallelOptions {MaxDegreeOfParallelism = 5}, file =>
{
DateTime currentFileStartTime = DateTime.Now;
Thread.Sleep(5);
limiter.Limit(currentFileStartTime, file.Blocks * Block.Size);
});
long roundOfWriteInKB = (files.Sum(i => i.Blocks.Count) * Block.Size) / 1024;
int currentSpeedInKbps = (int) (roundOfWriteInKB/DateTime.Now.Subtract(startDateTime).TotalMilliseconds*1000);
Assert.AreEqual(maxSpeedInKbps, currentSpeedInKbps, string.Format("maxSpeedInKbps {0} currentSpeedInKbps {1}", maxSpeedInKbps, currentSpeedInKbps));
}
I used to use Thread.Sleep a lot until I discovered waithandles. Using waithandles you can suspend threads, which will come alive again when the waithandle is triggered from elsewhere, or when a time threshold is reached. Perhaps it's possible to re-engineer your limit methodology to use waithandles in some way, because in a lot of situations they are indeed much more precise than Thread.Sleep?
You can do it fairly accurately using a busy wait, but I wouldn't recommend it. You should use one of the multimedia timers to wait instead.
However, this method will wait fairly accurately:
void accurateWait(int millisecs)
{
var sw = Stopwatch.StartNew();
if (millisecs >= 100)
Thread.Sleep(millisecs - 50);
while (sw.ElapsedMilliseconds < millisecs)
;
}
But it is a busy wait and will consume CPU cycles terribly. Also it could be affected by garbage collections or task rescheduling.
Here's the test program:
using System;
using System.Diagnostics;
using System.Collections.Generic;
using System.Threading;
namespace Demo
{
class Program
{
void run()
{
for (int i = 1; i < 10; ++i)
test(i);
for (int i = 10; i < 100; i += 5)
test(i);
for (int i = 100; i < 200; i += 10)
test(i);
for (int i = 200; i < 500; i += 20)
test(i);
}
void test(int millisecs)
{
var sw = Stopwatch.StartNew();
accurateWait(millisecs);
Console.WriteLine("Requested wait = " + millisecs + ", actual wait = " + sw.ElapsedMilliseconds);
}
void accurateWait(int millisecs)
{
var sw = Stopwatch.StartNew();
if (millisecs >= 100)
Thread.Sleep(millisecs - 50);
while (sw.ElapsedMilliseconds < millisecs)
;
}
static void Main()
{
new Program().run();
}
}
}