I have 2D field of bits stored in an array of 5 unsigned longs.
I am going for the best performance.
I am working in C# but I tried to set a benchmark by implementing my class in C++.
The problem here is that the C# implementation takes about 10 seconds to finish where the C++ takes about 1 second making it 10 times faster. C++ is x64 build in VS2015. C# is in x64 VS2015 .NET 4.6. Both in Release of course.
EDIT: After optimizing the C# code a little it still takes 7 to 8 seconds vs C++ 1.3 seconds.
Note: C++ in x86 takes about 6 seconds to finish. I am running the code on 64-bit machine.
Question: What makes the C++ THAT much faster? And is there a way to optimize the C# code to be at least similarly fast? (Maybe some unsafe magic?)
What puzzles me is that we are talking just about iterating through arrays and bitwise operations. Shouldn't it be JITed to pretty much the same thing as C++?
Example code:
There are two simple functions in the implementation. Left() and Right() shifting the whole filed by 1 bit to the left resp. right with appropriate bit carrying between the longs.
C++
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
class BitField
{
private:
unsigned long long LEFTMOST_BIT = 0x8000000000000000;
unsigned long long RIGHTMOST_BIT = 1;
public:
unsigned long long Cells_l[5];
BitField()
{
for (size_t i = 0; i < 5; i++)
{
Cells_l[i] = rand(); // Random initialization
}
}
void Left()
{
unsigned long long carry = 0;
unsigned long long nextCarry = 0;
for (int i = 0; i < 5; i++)
{
nextCarry = (Cells_l[i] & LEFTMOST_BIT) >> 63;
Cells_l[i] = Cells_l[i] << 1 | carry;
carry = nextCarry;
}
}
void Right()
{
unsigned long long carry = 0;
unsigned long long nextCarry = 0;
for (int i = 4; i >= 0; i--)
{
nextCarry = (Cells_l[i] & RIGHTMOST_BIT) << 63;
Cells_l[i] = Cells_l[i] >> 1 | carry;
carry = nextCarry;
}
}
};
int main()
{
BitField bf;
high_resolution_clock::time_point t1 = high_resolution_clock::now();
for (int i = 0; i < 100000000; i++)
{
bf.Left();
bf.Left();
bf.Left();
bf.Right();
bf.Right();
bf.Left();
bf.Right();
bf.Right();
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<milliseconds>(t2 - t1).count();
cout << "Time: " << duration << endl << endl;
// Print to avoid compiler optimizations
for (size_t i = 0; i < 5; i++)
{
cout << bf.Cells_l[i] << endl;
}
return 0;
}
C#
using System;
using System.Diagnostics;
namespace TestCS
{
class BitField
{
const ulong LEFTMOST_BIT = 0x8000000000000000;
const ulong RIGHTMOST_BIT = 1;
static Random rnd = new Random();
ulong[] Cells;
public BitField()
{
Cells = new ulong[5];
for (int i = 0; i < 5; i++)
{
Cells[i] = (ulong)rnd.Next(); // Random initialization
}
}
public void Left()
{
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 0; i < 5; i++)
{
nextCarry = (Cells[i] & LEFTMOST_BIT) >> 63;
Cells[i] = Cells[i] << 1 | carry;
carry = nextCarry;
}
}
public void Right()
{
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 4; i >= 0; i--)
{
nextCarry = (Cells[i] & RIGHTMOST_BIT) << 63;
Cells[i] = Cells[i] >> 1 | carry;
carry = nextCarry;
}
}
}
class Program
{
static void Main(string[] args)
{
BitField bf = new BitField();
Stopwatch sw = new Stopwatch();
// Call to remove the compilation time from measurements
bf.Left();
bf.Right();
sw.Start();
for (int i = 0; i < 100000000; i++)
{
bf.Left();
bf.Left();
bf.Left();
bf.Right();
bf.Right();
bf.Left();
bf.Right();
bf.Right();
}
sw.Stop();
Console.WriteLine($"Done in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
}
}
}
EDIT: Fixed "nextCarry" typos in example code.
I have got enough information from comments and a deleted answer from #AntoninLejsek that I can answer this myself.
TL;DR C++ compiler does much better job optimizing and C# managed array access costs a lot when done in loop. However unsafe code and fixed access is not enough to match C++.
It seems we need to optimize the C# code manually to get performance comparable to C++.
Unroll loops
Use unsafe code for fixed array access
Don't access the array repeatedly - rather store the item into local variable.
Following C# code runs as fast as C++ code (about 100 ms faster in fact). Compiled on .NET 4.6 VS 2015 Release x64.
unsafe struct BitField
{
static Random rnd = new Random();
public fixed ulong Cells[5];
public BitField(int nothing)
{
fixed (ulong* p = Cells)
{
for (int i = 0; i < 5; i++)
{
p[i] = (ulong)rnd.Next(); // Just some random number
}
}
}
public void StuffUnrolledNonManaged()
{
ulong u0;
ulong u1;
ulong u2;
ulong u3;
ulong u4;
fixed (ulong *p = Cells)
{
u0 = p[0];
u1 = p[1];
u2 = p[2];
u3 = p[3];
u4 = p[4];
}
ulong carry = 0;
ulong nextCarry = 0;
for (int i = 0; i < 100000000; i++)
{
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//left
carry = 0;
nextCarry = u0 >> 63;
u0 = u0 << 1 | carry;
carry = nextCarry;
nextCarry = u1 >> 63;
u1 = u1 << 1 | carry;
carry = nextCarry;
nextCarry = u2 >> 63;
u2 = u2 << 1 | carry;
carry = nextCarry;
nextCarry = u3 >> 63;
u3 = u3 << 1 | carry;
carry = nextCarry;
u4 = u4 << 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
//right
carry = 0;
nextCarry = u4 << 63;
u4 = u4 >> 1 | carry;
carry = nextCarry;
nextCarry = u3 << 63;
u3 = u3 >> 1 | carry;
carry = nextCarry;
nextCarry = u2 << 63;
u2 = u2 >> 1 | carry;
carry = nextCarry;
nextCarry = u1 << 63;
u1 = u1 >> 1 | carry;
carry = nextCarry;
u0 = u0 >> 1 | carry;
}
fixed (ulong* p = Cells)
{
p[0] = u0;
p[1] = u1;
p[2] = u2;
p[3] = u3;
p[4] = u4;
}
}
Testing code
static void Main(string[] args)
{
BitField bf = new BitField(0);
Stopwatch sw = new Stopwatch();
// Call to remove the compilation time from measurements
bf.StuffUnrolledNonManaged();
sw.Start();
bf.StuffUnrolledNonManaged();
sw.Stop();
Console.WriteLine($"Non managed access unrolled in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
}
This code finishes in about 1.1 seconds.
Note: Only fixed array access is not enough to match the C++ performance. If we don't use the local variables - every instance of u0 is replaced by p[0] etc.. The time is about 3.6 seconds.
If we use only fixed access with the code from question (calling Left() and Right() functions in loop). The time is about 5.8 seconds.
Part of the difference may be because of the differences in code between the two versions - you don't assign to nextCarry in the C++ Left nor in the C# Right, but those could be typos in the example.
You'd want to look at the disassembly of both to see the difference, but primarily it is due to the C++ compiler having more time to spend optimizing the code. In this case it unrolls the loops, inlines all the function calls (including the constructor), and shoves all of the stuff in Cells_l into registers. So there's one big loop using registers and no accesses to memory.
I haven't looked at the C# compiled output but I doubt it does anything close to that.
Also, as mentioned in a comment, replace all the Cells.Length calls in your C# code to 5 (just like you have in the C++ code).
Related
I want to separate the bits of a short[] contained in a byte[] so the most significant bit of each short are arranged in one contiguous block(array? line?) followed by the next bit from each short and so on.
This is a condensed sample of how the layout of the bits would change:
0101010101010101 0101010101010101
would become
0011001100110011 0011001100110011
or with 3 it would look like
0101010101010101 0101010101010101 0101010101010101
which would become
0001110001110001 1100011100011100 0111000111000111
I put that in a code block to preserve the line breaks.
This would be easy If I could address each bit individually but I have to use bitwise operators which makes it extremely difficult.
Ignoring the possibility that the number of elements in the array wouldn't be a multiple of the base type bit length which in this case is 16 I came up with this:
fixed(byte* inptr = sourcearray){ //the shorts in a byte[]
fixed(byte* outptr = destination){//the output byte[]
var insamples = (short*)inptr;
var outsamples = (ushort*)outptr;
var mask = (ushort)0b1000000000000000;
for(int i = 0, j = 0; i < numsamples; ++i, j += 16){
if(j >= numsamples){
j = 0;
mask >>= 1;
}
outsamples[i] = (ushort)((insamples[j] & mask) | ((insamples[j + 1] & mask) >> 1) | ((insamples[j + 2] & mask) >> 2) | ((insamples[j + 3] & mask) >> 3) |
((insamples[j + 4] & mask) >> 4) | ((insamples[j + 5] & mask) >> 5) | ((insamples[j + 6] & mask) >> 6) | ((insamples[j + 7] & mask) >> 7) |
((insamples[j + 8] & mask) >> 8) | ((insamples[j + 9] & mask) >> 9) | ((insamples[j + 10] & mask) >> 10) | ((insamples[j + 11] & mask) >> 11) |
((insamples[j + 12] & mask) >> 12) | ((insamples[j + 13] & mask) >> 13) | ((insamples[j + 14] & mask) >> 14) | ((insamples[j + 15] & mask) >> 15));
}
}
}
The array I'm working with is 480 shorts (960 bytes) long, I'm pretty sure it does what I want but I'm having trouble writing the function that does the opposite to restore the the array to its original state, so far I have nothing that makes sense, I need it to be reasonably optimal to minimize the processing required but its hurting my brain.
I would probably be better off doing this in C++ but I want to keep the program entirely managed.
I hate to answer my own question but I have just discovered the System.Collections.BitArray class which allows me to address bits individually and within minutes I replaced the code in the op with this:
for(int i = 0, j = 0, k = 0; i < inbits.Length; ++i, j += 16){
if(j >= inbits.Length) j = ++k;
_outbitsout[i] = inbits[j];
}
and to reverse that operation:
var stride = inbits.Length/16;
for(int i = 0, j = 0, k = 0; i < inbits.Length; ++i, j += stride){
if(j >= inbits.Length) j = ++k;
_outbitsin[i] = inbits[j];
}
I'm packing some binary data as a short, but want to have 4x values of 0-F.. And would like to do this without having a bunch of switch() cases reading the string.split of a hex
Someone have a clever, elegant solution for this or should I just long-hand it?
eg; 1C4A = (1, 12, 4, 10)
Shift in and out
var a = 1;
var b = 12;
var c = 4;
var d = 10;
// in
var packed = (short) ((a << 12) | (b << 8) | (c << 4) | d);
// out
a = (packed >> 12) & 0xf;
b = (packed >> 8) & 0xf;
c = (packed >> 4) & 0xf;
d = packed & 0xF;
Console.WriteLine(a);
Console.WriteLine(b);
Console.WriteLine(c);
Console.WriteLine(d);
Output
1
12
4
10
You can shift by 4 (or divide and multiply by 16) to move numbers into different place values. Then mask and shift your packed number to get your original numbers back.
Eg if you want to store 1 and 2 you could do:
int packed = (1 << 4) + 2;
int v1 = (packed & 0xF0) >> 4;
int v2 = packed & 0x0F;
Console.WriteLine($"{v1}, {v2}");
>>> 1, 2
If I want to convert 4 bytes into an int, I can do this:
byte[] b = BitConverter.GetBytes(i1);
int i2 = BitConverter.ToInt32(b,0);
int i3 = b[0] | (b[1]<<8) | (b[2]<<16) | (b[3]<<24);
and then i1,i2,i3 will all equal.
but how do I do the same for a uint? This:
uint u1 = uint.MaxValue-1000;
byte[] b = BitConverter.GetBytes(u1);
uint u2 = BitConverter.ToUInt32(b,0);
uint u3 = (uint)(b[0] | (b[1]<<8) | (b[2]<<16) | (b[3]<<24));
results in a overlflow for large uints.
It would only throw that exception if in a checked context. See: http://msdn.microsoft.com/en-us/library/y3d0kef1(v=vs.80).aspx.
No exception:
uint u1 = uint.MaxValue - 1000;
byte[] b = BitConverter.GetBytes(u1);
uint u2 = BitConverter.ToUInt32(b, 0);
uint u3 = (uint) (b[0] | (b[1] << 8) | (b[2] << 16) | (b[3] << 24));
exception:
checked
{
uint u1 = uint.MaxValue - 1000;
byte[] b = BitConverter.GetBytes(u1);
uint u2 = BitConverter.ToUInt32(b, 0);
uint u3 = (uint) (b[0] | (b[1] << 8) | (b[2] << 16) | (b[3] << 24));
}
no exception
checked
{
unchecked
{
uint u1 = uint.MaxValue - 1000;
byte[] b = BitConverter.GetBytes(u1);
uint u2 = BitConverter.ToUInt32(b, 0);
uint u3 = (uint) (b[0] | (b[1] << 8) | (b[2] << 16) | (b[3] << 24));
Console.WriteLine(u1 + " " + u2 + " " + u3);
}
}
Make sure you're not compiling with the /checked option.
The exception is thrown by casting from int to uint. Using the shift operator on the bytes (the line with uint u3 = ...) implicitly cast them to int. A uint with the MSB on ("1") is a negative int which is out of range for uint. Using int causes no such exception because there is no explicit cast which might elicit an overflow exception.
I ran your code with uint values up to 4,294,967,295, which is the max and it works fine in all cases.
Is there a trick for creating a faster integer modulus than the standard % operator for particular bases?
For my program, I'd be looking for around 1000-4000 (e.g. n%2048). Is there a quicker way to perform n modulus 2048 than simply: n%2048?
If the denominator is known at compile time to be a power of 2, like your example of 2048, you could subtract 1 and do a bitwise-and.
That is:
n % m == n & (m - 1)
...where m is a power of 2.
For example:
22 % 8 == 22 - 16 == 6
Dec Bin
----- -----
22 = 10110
8 = 01000
8 - 1 = 00111
22 & (8 - 1) = 10110
& 00111
-------
6 = 00110
Bear in mind that a good compiler will have its own optimizations for %, maybe even enough to be as fast as the above technique. Arithmetic operators tend to be pretty heavily optimized.
For powers of two 2^n, all you have to do is zero out all bits except the last n bits.
For example (assuming 32 bit integers):
x%2 is equivalent to x & 0x00000001
x%4 is equivalent to x & 0x00000003
In general x % (2^n) is equal to x & (2^n-1). Written out in C, this would be x & ((1<<n)-1).
This is because 2^n gives you a 1 in the n+1th bit (from the right). So 2^n-1 will give you n ones on the right, and zeros on the left.
You could zero out the high order bits i.e.
x = 11 = 1011
x % 4 = 3 = 0011
so for x % 4 you could just take the last 2 bits - I'm not sure what would happen if negative numbers were used though
Here's a few techniques that replicate the modulus operation.
Of those benchmarked, this was the fastest (modified to fit your 2048 scenario). As long as your "max" isn't millions and in the 1000-4000 range you mentioned, it may work faster for you too:
int threshold = 2048; //the number to mod by
int max = 1000; //the number on the left. Ex: 1000 % 2048
int total = 0;
int y = 0;
for (int x = 0; x < max; x++)
{
if (y > (threshold - 1))
{
y = 0;
total += x;
}
y += 1;
}
return total;
Give it a go. It performed faster on the author's machine at various settings, so should perform admirably well for you too.
Branchless non-power-of-two modulus is possible by precomputing magic constants at run-time, to implement division using a multiply-add-shift.
This is roughly 2x faster than the built-in modulo operator % on my Intel Core i5.
I'm surprised it's not more dramatic, as x86 CPU div instructions can have latencies as high as 80-90 cycles for 64-bit division on some CPUs, compared to mul at 3 cycles and bitwise ops at 1 cycle each.
Proof of concept and timings shown below. series_len refers to the number of modulus ops performed in series on a single var. That's to prevent the CPU from hiding latencies through parallelization.
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
typedef int32_t s32;
typedef uint32_t u32;
typedef uint64_t u64;
#define NUM_NUMS 1024
#define NUM_RUNS 500
#define MAX_NUM UINT32_MAX
#define MAX_DEN 1024
struct fastdiv {
u32 mul;
u32 add;
s32 shift;
u32 _odiv; /* save original divisor for modulo calc */
};
static u32 num[NUM_NUMS];
static u32 den[NUM_NUMS];
static struct fastdiv fd[NUM_NUMS];
/* hash of results to prevent gcc from optimizing out our ops */
static u32 cookie = 0;
/* required for magic constant generation */
u32 ulog2(u32 v) {
u32 r, shift;
r = (v > 0xFFFF) << 4; v >>= r;
shift = (v > 0xFF ) << 3; v >>= shift; r |= shift;
shift = (v > 0xF ) << 2; v >>= shift; r |= shift;
shift = (v > 0x3 ) << 1; v >>= shift; r |= shift;
r |= (v >> 1);
return r;
}
/* generate constants for implementing a division with multiply-add-shift */
void fastdiv_make(struct fastdiv *d, u32 divisor) {
u32 l, r, e;
u64 m;
d->_odiv = divisor;
l = ulog2(divisor);
if (divisor & (divisor - 1)) {
m = 1ULL << (l + 32);
d->mul = (u32)(m / divisor);
r = (u32)m - d->mul * divisor;
e = divisor - r;
if (e < (1UL << l)) {
++d->mul;
d->add = 0;
} else {
d->add = d->mul;
}
d->shift = l;
} else {
if (divisor == 1) {
d->mul = 0xffffffff;
d->add = 0xffffffff;
d->shift = 0;
} else {
d->mul = 0x80000000;
d->add = 0;
d->shift = l-1;
}
}
}
/* 0: use function that checks for a power-of-2 modulus (speedup for POTs)
* 1: use inline macro */
#define FASTMOD_BRANCHLESS 0
#define fastdiv(v,d) ((u32)(((u64)(v)*(d)->mul + (d)->add) >> 32) >> (d)->shift)
#define _fastmod(v,d) ((v) - fastdiv((v),(d)) * (d)->_odiv)
#if FASTMOD_BRANCHLESS
#define fastmod(v,d) _fastmod((v),(d))
#else
u32 fastmod(u32 v, struct fastdiv *d) {
if (d->mul == 0x80000000) {
return (v & ((1 << d->shift) - 1));
}
return _fastmod(v,d);
}
#endif
u32 random32(u32 upper_bound) {
return arc4random_uniform(upper_bound);
}
u32 random32_range(u32 lower_bound, u32 upper_bound) {
return random32(upper_bound - lower_bound) + lower_bound;
}
void fill_arrays() {
int i;
for (i = 0; i < NUM_NUMS; ++i) {
num[i] = random32_range(MAX_DEN, MAX_NUM);
den[i] = random32_range(1, MAX_DEN);
fastdiv_make(&fd[i], den[i]);
}
}
void fill_arrays_pot() {
u32 log_bound, rand_log;
int i;
log_bound = ulog2(MAX_DEN);
for (i = 0; i < NUM_NUMS; ++i) {
num[i] = random32_range(MAX_DEN, MAX_NUM);
rand_log = random32(log_bound) + 1;
den[i] = 1 << rand_log;
fastdiv_make(&fd[i], den[i]);
}
}
u64 clock_ns() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec*1000000000 + tv.tv_usec*1000;
}
void use_value(u32 v) {
cookie += v;
}
int main(int argc, char **arg) {
u64 builtin_npot_ns;
u64 builtin_pot_ns;
u64 branching_npot_ns;
u64 branching_pot_ns;
u64 branchless_npot_ns;
u64 branchless_pot_ns;
u64 t0, t1;
u32 v;
int s, r, i, j;
int series_len;
builtin_npot_ns = builtin_pot_ns = 0;
branching_npot_ns = branching_pot_ns = 0;
branchless_npot_ns = branchless_pot_ns = 0;
for (s = 5; s >= 0; --s) {
series_len = 1 << s;
for (r = 0; r < NUM_RUNS; ++r) {
/* built-in NPOT */
fill_arrays();
t0 = clock_ns();
for (i = 0; i < NUM_NUMS; ++i) {
v = num[i];
for (j = 0; j < series_len; ++j) {
v /= den[i];
}
use_value(v);
}
t1 = clock_ns();
builtin_npot_ns += (t1 - t0) / NUM_NUMS;
/* built-in POT */
fill_arrays_pot();
t0 = clock_ns();
for (i = 0; i < NUM_NUMS; ++i) {
v = num[i];
for (j = 0; j < series_len; ++j) {
v /= den[i];
}
use_value(v);
}
t1 = clock_ns();
builtin_pot_ns += (t1 - t0) / NUM_NUMS;
/* branching NPOT */
fill_arrays();
t0 = clock_ns();
for (i = 0; i < NUM_NUMS; ++i) {
v = num[i];
for (j = 0; j < series_len; ++j) {
v = fastmod(v, fd+i);
}
use_value(v);
}
t1 = clock_ns();
branching_npot_ns += (t1 - t0) / NUM_NUMS;
/* branching POT */
fill_arrays_pot();
t0 = clock_ns();
for (i = 0; i < NUM_NUMS; ++i) {
v = num[i];
for (j = 0; j < series_len; ++j) {
v = fastmod(v, fd+i);
}
use_value(v);
}
t1 = clock_ns();
branching_pot_ns += (t1 - t0) / NUM_NUMS;
/* branchless NPOT */
fill_arrays();
t0 = clock_ns();
for (i = 0; i < NUM_NUMS; ++i) {
v = num[i];
for (j = 0; j < series_len; ++j) {
v = _fastmod(v, fd+i);
}
use_value(v);
}
t1 = clock_ns();
branchless_npot_ns += (t1 - t0) / NUM_NUMS;
/* branchless POT */
fill_arrays_pot();
t0 = clock_ns();
for (i = 0; i < NUM_NUMS; ++i) {
v = num[i];
for (j = 0; j < series_len; ++j) {
v = _fastmod(v, fd+i);
}
use_value(v);
}
t1 = clock_ns();
branchless_pot_ns += (t1 - t0) / NUM_NUMS;
}
builtin_npot_ns /= NUM_RUNS;
builtin_pot_ns /= NUM_RUNS;
branching_npot_ns /= NUM_RUNS;
branching_pot_ns /= NUM_RUNS;
branchless_npot_ns /= NUM_RUNS;
branchless_pot_ns /= NUM_RUNS;
printf("series_len = %d\n", series_len);
printf("----------------------------\n");
printf("builtin_npot_ns : %llu ns\n", builtin_npot_ns);
printf("builtin_pot_ns : %llu ns\n", builtin_pot_ns);
printf("branching_npot_ns : %llu ns\n", branching_npot_ns);
printf("branching_pot_ns : %llu ns\n", branching_pot_ns);
printf("branchless_npot_ns : %llu ns\n", branchless_npot_ns);
printf("branchless_pot_ns : %llu ns\n\n", branchless_pot_ns);
}
printf("cookie=%u\n", cookie);
}
Results
Intel Core i5 (MacBookAir7,2), macOS 10.11.6, clang 8.0.0
series_len = 32
----------------------------
builtin_npot_ns : 218 ns
builtin_pot_ns : 225 ns
branching_npot_ns : 115 ns
branching_pot_ns : 42 ns
branchless_npot_ns : 110 ns
branchless_pot_ns : 110 ns
series_len = 16
----------------------------
builtin_npot_ns : 87 ns
builtin_pot_ns : 89 ns
branching_npot_ns : 47 ns
branching_pot_ns : 19 ns
branchless_npot_ns : 45 ns
branchless_pot_ns : 45 ns
series_len = 8
----------------------------
builtin_npot_ns : 32 ns
builtin_pot_ns : 34 ns
branching_npot_ns : 18 ns
branching_pot_ns : 10 ns
branchless_npot_ns : 17 ns
branchless_pot_ns : 17 ns
series_len = 4
----------------------------
builtin_npot_ns : 15 ns
builtin_pot_ns : 16 ns
branching_npot_ns : 8 ns
branching_pot_ns : 3 ns
branchless_npot_ns : 7 ns
branchless_pot_ns : 7 ns
series_len = 2
----------------------------
builtin_npot_ns : 8 ns
builtin_pot_ns : 7 ns
branching_npot_ns : 4 ns
branching_pot_ns : 2 ns
branchless_npot_ns : 2 ns
branchless_pot_ns : 2 ns
The fastest way to multiply/divide unsigned integers numbers is by bit shifting them left or right. Shift operations match directly to CPU commands. For example, 3 << 2 =6, while 4>>1 = 2.
You can use the same trick to calculate the module: Shift an integer far enough to the left so that only the remainder bits are left, then shift it back right so you can check the remainder value.
On the other hand, integer modulo also exists as a CPU command. If the integer modulo operator maps to this command in optimized builds, you will not see any improvement by using the bit shift trick.
The following code caclulates 7%4 by shifting far enough that only the 2 last bits are left (since 4=2^2). This means that we need to shift 30 bits:
uint i=7;
var modulo=((i<<30)>>30);
The result is 3
EDIT:
I just read all the solutions proposing simply erasing the higher order bits. It has the same effect, but a lot simpler and direct.
If you are dividing by literals that are powers of two, then the answer is probably No: Any decent compiler will automatically turn such expressions into a variation of an AND operation, which is pretty close to optimal.
I'm trying to load some decimal values from a file but I can't work out the correct way to take the raw values and convert them into decimals.
I've read the file out into a byte array, and each chunk of four bytes is supposed to represent one decimal value. To help figure it out, I've constructed a table of how the decimal values 1 through to 46 are represented as four byte chunks.
For instance, the number 1 appears as 0,0,128,63 the number 2 as 0,0,0,64 and so on up to 46, which is 0,0,56,66. The full table is available here.
There is also another series of numbers which go to three decimal places and include negatives, which is here.
The only documentation I have states
They are stored least significant byte first: 1's, 256's, 65536's, 16777216's. This makes the hex sequence 01 01 00 00 into the number 257 (decimal). In C/C++, to read e.g. a float, do: float x; fread(&x, sizeof(float), 1, fileptr);
However I'm using .NET's File.ReadAllBytes method so this isn't much help. If anyone can spare a few minutes to look at the examples files and see if they can spot a way to convert the values to decimals I'd be most grateful.
You can use BitConverter.ToSingle to read a float value from a byte array, so to get a sequence of floats, you could do something like this:
byte[] data = File.ReadAllBytes(fileName);
int count = data.Length / 4;
Debug.Assert(data.Length % 4 == 0);
IEnumerable<float> values = Enumerable.Range(0, count)
.Select(i => BitConverter.ToSingle(data, i*4));
Have you looked into using the BitConverter class? It converts between byte arrays and various types.
Edit:
MSDN has a helpful comment on the documentation for BitConverter at http://msdn.microsoft.com/en-us/library/system.bitconverter_methods(v=vs.85).aspx:
public static decimal ToDecimal(byte[] bytes)
{
int[] bits = new int[4];
bits[0] = ((bytes[0] | (bytes[1] << 8)) | (bytes[2] << 0x10)) | (bytes[3] << 0x18); //lo
bits[1] = ((bytes[4] | (bytes[5] << 8)) | (bytes[6] << 0x10)) | (bytes[7] << 0x18); //mid
bits[2] = ((bytes[8] | (bytes[9] << 8)) | (bytes[10] << 0x10)) | (bytes[11] << 0x18); //hi
bits[3] = ((bytes[12] | (bytes[13] << 8)) | (bytes[14] << 0x10)) | (bytes[15] << 0x18); //flags
return new decimal(bits);
}
public static byte[] GetBytes(decimal d)
{
byte[] bytes = new byte[16];
int[] bits = decimal.GetBits(d);
int lo = bits[0];
int mid = bits[1];
int hi = bits[2];
int flags = bits[3];
bytes[0] = (byte)lo;
bytes[1] = (byte)(lo >> 8);
bytes[2] = (byte)(lo >> 0x10);
bytes[3] = (byte)(lo >> 0x18);
bytes[4] = (byte)mid;
bytes[5] = (byte)(mid >> 8);
bytes[6] = (byte)(mid >> 0x10);
bytes[7] = (byte)(mid >> 0x18);
bytes[8] = (byte)hi;
bytes[9] = (byte)(hi >> 8);
bytes[10] = (byte)(hi >> 0x10);
bytes[11] = (byte)(hi >> 0x18);
bytes[12] = (byte)flags;
bytes[13] = (byte)(flags >> 8);
bytes[14] = (byte)(flags >> 0x10);
bytes[15] = (byte)(flags >> 0x18);
return bytes;
}
The .NET library implemented Decimal.GetBytes() method internally.
I've used the decompiled .NET library to create a simple conversion methods between decimal and byte arrary - you can find it here:
https://gist.github.com/eranbetzalel/5384006#file-decimalbytesconvertor-cs
EDIT : Here is the full source code from my link.
public decimal BytesToDecimal(byte[] buffer, int offset = 0)
{
var decimalBits = new int[4];
decimalBits[0] = buffer[offset + 0] | (buffer[offset + 1] << 8) | (buffer[offset + 2] << 16) | (buffer[offset + 3] << 24);
decimalBits[1] = buffer[offset + 4] | (buffer[offset + 5] << 8) | (buffer[offset + 6] << 16) | (buffer[offset + 7] << 24);
decimalBits[2] = buffer[offset + 8] | (buffer[offset + 9] << 8) | (buffer[offset + 10] << 16) | (buffer[offset + 11] << 24);
decimalBits[3] = buffer[offset + 12] | (buffer[offset + 13] << 8) | (buffer[offset + 14] << 16) | (buffer[offset + 15] << 24);
return new Decimal(decimalBits);
}
public byte[] DecimalToBytes(decimal number)
{
var decimalBuffer = new byte[16];
var decimalBits = Decimal.GetBits(number);
var lo = decimalBits.Value[0];
var mid = decimalBits.Value[1];
var hi = decimalBits.Value[2];
var flags = decimalBits.Value[3];
decimalBuffer[0] = (byte)lo;
decimalBuffer[1] = (byte)(lo >> 8);
decimalBuffer[2] = (byte)(lo >> 16);
decimalBuffer[3] = (byte)(lo >> 24);
decimalBuffer[4] = (byte)mid;
decimalBuffer[5] = (byte)(mid >> 8);
decimalBuffer[6] = (byte)(mid >> 16);
decimalBuffer[7] = (byte)(mid >> 24);
decimalBuffer[8] = (byte)hi;
decimalBuffer[9] = (byte)(hi >> 8);
decimalBuffer[10] = (byte)(hi >> 16);
decimalBuffer[11] = (byte)(hi >> 24);
decimalBuffer[12] = (byte)flags;
decimalBuffer[13] = (byte)(flags >> 8);
decimalBuffer[14] = (byte)(flags >> 16);
decimalBuffer[15] = (byte)(flags >> 24);
return decimalBuffer;
}
As others have mentioned, use the BitConverter class, see the example below:
byte[] bytez = new byte[] { 0x00, 0x00, 0x80, 0x3F };
float flt = BitConverter.ToSingle(bytez, 0); // 1.0
bytez = new byte[] { 0x00, 0x00, 0x00, 0x40 };
flt = BitConverter.ToSingle(bytez, 0); // 2.0
bytez = new byte[] { 0, 0, 192, 190 };
flt = BitConverter.ToSingle(bytez, 0); // -0.375