In C# or C++ how can I implement a branch-free sort of three (integer) numbers?
Is this possible?
No conditionals. Only a cast to uint. Perfect solution.
int abs (int a)
{
int b = a;
b = (b >> (sizeof(int)*CHAR_BIT-1) & 1);
return 2 * b * (a) + a;
}
int max (int a, int b) { return (a + b + abs(a - b)) / 2; }
int min (int a, int b) { return (a + b - abs(a - b)) / 2; }
void sort (int & a, int & b, int & c)
{
int maxnum = max(max(a,b), c);
int minnum = min(min(a,b), c);
int middlenum = a + b + c - maxnum - minnum;
a = maxnum;
b = middlenum;
c = minnum;
}
You can write max, min and swap branch-free functions. Once you have these functions, you can use them to write sort function as:
void sort(int &a, int &b, int &c)
{
int m1 = max(a,b,c);
int m2 = min(a,b,c);
b = a + b + c - m1 - m2;
swap(m1, a);
swap(m2, c);
}
And here are the helper functions:
void swap(int &a, int &b)
{
int tmp = a; a = b; b = tmp;
}
int max( int a, int b, int c ) {
int l1[] = { a, b };
int l2[] = { l1[ a<b ], c };
return l2[ l2[0] < c ];
}
int min( int a, int b, int c ) {
int l1[] = { a, b };
int l2[] = { l1[ a>b ], c };
return l2[ l2[0] > c ];
}
Test code:
int main() {
int a,b,c;
std::cin >> a >> b >> c;
sort(a,b,c);
std::cout << a <<"," << b << "," << c << std::endl;
return 0;
}
Input:
21 242 434
Output (descending order):
434, 242, 21
Demo : http://ideone.com/3ZOzc
I have taken the implementation of max from #David's answer from here, and implemented min with little twist.
You can do this in C++ with:
#include <iostream>
void sort(int *in) {
const int sum = in[0]+in[1];
const int diff = abs(in[1]-in[0]);
in[0] = (sum + diff) / 2;
in[1] = (sum - diff) / 2;
}
int main() {
int a[] = {3,4,1};
sort(a);
sort(a+1);
sort(a);
std::cout << a[0] << "," << a[1] << "," << a[2] << std::endl;
int b[] = {1,2,3};
sort(b);
sort(b+1);
sort(b);
std::cout << b[0] << "," << b[1] << "," << b[2] << std::endl;
}
The trick is in expressing the min/max elements as arithmetic operations, not branching and then calling sort on pairs enough times to "bubble sort" them.
I've made a totally generic version, using template meta-programming to call sort the right number of times. It all gets inlined exactly as you'd hope with gcc 4.7.0 on my x86 box (although call is unconditional on x86 anyway). I've also implemented an abs function that avoids branches on x86 (it makes a few assumptions about integers that make it less portable, it's based on gcc's __builtin_abs implementation for x86 though):
#include <iostream>
#include <limits.h>
void myabs(int& in) {
const int tmp = in >> ((sizeof(int) * CHAR_BIT) - 1);
in ^= tmp;
in = tmp - in;
}
template <int N, int I=1, bool C=false>
struct sorter {
static void sort(int *in) {
const int sum = in[I-0]+in[I-1];
int diff = in[I-1]-in[I-0];
myabs(diff);
in[I-0] = (sum + diff) / 2;
in[I-1] = (sum - diff) / 2;
sorter<N, I+1, I+1>=N>::sort(in);
}
};
template <int N,int I>
struct sorter<N,I,true> {
static void sort(int *in) {
sorter<N-1>::sort(in);
}
};
template <int I, bool C>
struct sorter<0,I,C> {
static void sort(int *) {
}
};
int main() {
int a[] = {3,4,1};
sorter<3>::sort(a);
std::cout << a[0] << "," << a[1] << "," << a[2] << std::endl;
}
Related
I'm into developing code to do arithmetic in Galois field gf(2^8) and I think I'm getting wrong results on multiplication operations.
private static byte Multiply(byte a, byte b)
{
byte result = 0;
while (b != 0)
{
if ((b & 1) != 0)
{
result ^= a;
}
a <<= 1;
b >>= 1;
}
return result;
}
The result for Multiply(1, 2) gives the correct value of 2 but Multiply(240, 249) gives me 112 instead of the expected 148.
Now I'm not sure if this value is good or not with Russian Peasant Multiplication.
Maybe there's another algorithm that gives correct results?
Example code:
#define POLY 0x11D
static BYTE GFMpy(BYTE b0, BYTE b1)
{
int i;
int product;
product = 0;
for(i = 0; i < 8; i++){
product <<= 1;
if(product & 0x100){
product ^= POLY;}
if(b0 & 0x80u){
product ^= b1;}
b0 <<= 1;}
return((BYTE)product);
}
Example using lookup tables:
#define POLY (0x11d)
/* all non-zero elements are powers of 2 for POLY == 0x11d */
typedef unsigned char BYTE;
/* ... */
static BYTE exp2[512];
static BYTE log2[256];
/* ... */
static void Tbli()
{
int i;
int b;
b = 0x01; /* init exp2 table */
for(i = 0; i < 512; i++){
exp2[i] = (BYTE)b;
b = (b << 1); /* powers of 2 */
if(b & 0x100)
b ^= POLY;
}
log2[0] = 0xff; /* init log2 table */
for(i = 0; i < 255; i++)
log2[exp2[i]] = (BYTE)i;
}
/* ... */
static BYTE GFMpy(BYTE m0, BYTE m1) /* multiply */
{
if(0 == m0 || 0 == m1)
return(0);
return(exp2[log2[m0] + log2[m1]]);
}
/* ... */
static BYTE GFDiv(BYTE m0, BYTE m1) /* divide */
{
if(0 == m0)
return(0);
return(exp2[log2[m0] + 255 - log2[m1]]);
}
I found that my application spends 25% of its time doing this in a loop:
private static int Diff (int c0, int c1)
{
unsafe {
byte* pc0 = (byte*) &c0;
byte* pc1 = (byte*) &c1;
int d0 = pc0[0] - pc1[0];
int d1 = pc0[1] - pc1[1];
int d2 = pc0[2] - pc1[2];
int d3 = pc0[3] - pc1[3];
d0 *= d0;
d1 *= d1;
d2 *= d2;
d3 *= d3;
return d0 + d1 + d2 + d3;
}
}
How can I improve the performance of this method? My ideas so far:
Most obviously, this would benefit from SIMD, but let us suppose I don't want to go there because it is a bit of a hassle.
Same goes for lower level stuff (calling a C library, executing on GPGPU)
Multithreading - I'll use that.
Edit: For your convenience, some test code which reflects the real environment and use case. (In reality even more data are involved, and data are not compared in single large blocks but in many chunks of several kb each.)
public static class ByteCompare
{
private static void Main ()
{
const int n = 1024 * 1024 * 20;
const int repeat = 20;
var rnd = new Random (0);
Console.Write ("Generating test data... ");
var t0 = Enumerable.Range (1, n)
.Select (x => rnd.Next (int.MinValue, int.MaxValue))
.ToArray ();
var t1 = Enumerable.Range (1, n)
.Select (x => rnd.Next (int.MinValue, int.MaxValue))
.ToArray ();
Console.WriteLine ("complete.");
GC.Collect (2, GCCollectionMode.Forced);
Console.WriteLine ("GCs: " + GC.CollectionCount (0));
{
var sw = Stopwatch.StartNew ();
long res = 0;
for (int reps = 0; reps < repeat; reps++) {
for (int i = 0; i < n; i++) {
int c0 = t0[i];
int c1 = t1[i];
res += ByteDiff_REGULAR (c0, c1);
}
}
sw.Stop ();
Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_REGULAR");
}
{
var sw = Stopwatch.StartNew ();
long res = 0;
for (int reps = 0; reps < repeat; reps++) {
for (int i = 0; i < n; i++) {
int c0 = t0[i];
int c1 = t1[i];
res += ByteDiff_UNSAFE (c0, c1);
}
}
sw.Stop ();
Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_UNSAFE_PTR");
}
Console.WriteLine ("GCs: " + GC.CollectionCount (0));
Console.WriteLine ("Test complete.");
Console.ReadKey (true);
}
public static int ByteDiff_REGULAR (int c0, int c1)
{
var c00 = (byte) (c0 >> (8 * 0));
var c01 = (byte) (c0 >> (8 * 1));
var c02 = (byte) (c0 >> (8 * 2));
var c03 = (byte) (c0 >> (8 * 3));
var c10 = (byte) (c1 >> (8 * 0));
var c11 = (byte) (c1 >> (8 * 1));
var c12 = (byte) (c1 >> (8 * 2));
var c13 = (byte) (c1 >> (8 * 3));
var d0 = (c00 - c10);
var d1 = (c01 - c11);
var d2 = (c02 - c12);
var d3 = (c03 - c13);
d0 *= d0;
d1 *= d1;
d2 *= d2;
d3 *= d3;
return d0 + d1 + d2 + d3;
}
private static int ByteDiff_UNSAFE (int c0, int c1)
{
unsafe {
byte* pc0 = (byte*) &c0;
byte* pc1 = (byte*) &c1;
int d0 = pc0[0] - pc1[0];
int d1 = pc0[1] - pc1[1];
int d2 = pc0[2] - pc1[2];
int d3 = pc0[3] - pc1[3];
d0 *= d0;
d1 *= d1;
d2 *= d2;
d3 *= d3;
return d0 + d1 + d2 + d3;
}
}
}
which yields for me (running as x64 Release on an i5):
Generating test data... complete.
GCs: 8
res=18324555528140, t=1.46s - ByteDiff_REGULAR
res=18324555528140, t=1.15s - ByteDiff_UNSAFE
res=18324555528140, t=1.73s - Diff_Alex1
res=18324555528140, t=1.63s - Diff_Alex2
res=18324555528140, t=3.59s - Diff_Alex3
res=18325828513740, t=3.90s - Diff_Alex4
GCs: 8
Test complete.
Most obviously, this would benefit from SIMD, but let us suppose I don't want to go there because it is a bit of a hassle.
Well avoid it if you want, but it's actually fairly well supported directly from C#. Short of offloading to the GPU, I would expect this to be by far the largest performance winner if the larger algorithm lends itself to SIMD processing.
http://www.drdobbs.com/architecture-and-design/simd-enabled-vector-types-with-c/240168888
Multithreading
Sure, use one thread per CPU core. You can also use constructs like Parallel.For and let .NET sort out how many threads to use. It's pretty good at that, but since you know this is certainly CPU bound you might (or might not) get a more optimal result by managing threads yourself.
As for speeding up the actual code block, it may be faster to use bit masking and bit shifting to get the individual values to work on, rather than using pointers. That has the additional benefit that you don't need an unsafe code block, e.g.
byte b0_leftmost = (c0 & 0xff000000) >> 24;
Besides the already mentioned SIMD options and running multiple operations in parallel, have you tried to benchmark some possible implementation variations on the theme? Like some of the below options.
I almost forgot to mention a very important optimization:
Add a using System.Runtime.CompilerServices;
Add the [MethodImpl(MethodImplOptions.AggressiveInlining)] attribute to your method.
Like this:
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
unsafe
{
byte* pc0 = (byte*)&c0;
byte* pc1 = (byte*)&c1;
int sum = 0;
int dif = 0;
for (var i = 0; i < 4; i++, pc0++, pc1++)
{
dif = *pc0 - *pc1;
sum += (dif * dif);
}
return sum;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
unchecked
{
int sum = 0;
int dif = 0;
for (var i = 0; i < 4; i++)
{
dif = (c0 & 0xFF) - (c1 & 0xFF);
c0 >>= 8;
c1 >>= 8;
sum += (dif * dif);
}
return sum;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
unsafe
{
int* difs = stackalloc int[4];
byte* pc0 = (byte*)&c0;
byte* pc1 = (byte*)&c1;
difs[0] = pc0[0] - pc1[0];
difs[1] = pc0[1] - pc1[1];
difs[2] = pc0[2] - pc1[2];
difs[3] = pc0[3] - pc1[3];
return difs[0] * difs[0] + difs[1] * difs[1] + difs[2] * difs[2] + difs[3] * difs[3];
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
unsafe
{
int* difs = stackalloc int[4];
difs[0] = (c0 >> 24) - (c1 >> 24);
difs[1] = ((c0 >> 16) & 0xFF) - ((c1 >> 16) & 0xFF);
difs[2] = ((c0 >> 8) & 0xFF) - ((c1 >> 8) & 0xFF);
difs[3] = (c0 & 0xFF) - (c1 & 0xFF);
return difs[0] * difs[0] + difs[1] * difs[1] + difs[2] * difs[2] + difs[3] * difs[3];
}
}
I tried to reduce IL instructions count (looks like it's only option for single threaded, no-SIMD code). This code runs 35% faster than in description on my machine. Also i was thinking that you could try to generate IL instruction by yourself via Emit static class. It can give you more accuracy.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int ByteDiff_UNSAFE_2 (int c0, int c1)
{
unsafe {
byte* pc0 = (byte*) &c0;
byte* pc1 = (byte*) &c1;
int d0 = pc0[0] - pc1[0];
d0 *= d0;
int d1 = pc0[1] - pc1[1];
d0 += d1 * d1;
int d2 = pc0[2] - pc1[2];
d0 += d2 * d2;
int d3 = pc0[3] - pc1[3];
return d0 + d3 * d3;
}
}
Conventional wisdom has it that when you are ORing bytes together to make an int, you should use the | operator rather than the + operator, otherwise you could have problems with the sign bit.
But this doesn't appear to be the case in C#. It looks like you can happily use the + operator, and it still works even for negative results.
My questions:
Is this really true?
If so, why does it work? (And why do a lot of people think it shouldn't - including me! ;)
Here's a test program which I believe tests every possible combination of four bytes using the + operator and the | operator, and verifies that both approaches yield the same results.
Here's the test code:
using System;
using System.Diagnostics;
namespace Demo
{
class Program
{
int Convert1(byte b1, byte b2, byte b3, byte b4)
{
return b1 + (b2 << 8) + (b3 << 16) + (b4 << 24);
}
int Convert2(byte b1, byte b2, byte b3, byte b4)
{
return b1 | (b2 << 8) | (b3 << 16) | (b4 << 24);
}
void Run()
{
byte b = 0xff;
Trace.Assert(Convert1(b, b, b, b) == -1); // Sanity check.
Trace.Assert(Convert2(b, b, b, b) == -1);
for (int i = 0; i < 256; ++i)
{
Console.WriteLine(i);
byte b1 = (byte) i;
for (int j = 0; j < 256; ++j)
{
byte b2 = (byte) j;
for (int k = 0; k < 256; ++k)
{
byte b3 = (byte) k;
for (int l = 0; l < 256; ++l)
{
byte b4 = (byte) l;
Trace.Assert(Convert1(b1, b2, b3, b4) == Convert2(b1, b2, b3, b4));
}
}
}
}
Console.WriteLine("Done.");
}
static void Main()
{
new Program().Run();
}
}
}
[EDIT]
To see how this works, consider this:
byte b = 0xff;
int i1 = b;
int i2 = (b << 8);
int i3 = (b << 16);
int i4 = (b << 24);
Console.WriteLine(i1);
Console.WriteLine(i2);
Console.WriteLine(i3);
Console.WriteLine(i4);
int total = i1 + i2 + i3 + i4;
Console.WriteLine(total);
This prints:
255
65280
16711680
-16777216
-1
Aha!
Differences:
When bits overlap, | and + will produce different results:
2 | 3 = 3
2 + 3 = 5
When actually using signed bytes, the result will be different:
-2 | -3 = -1
-2 + (-3) = -5
I am looking to convert the following bit of C# code to Java. I having a hard time coming up with a equivalent.
Working C# Code:
private ushort ConvertBytes(byte a, byte b, bool flip)
{
byte[] buffer = new byte[] { a, b };
if (!flip)
{
return BitConverter.ToUInt16(buffer, 0);
}
ushort num = BitConverter.ToUInt16(buffer, 0);
//this.Weight = num;
int xy = 0x3720;
int num2 = 0x3720 - num;
if (num2 > -1)
{
return Convert.ToUInt16(num2);
}
return 1;
}
Here is the Java Code that does not work. The Big challenge is the "BitConverter.ToInt16(buffer,0). How do i get the Java equal of the working C# method.
Java Code that is Wrong:
private short ConvertBytes(byte a, byte b, boolean flip){
byte[] buffer = new byte[] { a, b };
if (!flip){
return (short) ((a << 8) | (b & 0xFF));
}
short num = (short) ((a << 8) | (b & 0xFF));
//this.Weight = num;
int num2 = 0x3720 - num;
if (num2 > -1){
return (short)num2;
}
return 1;
}
private short ConvertBytes(byte a, byte b, boolean flip){
ByteBuffer byteBuffer = ByteBuffer.allocate(2);
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
byteBuffer.put(a);
byteBuffer.put(b);
short num = byteBuffer.getShort(0);
//this.Weight = num;
int num2 = 0x3720 - num;
if (num2 > -1){
return (short)num2;
}
return 1;
}
Below is a Win32 Console App procedure that demonstrates the dependence of various pointers on an array. A change to the values in the original array (model) by for example uncommenting the lines marked '// uncomment ...' results in a change to the output. My question is how do I get or mimic this behaviour in a C# managed code environment (i.e. without using unsafe and pointers)?
#include "stdafx.h"
#include <iostream>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
float model[100];
for(int i = 0; i < 100; i++) { model[i] = i; }
// uncomment these to alter the results
//model[5] = 5000;
//model[20] = 20000;
//model[38] = 38000;
static const int componentCount = 5;
float* coefs = model; // coefs points to model[0]
float* mean = coefs + componentCount; // mean points to model[0 + componentCount] == model[5]
float* cov = mean + 3*componentCount; // cov points to model[0 + componentCount + 3*componentCount] == model[20]
int ci = 2;
float* c = cov + 9*ci; // c points to model[0 + componentCount + 3*componentCount + 9*ci] == model[38]
int i = 0;
cout <<"model : "<< model[i] << endl; // 0
cout <<"coefs : "<< coefs[i] << endl; // 0
cout <<"mean : "<< mean[i] << endl; // 5 (or 5000)
cout <<"cov : "<< cov[i] << endl; // 20 (or 20000)
cout <<"ci : "<< ci << endl; // 2
cout <<"c : "<< c[i] << endl; // 38 (or 38000)
cin.get(); }
You can do the same thing in C# without unsafe code:
struct ArrayPointer<T>
{
private T[] array;
private int offset;
public ArrayPointer(T[] array) : this(array, 0)
{
}
private ArrayPointer(T[] array, int offset)
{
Debug.Assert(array != null);
Debug.Assert(offset >= 0);
Debug.Assert(offset < array.Length);
this.array = array;
this.offset = offset;
}
public static ArrayPointer<T> operator+(ArrayPointer<T> p1, int p2)
{
return new ArrayPointer<T>(p1.array, p1.offset + p2);
}
And so on. Define operators for addition, subtraction, increment, decrement, comparison, indexing, conversion from arrays, and so on. Then you can say:
int[] arr = whatever;
ArrayPointer<int> pointer = arr;
pointer+=2;
pointer--;
int x = pointer[3];
and so on.
This approach has a lot of nice properties. For example, you can do a debug assert if you ever compare p1 > p2 when p1 and p2 are pointers to the interiors of different arrays. That is almost always a bug in C, but a hard one to catch.
You could write a class that represents an array with some offset, similar to the one below. Additionaly, you might want it to implement ICollection<T> or at least IEnumerable<T>.
class ArrayWithOffset<T>
{
T[] m_array;
int m_offset;
public ArrayWithOffset(T[] array, int offset)
{
m_array = array;
m_offset = offset;
}
public T this[int i]
{
return m_array[offset + i]
}
}
Instead of one parameter, pointer to array item, use pair of parameters (array, offset).