Related
I've implemented a method for parsing an unsigned integer string of length <= 8 using SIMD intrinsics available in .NET as follows:
public unsafe static uint ParseUint(string text)
{
fixed (char* c = text)
{
var parsed = Sse3.LoadDquVector128((byte*) c);
var shift = (8 - text.Length) * 2;
var shifted = Sse2.ShiftLeftLogical128BitLane(parsed,
(byte) (shift));
Vector128<byte> digit0 = Vector128.Create((byte) '0');
var reduced = Sse2.SubtractSaturate(shifted, digit0);
var shortMult = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
var collapsed2 = Sse2.MultiplyAddAdjacent(reduced.As<byte, short>(), shortMult);
var repack = Sse41.PackUnsignedSaturate(collapsed2, collapsed2);
var intMult = Vector128.Create((short)0, 0, 0, 0, 100, 1, 100, 1);
var collapsed3 = Sse2.MultiplyAddAdjacent(repack.As<ushort,short>(), intMult);
var e1 = collapsed3.GetElement(2);
var e2 = collapsed3.GetElement(3);
return (uint) (e1 * 10000 + e2);
}
}
Sadly, a comparison with a baseline uint.Parse() gives the following, rather unimpressive, result:
Method
Mean
Error
StdDev
Baseline
15.157 ns
0.0325 ns
0.0304 ns
ParseSimd
3.269 ns
0.0115 ns
0.0102 ns
What are some of the ways the above code can be improved? My particular areas of concern are:
The way a bit shift of the SIMD register happens with a calculation involving text.Length
~~The unpacking of UTF-16 data using a MultiplyAddAdjacent involving a vector of 0s and 1~~
The way elements are extracted using GetElement() -- maybe there's some ToScalar() call that can happen somwehere?
I've made some optimizations.
public unsafe uint ParseUint2(string text)
{
fixed (char* c = text)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c);
raw = Sse2.ShiftLeftLogical128BitLane(raw, (byte)(8 - text.Length << 1));
Vector128<ushort> digit0 = Vector128.Create('0');
raw = Sse2.SubtractSaturate(raw, digit0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
res = Ssse3.HorizontalAdd(res, res);
res = Ssse3.HorizontalAdd(res, res);
return (uint)res.GetElement(0);
}
}
Reduced amount of type conversions and final calculations were made with vphaddd. As result it's by ~10% faster.
But...imm8 must be a compile-time constant. It means you can't use a variable where imm8 is argument. Otherwise JIT compiler won't produce the intrinsic instruction for the operation. It will make an external method call at this place (maybe some workaround is there). Thanks #PeterCordes for help.
This monster isn't significantly but faster than above one, regardless of text.Length.
public unsafe uint ParseUint3(string text)
{
fixed (char* c = text)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c);
switch (text.Length)
{
case 0: raw = Vector128<ushort>.Zero; break;
case 1: raw = Sse2.ShiftLeftLogical128BitLane(raw, 14); break;
case 2: raw = Sse2.ShiftLeftLogical128BitLane(raw, 12); break;
case 3: raw = Sse2.ShiftLeftLogical128BitLane(raw, 10); break;
case 4: raw = Sse2.ShiftLeftLogical128BitLane(raw, 8); break;
case 5: raw = Sse2.ShiftLeftLogical128BitLane(raw, 6); break;
case 6: raw = Sse2.ShiftLeftLogical128BitLane(raw, 4); break;
case 7: raw = Sse2.ShiftLeftLogical128BitLane(raw, 2); break;
};
Vector128<ushort> digit0 = Vector128.Create('0');
raw = Sse2.SubtractSaturate(raw, digit0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
res = Ssse3.HorizontalAdd(res, res);
res = Ssse3.HorizontalAdd(res, res);
return (uint)res.GetElement(0);
}
}
Again, #PeterCordes doesn't allow me to write a slow code. The following version got 2 improvements. Now string loaded already shifted, and then subtracted to the shifted mask by the same offset. This avoids the slow fallback for ShiftLeftLogical128BitLane with a variable count.
The second improvement is replacing vphaddd with pshufd + paddd.
// Note that this loads up to 14 bytes before the data part of the string. (Or 16 for an empty string)
// This might or might not make it possible to read from an unmapped page and fault, beware.
public unsafe uint ParseUint4(string text)
{
const string mask = "\xffff\xffff\xffff\xffff\xffff\xffff\xffff\xffff00000000";
fixed (char* c = text, m = mask)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c - 8 + text.Length);
Vector128<ushort> mask0 = Sse3.LoadDquVector128((ushort*)m + text.Length);
raw = Sse2.SubtractSaturate(raw, mask0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
Vector128<int> shuf = Sse2.Shuffle(res, 0x1b); // 0 1 2 3 => 3 2 1 0
res = Sse2.Add(shuf, res);
shuf = Sse2.Shuffle(res, 0x41); // 0 1 2 3 => 1 0 3 2
res = Sse2.Add(shuf, res);
return (uint)res.GetElement(0);
}
}
~Twice faster than initial solution. (o_O) At least on my Haswell i7.
C# (thanks #aepot)
public unsafe uint ParseUint(string text)
{
fixed (char* c = text)
{
Vector128<byte> mul1 = Vector128.Create(0x14C814C8, 0x010A0A64, 0, 0).AsByte();
Vector128<short> mul2 = Vector128.Create(0x00FA61A8, 0x0001000A, 0, 0).AsInt16();
Vector128<long> shift_amount = Sse2.ConvertScalarToVector128Int32(8 - text.Length << 3).AsInt64();
Vector128<short> vs = Sse2.LoadVector128((short*)c);
Vector128<byte> vb = Sse2.PackUnsignedSaturate(vs, vs);
vb = Sse2.SubtractSaturate(vb, Vector128.Create((byte)'0'));
vb = Sse2.ShiftLeftLogical(vb.AsInt64(), shift_amount).AsByte();
Vector128<int> v = Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(mul1, vb.AsSByte()), mul2);
v = Sse2.Add(Sse2.Add(v, v), Sse2.Shuffle(v, 1));
return (uint)v.GetElement(0);
}
}
C solution using SSSE3:
#include <uchar.h> // char16_t
#include <tmmintrin.h> // pmaddubsw
unsigned ParseUint(char16_t* ptr, size_t len) {
const __m128i mul1 = _mm_set_epi32(0, 0, 0x010A0A64, 0x14C814C8);
const __m128i mul2 = _mm_set_epi32(0, 0, 0x0001000A, 0x00FA61A8);
const __m128i shift_amount = _mm_cvtsi32_si128((8 - len) * 8);
__m128i v = _mm_loadu_si128((__m128i*)ptr); // unsafe chunking
v = _mm_packus_epi16(v,v); // convert digits from UTF16-LE to ASCII
v = _mm_subs_epu8(v, _mm_set1_epi8('0'));
v = _mm_sll_epi64(v, shift_amount); // shift off non-digit trash
// convert
v = _mm_madd_epi16(_mm_maddubs_epi16(mul1, v), mul2);
v = _mm_add_epi32(_mm_add_epi32(v,v), _mm_shuffle_epi32(v, 1));
return (unsigned)_mm_cvtsi128_si32(v);
}
Regardless of how one shifts/aligns the string (see aepot's anwser), we want to stay away from pmulld. SSE basically has 16-bit integer multiplication and the 32-bit multiply has double the latency and uops. However, care must be taken around the sign-extension behavior of pmaddubsw and pmaddwd.
using scalar x64:
// untested && I don't know C#
public unsafe static uint ParseUint(string text)
{
fixed (char* c = text)
{
var xmm = Sse2.LoadVector128((ushort*)c); // unsafe chunking
var packed = Sse2.PackSignedSaturate(xmm,xmm); // convert digits from UTF16-LE to ASCII
ulong val = Sse2.X64.ConvertToUInt64(packed); // extract to scalar
val -= 0x3030303030303030; // subtract '0' from each digit
val <<= ((8 - text.Length) * 8); // shift off non-digit trash
// convert
const ulong mask = 0x000000FF000000FF;
const ulong mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
const ulong mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
val = (val * 10) + (val >> 8);
val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
return (uint)val;
}
}
What if we don't know the length of the number ahead of time?
// C pseudocode & assumes ascii text
uint64_t v, m, len;
v = unaligned_load_little_endian_u64(p);
m = v + 0x4646464646464646; // roll '9' to 0x7F
v -= 0x3030303030303030; // unpacked binary coded decimal
m = (m | v) & 0x8080808080808080; // detect first non-digit
m = _tzcnt_u64(m >> 7); // count run of digits
if (((uint8_t)v) > 9) return error_not_a_number;
v <<= 64 - m; // shift off any "trailing" chars that are not digits
p += m >> 3; // consume bytes
v = parse_8_chars(v);
Or if we have a list of strings to process:
// assumes ascii text
__m256i parse_uint_x4(void* base_addr, __m256i offsets_64x4)
{
const __m256i x00 = _mm256_setzero_si256();
const __m256i x0A = _mm256_set1_epi8(0x0A);
const __m256i x30 = _mm256_set1_epi8(0x30);
const __m256i x08 = _mm256_and_si256(_mm256_srli_epi32(x30, 1), x0A);
const __m256i mul1 = _mm256_set1_epi64x(0x010A0A6414C814C8);
const __m256i mul2 = _mm256_set1_epi64x(0x0001000A00FA61A8);
__m256i v, m;
// process 4 strings at once, up to 8 digits in each string...
// (the 64-bit chunks could be manually loaded using 3 shuffles)
v = _mm256_i64gather_epi64((long long*)base_addr, offsets_64x4, 1);
// rebase digits from 0x30..0x39 to 0x00..0x09
v = _mm256_xor_si256(v, x30);
// range check
// (unsigned gte compare)
v = _mm256_min_epu8(v, x0A);
m = _mm256_cmpeq_epi8(x0A, v);
// mask of lowest non-digit and above
m = _mm256_or_si256(m, _mm256_sub_epi64(x00, m));
// align the end of the digit-string to the top of the u64 lane
// (shift off masked bytes and insert leading zeros)
m = _mm256_sad_epu8(_mm256_and_si256(m, x08), x00);
v = _mm256_sllv_epi64(v, m);
// convert to binary
// (the `add(v,v)` allow us to keep `mul2` unsigned)
v = _mm256_madd_epi16(_mm256_maddubs_epi16(mul1, v), mul2);
v = _mm256_add_epi32(_mm256_shuffle_epi32(v, 0x31), _mm256_add_epi32(v,v));
// zero the hi-dwords of each qword
v = _mm256_blend_epi32(v, x00, 0xAA);
return v;
}
First of all, 5x improvement is not “rather unimpressive”.
I would not do the last step with scalar code, here’s an alternative:
// _mm_shuffle_epi32( x, _MM_SHUFFLE( 3, 3, 2, 2 ) )
collapsed3 = Sse2.Shuffle( collapsed3, 0xFA );
// _mm_mul_epu32
var collapsed4 = Sse2.Multiply( collapsed3.As<int, uint>(), Vector128.Create( 10000u, 0, 1, 0 ) ).As<ulong, uint>();
// _mm_add_epi32( x, _mm_srli_si128( x, 8 ) )
collapsed4 = Sse2.Add( collapsed4, Sse2.ShiftRightLogical128BitLane( collapsed4, 8 ) );
return collapsed4.GetElement( 0 );
The C++ version gonna be way faster than what happens on my PC (.NET Core 3.1). The generated code is not good. They initialize constants like this:
00007FFAD10B11B6 xor ecx,ecx
00007FFAD10B11B8 mov dword ptr [rsp+20h],ecx
00007FFAD10B11BC mov dword ptr [rsp+28h],64h
00007FFAD10B11C4 mov dword ptr [rsp+30h],1
00007FFAD10B11CC mov dword ptr [rsp+38h],64h
00007FFAD10B11D4 mov dword ptr [rsp+40h],1
They use stack memory instead of another vector register. It looks like JIT developers forgot there’re 16 vector registers there, the complete function only uses xmm0.
00007FFAD10B1230 vmovapd xmmword ptr [rbp-0C0h],xmm0
00007FFAD10B1238 vmovapd xmm0,xmmword ptr [rbp-0C0h]
00007FFAD10B1240 vpsrldq xmm0,xmm0,8
00007FFAD10B1245 vpaddd xmm0,xmm0,xmmword ptr [rbp-0C0h]
I just got confused about how to convert an array of 4 signed bytes to a float number.
I just know for an array of unsigned bytes bts, probably I can use this function
BitConverter.ToSingle(bts, 0);
However, it looks like BitConverter.ToSingle only accepts byte array instead of sbyte array.
Could somebody give me some ideas please?
Thanks!
Maybe this:
float num = 0;
for (int i = 0; i < sbytesArr.Length; i++)
{
num = (num | sbytesArr[i]) << i * 4;
}
Float value = 5000.1234;
//
// Invoke BitConverter.GetBytes to convert double to bytes.
//
byte[] array = BitConverter.GetBytes(value);
foreach (byte element in array)
{
Console.WriteLine(element);
}
//
// You can convert the bytes back to a double.
//
Float result = BitConverter.Tofloat(array, 0);
Console.WriteLine(result);
Assuming that your signed bytes are in an array named sbts you can first of all convert to an unsigned byte array, and then use BitConverter.ToSingle().
byte[] bts = new byte[sbts.Length];
Buffer.BlockCopy(sbts, 0, bts, 0, sbts.Length);
float f = BitConverter.ToSingle(bts, 0);
It is a little known fact that byte and sbyte are interchangeable at the CLR level:
sbyte[] a = new sbyte[1];
byte[] b = (byte[])(object)a;
This code actually works at runtime. So can pass in the array that you have.
BitConverter.ToSingle((byte[])(object)bts, 0);
Call GetFloatValue method passing una array of four sbyte as parameter
public float GetFloatValue(sbyte[] data)
{
return bytesToFloat(data[0], data[1], data[2], data[3]);
}
private static float bytesToFloat(sbyte b0, sbyte b1, sbyte b2, sbyte b3)
{
int mantissa = (byte)b0 + ((byte)b1 << 8) + ((byte)b2 << 16);
return (float)(mantissa * Math.Pow(10, b3));
}
I want to know how to set a specific bit in a 16 byte array (128 bits).
For example ... if I wanted to set the 9th bit in the the array I would expect:
{00, 80, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00}
If I wanted to set the 125th bit ...
{00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 08}
I have looked into using bit shifting but got confused on how to bit shift with an array consisting of 128 bits. Is there a way to break down an array this size and evaluate in smaller chunks of bytes? Any help would be appreciated.
The process of selecting a specific bit consists of two steps:
Picking the byte, and then
Picking the bit.
Picking the byte is simple: all you need to do is dividing the bit index by the number of bits in a byte - i.e. dividing by eight:
int byteIndex = bitIndex / 8;
Now that you know what byte to use, calculate which bit you want to access. For that you need to compute a remainder of a division by eight, like this:
int bitInByteIndex = bitIndex % 8;
With these two indexes in hand, it is easy to access the bit: use 1 << bitInByteIndex as a mask, like this:
byte mask = (byte)(1 << bitInByteIndex);
bool isSet = (bytes[byteIndex] & mask) != 0;
// set to 1
bytes[byteIndex] |= mask;
// Set to zero
bytes[byteIndex] &= ~mask;
// Toggle
bytes[byteIndex] ^= mask;
You could use a BitArray:
byte[] bytearray = new byte[16];
var bitArray = new BitArray(bytearray);
bitArray.Set(8, true);
bitArray.CopyTo(bytearray, 0);
Directly from a Byte Array, your quickest solution is probably using a BitArray.
- http://msdn.microsoft.com/en-us/library/system.collections.bitarray.aspx
E.g. This would go something like:
public void BitManimulation()
{
//Ill just use 2 bytes here for demonstration, get your 16 bytes here.
byte[] bytes = new[] { (byte)250, (byte)250 };
//Convert your bytes to a BitArray.
BitArray array = new BitArray(bytes);
array[3] = !array[3]; // Flip
array[4] = false; // 0
array[5] = true; // 1
//Convert back to bytes.
byte[] bytes2 = new byte[2];
array.CopyTo(bytes2,0);
Console.WriteLine(bytes2[0]);
Console.WriteLine(bytes2[1]);
}
Now this comes with a performance penalty.
So as an alternative you have the BitVector32 that will require you to convert blocks of 4 bytes to an integer...
- http://msdn.microsoft.com/en-us/library/system.collections.specialized.bitvector32.aspx
Finally you can do bit shifting, XOR's etc to produce the desired results. Since #dasblinkenlight already left a nice answer for that ill just leave you with that for explaining things.^
Here is some quick helper methods based on his post though:
public static class ByteArrayExt
{
public static byte[] SetBit(this byte[] self, int index, bool value)
{
int byteIndex = index / 8;
int bitIndex = index % 8;
byte mask = (byte)(1 << bitIndex);
self[byteIndex] = (byte)(value ? (self[byteIndex] | mask) : (self[byteIndex] & ~mask));
return self;
}
public static byte[] ToggleBit(this byte[] self, int index)
{
int byteIndex = index / 8;
int bitIndex = index % 8;
byte mask = (byte)(1 << bitIndex);
self[byteIndex] ^= mask;
return self;
}
public static bool GetBit(this byte[] self, int index)
{
int byteIndex = index / 8;
int bitIndex = index % 8;
byte mask = (byte)(1 << bitIndex);
return (self[byteIndex] & mask) != 0;
}
}
First you need to find out which byte you're working with. Let's say you have:
00000000 00000000 00000000 00000000 ...
And you want to turn on the 10th bit, so it needs to become like this:
00000000 01000000 00000000 00000000 ...
So first do a division by 8 (rounded down) to locate the byte number (in this case byte number one, or the second byte). Once you've done that, you can use bitwise operators to set the bit you want, i.e.
array[1] |= 0x40
We're doing a bitwise OR operation between the old value of that byte and 0x40 (which is 01000000). If the old value was 00101101, then array[1] = (00101101 OR 01000000), which is 01101101.
Naturally, in this case I've been using literals so you'll have to change that depending on which bit is set (e.g. if you're setting the bit before the last, you want to use 0x02 instead, etc.
Dividing the bit number by 8 gives you the byte. Taking it modulo 8 give you the bit number within the byte.
You can also use lookUp, like this:
byte[] lookUp = { 1, 2, 4, 8, 16, 32, 64, 128 };
And the following code (considering reverse bits numeration in bytes)
int bytePosition = bitIndex / 8;
int bitInBytePosition = bitIndex % 8;
if (bitIndex < 8)
{
bitInBytePosition = bitIndex;
}
array[bytePosition] ^= lookUp[7 - bitInBytePosition];
I'm trying to convert 3 bytes to signed integer (Big-endian) in C#.
I've tried to use BitConverter.ToInt32 method, but my problem is what value should have the lats byte.
Can anybody suggest me how can I do it in different way?
I also need to convert 5 (or 6 or 7) bytes to signed long, is there any general rule how to do it?
Thanks in advance for any help.
As a last resort you could always shift+add yourself:
byte b1, b2, b3;
int r = b1 << 16 | b2 << 8 | b3;
Just swap b1/b2/b3 until you have the desired result.
On second thought, this will never produce negative values.
What result do you want when the msb >= 0x80 ?
Part 2, brute force sign extension:
private static int Bytes2Int(byte b1, byte b2, byte b3)
{
int r = 0;
byte b0 = 0xff;
if ((b1 & 0x80) != 0) r |= b0 << 24;
r |= b1 << 16;
r |= b2 << 8;
r |= b3;
return r;
}
I've tested this with:
byte[] bytes = BitConverter.GetBytes(p);
int r = Bytes2Int(bytes[2], bytes[1], bytes[0]);
Console.WriteLine("{0} == {1}", p, r);
for several p.
The last value should be 0 if it isn't set for a positive number, 256 for a negative.
To know what you should pass in, you can try converting it the other way:
var bytes = BitConverter.GetBytes(i);
int x = BitConverter.ToInt32(bytes, 0);
To add to the existing answers here, there's a bit of a gotcha in that Bitconverter.ToInt32() will throw an ArgumentException if the array is less than sizseof(int) (4) bytes in size;
Destination array is not long enough to copy all the items in the collection. Check array index and length.
Given an array less than sizeof(int) (4) bytes in size, you can compensate for left/right padding like so;
Right-pad
Results in positive Int32 numbers
int intByteSize = sizeof(int);
byte[] padded = new byte[intByteSize];
Array.Copy(sourceBytes, 0, padded, 0, sourceBytes.Length);
sourceBytes = padded;
Left-pad
Results in negative Int32 numbers, assuming non-zero value at byte index sourceBytes.Length - 1.
int intByteSize = sizeof(int);
byte[] padded = new byte[intByteSize];
Array.Copy(sourceBytes, 0, padded, intByteSize - sourceBytes.Length, sourceBytes.Length);
sourceBytes = padded;
Once padded, you can safely call int myValue = BitConverter.ToInt32(sourceBytes, 0);.
I'm getting a hex string that needs to be converted to a signed 8-bit integer. Currently I'm converting using Int16/Int32, which will obviously not give me a negative value for an 8-bit integer. If I get the value 255 in Hex, how do I convert that to -1 in decimal? I assume I want to use an sbyte, but I'm not sure how to get that value in there properly.
You can use Convert.ToSByte
For example:
string x = "aa";
sbyte v = Convert.ToSByte(x, 16);
// result: v = 0xAA or -86
You can also use sbyte.Parse
For example:
string y = "bb";
sbyte w = sbyte.Parse(y, System.Globalization.NumberStyles.HexNumber);
// result: w = 0xBB or -69
To answer your question about the upper or lower byte of an Int16:
string signed_short = "feff";
// Truncate 16 bit value down to 8 bit
sbyte b1 = (sbyte)Convert.ToInt16(signed_short, 16);
sbyte b2 = (sbyte)short.Parse(signed_short, System.Globalization.NumberStyles.HexNumber);
// result: b1 = 0xFF or -1
// result: b2 = 0xFF or -1
// Use upper 8 bit of 16 bit
sbyte b3 = (sbyte)(Convert.ToInt16(signed_short, 16) >> 8);
sbyte b4 = (sbyte)(short.Parse(signed_short, System.Globalization.NumberStyles.HexNumber) >> 8);
// result: b3 = 0xFE or -2
// result: b4 = 0xFE or -2
You need to perform an unchecked cast, like this:
sbyte negativeOne = unchecked((sbyte)255);
My solution was to put the first take the first 8 bits of the 16 bit integer and store them in an sbyte.