Multithreading file compress - c#

I've just started to work with threads,
I want to write simple file compressor. It should create two background threads - one for reading and other one for writing. The first one should read file by small chunks and put them into Queue, where int - is chunkId. The second thread should dequeue chunks and write them down in order(using chunkId) into output stream (file, which this thread created in begin).
I did it. But I cant understand why after my program ends and I open my gziped file - I see, that my chunks mixed, and file doesn't have previous order.
public static class Reader
{
private static readonly object Locker = new object();
private const int ChunkSize = 1024*1024;
private static readonly int MaxThreads;
private static readonly Queue<KeyValuePair<int, byte[]>> ChunksQueue;
private static int _chunksComplete;
static Reader()
{
MaxThreads = Environment.ProcessorCount;
ChunksQueue = new Queue<KeyValuePair<int,byte[]>>(MaxThreads);
}
public static void Read(string filename)
{
_chunksComplete = 0;
var tRead = new Thread(Reading) { IsBackground = true };
var tWrite = new Thread(Writing) { IsBackground = true };
tRead.Start(filename);
tWrite.Start(filename);
tRead.Join();
tWrite.Join();
Console.WriteLine("Finished");
}
private static void Writing(object threadContext)
{
var filename = (string) threadContext;
using (var s = File.Create(filename + ".gz"))
{
while (true)
{
var dataPair = DequeueSafe();
if (dataPair.Value == null)
return;
while (dataPair.Key != _chunksComplete)
{
Thread.Sleep(1);
}
Console.WriteLine("write chunk {0}", dataPair.Key);
using (var gz = new GZipStream(s, CompressionMode.Compress, true))
{
gz.Write(dataPair.Value, 0, dataPair.Value.Length);
}
_chunksComplete++;
}
}
}
private static void Reading(object threadContext)
{
var filename = (string) threadContext;
using (var s = File.OpenRead(filename))
{
var counter = 0;
var buffer = new byte[ChunkSize];
while (s.Read(buffer, 0, buffer.Length) != 0)
{
while (ChunksQueue.Count == MaxThreads)
{
Thread.Sleep(1);
}
Console.WriteLine("read chunk {0}", counter);
var dataPair = new KeyValuePair<int, byte[]>(counter, buffer);
EnqueueSafe(dataPair);
counter++;
}
EnqueueSafe(new KeyValuePair<int, byte[]>(0, null));
}
}
private static void EnqueueSafe(KeyValuePair<int, byte[]> dataPair)
{
lock (ChunksQueue)
{
ChunksQueue.Enqueue(dataPair);
}
}
private static KeyValuePair<int, byte[]> DequeueSafe()
{
while (true)
{
lock (ChunksQueue)
{
if (ChunksQueue.Count > 0)
{
return ChunksQueue.Dequeue();
}
}
Thread.Sleep(1);
}
}
}
UPD:
I can use only .NET 3.5

Stream.Read() returns the actual number of bytes it consumed. Use it to limit the size of chunk for the writer. And, since there is concurrent reading and writing involved you'll need more than one buffer.
Try 4096 as the chunk size.
Reader:
var buffer = new byte[ChunkSize];
int bytesRead = s.Read(buffer, 0, buffer.Length);
while (bytesRead != 0)
{
...
var dataPair = new KeyValuePair<int, byte[]>(bytesRead, buffer);
buffer = new byte[ChunkSize];
bytesRead = s.Read(buffer, 0, buffer.Length);
}
Writer:
gz.Write(dataPair.Value, 0, dataPair.Key)
PS: The performance can be improved with adding a pool of free data buffers instead of allocating new each time and using events (e.g. ManualResetEvent) to signal queue is empty, queue is full instead of using Thread.Sleep().

While alexm's answer does bring up a very important point that Stream.Read may fill buffer with less bytes than you requested, the main problem you have is you only have one byte[] you keep using over and over again.
When your reading loop goes to read a 2nd value it overwrites the byte[] that is sitting inside the dataPair you passed to the queue. You must have a buffer = new byte[ChunkSize]; inside the loop to solve this problem. You also must record how many bytes where read in and only write the same number of bytes.
You don't need to keep the counter in the pair as a Queue will maintain the order, use the int in the pair to store the number of bytes recorded as in alexm's example.

Related

Channels & Memory Management Strategies for Large Objects

I'm trying to determine how to best implement .Net Core 3 Channels and whether it's a good idea to pass very large objects between tasks. In my example, one task that is very fast can read in a 1GB chunk from a very large file. A number of consumer tasks can read a chunk from the channel and process them in parallel, as processing is much slower and needs parallel (multi-threaded) execution.
In testing my code, there is a massive amount of GC happening and total RAM used far exceeds the sum of all data waiting in one bounded channel and all executing tasks. I've simplified my code down to the most basic example hoping someone can give me some tips on how to better allocate/manage memory or if this approach is a good idea?
using System;
using System.IO;
using System.Threading.Channels;
using System.Threading.Tasks;
namespace MergeSort
{
public class Example
{
private Channel<byte[]> _channelProcessing;
public async Task DoSort(int queueDepth, int parallelTaskCount)
{
// Hard-code some values so we can talk about details
queueDepth = 2;
parallelTasks = 8;
_channelProcessing = Channel.CreateBounded<byte[]>(queueDepth);
Task[] processingTasks = new Task[parallelTaskCount];
int outputBufferSize = 1024 * 1024;
for (int x = 0; x < parallelTaskCount; x++)
{
string outputFile = $"C:\\Output.{x:00000000}.txt";
processingTasks[x] = Task.Run(() => ProcessChunkAsync(outputBufferSize));
}
// Task put unsorted chunks on the channel
string inputFile = "C:\\Input.txt";
int chunkSize = 1024 * 1024 * 1024; // 1GiB
Task inputTask = Task.Run(() => ReadInputAsync(inputFile, chunkSize));
// Wait for all tasks building chunk files to complete before continuing
await inputTask;
await Task.WhenAll(processingTasks);
}
private async Task ReadInputAsync(string inputFile, int chunkSize)
{
int bytesRead = 0;
byte[] chunkBuffer = new byte[chunkSize];
using (FileStream fileStream = File.Open(inputFile, FileMode.Open, FileAccess.Read, FileShare.Read))
{
// Read chunks until input EOF
while (fileStream.Position != fileStream.Length)
{
bytesRead = fileStream.Read(chunkBuffer, 0, chunkBuffer.Length);
// Fake code him to simulate the work I need to do showing outBuffer.Length is calculated at runtime
Random rnd = new Random();
int runtimeCalculatedAmount = rnd.Next(100, 600);
byte[] tempBuffer = new byte[runtimeCalculatedAmount];
// Create the buffer with a slightly variable size that needs to be passed to the channel for next task
byte[] outBuffer = new byte[1024 * 1024 * 1024 + runtimeCalculatedAmount];
Array.Copy(chunkBuffer, outBuffer, bytesRead);
Array.Copy(tempBuffer, 0, outBuffer, bytesRead, outBuffer.Length);
await _channelProcessing.Writer.WriteAsync(outBuffer);
outBuffer = null;
}
}
// Not sure if it's safe to .Complete() before consumers have read all data from channel?
_channelProcessing.Writer.Complete();
}
private async Task ProcessChunkAsync(int outputBufferSize)
{
while (await _channelProcessing.Reader.WaitToReadAsync())
{
if (_channelProcessing.Reader.TryRead(out byte[] inBuffer))
{
// myBigThing is also a very large object (result of processing inBuffer and slightly larger)
MyBigThing myBigThing = new MyBigThing(inBuffer);
inBuffer = null;
// Create file and write all rows
using (FileStream fileStream = File.Create("C:\\Output.txt", outputBufferSize, FileOptions.SequentialScan))
{
// Write myBigThing to output file
fileStream.Write(myBigThing.Data);
}
myBigThing = null;
}
}
}
}
}

Bass.net DOWNLOADPROC only record 5 seconds

I'm trying to record an audio streaming using bass.net but using the documentation example i'm only able to record 5 seconds. How can i record more time?
Following is my code:
class Program
{
private static FileStream _fs = null;
private static DOWNLOADPROC _myDownloadProc;
private static byte[] _data;
static void Main(string[] args)
{
Bass.BASS_Init(-1, 44100, BASSInit.BASS_DEVICE_DEFAULT, IntPtr.Zero);
_myDownloadProc = new DOWNLOADPROC(MyDownload);
int stream = Bass.BASS_StreamCreateURL("http://m2.fabricahost.com.br:8704/;stream.mp3", 0,
BASSFlag.BASS_STREAM_BLOCK | BASSFlag.BASS_SAMPLE_MONO | BASSFlag.BASS_STREAM_STATUS, _myDownloadProc, IntPtr.Zero);
}
private static void MyDownload(IntPtr buffer, int length, IntPtr user)
{
if (_fs == null)
{
// create the file
_fs = File.OpenWrite("output.mp3");
}
if (buffer == IntPtr.Zero)
{
// finished downloading
_fs.Flush();
_fs.Close();
}
else
{
// increase the data buffer as needed
if (_data == null || _data.Length < length)
_data = new byte[length];
// copy from managed to unmanaged memory
Marshal.Copy(buffer, _data, 0, length);
// write to file
_fs.Write(_data, 0, length);
}
}
}
Thanks
I found out why, the bass.net default net buffer size is 5000ms(5 seconds). I just changed the size using
Bass.BASS_SetConfig(BASSConfig.BASS_CONFIG_NET_BUFFER, 10000); to record as much as i wanted.

What is different with the writing in FileStream?

When I searched the method about decompress the file by using SharpZipLib, I found lot of methods like this:
public static void TarWriteCharacters(string tarfile, string targetDir)
{
using (TarInputStream s = new TarInputStream(File.OpenRead(tarfile)))
{
//some codes here
using (FileStream fileWrite = File.Create(targetDir + directoryName + fileName))
{
int size = 2048;
byte[] data = new byte[2048];
while (true)
{
size = s.Read(data, 0, data.Length);
if (size > 0)
{
fileWrite.Write(data, 0, size);
}
else
{
break;
}
}
fileWrite.Close();
}
}
}
The format FileStream.Write is:
FileStream.Write(byte[] array, int offset, int count)
Now I try to separate part of read and write because I want to use thread to speed up the decompress rate in write function, and I use dynamic array byte[] and int[] to deposit the file's data and size like below
Read:
public static void TarWriteCharacters(string tarfile, string targetDir)
{
using (TarInputStream s = new TarInputStream(File.OpenRead(tarfile)))
{
//some codes here
using (FileStream fileWrite= File.Create(targetDir + directoryName + fileName))
{
int size = 2048;
List<int> SizeList = new List<int>();
List<byte[]> mydatalist = new List<byte[]>();
while (true)
{
byte[] data = new byte[2048];
size = s.Read(data, 0, data.Length);
if (size > 0)
{
mydatalist.Add(data);
SizeList.Add(size);
}
else
{
break;
}
}
test = new Thread(() =>
FileWriteFun(pathToTar, args, SizeList, mydatalist)
);
test.Start();
streamWriter.Close();
}
}
}
Write:
public static void FileWriteFun(string pathToTar , string[] args, List<int> SizeList, List<byte[]> mydataList)
{
//some codes here
using (FileStream fileWrite= File.Create(targetDir + directoryName + fileName))
{
for (int i = 0; i < mydataList.Count; i++)
{
fileWrite.Write(mydataList[i], 0, SizeList[i]);
}
fileWrite.Close();
}
}
Edit
(1)byte[] data = new byte[2048] into while loop to assign data to new array.
(2)change int[] SizeList = new int[2048] to List<int> SizeList = new List<int>() because of int range
As read on a stream is only guarantied to return one byte (typically it will be more, but you can't rely on the full requested length each time), your solution can theoretically fail after 2048 bytes as your SizeList can only hold 2048 entries.
You could use a List to hold the sizes.
Or use a MemoryStream instead of inventing your own.
But the two main problems are:
1) You keep reading into the same byte array, overwriting previously read data. When you add your data byte array to mydatalist, you must assign data to a new byte array.
2) you close your stream before the second thread is done writing.
In general threading is difficult and should only be used where you know it will improve performance. Simply reading and writing data is typically IO bound in performance, not cpu bound, so introducing a second thread will just give a small performance penalty and no gain in speed. You could use multithreading to ensure concurrent read/write operations, but most likely the disk cache will do this for you if you stick to the first solution - amd if not, using async is easier than multithreaded to achieve this.

C# Socket ReceiveAsync

I am used to sync sockets and had a few headaches to get to the point where I am now, especially with Socket.Receive(..) not always receiveing all bytes
Here is my code what I used to use
public byte[] Receive(int size)
{
var buffer = new byte[size];
var r = 0;
do
{
// ReSharper disable once InconsistentlySynchronizedField
var c = _clientSocket.Receive(buffer, r, size - r, SocketFlags.None);
if (c == 0)
{
throw new SocketExtendedException();
}
r += c;
} while (r != buffer.Length);
return buffer;
}
Now I started to use sockets in Windows Phone BUT .Receive(..) is not available and I managed to get Socket.ReceiveAsync(..) working but I am concerned (no problems happened so far) here is my new code, I have not implemented the checking if all bytes has been recieved or not nor do I know if I have to with the following code
private byte[] ReadBySize(int size = 4)
{
var readEvent = new AutoResetEvent(false);
var buffer = new byte[size];
var recieveArgs = new SocketAsyncEventArgs()
{
UserToken = readEvent
};
recieveArgs.SetBuffer(buffer, 0, size);
recieveArgs.Completed += recieveArgs_Completed;
_connecter.ReceiveAsync(recieveArgs);
readEvent.WaitOne();
if (recieveArgs.BytesTransferred == 0)
{
if (recieveArgs.SocketError != SocketError.Success)
throw new SocketException((int)recieveArgs.SocketError);
throw new CommunicationException();
}
return buffer;
}
void recieveArgs_Completed(object sender, SocketAsyncEventArgs e)
{
var are = (AutoResetEvent)e.UserToken;
are.Set();
}
This is my first use of ReceiveAsync can someone point out anything I might have done wrong or need to change
Ok I went and took a large buffer and send it in batches with a sleep interval in between to replicate 'not all bytes received' So my code above doesn't recieve all bytes. for those who also use ReceiveAsync(..) here is my code that works
private byte[] ReadBySize(int size = 4)
{
var readEvent = new AutoResetEvent(false);
var buffer = new byte[size]; //Receive buffer
var totalRecieved = 0;
do
{
var recieveArgs = new SocketAsyncEventArgs()
{
UserToken = readEvent
};
recieveArgs.SetBuffer(buffer, totalRecieved, size - totalRecieved);//Receive bytes from x to total - x, x is the number of bytes already recieved
recieveArgs.Completed += recieveArgs_Completed;
_connecter.ReceiveAsync(recieveArgs);
readEvent.WaitOne();//Wait for recieve
if (recieveArgs.BytesTransferred == 0)//If now bytes are recieved then there is an error
{
if (recieveArgs.SocketError != SocketError.Success)
throw new ReadException(ReadExceptionCode.UnexpectedDisconnect,"Unexpected Disconnect");
throw new ReadException(ReadExceptionCode.DisconnectGracefully);
}
totalRecieved += recieveArgs.BytesTransferred;
} while (totalRecieved != size);//Check if all bytes has been received
return buffer;
}
void recieveArgs_Completed(object sender, SocketAsyncEventArgs e)
{
var are = (AutoResetEvent)e.UserToken;
are.Set();
}
The way I work with my Socket applications is to send a Buffer that consist of some variables
[0] -> 0,1,2 0 is keep alive, 1 means there are data, 2 means a type off error occured
[1,2,3,4] size of the actual buffer I am sending
[x(size of 1,2,3,4)] the actual 'Serialized' data buffer
You could create a socket extension like:
public static Task<int> ReceiveAsync(this Socket socket,
byte[] buffer, int offset, int size, SocketFlags socketFlags)
{
if (socket == null) throw new ArgumentNullException(nameof(socket));
var tcs = new TaskCompletionSource<int>();
socket.BeginReceive(buffer, offset, size, socketFlags, ar =>
{
try { tcs.TrySetResult(socket.EndReceive(ar)); }
catch (Exception e) { tcs.TrySetException(e); }
}, state: null);
return tcs.Task;
}
And then a method to read the size you want like this:
public static async Task<byte[]> ReadFixed(Socket socket, int bufferSize)
{
byte[] ret = new byte[bufferSize];
for (int read = 0; read < bufferSize; read += await socket.ReceiveAsync(ret, read, ret.Length - read, SocketFlags.None)) ;
return ret;
}

Replace sequence of bytes in binary file

What is the best method to replace sequence of bytes in binary file to the same length of other bytes? The binary files will be pretty large, about 50 mb and should not be loaded at once in memory.
Update: I do not know location of bytes which needs to be replaced, I need to find them first.
Assuming you're trying to replace a known section of the file.
Open a FileStream with read/write access
Seek to the right place
Overwrite existing data
Sample code coming...
public static void ReplaceData(string filename, int position, byte[] data)
{
using (Stream stream = File.Open(filename, FileMode.Open))
{
stream.Position = position;
stream.Write(data, 0, data.Length);
}
}
If you're effectively trying to do a binary version of a string.Replace (e.g. "always replace bytes { 51, 20, 34} with { 20, 35, 15 } then it's rather harder. As a quick description of what you'd do:
Allocate a buffer of at least the size of data you're interested in
Repeatedly read into the buffer, scanning for the data
If you find a match, seek back to the right place (e.g. stream.Position -= buffer.Length - indexWithinBuffer; and overwrite the data
Sounds simple so far... but the tricky bit is if the data starts near the end of the buffer. You need to remember all potential matches and how far you've matched so far, so that if you get a match when you read the next buffer's-worth, you can detect it.
There are probably ways of avoiding this trickiness, but I wouldn't like to try to come up with them offhand :)
EDIT: Okay, I've got an idea which might help...
Keep a buffer which is at least twice as big as you need
Repeatedly:
Copy the second half of the buffer into the first half
Fill the second half of the buffer from the file
Search throughout the whole buffer for the data you're looking for
That way at some point, if the data is present, it will be completely within the buffer.
You'd need to be careful about where the stream was in order to get back to the right place, but I think this should work. It would be trickier if you were trying to find all matches, but at least the first match should be reasonably simple...
My solution :
/// <summary>
/// Copy data from a file to an other, replacing search term, ignoring case.
/// </summary>
/// <param name="originalFile"></param>
/// <param name="outputFile"></param>
/// <param name="searchTerm"></param>
/// <param name="replaceTerm"></param>
private static void ReplaceTextInBinaryFile(string originalFile, string outputFile, string searchTerm, string replaceTerm)
{
byte b;
//UpperCase bytes to search
byte[] searchBytes = Encoding.UTF8.GetBytes(searchTerm.ToUpper());
//LowerCase bytes to search
byte[] searchBytesLower = Encoding.UTF8.GetBytes(searchTerm.ToLower());
//Temporary bytes during found loop
byte[] bytesToAdd = new byte[searchBytes.Length];
//Search length
int searchBytesLength = searchBytes.Length;
//First Upper char
byte searchByte0 = searchBytes[0];
//First Lower char
byte searchByte0Lower = searchBytesLower[0];
//Replace with bytes
byte[] replaceBytes = Encoding.UTF8.GetBytes(replaceTerm);
int counter = 0;
using (FileStream inputStream = File.OpenRead(originalFile)) {
//input length
long srcLength = inputStream.Length;
using (BinaryReader inputReader = new BinaryReader(inputStream)) {
using (FileStream outputStream = File.OpenWrite(outputFile)) {
using (BinaryWriter outputWriter = new BinaryWriter(outputStream)) {
for (int nSrc = 0; nSrc < srcLength; ++nSrc)
//first byte
if ((b = inputReader.ReadByte()) == searchByte0
|| b == searchByte0Lower) {
bytesToAdd[0] = b;
int nSearch = 1;
//next bytes
for (; nSearch < searchBytesLength; ++nSearch)
//get byte, save it and test
if ((b = bytesToAdd[nSearch] = inputReader.ReadByte()) != searchBytes[nSearch]
&& b != searchBytesLower[nSearch]) {
break;//fail
}
//Avoid overflow. No need, in my case, because no chance to see searchTerm at the end.
//else if (nSrc + nSearch >= srcLength)
// break;
if (nSearch == searchBytesLength) {
//success
++counter;
outputWriter.Write(replaceBytes);
nSrc += nSearch - 1;
}
else {
//failed, add saved bytes
outputWriter.Write(bytesToAdd, 0, nSearch + 1);
nSrc += nSearch;
}
}
else
outputWriter.Write(b);
}
}
}
}
Console.WriteLine("ReplaceTextInBinaryFile.counter = " + counter);
}
You can use my BinaryUtility to search and replace one or more bytes without loading the entire file into memory like this:
var searchAndReplace = new List<Tuple<byte[], byte[]>>()
{
Tuple.Create(
BitConverter.GetBytes((UInt32)0xDEADBEEF),
BitConverter.GetBytes((UInt32)0x01234567)),
Tuple.Create(
BitConverter.GetBytes((UInt32)0xAABBCCDD),
BitConverter.GetBytes((UInt16)0xAFFE)),
};
using(var reader =
new BinaryReader(new FileStream(#"C:\temp\data.bin", FileMode.Open)))
{
using(var writer =
new BinaryWriter(new FileStream(#"C:\temp\result.bin", FileMode.Create)))
{
BinaryUtility.Replace(reader, writer, searchAndReplace);
}
}
BinaryUtility code:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
public static class BinaryUtility
{
public static IEnumerable<byte> GetByteStream(BinaryReader reader)
{
const int bufferSize = 1024;
byte[] buffer;
do
{
buffer = reader.ReadBytes(bufferSize);
foreach (var d in buffer) { yield return d; }
} while (bufferSize == buffer.Length);
}
public static void Replace(BinaryReader reader, BinaryWriter writer, IEnumerable<Tuple<byte[], byte[]>> searchAndReplace)
{
foreach (byte d in Replace(GetByteStream(reader), searchAndReplace)) { writer.Write(d); }
}
public static IEnumerable<byte> Replace(IEnumerable<byte> source, IEnumerable<Tuple<byte[], byte[]>> searchAndReplace)
{
foreach (var s in searchAndReplace)
{
source = Replace(source, s.Item1, s.Item2);
}
return source;
}
public static IEnumerable<byte> Replace(IEnumerable<byte> input, IEnumerable<byte> from, IEnumerable<byte> to)
{
var fromEnumerator = from.GetEnumerator();
fromEnumerator.MoveNext();
int match = 0;
foreach (var data in input)
{
if (data == fromEnumerator.Current)
{
match++;
if (fromEnumerator.MoveNext()) { continue; }
foreach (byte d in to) { yield return d; }
match = 0;
fromEnumerator.Reset();
fromEnumerator.MoveNext();
continue;
}
if (0 != match)
{
foreach (byte d in from.Take(match)) { yield return d; }
match = 0;
fromEnumerator.Reset();
fromEnumerator.MoveNext();
}
yield return data;
}
if (0 != match)
{
foreach (byte d in from.Take(match)) { yield return d; }
}
}
}
public static void BinaryReplace(string sourceFile, byte[] sourceSeq, string targetFile, byte[] targetSeq)
{
FileStream sourceStream = File.OpenRead(sourceFile);
FileStream targetStream = File.Create(targetFile);
try
{
int b;
long foundSeqOffset = -1;
int searchByteCursor = 0;
while ((b=sourceStream.ReadByte()) != -1)
{
if (sourceSeq[searchByteCursor] == b)
{
if (searchByteCursor == sourceSeq.Length - 1)
{
targetStream.Write(targetSeq, 0, targetSeq.Length);
searchByteCursor = 0;
foundSeqOffset = -1;
}
else
{
if (searchByteCursor == 0)
{
foundSeqOffset = sourceStream.Position - 1;
}
++searchByteCursor;
}
}
else
{
if (searchByteCursor == 0)
{
targetStream.WriteByte((byte) b);
}
else
{
targetStream.WriteByte(sourceSeq[0]);
sourceStream.Position = foundSeqOffset + 1;
searchByteCursor = 0;
foundSeqOffset = -1;
}
}
}
}
finally
{
sourceStream.Dispose();
targetStream.Dispose();
}
}

Categories

Resources