I am trying to convert a zip file into a text file (xml) using the following methods. It works fine for smaller file but dose not seem to work for files larger than 50 mb.
class Program
{
public static void Main(string[] args)
{
try
{
string importFilePath = #"D:\CorpTax\Tasks\966442\CS Publish error\CSUPD20180604L.zip";
int maxLengthInMb = 20;
byte[] payLoad = File.ReadAllBytes(importFilePath);
int payLoadInMb = (payLoad.Length / 1024) / 1024;
bool splitIntoMultipleFiles = (payLoadInMb / maxLengthInMb) > 1;
int payLoadLength = splitIntoMultipleFiles ? maxLengthInMb * 1024 * 1024 : payLoad.Length;
if (splitIntoMultipleFiles)
{
foreach (byte[] splitPayLoad in payLoad.Slices(payLoadLength))
{
ToXml(payLoad);
}
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
}
public static string ToXml(byte[] payLoad)
{
using (XmlStringWriter xmlStringWriter = new XmlStringWriter())
{
xmlStringWriter.WriteStartDocument();
xmlStringWriter.Writer.WriteStartElement("Payload");
xmlStringWriter.Writer.WriteRaw(Convert.ToBase64String(payLoad));
xmlStringWriter.Writer.WriteEndElement();
xmlStringWriter.WriteEndDocument();
return xmlStringWriter.ToString();
}
}
}
I have a .zip file which is like 120 MB in size and I get the
System.OutOfMemoryException when calling Convert.ToBase64String().
So I went ahead and split the byte array into a size of 20 mb chunks hoping that it will not fail. But I see that it works until it goes through the loop 3 times i.e able to convert 60mb of the data and in the 4th iteration i get the same exception. Some times I also get exceptions at the line return xmlStringWriter.ToString()
To split the byte[] I have used the following extension classes
public static class ArrayExtensions
{
public static T[] CopySlice<T>(this T[] source, int index, int length, bool padToLength = false)
{
int n = length;
T[] slice = null;
if (source.Length < index + length)
{
n = source.Length - index;
if (padToLength)
{
slice = new T[length];
}
}
if (slice == null) slice = new T[n];
Array.Copy(source, index, slice, 0, n);
return slice;
}
public static IEnumerable<T[]> Slices<T>(this T[] source, int count, bool padToLength = false)
{
for (var i = 0; i < source.Length; i += count)
{
yield return source.CopySlice(i, count, padToLength);
}
}
}
I got the above code from the following link
Splitting a byte[] into multiple byte[] arrays in C#
Funny part is the program runs fine when I run it in a console application but when I put this code into the windows application it throws the System.OutOfMemoryException.
Preferablilty you want to be doing something like this
byte[] Packet = new byte[4096];
string b64str = "";
using (FileStream fs = new FileStream(file, FileMode.Open))
{
int i = Packet.Length;
while (i == Packet.Length)
{
i = fs.Read(Packet, 0, Packet.Length);
b64str = Convert.ToBase64String(Packet, 0, i);
}
}
with that b64str you should create your xml data.
Also it is typically unwise to allocate 20mb on stack all in one go.
Related
I am trying to write a massive object to AWS S3 (e.g. 25 GB).
Currently I can get it working in two ways:
Write the content to a file on local disk, then send the file to S3 using multi-part upload
Write the content to a MemoryStream, then send that stream to S3 using multi-part upload
However, I don't like either approach, because I need to reserve a large amount of disk space or memory for the operation. I am generating this content in code, so I was hoping to open a stream to an S3 object, and generate the content directly to that object. But I can't see how to make that work.
Is it possible to build a massive object in S3 without representing the entire object in a local file or memory first?
(Note: My question is very similar to this question, but that question doesn't have a useful answer.)
I was able to get it working by breaking the overall payload into chunks, and sending each individual chunk as a separate MemoryStream.
Technically this solution still uses a MemoryStream, but that's OK, since I can control how much memory is used by adjusting the chunk size. For my test, I created a 25GB file while keeping memory usage well below that (~2 GB IIRC).
Here is my solution:
private const string BucketName = "YOUR-BUCKET-NAME-HERE";
private static readonly RegionEndpoint BucketRegion = RegionEndpoint.USEast1;
private const string Key = "massive-file-test";
// We're going to send 100 chunks of 256 MB each, for a total of 25 GB.
// The content will be the asterisk ("*") repeated for the desired size.
private const int ChunkSizeMb = 256;
private const int TotalSizeGb = 25;
public static void Main(string[] args)
{
Console.WriteLine($"Writing object to {BucketName}, {Key}");
int totalChunks = TotalSizeGb * 1024 / ChunkSizeMb;
int chunkSizeBytes = ChunkSizeMb * 1024 * 1024;
string payload = new String('*', chunkSizeBytes);
// Initiate the request.
InitiateMultipartUploadRequest initiateRequest = new InitiateMultipartUploadRequest
{
BucketName = BucketName,
Key = Key
};
List<UploadPartResponse> uploadResponses = new List<UploadPartResponse>();
IAmazonS3 s3Client = new AmazonS3Client(BucketRegion);
InitiateMultipartUploadResponse initResponse = s3Client.InitiateMultipartUpload(initiateRequest);
// Open a stream to build the input.
for (int i = 0; i < totalChunks; i++)
{
// Write the next chunk to the input stream.
Console.WriteLine($"Writing chunk {i} of {totalChunks}");
using (var stream = ToStream(payload))
{
// Write the next chunk to s3.
UploadPartRequest uploadRequest = new UploadPartRequest
{
BucketName = BucketName,
Key = Key,
UploadId = initResponse.UploadId,
PartNumber = i + 1,
PartSize = chunkSizeBytes,
InputStream = stream,
};
uploadResponses.Add(s3Client.UploadPart(uploadRequest));
}
}
// Complete the request.
CompleteMultipartUploadRequest completeRequest = new CompleteMultipartUploadRequest
{
BucketName = BucketName,
Key = Key,
UploadId = initResponse.UploadId
};
completeRequest.AddPartETags(uploadResponses);
s3Client.CompleteMultipartUpload(completeRequest);
Console.WriteLine("Script is complete. Press any key to exit...");
Console.ReadKey();
}
private static Stream ToStream(string s)
{
var stream = new MemoryStream();
var writer = new StreamWriter(stream);
writer.Write(s);
writer.Flush();
stream.Position = 0;
return stream;
}
Here is what AnonCoward started, finished off by adding seeking - it's a trivial op for a stream that does nothing except write asterisks to its buffer. If you were generating more complex data it would be hard work but for seeking all you need to do is set the position and say "yep, done that" because no matter where you seek to in the stream the behavior of creating asterisks is always the same
class AsteriskGeneratingStream : Stream
{
long _pos = 0;
long _length = 0;
public AsteriskGeneratingStream(long length)
{
_length = length;
}
public override long Length => _length;
public override int Read(byte[] buffer, int offset, int count)
{
// Create the data as needed
if (count + _pos > _length)
count = (int)(_length - _pos);
for (int i = offset; i < count; i++)
buffer[i] = (byte)'*';
_pos += count;
return count;
}
public override bool CanRead => true;
public override long Seek(long offset, SeekOrigin origin)
{
if(origin == SeekOrigin.Begin) //lets just trust that the caller will be sensible and not set e.g. negative offset
_pos = offset;
else if(origin == SeekOrigin.Current)
_pos += offset;
else if(origin == SeekOrigin.End)
_pos = _length + offset;
return _pos;
}
public override bool CanSeek => true;
public override bool CanWrite => false;
public override long Position { get => _pos; set => _pos = value; }
public override void Flush() { }
public override void SetLength(long value) { _length = value; }
public override void Write(byte[] buffer, int offset, int count) { throw new NotImplementedException(); }
}
class Program
{
static void Main(string[] args)
{
long objectSize = 25L * 1024 * 1024;
var s3 = new AmazonS3Client(Amazon.RegionEndpoint.USWest1);
var xfer = new TransferUtility(s3,new TransferUtilityConfig
{
MinSizeBeforePartUpload = 5L * 1024 * 1024
});
var helper = new AsteriskGeneratingStream(objectSize);
xfer.Upload(helper, "bucket-name", "object-key");
}
}
Note, I can't guarantee it'll work right off the bat because I'm on a cellphone and can't test this via c# fiddle but let's see how it blows up! 😀
If you can create the object on the fly, or at least cache fairly small segments, you can create a stream that serves the data up to S3. Note, that unless you can also create any part of the object out of order, you need to prevent the AWS SDK from using a multi-part upload, which will slow down the transfer speed.
class DataStream : Stream
{
long _pos = 0;
long _length = 0;
public DataStream(long length)
{
_length = length;
}
public override long Length => _length;
public override int Read(byte[] buffer, int offset, int count)
{
// Create the data as needed, on demand
// For this example, just cycle through 0 to 256 in the data over and over again
if (count + _pos > _length)
{
count = (int)(_length - _pos);
}
for (int i = 0; i < count; i++)
{
buffer[i + offset] = (byte)((_pos + i) % 256);
}
_pos += count;
return count;
}
public override bool CanRead => true;
// Stub out all other methods. For a seekable stream
// Seek() and Postion need to be implemented, along with CanSeek changed
public override long Seek(long offset, SeekOrigin origin) { throw new NotImplementedException(); }
public override bool CanSeek => false;
public override bool CanWrite => false;
public override long Position { get => _pos; set => throw new NotImplementedException(); }
public override void Flush() { throw new NotImplementedException(); }
public override void SetLength(long value) { throw new NotImplementedException(); }
public override void Write(byte[] buffer, int offset, int count) { throw new NotImplementedException(); }
}
class Program
{
static void Main(string[] args)
{
long objectSize = 25L * 1024 * 1024;
var s3 = new AmazonS3Client(Amazon.RegionEndpoint.USWest1);
// Prevent a multi-part upload, which requires a seekable stream
var xfer = new TransferUtility(s3, new TransferUtilityConfig
{
MinSizeBeforePartUpload = objectSize + 1
});
var helper = new DataStream(objectSize);
xfer.Upload(helper, "bucket-name", "object-key");
}
}
I'm trying to parse a binary file as fastest as possible. So this is what I first tried to do:
using (FileStream filestream = path.OpenRead()) {
using (var d = new GZipStream(filestream, CompressionMode.Decompress)) {
using (MemoryStream m = new MemoryStream()) {
d.CopyTo(m);
m.Position = 0;
using (BinaryReaderBigEndian b = new BinaryReaderBigEndian(m)) {
while (b.BaseStream.Position != b.BaseStream.Length) {
UInt32 value = b.ReadUInt32();
} } } } }
Where BinaryReaderBigEndian class is implemented as it follows:
public static class BinaryReaderBigEndian {
public BinaryReaderBigEndian(Stream stream) : base(stream) { }
public override UInt32 ReadUInt32() {
var x = base.ReadBytes(4);
Array.Reverse(x);
return BitConverter.ToUInt32(x, 0);
} }
Then, I tried to get a performance improvement using ReadOnlySpan instead of MemoryStream. So, I tried doing:
using (FileStream filestream = path.OpenRead()) {
using (var d = new GZipStream(filestream, CompressionMode.Decompress)) {
using (MemoryStream m = new MemoryStream()) {
d.CopyTo(m);
int position = 0;
ReadOnlySpan<byte> stream = new ReadOnlySpan<byte>(m.ToArray());
while (position != stream.Length) {
UInt32 value = stream.ReadUInt32(position);
position += 4;
} } } }
Where BinaryReaderBigEndian class changed in:
public static class BinaryReaderBigEndian {
public override UInt32 ReadUInt32(this ReadOnlySpan<byte> stream, int start) {
var data = stream.Slice(start, 4).ToArray();
Array.Reverse(x);
return BitConverter.ToUInt32(x, 0);
} }
But, unfortunately, I didn't notice any improvement. So, where am I doing wrong?
I did some measurement of your code on my computer (Intel Q9400, 8 GiB RAM, SSD disk, Win10 x64 Home, .NET Framework 4/7/2, tested with 15 MB (when unpacked) file) with these results:
No-Span version: 520 ms
Span version: 720 ms
So Span version is actually slower! Why? Because new ReadOnlySpan<byte>(m.ToArray()) performs additional copy of whole file and also ReadUInt32() performs many slicings of the Span (slicing is cheap, but not free). Since you performed more work, you can't expect performance to be any better just because you used Span.
So can we do better? Yes. It turns out that the slowest part of your code is actually garbage collection caused by repeatedly allocating 4-byte Arrays created by the .ToArray() calls in ReadUInt32() method. You can avoid it by implementing ReadUInt32() yourself. It's pretty easy and also eliminates need for Span slicing. You can also replace new ReadOnlySpan<byte>(m.ToArray()) with new ReadOnlySpan<byte>(m.GetBuffer()).Slice(0, (int)m.Length);, which performs cheap slicing instead of copy of whole file. So now code looks like this:
public static void Read(FileInfo path)
{
using (FileStream filestream = path.OpenRead())
{
using (var d = new GZipStream(filestream, CompressionMode.Decompress))
{
using (MemoryStream m = new MemoryStream())
{
d.CopyTo(m);
int position = 0;
ReadOnlySpan<byte> stream = new ReadOnlySpan<byte>(m.GetBuffer()).Slice(0, (int)m.Length);
while (position != stream.Length)
{
UInt32 value = stream.ReadUInt32(position);
position += 4;
}
}
}
}
}
public static class BinaryReaderBigEndian
{
public static UInt32 ReadUInt32(this ReadOnlySpan<byte> stream, int start)
{
UInt32 res = 0;
for (int i = 0; i < 4; i++)
{
res = (res << 8) | (((UInt32)stream[start + i]) & 0xff);
}
return res;
}
}
With these changes I get from 720 ms down to 165 ms (4x faster). Sounds great, doesn't it? But we can do even better. We can completely avoid MemoryStream copy and inline and further optimize ReadUInt32():
public static void Read(FileInfo path)
{
using (FileStream filestream = path.OpenRead())
{
using (var d = new GZipStream(filestream, CompressionMode.Decompress))
{
var buffer = new byte[64 * 1024];
do
{
int bufferDataLength = FillBuffer(d, buffer);
if (bufferDataLength % 4 != 0)
throw new Exception("Stream length not divisible by 4");
if (bufferDataLength == 0)
break;
for (int i = 0; i < bufferDataLength; i += 4)
{
uint value = unchecked(
(((uint)buffer[i]) << 24)
| (((uint)buffer[i + 1]) << 16)
| (((uint)buffer[i + 2]) << 8)
| (((uint)buffer[i + 3]) << 0));
}
} while (true);
}
}
}
private static int FillBuffer(Stream stream, byte[] buffer)
{
int read = 0;
int totalRead = 0;
do
{
read = stream.Read(buffer, totalRead, buffer.Length - totalRead);
totalRead += read;
} while (read > 0 && totalRead < buffer.Length);
return totalRead;
}
And now it takes less than 90 ms (8x faster then the original!). And without Span! Span is great in situations, where it allows perform slicing and avoid array copy, but it won't improve performance just by blindly using it. After all, Span is designed to have performance characteristics on par with Array, but not better (and only on runtimes that have special support for it, such as .NET Core 2.1).
When I searched the method about decompress the file by using SharpZipLib, I found lot of methods like this:
public static void TarWriteCharacters(string tarfile, string targetDir)
{
using (TarInputStream s = new TarInputStream(File.OpenRead(tarfile)))
{
//some codes here
using (FileStream fileWrite = File.Create(targetDir + directoryName + fileName))
{
int size = 2048;
byte[] data = new byte[2048];
while (true)
{
size = s.Read(data, 0, data.Length);
if (size > 0)
{
fileWrite.Write(data, 0, size);
}
else
{
break;
}
}
fileWrite.Close();
}
}
}
The format FileStream.Write is:
FileStream.Write(byte[] array, int offset, int count)
Now I try to separate part of read and write because I want to use thread to speed up the decompress rate in write function, and I use dynamic array byte[] and int[] to deposit the file's data and size like below
Read:
public static void TarWriteCharacters(string tarfile, string targetDir)
{
using (TarInputStream s = new TarInputStream(File.OpenRead(tarfile)))
{
//some codes here
using (FileStream fileWrite= File.Create(targetDir + directoryName + fileName))
{
int size = 2048;
List<int> SizeList = new List<int>();
List<byte[]> mydatalist = new List<byte[]>();
while (true)
{
byte[] data = new byte[2048];
size = s.Read(data, 0, data.Length);
if (size > 0)
{
mydatalist.Add(data);
SizeList.Add(size);
}
else
{
break;
}
}
test = new Thread(() =>
FileWriteFun(pathToTar, args, SizeList, mydatalist)
);
test.Start();
streamWriter.Close();
}
}
}
Write:
public static void FileWriteFun(string pathToTar , string[] args, List<int> SizeList, List<byte[]> mydataList)
{
//some codes here
using (FileStream fileWrite= File.Create(targetDir + directoryName + fileName))
{
for (int i = 0; i < mydataList.Count; i++)
{
fileWrite.Write(mydataList[i], 0, SizeList[i]);
}
fileWrite.Close();
}
}
Edit
(1)byte[] data = new byte[2048] into while loop to assign data to new array.
(2)change int[] SizeList = new int[2048] to List<int> SizeList = new List<int>() because of int range
As read on a stream is only guarantied to return one byte (typically it will be more, but you can't rely on the full requested length each time), your solution can theoretically fail after 2048 bytes as your SizeList can only hold 2048 entries.
You could use a List to hold the sizes.
Or use a MemoryStream instead of inventing your own.
But the two main problems are:
1) You keep reading into the same byte array, overwriting previously read data. When you add your data byte array to mydatalist, you must assign data to a new byte array.
2) you close your stream before the second thread is done writing.
In general threading is difficult and should only be used where you know it will improve performance. Simply reading and writing data is typically IO bound in performance, not cpu bound, so introducing a second thread will just give a small performance penalty and no gain in speed. You could use multithreading to ensure concurrent read/write operations, but most likely the disk cache will do this for you if you stick to the first solution - amd if not, using async is easier than multithreaded to achieve this.
I want to be able to compute the hashes of arbitrarily sized file chunks of a file in C#.
eg: Compute the hash of the 3rd gigabyte in 4gb file.
The main problem is that I don't want to load the entire file at memory, as there could be several files and the offsets could be quite arbitrary.
AFAIK, the HashAlgorithm.ComputeHash allows me to either use a byte buffer, of a stream. The stream would allow me to compute the hash efficiently, but for the entire file, not just for a specific chunk.
I was thinking to create aan alternate FileStream object and pass it to ComputeHash, where I would overload the FileStream methods and have read only for a certain chunk in a file.
Is there a better solution than this, preferably using the built in C# libraries ?
Thanks.
You should pass in either:
A byte array containing the chunk of data to compute the hash from
A stream that restricts access to the chunk you want to computer the hash from
The second option isn't all that hard, here's a quick LINQPad program I threw together. Note that it lacks quite a bit of error handling, such as checking that the chunk is actually available (ie. that you're passing in a position and length of the stream that actually exists and doesn't fall off the end of the underlying stream).
Needless to say, if this should end up as production code I would add a lot of error handling, and write a bunch of unit-tests to ensure all edge-cases are handled correctly.
You would construct the PartialStream instance for your file like this:
const long gb = 1024 * 1024 * 1024;
using (var fileStream = new FileStream(#"d:\temp\too_long_file.bin", FileMode.Open))
using (var chunk = new PartialStream(fileStream, 2 * gb, 1 * gb))
{
var hash = hashAlgorithm.ComputeHash(chunk);
}
Here's the LINQPad test program:
void Main()
{
var buffer = Enumerable.Range(0, 256).Select(i => (byte)i).ToArray();
using (var underlying = new MemoryStream(buffer))
using (var partialStream = new PartialStream(underlying, 64, 32))
{
var temp = new byte[1024]; // too much, ensure we don't read past window end
partialStream.Read(temp, 0, temp.Length);
temp.Dump();
// should output 64-95 and then 0's for the rest (64-95 = 32 bytes)
}
}
public class PartialStream : Stream
{
private readonly Stream _UnderlyingStream;
private readonly long _Position;
private readonly long _Length;
public PartialStream(Stream underlyingStream, long position, long length)
{
if (!underlyingStream.CanRead || !underlyingStream.CanSeek)
throw new ArgumentException("underlyingStream");
_UnderlyingStream = underlyingStream;
_Position = position;
_Length = length;
_UnderlyingStream.Position = position;
}
public override bool CanRead
{
get
{
return _UnderlyingStream.CanRead;
}
}
public override bool CanWrite
{
get
{
return false;
}
}
public override bool CanSeek
{
get
{
return true;
}
}
public override long Length
{
get
{
return _Length;
}
}
public override long Position
{
get
{
return _UnderlyingStream.Position - _Position;
}
set
{
_UnderlyingStream.Position = value + _Position;
}
}
public override void Flush()
{
throw new NotSupportedException();
}
public override long Seek(long offset, SeekOrigin origin)
{
switch (origin)
{
case SeekOrigin.Begin:
return _UnderlyingStream.Seek(_Position + offset, SeekOrigin.Begin) - _Position;
case SeekOrigin.End:
return _UnderlyingStream.Seek(_Length + offset, SeekOrigin.Begin) - _Position;
case SeekOrigin.Current:
return _UnderlyingStream.Seek(offset, SeekOrigin.Current) - _Position;
default:
throw new ArgumentException("origin");
}
}
public override void SetLength(long length)
{
throw new NotSupportedException();
}
public override int Read(byte[] buffer, int offset, int count)
{
long left = _Length - Position;
if (left < count)
count = (int)left;
return _UnderlyingStream.Read(buffer, offset, count);
}
public override void Write(byte[] buffer, int offset, int count)
{
throw new NotSupportedException();
}
}
You can use TransformBlock and TransformFinalBlock directly. That's pretty similar to what HashAlgorithm.ComputeHash does internally.
Something like:
using(var hashAlgorithm = new SHA256Managed())
using(var fileStream = new File.OpenRead(...))
{
fileStream.Position = ...;
long bytesToHash = ...;
var buf = new byte[4 * 1024];
while(bytesToHash > 0)
{
var bytesRead = fileStream.Read(buf, 0, (int)Math.Min(bytesToHash, buf.Length));
hashAlgorithm.TransformBlock(buf, 0, bytesRead, null, 0);
bytesToHash -= bytesRead;
if(bytesRead == 0)
throw new InvalidOperationException("Unexpected end of stream");
}
hashAlgorithm.TransformFinalBlock(buf, 0, 0);
var hash = hashAlgorithm.Hash;
return hash;
};
Your suggestion - passing in a restricted access wrapper for your FileStream - is the cleanest solution. Your wrapper should defer everything to the wrapped Stream except the Length and Position properties.
How? Simply create a class that inherits from Stream. Make the constructor take:
Your source Stream (in your case, a FileStream)
The chunk start position
The chunk end position
As an extension - this is a list of all the Streams that are available http://msdn.microsoft.com/en-us/library/system.io.stream%28v=vs.100%29.aspx#inheritanceContinued
To easily compute the hash of a chunk of a larger stream, use these two methods:
HashAlgorithm.TransformBlock
HashAlgorithm.TransformFinalBlock
Here's a LINQPad program that demonstrates:
void Main()
{
const long gb = 1024 * 1024 * 1024;
using (var stream = new FileStream(#"d:\temp\largefile.bin", FileMode.Open))
{
stream.Position = 2 * gb; // 3rd gb-chunk
byte[] buffer = new byte[32768];
long amount = 1 * gb;
using (var hashAlgorithm = SHA1.Create())
{
while (amount > 0)
{
int bytesRead = stream.Read(buffer, 0,
(int)Math.Min(buffer.Length, amount));
if (bytesRead > 0)
{
amount -= bytesRead;
if (amount > 0)
hashAlgorithm.TransformBlock(buffer, 0, bytesRead,
buffer, 0);
else
hashAlgorithm.TransformFinalBlock(buffer, 0, bytesRead);
}
else
throw new InvalidOperationException();
}
hashAlgorithm.Hash.Dump();
}
}
}
To answer your original question ("Is there a better solution..."):
Not that I know of.
This seems to be a very special, non-trivial task, so a little extra work might be involved anyway. I think your approach of using a custom Stream-class goes in the right direction, I'd probably do exactly the same.
And Gusdor and xander have already provided very helpful information on how to implement that — good job guys!
What is the best method to replace sequence of bytes in binary file to the same length of other bytes? The binary files will be pretty large, about 50 mb and should not be loaded at once in memory.
Update: I do not know location of bytes which needs to be replaced, I need to find them first.
Assuming you're trying to replace a known section of the file.
Open a FileStream with read/write access
Seek to the right place
Overwrite existing data
Sample code coming...
public static void ReplaceData(string filename, int position, byte[] data)
{
using (Stream stream = File.Open(filename, FileMode.Open))
{
stream.Position = position;
stream.Write(data, 0, data.Length);
}
}
If you're effectively trying to do a binary version of a string.Replace (e.g. "always replace bytes { 51, 20, 34} with { 20, 35, 15 } then it's rather harder. As a quick description of what you'd do:
Allocate a buffer of at least the size of data you're interested in
Repeatedly read into the buffer, scanning for the data
If you find a match, seek back to the right place (e.g. stream.Position -= buffer.Length - indexWithinBuffer; and overwrite the data
Sounds simple so far... but the tricky bit is if the data starts near the end of the buffer. You need to remember all potential matches and how far you've matched so far, so that if you get a match when you read the next buffer's-worth, you can detect it.
There are probably ways of avoiding this trickiness, but I wouldn't like to try to come up with them offhand :)
EDIT: Okay, I've got an idea which might help...
Keep a buffer which is at least twice as big as you need
Repeatedly:
Copy the second half of the buffer into the first half
Fill the second half of the buffer from the file
Search throughout the whole buffer for the data you're looking for
That way at some point, if the data is present, it will be completely within the buffer.
You'd need to be careful about where the stream was in order to get back to the right place, but I think this should work. It would be trickier if you were trying to find all matches, but at least the first match should be reasonably simple...
My solution :
/// <summary>
/// Copy data from a file to an other, replacing search term, ignoring case.
/// </summary>
/// <param name="originalFile"></param>
/// <param name="outputFile"></param>
/// <param name="searchTerm"></param>
/// <param name="replaceTerm"></param>
private static void ReplaceTextInBinaryFile(string originalFile, string outputFile, string searchTerm, string replaceTerm)
{
byte b;
//UpperCase bytes to search
byte[] searchBytes = Encoding.UTF8.GetBytes(searchTerm.ToUpper());
//LowerCase bytes to search
byte[] searchBytesLower = Encoding.UTF8.GetBytes(searchTerm.ToLower());
//Temporary bytes during found loop
byte[] bytesToAdd = new byte[searchBytes.Length];
//Search length
int searchBytesLength = searchBytes.Length;
//First Upper char
byte searchByte0 = searchBytes[0];
//First Lower char
byte searchByte0Lower = searchBytesLower[0];
//Replace with bytes
byte[] replaceBytes = Encoding.UTF8.GetBytes(replaceTerm);
int counter = 0;
using (FileStream inputStream = File.OpenRead(originalFile)) {
//input length
long srcLength = inputStream.Length;
using (BinaryReader inputReader = new BinaryReader(inputStream)) {
using (FileStream outputStream = File.OpenWrite(outputFile)) {
using (BinaryWriter outputWriter = new BinaryWriter(outputStream)) {
for (int nSrc = 0; nSrc < srcLength; ++nSrc)
//first byte
if ((b = inputReader.ReadByte()) == searchByte0
|| b == searchByte0Lower) {
bytesToAdd[0] = b;
int nSearch = 1;
//next bytes
for (; nSearch < searchBytesLength; ++nSearch)
//get byte, save it and test
if ((b = bytesToAdd[nSearch] = inputReader.ReadByte()) != searchBytes[nSearch]
&& b != searchBytesLower[nSearch]) {
break;//fail
}
//Avoid overflow. No need, in my case, because no chance to see searchTerm at the end.
//else if (nSrc + nSearch >= srcLength)
// break;
if (nSearch == searchBytesLength) {
//success
++counter;
outputWriter.Write(replaceBytes);
nSrc += nSearch - 1;
}
else {
//failed, add saved bytes
outputWriter.Write(bytesToAdd, 0, nSearch + 1);
nSrc += nSearch;
}
}
else
outputWriter.Write(b);
}
}
}
}
Console.WriteLine("ReplaceTextInBinaryFile.counter = " + counter);
}
You can use my BinaryUtility to search and replace one or more bytes without loading the entire file into memory like this:
var searchAndReplace = new List<Tuple<byte[], byte[]>>()
{
Tuple.Create(
BitConverter.GetBytes((UInt32)0xDEADBEEF),
BitConverter.GetBytes((UInt32)0x01234567)),
Tuple.Create(
BitConverter.GetBytes((UInt32)0xAABBCCDD),
BitConverter.GetBytes((UInt16)0xAFFE)),
};
using(var reader =
new BinaryReader(new FileStream(#"C:\temp\data.bin", FileMode.Open)))
{
using(var writer =
new BinaryWriter(new FileStream(#"C:\temp\result.bin", FileMode.Create)))
{
BinaryUtility.Replace(reader, writer, searchAndReplace);
}
}
BinaryUtility code:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
public static class BinaryUtility
{
public static IEnumerable<byte> GetByteStream(BinaryReader reader)
{
const int bufferSize = 1024;
byte[] buffer;
do
{
buffer = reader.ReadBytes(bufferSize);
foreach (var d in buffer) { yield return d; }
} while (bufferSize == buffer.Length);
}
public static void Replace(BinaryReader reader, BinaryWriter writer, IEnumerable<Tuple<byte[], byte[]>> searchAndReplace)
{
foreach (byte d in Replace(GetByteStream(reader), searchAndReplace)) { writer.Write(d); }
}
public static IEnumerable<byte> Replace(IEnumerable<byte> source, IEnumerable<Tuple<byte[], byte[]>> searchAndReplace)
{
foreach (var s in searchAndReplace)
{
source = Replace(source, s.Item1, s.Item2);
}
return source;
}
public static IEnumerable<byte> Replace(IEnumerable<byte> input, IEnumerable<byte> from, IEnumerable<byte> to)
{
var fromEnumerator = from.GetEnumerator();
fromEnumerator.MoveNext();
int match = 0;
foreach (var data in input)
{
if (data == fromEnumerator.Current)
{
match++;
if (fromEnumerator.MoveNext()) { continue; }
foreach (byte d in to) { yield return d; }
match = 0;
fromEnumerator.Reset();
fromEnumerator.MoveNext();
continue;
}
if (0 != match)
{
foreach (byte d in from.Take(match)) { yield return d; }
match = 0;
fromEnumerator.Reset();
fromEnumerator.MoveNext();
}
yield return data;
}
if (0 != match)
{
foreach (byte d in from.Take(match)) { yield return d; }
}
}
}
public static void BinaryReplace(string sourceFile, byte[] sourceSeq, string targetFile, byte[] targetSeq)
{
FileStream sourceStream = File.OpenRead(sourceFile);
FileStream targetStream = File.Create(targetFile);
try
{
int b;
long foundSeqOffset = -1;
int searchByteCursor = 0;
while ((b=sourceStream.ReadByte()) != -1)
{
if (sourceSeq[searchByteCursor] == b)
{
if (searchByteCursor == sourceSeq.Length - 1)
{
targetStream.Write(targetSeq, 0, targetSeq.Length);
searchByteCursor = 0;
foundSeqOffset = -1;
}
else
{
if (searchByteCursor == 0)
{
foundSeqOffset = sourceStream.Position - 1;
}
++searchByteCursor;
}
}
else
{
if (searchByteCursor == 0)
{
targetStream.WriteByte((byte) b);
}
else
{
targetStream.WriteByte(sourceSeq[0]);
sourceStream.Position = foundSeqOffset + 1;
searchByteCursor = 0;
foundSeqOffset = -1;
}
}
}
}
finally
{
sourceStream.Dispose();
targetStream.Dispose();
}
}