I am using this code for uploading a file :
https://gist.github.com/bgrins/1789787
But if I am trying to use this code for uploading a file "2 GB" file I am getting out of memory exception and the reason in this line :
https://gist.github.com/bgrins/1789787#file-gistfile1-cs-L75
so how can I fix this issue?
Read giant file piece by piece, and upload pieces one by one. you could provide a progress bar also.
upload code piece by piece : How to read a big file piece by piece in C#
in server side, append new pieces to a file: C# Append byte array to existing file
you can detail the code with this idea. I did it once last year, but cannot share the code.
There are more than one solution
1- Writing to RequestStream directly instead of writing to MemoryStream :
https://blogs.msdn.microsoft.com/johan/2006/11/15/are-you-getting-outofmemoryexceptions-when-uploading-large-files/
public static string MyUploader(string strFileToUpload, string strUrl)
{
string strFileFormName = "file";
Uri oUri = new Uri(strUrl);
string strBoundary = "----------" + DateTime.Now.Ticks.ToString("x");
// The trailing boundary string
byte[] boundaryBytes = Encoding.ASCII.GetBytes("\r\n--" + strBoundary + "\r\n");
// The post message header
StringBuilder sb = new StringBuilder();
sb.Append("--");
sb.Append(strBoundary);
sb.Append("\r\n");
sb.Append("Content-Disposition: form-data; name=\"");
sb.Append(strFileFormName);
sb.Append("\"; filename=\"");
sb.Append(Path.GetFileName(strFileToUpload));
sb.Append("\"");
sb.Append("\r\n");
sb.Append("Content-Type: ");
sb.Append("application/octet-stream");
sb.Append("\r\n");
sb.Append("\r\n");
string strPostHeader = sb.ToString();
byte[] postHeaderBytes = Encoding.UTF8.GetBytes(strPostHeader);
// The WebRequest
HttpWebRequest oWebrequest = (HttpWebRequest)WebRequest.Create(oUri);
oWebrequest.ContentType = "multipart/form-data; boundary=" + strBoundary;
oWebrequest.Method = "POST";
// This is important, otherwise the whole file will be read to memory anyway...
oWebrequest.AllowWriteStreamBuffering = false;
// Get a FileStream and set the final properties of the WebRequest
FileStream oFileStream = new FileStream(strFileToUpload, FileMode.Open, FileAccess.Read);
long length = postHeaderBytes.Length + oFileStream.Length + boundaryBytes.Length;
oWebrequest.ContentLength = length;
Stream oRequestStream = oWebrequest.GetRequestStream();
// Write the post header
oRequestStream.Write(postHeaderBytes, 0, postHeaderBytes.Length);
// Stream the file contents in small pieces (4096 bytes, max).
byte[] buffer = new Byte[checked((uint)Math.Min(4096, (int)oFileStream.Length))];
int bytesRead = 0;
while ((bytesRead = oFileStream.Read(buffer, 0, buffer.Length)) != 0)
oRequestStream.Write(buffer, 0, bytesRead);
oFileStream.Close();
// Add the trailing boundary
oRequestStream.Write(boundaryBytes, 0, boundaryBytes.Length);
WebResponse oWResponse = oWebrequest.GetResponse();
Stream s = oWResponse.GetResponseStream();
StreamReader sr = new StreamReader(s);
String sReturnString = sr.ReadToEnd();
// Clean up
oFileStream.Close();
oRequestStream.Close();
s.Close();
sr.Close();
return sReturnString;
}
2- Using RecyclableMemoryStream instead of MemoryStream solution
You can read more about RecyclableMemoryStream here :
http://www.philosophicalgeek.com/2015/02/06/announcing-microsoft-io-recycablememorystream/
https://github.com/Microsoft/Microsoft.IO.RecyclableMemoryStream
3- Using MemoryTributary instead of MemoryStream
You can read more about MemoryTributary here :
https://www.codeproject.com/Articles/348590/A-replacement-for-MemoryStream?msg=5257615#xx5257615xx
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.InteropServices;
namespace LiquidEngine.Tools
{
/// <summary>
/// MemoryTributary is a re-implementation of MemoryStream that uses a dynamic list of byte arrays as a backing store, instead of a single byte array, the allocation
/// of which will fail for relatively small streams as it requires contiguous memory.
/// </summary>
public class MemoryTributary : Stream /* http://msdn.microsoft.com/en-us/library/system.io.stream.aspx */
{
#region Constructors
public MemoryTributary()
{
Position = 0;
}
public MemoryTributary(byte[] source)
{
this.Write(source, 0, source.Length);
Position = 0;
}
/* length is ignored because capacity has no meaning unless we implement an artifical limit */
public MemoryTributary(int length)
{
SetLength(length);
Position = length;
byte[] d = block; //access block to prompt the allocation of memory
Position = 0;
}
#endregion
#region Status Properties
public override bool CanRead
{
get { return true; }
}
public override bool CanSeek
{
get { return true; }
}
public override bool CanWrite
{
get { return true; }
}
#endregion
#region Public Properties
public override long Length
{
get { return length; }
}
public override long Position { get; set; }
#endregion
#region Members
protected long length = 0;
protected long blockSize = 65536;
protected List<byte[]> blocks = new List<byte[]>();
#endregion
#region Internal Properties
/* Use these properties to gain access to the appropriate block of memory for the current Position */
/// <summary>
/// The block of memory currently addressed by Position
/// </summary>
protected byte[] block
{
get
{
while (blocks.Count <= blockId)
blocks.Add(new byte[blockSize]);
return blocks[(int)blockId];
}
}
/// <summary>
/// The id of the block currently addressed by Position
/// </summary>
protected long blockId
{
get { return Position / blockSize; }
}
/// <summary>
/// The offset of the byte currently addressed by Position, into the block that contains it
/// </summary>
protected long blockOffset
{
get { return Position % blockSize; }
}
#endregion
#region Public Stream Methods
public override void Flush()
{
}
public override int Read(byte[] buffer, int offset, int count)
{
long lcount = (long)count;
if (lcount < 0)
{
throw new ArgumentOutOfRangeException("count", lcount, "Number of bytes to copy cannot be negative.");
}
long remaining = (length - Position);
if (lcount > remaining)
lcount = remaining;
if (buffer == null)
{
throw new ArgumentNullException("buffer", "Buffer cannot be null.");
}
if (offset < 0)
{
throw new ArgumentOutOfRangeException("offset",offset,"Destination offset cannot be negative.");
}
int read = 0;
long copysize = 0;
do
{
copysize = Math.Min(lcount, (blockSize - blockOffset));
Buffer.BlockCopy(block, (int)blockOffset, buffer, offset, (int)copysize);
lcount -= copysize;
offset += (int)copysize;
read += (int)copysize;
Position += copysize;
} while (lcount > 0);
return read;
}
public override long Seek(long offset, SeekOrigin origin)
{
switch (origin)
{
case SeekOrigin.Begin:
Position = offset;
break;
case SeekOrigin.Current:
Position += offset;
break;
case SeekOrigin.End:
Position = Length - offset;
break;
}
return Position;
}
public override void SetLength(long value)
{
length = value;
}
public override void Write(byte[] buffer, int offset, int count)
{
long initialPosition = Position;
int copysize;
try
{
do
{
copysize = Math.Min(count, (int)(blockSize - blockOffset));
EnsureCapacity(Position + copysize);
Buffer.BlockCopy(buffer, (int)offset, block, (int)blockOffset, copysize);
count -= copysize;
offset += copysize;
Position += copysize;
} while (count > 0);
}
catch (Exception e)
{
Position = initialPosition;
throw e;
}
}
public override int ReadByte()
{
if (Position >= length)
return -1;
byte b = block[blockOffset];
Position++;
return b;
}
public override void WriteByte(byte value)
{
EnsureCapacity(Position + 1);
block[blockOffset] = value;
Position++;
}
protected void EnsureCapacity(long intended_length)
{
if (intended_length > length)
length = (intended_length);
}
#endregion
#region IDispose
/* http://msdn.microsoft.com/en-us/library/fs2xkftw.aspx */
protected override void Dispose(bool disposing)
{
/* We do not currently use unmanaged resources */
base.Dispose(disposing);
}
#endregion
#region Public Additional Helper Methods
/// <summary>
/// Returns the entire content of the stream as a byte array. This is not safe because the call to new byte[] may
/// fail if the stream is large enough. Where possible use methods which operate on streams directly instead.
/// </summary>
/// <returns>A byte[] containing the current data in the stream</returns>
public byte[] ToArray()
{
long firstposition = Position;
Position = 0;
byte[] destination = new byte[Length];
Read(destination, 0, (int)Length);
Position = firstposition;
return destination;
}
/// <summary>
/// Reads length bytes from source into the this instance at the current position.
/// </summary>
/// <param name="source">The stream containing the data to copy</param>
/// <param name="length">The number of bytes to copy</param>
public void ReadFrom(Stream source, long length)
{
byte[] buffer = new byte[4096];
int read;
do
{
read = source.Read(buffer, 0, (int)Math.Min(4096, length));
length -= read;
this.Write(buffer, 0, read);
} while (length > 0);
}
/// <summary>
/// Writes the entire stream into destination, regardless of Position, which remains unchanged.
/// </summary>
/// <param name="destination">The stream to write the content of this stream to</param>
public void WriteTo(Stream destination)
{
long initialpos = Position;
Position = 0;
this.CopyTo(destination);
Position = initialpos;
}
#endregion
}
}
Related
I am trying to write a massive object to AWS S3 (e.g. 25 GB).
Currently I can get it working in two ways:
Write the content to a file on local disk, then send the file to S3 using multi-part upload
Write the content to a MemoryStream, then send that stream to S3 using multi-part upload
However, I don't like either approach, because I need to reserve a large amount of disk space or memory for the operation. I am generating this content in code, so I was hoping to open a stream to an S3 object, and generate the content directly to that object. But I can't see how to make that work.
Is it possible to build a massive object in S3 without representing the entire object in a local file or memory first?
(Note: My question is very similar to this question, but that question doesn't have a useful answer.)
I was able to get it working by breaking the overall payload into chunks, and sending each individual chunk as a separate MemoryStream.
Technically this solution still uses a MemoryStream, but that's OK, since I can control how much memory is used by adjusting the chunk size. For my test, I created a 25GB file while keeping memory usage well below that (~2 GB IIRC).
Here is my solution:
private const string BucketName = "YOUR-BUCKET-NAME-HERE";
private static readonly RegionEndpoint BucketRegion = RegionEndpoint.USEast1;
private const string Key = "massive-file-test";
// We're going to send 100 chunks of 256 MB each, for a total of 25 GB.
// The content will be the asterisk ("*") repeated for the desired size.
private const int ChunkSizeMb = 256;
private const int TotalSizeGb = 25;
public static void Main(string[] args)
{
Console.WriteLine($"Writing object to {BucketName}, {Key}");
int totalChunks = TotalSizeGb * 1024 / ChunkSizeMb;
int chunkSizeBytes = ChunkSizeMb * 1024 * 1024;
string payload = new String('*', chunkSizeBytes);
// Initiate the request.
InitiateMultipartUploadRequest initiateRequest = new InitiateMultipartUploadRequest
{
BucketName = BucketName,
Key = Key
};
List<UploadPartResponse> uploadResponses = new List<UploadPartResponse>();
IAmazonS3 s3Client = new AmazonS3Client(BucketRegion);
InitiateMultipartUploadResponse initResponse = s3Client.InitiateMultipartUpload(initiateRequest);
// Open a stream to build the input.
for (int i = 0; i < totalChunks; i++)
{
// Write the next chunk to the input stream.
Console.WriteLine($"Writing chunk {i} of {totalChunks}");
using (var stream = ToStream(payload))
{
// Write the next chunk to s3.
UploadPartRequest uploadRequest = new UploadPartRequest
{
BucketName = BucketName,
Key = Key,
UploadId = initResponse.UploadId,
PartNumber = i + 1,
PartSize = chunkSizeBytes,
InputStream = stream,
};
uploadResponses.Add(s3Client.UploadPart(uploadRequest));
}
}
// Complete the request.
CompleteMultipartUploadRequest completeRequest = new CompleteMultipartUploadRequest
{
BucketName = BucketName,
Key = Key,
UploadId = initResponse.UploadId
};
completeRequest.AddPartETags(uploadResponses);
s3Client.CompleteMultipartUpload(completeRequest);
Console.WriteLine("Script is complete. Press any key to exit...");
Console.ReadKey();
}
private static Stream ToStream(string s)
{
var stream = new MemoryStream();
var writer = new StreamWriter(stream);
writer.Write(s);
writer.Flush();
stream.Position = 0;
return stream;
}
Here is what AnonCoward started, finished off by adding seeking - it's a trivial op for a stream that does nothing except write asterisks to its buffer. If you were generating more complex data it would be hard work but for seeking all you need to do is set the position and say "yep, done that" because no matter where you seek to in the stream the behavior of creating asterisks is always the same
class AsteriskGeneratingStream : Stream
{
long _pos = 0;
long _length = 0;
public AsteriskGeneratingStream(long length)
{
_length = length;
}
public override long Length => _length;
public override int Read(byte[] buffer, int offset, int count)
{
// Create the data as needed
if (count + _pos > _length)
count = (int)(_length - _pos);
for (int i = offset; i < count; i++)
buffer[i] = (byte)'*';
_pos += count;
return count;
}
public override bool CanRead => true;
public override long Seek(long offset, SeekOrigin origin)
{
if(origin == SeekOrigin.Begin) //lets just trust that the caller will be sensible and not set e.g. negative offset
_pos = offset;
else if(origin == SeekOrigin.Current)
_pos += offset;
else if(origin == SeekOrigin.End)
_pos = _length + offset;
return _pos;
}
public override bool CanSeek => true;
public override bool CanWrite => false;
public override long Position { get => _pos; set => _pos = value; }
public override void Flush() { }
public override void SetLength(long value) { _length = value; }
public override void Write(byte[] buffer, int offset, int count) { throw new NotImplementedException(); }
}
class Program
{
static void Main(string[] args)
{
long objectSize = 25L * 1024 * 1024;
var s3 = new AmazonS3Client(Amazon.RegionEndpoint.USWest1);
var xfer = new TransferUtility(s3,new TransferUtilityConfig
{
MinSizeBeforePartUpload = 5L * 1024 * 1024
});
var helper = new AsteriskGeneratingStream(objectSize);
xfer.Upload(helper, "bucket-name", "object-key");
}
}
Note, I can't guarantee it'll work right off the bat because I'm on a cellphone and can't test this via c# fiddle but let's see how it blows up! 😀
If you can create the object on the fly, or at least cache fairly small segments, you can create a stream that serves the data up to S3. Note, that unless you can also create any part of the object out of order, you need to prevent the AWS SDK from using a multi-part upload, which will slow down the transfer speed.
class DataStream : Stream
{
long _pos = 0;
long _length = 0;
public DataStream(long length)
{
_length = length;
}
public override long Length => _length;
public override int Read(byte[] buffer, int offset, int count)
{
// Create the data as needed, on demand
// For this example, just cycle through 0 to 256 in the data over and over again
if (count + _pos > _length)
{
count = (int)(_length - _pos);
}
for (int i = 0; i < count; i++)
{
buffer[i + offset] = (byte)((_pos + i) % 256);
}
_pos += count;
return count;
}
public override bool CanRead => true;
// Stub out all other methods. For a seekable stream
// Seek() and Postion need to be implemented, along with CanSeek changed
public override long Seek(long offset, SeekOrigin origin) { throw new NotImplementedException(); }
public override bool CanSeek => false;
public override bool CanWrite => false;
public override long Position { get => _pos; set => throw new NotImplementedException(); }
public override void Flush() { throw new NotImplementedException(); }
public override void SetLength(long value) { throw new NotImplementedException(); }
public override void Write(byte[] buffer, int offset, int count) { throw new NotImplementedException(); }
}
class Program
{
static void Main(string[] args)
{
long objectSize = 25L * 1024 * 1024;
var s3 = new AmazonS3Client(Amazon.RegionEndpoint.USWest1);
// Prevent a multi-part upload, which requires a seekable stream
var xfer = new TransferUtility(s3, new TransferUtilityConfig
{
MinSizeBeforePartUpload = objectSize + 1
});
var helper = new DataStream(objectSize);
xfer.Upload(helper, "bucket-name", "object-key");
}
}
I would like to do something roughly equivalent to the code example below. I want to generate and serve a stream of data without necessarily having the entire data set in memory at any one time.
It seems like I would need some implementation of Stream that accepts an IEnumerable<string> (or IEnumerable<byte>) in its constructor. Internally this Stream would only walk the IEnumerable as the Stream is being read or as needed. But I don't know of any Stream implementation like this.
Am I on the right track? Do you know of any way to do something like this?
public FileStreamResult GetResult()
{
IEnumerable<string> data = GetDataForStream();
Stream dataStream = ToStringStream(Encoding.UTF8, data);
return File(dataStream, "text/plain", "Result");
}
private IEnumerable<string> GetDataForStream()
{
StringBuilder sb;
for (int i = 0; i < 10000; i++)
{
yield return i.ToString();
yield return "\r\n";
}
}
private Stream ToStringStream(Encoding encoding, IEnumerable<string> data)
{
// I have to write my own implementation of stream?
throw new NotImplementedException();
}
Here's a read-only Stream implementation that uses an IEnumerable<byte> as input:
public class ByteStream : Stream, IDisposable
{
private readonly IEnumerator<byte> _input;
private bool _disposed;
public ByteStream(IEnumerable<byte> input)
{
_input = input.GetEnumerator();
}
public override bool CanRead => true;
public override bool CanSeek => false;
public override bool CanWrite => false;
public override long Length => 0;
public override long Position { get; set; } = 0;
public override int Read(byte[] buffer, int offset, int count)
{
int i = 0;
for (; i < count && _input.MoveNext(); i++)
buffer[i + offset] = _input.Current;
return i;
}
public override long Seek(long offset, SeekOrigin origin) => throw new InvalidOperationException();
public override void SetLength(long value) => throw new InvalidOperationException();
public override void Write(byte[] buffer, int offset, int count) => throw new InvalidOperationException();
public override void Flush() => throw new InvalidOperationException();
void IDisposable.Dispose()
{
if (_disposed)
return;
_input.Dispose();
_disposed= true;
}
}
What you then still need is a function that converts IEnumerable<string> to IEnumerable<byte>:
public static IEnumerable<byte> Encode(IEnumerable<string> input, Encoding encoding)
{
byte[] newLine = encoding.GetBytes(Environment.NewLine);
foreach (string line in input)
{
byte[] bytes = encoding.GetBytes(line);
foreach (byte b in bytes)
yield return b;
foreach (byte b in newLine)
yield return b;
}
}
And finally, here's how to use this in your controller:
public FileResult GetResult()
{
IEnumerable<string> data = GetDataForStream();
var stream = new ByteStream(Encode(data, Encoding.UTF8));
return File(stream, "text/plain", "Result.txt");
}
I created a class called ProducerConsumerStream that does this. The producer writes data to the stream and the consumer reads. There's a buffer in the middle so that the producer can "write ahead" a little bit. You can define the size of the buffer.
Anyway, if it's not exactly what you're looking for, I suspect it will give you a good idea of how it's done. See Building a new type of stream.
Update
The link went stale, so I've copied my code here. The original article is still available on the Wayback machine at https://web.archive.org/web/20151210235510/http://www.informit.com/guides/content.aspx?g=dotnet&seqNum=852
First, the ProducerConsumerStream class:
using System;
using System.IO;
using System.Threading;
using System.Diagnostics;
namespace Mischel.IO
{
// This class is safe for 1 producer and 1 consumer.
public class ProducerConsumerStream : Stream
{
private byte[] CircleBuff;
private int Head;
private int Tail;
public bool IsAddingCompleted { get; private set; }
public bool IsCompleted { get; private set; }
// For debugging
private long TotalBytesRead = 0;
private long TotalBytesWritten = 0;
public ProducerConsumerStream(int size)
{
CircleBuff = new byte[size];
Head = 1;
Tail = 0;
}
[Conditional("JIM_DEBUG")]
private void DebugOut(string msg)
{
Console.WriteLine(msg);
}
[Conditional("JIM_DEBUG")]
private void DebugOut(string fmt, params object[] parms)
{
DebugOut(string.Format(fmt, parms));
}
private int ReadBytesAvailable
{
get
{
if (Head > Tail)
return Head - Tail - 1;
else
return CircleBuff.Length - Tail + Head - 1;
}
}
private int WriteBytesAvailable { get { return CircleBuff.Length - ReadBytesAvailable - 1; } }
private void IncrementTail()
{
Tail = (Tail + 1) % CircleBuff.Length;
}
public override int Read(byte[] buffer, int offset, int count)
{
if (disposed)
{
throw new ObjectDisposedException("The stream has been disposed.");
}
if (IsCompleted)
{
throw new EndOfStreamException("The stream is empty and has been marked complete for adding.");
}
if (count == 0)
{
return 0;
}
lock (CircleBuff)
{
DebugOut("Read: requested {0:N0} bytes. Available = {1:N0}.", count, ReadBytesAvailable);
while (ReadBytesAvailable == 0)
{
if (IsAddingCompleted)
{
IsCompleted = true;
return 0;
}
Monitor.Wait(CircleBuff);
}
// If Head < Tail, then there are bytes available at the end of the buffer
// and also at the front of the buffer.
// If reading from Tail to the end doesn't fulfill the request,
// and there are still bytes available,
// then read from the start of the buffer.
DebugOut("Read: Head={0}, Tail={1}, Avail={2}", Head, Tail, ReadBytesAvailable);
IncrementTail();
int bytesToRead;
if (Tail > Head)
{
// When Tail > Head, we know that there are at least
// (CircleBuff.Length - Tail) bytes available in the buffer.
bytesToRead = CircleBuff.Length - Tail;
}
else
{
bytesToRead = Head - Tail;
}
// Don't read more than count bytes!
bytesToRead = Math.Min(bytesToRead, count);
Buffer.BlockCopy(CircleBuff, Tail, buffer, offset, bytesToRead);
Tail += (bytesToRead - 1);
int bytesRead = bytesToRead;
// At this point, either we've exhausted the buffer,
// or Tail is at the end of the buffer and has to wrap around.
if (bytesRead < count && ReadBytesAvailable > 0)
{
// We haven't fulfilled the read.
IncrementTail();
// Tail is always equal to 0 here.
bytesToRead = Math.Min((count - bytesRead), (Head - Tail));
Buffer.BlockCopy(CircleBuff, Tail, buffer, offset + bytesRead, bytesToRead);
bytesRead += bytesToRead;
Tail += (bytesToRead - 1);
}
TotalBytesRead += bytesRead;
DebugOut("Read: returning {0:N0} bytes. TotalRead={1:N0}", bytesRead, TotalBytesRead);
DebugOut("Read: Head={0}, Tail={1}, Avail={2}", Head, Tail, ReadBytesAvailable);
Monitor.Pulse(CircleBuff);
return bytesRead;
}
}
public override void Write(byte[] buffer, int offset, int count)
{
if (disposed)
{
throw new ObjectDisposedException("The stream has been disposed.");
}
if (IsAddingCompleted)
{
throw new InvalidOperationException("The stream has been marked as complete for adding.");
}
lock (CircleBuff)
{
DebugOut("Write: requested {0:N0} bytes. Available = {1:N0}", count, WriteBytesAvailable);
int bytesWritten = 0;
while (bytesWritten < count)
{
while (WriteBytesAvailable == 0)
{
Monitor.Wait(CircleBuff);
}
DebugOut("Write: Head={0}, Tail={1}, Avail={2}", Head, Tail, WriteBytesAvailable);
int bytesToCopy = Math.Min((count - bytesWritten), WriteBytesAvailable);
CopyBytes(buffer, offset + bytesWritten, bytesToCopy);
TotalBytesWritten += bytesToCopy;
DebugOut("Write: {0} bytes written. TotalWritten={1:N0}", bytesToCopy, TotalBytesWritten);
DebugOut("Write: Head={0}, Tail={1}, Avail={2}", Head, Tail, WriteBytesAvailable);
bytesWritten += bytesToCopy;
Monitor.Pulse(CircleBuff);
}
}
}
private void CopyBytes(byte[] buffer, int srcOffset, int count)
{
// Insert at head
// The copy might require two separate operations.
// copy as much as can fit between Head and end of the circular buffer
int offset = srcOffset;
int bytesCopied = 0;
int bytesToCopy = Math.Min(CircleBuff.Length - Head, count);
if (bytesToCopy > 0)
{
Buffer.BlockCopy(buffer, offset, CircleBuff, Head, bytesToCopy);
bytesCopied = bytesToCopy;
Head = (Head + bytesToCopy) % CircleBuff.Length;
offset += bytesCopied;
}
// Copy the remainder, which will go from the beginning of the buffer.
if (bytesCopied < count)
{
bytesToCopy = count - bytesCopied;
Buffer.BlockCopy(buffer, offset, CircleBuff, Head, bytesToCopy);
Head = (Head + bytesToCopy) % CircleBuff.Length;
}
}
public void CompleteAdding()
{
if (disposed)
{
throw new ObjectDisposedException("The stream has been disposed.");
}
lock (CircleBuff)
{
DebugOut("CompleteAdding: {0:N0} bytes written.", TotalBytesWritten);
IsAddingCompleted = true;
Monitor.Pulse(CircleBuff);
}
}
public override bool CanRead { get { return true; } }
public override bool CanSeek { get { return false; } }
public override bool CanWrite { get { return true; } }
public override void Flush() { /* does nothing */ }
public override long Length { get { throw new NotImplementedException(); } }
public override long Position
{
get { throw new NotImplementedException(); }
set { throw new NotImplementedException(); }
}
public override long Seek(long offset, SeekOrigin origin)
{
throw new NotImplementedException();
}
public override void SetLength(long value)
{
throw new NotImplementedException();
}
private bool disposed = false;
protected override void Dispose(bool disposing)
{
if (!disposed)
{
base.Dispose(disposing);
disposed = true;
}
}
}
}
And an example of how to use it:
class Program
{
static readonly string TestText = "This is a test of the emergency broadcast system.";
static readonly byte[] TextBytes = Encoding.UTF8.GetBytes(TestText);
const int Megabyte = 1024 * 1024;
const int TestBufferSize = 12;
const int ProducerBufferSize = 4;
const int ConsumerBufferSize = 5;
static void Main(string[] args)
{
Console.WriteLine("TextBytes contains {0:N0} bytes.", TextBytes.Length);
using (var pcStream = new ProducerConsumerStream(TestBufferSize))
{
Thread ProducerThread = new Thread(ProducerThreadProc);
Thread ConsumerThread = new Thread(ConsumerThreadProc);
ProducerThread.Start(pcStream);
Thread.Sleep(2000);
ConsumerThread.Start(pcStream);
ProducerThread.Join();
ConsumerThread.Join();
}
Console.Write("Done. Press Enter.");
Console.ReadLine();
}
static void ProducerThreadProc(object state)
{
Console.WriteLine("Producer: started.");
var pcStream = (ProducerConsumerStream)state;
int offset = 0;
while (offset < TestText.Length)
{
int bytesToWrite = Math.Min(ProducerBufferSize, TestText.Length - offset);
pcStream.Write(TextBytes, offset, bytesToWrite);
offset += bytesToWrite;
}
pcStream.CompleteAdding();
Console.WriteLine("Producer: {0:N0} total bytes written.", offset);
Console.WriteLine("Producer: exit.");
}
static void ConsumerThreadProc(object state)
{
Console.WriteLine("Consumer: started.");
var instream = (ProducerConsumerStream)state;
int testOffset = 0;
var inputBuffer = new byte[TextBytes.Length];
int bytesRead;
do
{
int bytesToRead = Math.Min(ConsumerBufferSize, inputBuffer.Length - testOffset);
bytesRead = instream.Read(inputBuffer, testOffset, bytesToRead);
//Console.WriteLine("Consumer: {0:N0} bytes read.", bytesRead);
testOffset += bytesRead;
} while (bytesRead != 0);
Console.WriteLine("Consumer: {0:N0} total bytes read.", testOffset);
// Compare bytes read with TextBytes
for (int i = 0; i < TextBytes.Length; ++i)
{
if (inputBuffer[i] != TextBytes[i])
{
Console.WriteLine("Read error at position {0}", i);
break;
}
}
Console.WriteLine("Consumer: exit.");
}
}
I had the same problem. In my case a third party package only accepts streams but I have an IEnumerable, and couldn't find an answer online so I wrote my own, which I'll share:
public class IEnumerableStringReader : TextReader
{
private readonly IEnumerator<string> _enumerator;
private bool eof = false; // is set to true when .MoveNext tells us there is no more data.
private char[] curLine = null;
private int curLinePos = 0;
private bool disposed = false;
public IEnumerableStringReader(IEnumerable<string> input)
{
_enumerator = input.GetEnumerator();
}
private void GetNextLine()
{
if (eof) return;
eof = !_enumerator.MoveNext();
if (eof) return;
curLine = $"{_enumerator.Current}\r\n" // IEnumerable<string> input implies newlines exist betweent he lines.
.ToCharArray();
curLinePos = 0;
}
public override int Peek()
{
if (disposed) throw new ObjectDisposedException("The stream has been disposed.");
if (curLine == null || curLinePos == curLine.Length) GetNextLine();
if (eof) return -1;
return curLine[curLinePos];
}
public override int Read()
{
if (disposed) throw new ObjectDisposedException("The stream has been disposed.");
if (curLine == null || curLinePos == curLine.Length) GetNextLine();
if (eof) return -1;
return curLine[curLinePos++];
}
public override int Read(char[] buffer, int index, int count)
{
if (disposed) throw new ObjectDisposedException("The stream has been disposed.");
if (count == 0) return 0;
int charsReturned = 0;
int maxChars = Math.Min(count, buffer.Length - index); // Assuming we dont run out of input chars, we return count characters if we can. If the space left in the buffer is not big enough we return as many as will fit in the buffer.
while (charsReturned < maxChars)
{
if (curLine == null || curLinePos == curLine.Length) GetNextLine();
if (eof) return charsReturned;
int maxCurrentCopy = maxChars - charsReturned;
int charsAtTheReady = curLine.Length - curLinePos; // chars available in current line
int copySize = Math.Min(maxCurrentCopy, charsAtTheReady); // stop at end of buffer.
// cant use Buffer.BlockCopy because it's byte based and we're dealing with chars.
Array.ConstrainedCopy(curLine, curLinePos, buffer, index, copySize);
index += copySize;
curLinePos += copySize;
charsReturned += copySize;
}
return charsReturned;
}
public override string ReadLine()
{
if (curLine == null || curLinePos == curLine.Length) GetNextLine();
if (eof) return null;
if (curLinePos > 0) // this is necessary in case the client uses both Read() and ReadLine() calls
{
var tmp = new string(curLine, curLinePos, (curLine.Length - curLinePos) - 2); // create a new string from the remainder of the char array. The -2 is because GetNextLine appends a crlf.
curLinePos = curLine.Length; // so next call will re-read
return tmp;
}
// read full line.
curLinePos = curLine.Length; // so next call will re-read
return _enumerator.Current; // if all the client does is call ReadLine this (faster) code path will be taken.
}
protected override void Dispose(bool disposing)
{
if (!disposed)
{
_enumerator.Dispose();
base.Dispose(disposing);
disposed = true;
}
}
}
In my case, I want to use it as input to Datastreams.Csv:
using (var tr = new IEnumerableStringReader(input))
using (var reader = new CsvReader(tr))
{
while (reader.ReadRecord())
{
// do whatever
}
}
Using the EnumerableToStream Nuget package, you would implement your method like so:
using EnumerableToStream;
private Stream ToStringStream(Encoding encoding, IEnumerable<string> data)
{
return data.ToStream(encoding);
}
I had the same requirement and ended up rolling my own implementation which I have been using for a while now. Getting all the nitty-gritty details just right took some time and effort. For instance, you want your IEnumerable to be disposed after the stream is read to the end and you don't want multibyte characters to be partially written to the buffer.
In this particular implementation, reading the stream does zero allocations, unlike other implementations using encoding.GetBytes(line).
After seeing this question, I decided to release the code as a Nuget package. Hope it saves you a few hours. The source code is on GitHub.
Steve Sadler wrote a perfectly working answer. However, he makes it way more difficult than needed
According to the reference source of TextReader you'll need only override Peek and Read:
A subclass must minimally implement the Peek() and Read() methods.
So first I write a function that converts IEnumerable<string> into IEnumerable<char> where a new line is added at the end of each string:
private static IEnumerable<char> ReadCharacters(IEnumerable<string> lines)
{
foreach (string line in lines)
{
foreach (char c in line + Environment.NewLine)
{
yield return c;
}
}
}
Environment.NewLine is the part that adds the new line at the end of each string.
Now the class is failry straightforward:
class EnumStringReader : TextReader
{
public EnumStringReader(IEnumerable<string> lines)
{
this.enumerator = ReadCharacters(lines).GetEnumerator();
this.dataAvailable = this.enumerator.MoveNext();
}
private bool disposed = false;
private bool dataAvailable;
private readonly IEnumerator<char> enumerator;
The constructor takes a sequence of lines to read. It uses this sequence and the earlier written function to convert the sequence into a sequence of characters with the added Environment.NewLine.
It gets the enumerator of the converted sequence, and moves to the first character. It remembers whether there is a first character in DataAvailable
Now we are ready to Peek: if no data available: return -1, otherwise return the current character as int. Do not move forward:
public override int Peek()
{
this.ThrowIfDisposed();
return this.dataAvailable ? this.enumerator.Current : -1;
}
Read: if no data available, return -1, otherwise return the current character as int. Move forward to the next character and remember whether there is data available:
public override int Read()
{
this.ThrowIfDisposed();
if (this.dataAvailable)
{
char nextChar = this.enumerator.Current;
this.dataAvailable = this.enumerator.MoveNext();
return (int)nextChar;
}
else
{
return -1;
}
}
Don't forget to override Dispose(bool) where you dispose the enumerator.
That is all that is needed. All other functions will use these two.
Now to fill your stream with the lines:
IEnumerable<string> lines = ...
using (TextWriter writer = System.IO.File.CreateText(...))
{
using (TextReader reader = new EnumStringReader(lines);
{
// either write per char:
while (reader.Peek() != -1)
{
char c = (char)reader.Read();
writer.Write(c);
}
// or write per line:
string line = reader.ReadLine();
// line is without newLine!
while (line != null)
{
writer.WriteLine(line);
line = reader.ReadLine();
}
// or write per block
buffer buf = new char[4096];
int nrRead = reader.ReadBlock(buf, 0, buf.Length)
while (nrRead > 0)
{
writer.Write(buf, 0, nrRead);
nrRead = reader.ReadBlock(buf, 0, buf.Length);
}
}
}
What is the best method to replace sequence of bytes in binary file to the same length of other bytes? The binary files will be pretty large, about 50 mb and should not be loaded at once in memory.
Update: I do not know location of bytes which needs to be replaced, I need to find them first.
Assuming you're trying to replace a known section of the file.
Open a FileStream with read/write access
Seek to the right place
Overwrite existing data
Sample code coming...
public static void ReplaceData(string filename, int position, byte[] data)
{
using (Stream stream = File.Open(filename, FileMode.Open))
{
stream.Position = position;
stream.Write(data, 0, data.Length);
}
}
If you're effectively trying to do a binary version of a string.Replace (e.g. "always replace bytes { 51, 20, 34} with { 20, 35, 15 } then it's rather harder. As a quick description of what you'd do:
Allocate a buffer of at least the size of data you're interested in
Repeatedly read into the buffer, scanning for the data
If you find a match, seek back to the right place (e.g. stream.Position -= buffer.Length - indexWithinBuffer; and overwrite the data
Sounds simple so far... but the tricky bit is if the data starts near the end of the buffer. You need to remember all potential matches and how far you've matched so far, so that if you get a match when you read the next buffer's-worth, you can detect it.
There are probably ways of avoiding this trickiness, but I wouldn't like to try to come up with them offhand :)
EDIT: Okay, I've got an idea which might help...
Keep a buffer which is at least twice as big as you need
Repeatedly:
Copy the second half of the buffer into the first half
Fill the second half of the buffer from the file
Search throughout the whole buffer for the data you're looking for
That way at some point, if the data is present, it will be completely within the buffer.
You'd need to be careful about where the stream was in order to get back to the right place, but I think this should work. It would be trickier if you were trying to find all matches, but at least the first match should be reasonably simple...
My solution :
/// <summary>
/// Copy data from a file to an other, replacing search term, ignoring case.
/// </summary>
/// <param name="originalFile"></param>
/// <param name="outputFile"></param>
/// <param name="searchTerm"></param>
/// <param name="replaceTerm"></param>
private static void ReplaceTextInBinaryFile(string originalFile, string outputFile, string searchTerm, string replaceTerm)
{
byte b;
//UpperCase bytes to search
byte[] searchBytes = Encoding.UTF8.GetBytes(searchTerm.ToUpper());
//LowerCase bytes to search
byte[] searchBytesLower = Encoding.UTF8.GetBytes(searchTerm.ToLower());
//Temporary bytes during found loop
byte[] bytesToAdd = new byte[searchBytes.Length];
//Search length
int searchBytesLength = searchBytes.Length;
//First Upper char
byte searchByte0 = searchBytes[0];
//First Lower char
byte searchByte0Lower = searchBytesLower[0];
//Replace with bytes
byte[] replaceBytes = Encoding.UTF8.GetBytes(replaceTerm);
int counter = 0;
using (FileStream inputStream = File.OpenRead(originalFile)) {
//input length
long srcLength = inputStream.Length;
using (BinaryReader inputReader = new BinaryReader(inputStream)) {
using (FileStream outputStream = File.OpenWrite(outputFile)) {
using (BinaryWriter outputWriter = new BinaryWriter(outputStream)) {
for (int nSrc = 0; nSrc < srcLength; ++nSrc)
//first byte
if ((b = inputReader.ReadByte()) == searchByte0
|| b == searchByte0Lower) {
bytesToAdd[0] = b;
int nSearch = 1;
//next bytes
for (; nSearch < searchBytesLength; ++nSearch)
//get byte, save it and test
if ((b = bytesToAdd[nSearch] = inputReader.ReadByte()) != searchBytes[nSearch]
&& b != searchBytesLower[nSearch]) {
break;//fail
}
//Avoid overflow. No need, in my case, because no chance to see searchTerm at the end.
//else if (nSrc + nSearch >= srcLength)
// break;
if (nSearch == searchBytesLength) {
//success
++counter;
outputWriter.Write(replaceBytes);
nSrc += nSearch - 1;
}
else {
//failed, add saved bytes
outputWriter.Write(bytesToAdd, 0, nSearch + 1);
nSrc += nSearch;
}
}
else
outputWriter.Write(b);
}
}
}
}
Console.WriteLine("ReplaceTextInBinaryFile.counter = " + counter);
}
You can use my BinaryUtility to search and replace one or more bytes without loading the entire file into memory like this:
var searchAndReplace = new List<Tuple<byte[], byte[]>>()
{
Tuple.Create(
BitConverter.GetBytes((UInt32)0xDEADBEEF),
BitConverter.GetBytes((UInt32)0x01234567)),
Tuple.Create(
BitConverter.GetBytes((UInt32)0xAABBCCDD),
BitConverter.GetBytes((UInt16)0xAFFE)),
};
using(var reader =
new BinaryReader(new FileStream(#"C:\temp\data.bin", FileMode.Open)))
{
using(var writer =
new BinaryWriter(new FileStream(#"C:\temp\result.bin", FileMode.Create)))
{
BinaryUtility.Replace(reader, writer, searchAndReplace);
}
}
BinaryUtility code:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
public static class BinaryUtility
{
public static IEnumerable<byte> GetByteStream(BinaryReader reader)
{
const int bufferSize = 1024;
byte[] buffer;
do
{
buffer = reader.ReadBytes(bufferSize);
foreach (var d in buffer) { yield return d; }
} while (bufferSize == buffer.Length);
}
public static void Replace(BinaryReader reader, BinaryWriter writer, IEnumerable<Tuple<byte[], byte[]>> searchAndReplace)
{
foreach (byte d in Replace(GetByteStream(reader), searchAndReplace)) { writer.Write(d); }
}
public static IEnumerable<byte> Replace(IEnumerable<byte> source, IEnumerable<Tuple<byte[], byte[]>> searchAndReplace)
{
foreach (var s in searchAndReplace)
{
source = Replace(source, s.Item1, s.Item2);
}
return source;
}
public static IEnumerable<byte> Replace(IEnumerable<byte> input, IEnumerable<byte> from, IEnumerable<byte> to)
{
var fromEnumerator = from.GetEnumerator();
fromEnumerator.MoveNext();
int match = 0;
foreach (var data in input)
{
if (data == fromEnumerator.Current)
{
match++;
if (fromEnumerator.MoveNext()) { continue; }
foreach (byte d in to) { yield return d; }
match = 0;
fromEnumerator.Reset();
fromEnumerator.MoveNext();
continue;
}
if (0 != match)
{
foreach (byte d in from.Take(match)) { yield return d; }
match = 0;
fromEnumerator.Reset();
fromEnumerator.MoveNext();
}
yield return data;
}
if (0 != match)
{
foreach (byte d in from.Take(match)) { yield return d; }
}
}
}
public static void BinaryReplace(string sourceFile, byte[] sourceSeq, string targetFile, byte[] targetSeq)
{
FileStream sourceStream = File.OpenRead(sourceFile);
FileStream targetStream = File.Create(targetFile);
try
{
int b;
long foundSeqOffset = -1;
int searchByteCursor = 0;
while ((b=sourceStream.ReadByte()) != -1)
{
if (sourceSeq[searchByteCursor] == b)
{
if (searchByteCursor == sourceSeq.Length - 1)
{
targetStream.Write(targetSeq, 0, targetSeq.Length);
searchByteCursor = 0;
foundSeqOffset = -1;
}
else
{
if (searchByteCursor == 0)
{
foundSeqOffset = sourceStream.Position - 1;
}
++searchByteCursor;
}
}
else
{
if (searchByteCursor == 0)
{
targetStream.WriteByte((byte) b);
}
else
{
targetStream.WriteByte(sourceSeq[0]);
sourceStream.Position = foundSeqOffset + 1;
searchByteCursor = 0;
foundSeqOffset = -1;
}
}
}
}
finally
{
sourceStream.Dispose();
targetStream.Dispose();
}
}
I've been playing with Code Contracts and I really like what I've seen so far. They encourage me to evaluate and explicitly declare my assumptions, which has already helped me to identify a few corner cases I hadn't considered in the code to which I'm adding contracts. Right now I'm playing with trying to enforce more sophisticated invariants. I have one case that currently fails proving and I'm curious if there is a way I can fix this besides simply adding Contract.Assume calls. Here is the class in question, stripped down for ease of reading:
public abstract class MemoryEncoder
{
private const int CapacityDelta = 16;
private int _currentByte;
/// <summary>
/// The current byte index in the encoding stream.
/// This should not need to be modified, under typical usage,
/// but can be used to randomly access the encoding region.
/// </summary>
public int CurrentByte
{
get
{
Contract.Ensures(Contract.Result<int>() >= 0);
Contract.Ensures(Contract.Result<int>() <= Length);
return _currentByte;
}
set
{
Contract.Requires(value >= 0);
Contract.Requires(value <= Length);
_currentByte = value;
}
}
/// <summary>
/// Current number of bytes encoded in the buffer.
/// This may be less than the size of the buffer (capacity).
/// </summary>
public int Length { get; private set; }
/// <summary>
/// The raw buffer encapsulated by the encoder.
/// </summary>
protected internal Byte[] Buffer { get; private set; }
/// <summary>
/// Reserve space in the encoder buffer for the specified number of new bytes
/// </summary>
/// <param name="bytesRequired">The number of bytes required</param>
protected void ReserveSpace(int bytesRequired)
{
Contract.Requires(bytesRequired > 0);
Contract.Ensures((Length - CurrentByte) >= bytesRequired);
//Check if these bytes would overflow the current buffer););
if ((CurrentByte + bytesRequired) > Buffer.Length)
{
//Create a new buffer with at least enough space for the additional bytes required
var newBuffer = new Byte[Buffer.Length + Math.Max(bytesRequired, CapacityDelta)];
//Copy the contents of the previous buffer and replace the original buffer reference
Buffer.CopyTo(newBuffer, 0);
Buffer = newBuffer;
}
//Check if the total length of written bytes has increased
if ((CurrentByte + bytesRequired) > Length)
{
Length = CurrentByte + bytesRequired;
}
}
[ContractInvariantMethod]
private void GlobalRules()
{
Contract.Invariant(Buffer != null);
Contract.Invariant(Length <= Buffer.Length);
Contract.Invariant(CurrentByte >= 0);
Contract.Invariant(CurrentByte <= Length);
}
}
I'm interested in how I can structure the Contract calls in ReserveSpace so that the class invariants are provable. In particular, it complains about (Length <= Buffer.Length) and (CurrentByte <= Length). It's reasonable to me that it can't see that (Length <= Buffer.Length) is satisfied, since it's creating a new buffer and reassigning the reference. Is my only option to add an Assume that the invariants are satisfied?
After fighting with this for a while, I came up with this provable solution (constructor is a dummy to allow for isolated testing):
public abstract class MemoryEncoder
{
private const int CapacityDelta = 16;
private byte[] _buffer;
private int _currentByte;
private int _length;
protected MemoryEncoder()
{
Buffer = new byte[500];
Length = 0;
CurrentByte = 0;
}
/// <summary>
/// The current byte index in the encoding stream.
/// This should not need to be modified, under typical usage,
/// but can be used to randomly access the encoding region.
/// </summary>
public int CurrentByte
{
get
{
return _currentByte;
}
set
{
Contract.Requires(value >= 0);
Contract.Requires(value <= Length);
_currentByte = value;
}
}
/// <summary>
/// Current number of bytes encoded in the buffer.
/// This may be less than the size of the buffer (capacity).
/// </summary>
public int Length
{
get { return _length; }
private set
{
Contract.Requires(value >= 0);
Contract.Requires(value <= _buffer.Length);
Contract.Requires(value >= CurrentByte);
Contract.Ensures(_length <= _buffer.Length);
_length = value;
}
}
/// <summary>
/// The raw buffer encapsulated by the encoder.
/// </summary>
protected internal Byte[] Buffer
{
get { return _buffer; }
private set
{
Contract.Requires(value != null);
Contract.Requires(value.Length >= _length);
_buffer = value;
}
}
/// <summary>
/// Reserve space in the encoder buffer for the specified number of new bytes
/// </summary>
/// <param name="bytesRequired">The number of bytes required</param>
protected void ReserveSpace(int bytesRequired)
{
Contract.Requires(bytesRequired > 0);
Contract.Ensures((Length - CurrentByte) >= bytesRequired);
//Check if these bytes would overflow the current buffer););
if ((CurrentByte + bytesRequired) > Buffer.Length)
{
//Create a new buffer with at least enough space for the additional bytes required
var newBuffer = new Byte[Buffer.Length + Math.Max(bytesRequired, CapacityDelta)];
//Copy the contents of the previous buffer and replace the original buffer reference
Buffer.CopyTo(newBuffer, 0);
Buffer = newBuffer;
}
//Check if the total length of written bytes has increased
if ((CurrentByte + bytesRequired) > Length)
{
Contract.Assume(CurrentByte + bytesRequired <= _buffer.Length);
Length = CurrentByte + bytesRequired;
}
}
[ContractInvariantMethod]
private void GlobalRules()
{
Contract.Invariant(_buffer != null);
Contract.Invariant(_length <= _buffer.Length);
Contract.Invariant(_currentByte >= 0);
Contract.Invariant(_currentByte <= _length);
}
}
The main thing I noticed is that placing invariants on properties gets messy, but seems to solve more easily with invariants on fields. It was also important to place appropriate contractual obligations in the property accessors. I'll have to keep experimenting and see what works and what doesn't. It's an interesting system, but I'd definitely like to know more if anybody has a good 'cheat sheet' on how the prover works.
How do you think what is the best way to find position in the System.Stream where given byte sequence starts (first occurence):
public static long FindPosition(Stream stream, byte[] byteSequence)
{
long position = -1;
/// ???
return position;
}
P.S. The simpliest yet fastest solution is preffered. :)
I've reached this solution.
I did some benchmarks with an ASCII file that was 3.050 KB and 38803 lines.
With a search byte array of 22 bytes in the last line of the file I've got the result in about 2.28 seconds (in a slow/old machine).
public static long FindPosition(Stream stream, byte[] byteSequence)
{
if (byteSequence.Length > stream.Length)
return -1;
byte[] buffer = new byte[byteSequence.Length];
using (BufferedStream bufStream = new BufferedStream(stream, byteSequence.Length))
{
int i;
while ((i = bufStream.Read(buffer, 0, byteSequence.Length)) == byteSequence.Length)
{
if (byteSequence.SequenceEqual(buffer))
return bufStream.Position - byteSequence.Length;
else
bufStream.Position -= byteSequence.Length - PadLeftSequence(buffer, byteSequence);
}
}
return -1;
}
private static int PadLeftSequence(byte[] bytes, byte[] seqBytes)
{
int i = 1;
while (i < bytes.Length)
{
int n = bytes.Length - i;
byte[] aux1 = new byte[n];
byte[] aux2 = new byte[n];
Array.Copy(bytes, i, aux1, 0, n);
Array.Copy(seqBytes, aux2, n);
if (aux1.SequenceEqual(aux2))
return i;
i++;
}
return i;
}
If you treat the stream like another sequence of bytes, you can just search it like you were doing a string search. Wikipedia has a great article on that. Boyer-Moore is a good and simple algorithm for this.
Here's a quick hack I put together in Java. It works and it's pretty close if not Boyer-Moore. Hope it helps ;)
public static final int BUFFER_SIZE = 32;
public static int [] buildShiftArray(byte [] byteSequence){
int [] shifts = new int[byteSequence.length];
int [] ret;
int shiftCount = 0;
byte end = byteSequence[byteSequence.length-1];
int index = byteSequence.length-1;
int shift = 1;
while(--index >= 0){
if(byteSequence[index] == end){
shifts[shiftCount++] = shift;
shift = 1;
} else {
shift++;
}
}
ret = new int[shiftCount];
for(int i = 0;i < shiftCount;i++){
ret[i] = shifts[i];
}
return ret;
}
public static byte [] flushBuffer(byte [] buffer, int keepSize){
byte [] newBuffer = new byte[buffer.length];
for(int i = 0;i < keepSize;i++){
newBuffer[i] = buffer[buffer.length - keepSize + i];
}
return newBuffer;
}
public static int findBytes(byte [] haystack, int haystackSize, byte [] needle, int [] shiftArray){
int index = needle.length;
int searchIndex, needleIndex, currentShiftIndex = 0, shift;
boolean shiftFlag = false;
index = needle.length;
while(true){
needleIndex = needle.length-1;
while(true){
if(index >= haystackSize)
return -1;
if(haystack[index] == needle[needleIndex])
break;
index++;
}
searchIndex = index;
needleIndex = needle.length-1;
while(needleIndex >= 0 && haystack[searchIndex] == needle[needleIndex]){
searchIndex--;
needleIndex--;
}
if(needleIndex < 0)
return index-needle.length+1;
if(shiftFlag){
shiftFlag = false;
index += shiftArray[0];
currentShiftIndex = 1;
} else if(currentShiftIndex >= shiftArray.length){
shiftFlag = true;
index++;
} else{
index += shiftArray[currentShiftIndex++];
}
}
}
public static int findBytes(InputStream stream, byte [] needle){
byte [] buffer = new byte[BUFFER_SIZE];
int [] shiftArray = buildShiftArray(needle);
int bufferSize, initBufferSize;
int offset = 0, init = needle.length;
int val;
try{
while(true){
bufferSize = stream.read(buffer, needle.length-init, buffer.length-needle.length+init);
if(bufferSize == -1)
return -1;
if((val = findBytes(buffer, bufferSize+needle.length-init, needle, shiftArray)) != -1)
return val+offset;
buffer = flushBuffer(buffer, needle.length);
offset += bufferSize-init;
init = 0;
}
} catch (IOException e){
e.printStackTrace();
}
return -1;
}
You'll basically need to keep a buffer the same size as byteSequence so that once you've found that the "next byte" in the stream matches, you can check the rest but then still go back to the "next but one" byte if it's not an actual match.
It's likely to be a bit fiddly whatever you do, to be honest :(
I needed to do this myself, had already started, and didn't like the solutions above. I specifically needed to find where the search-byte-sequence ends. In my situation, I need to fast-forward the stream until after that byte sequence. But you can use my solution for this question too:
var afterSequence = stream.ScanUntilFound(byteSequence);
var beforeSequence = afterSequence - byteSequence.Length;
Here is StreamExtensions.cs
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace System
{
static class StreamExtensions
{
/// <summary>
/// Advances the supplied stream until the given searchBytes are found, without advancing too far (consuming any bytes from the stream after the searchBytes are found).
/// Regarding efficiency, if the stream is network or file, then MEMORY/CPU optimisations will be of little consequence here.
/// </summary>
/// <param name="stream">The stream to search in</param>
/// <param name="searchBytes">The byte sequence to search for</param>
/// <returns></returns>
public static int ScanUntilFound(this Stream stream, byte[] searchBytes)
{
// For this class code comments, a common example is assumed:
// searchBytes are {1,2,3,4} or 1234 for short
// # means value that is outside of search byte sequence
byte[] streamBuffer = new byte[searchBytes.Length];
int nextRead = searchBytes.Length;
int totalScannedBytes = 0;
while (true)
{
FillBuffer(stream, streamBuffer, nextRead);
totalScannedBytes += nextRead; //this is only used for final reporting of where it was found in the stream
if (ArraysMatch(searchBytes, streamBuffer, 0))
return totalScannedBytes; //found it
nextRead = FindPartialMatch(searchBytes, streamBuffer);
}
}
/// <summary>
/// Check all offsets, for partial match.
/// </summary>
/// <param name="searchBytes"></param>
/// <param name="streamBuffer"></param>
/// <returns>The amount of bytes which need to be read in, next round</returns>
static int FindPartialMatch(byte[] searchBytes, byte[] streamBuffer)
{
// 1234 = 0 - found it. this special case is already catered directly in ScanUntilFound
// #123 = 1 - partially matched, only missing 1 value
// ##12 = 2 - partially matched, only missing 2 values
// ###1 = 3 - partially matched, only missing 3 values
// #### = 4 - not matched at all
for (int i = 1; i < searchBytes.Length; i++)
{
if (ArraysMatch(searchBytes, streamBuffer, i))
{
// EG. Searching for 1234, have #123 in the streamBuffer, and [i] is 1
// Output: 123#, where # will be read using FillBuffer next.
Array.Copy(streamBuffer, i, streamBuffer, 0, searchBytes.Length - i);
return i; //if an offset of [i], makes a match then only [i] bytes need to be read from the stream to check if there's a match
}
}
return 4;
}
/// <summary>
/// Reads bytes from the stream, making sure the requested amount of bytes are read (streams don't always fulfill the full request first time)
/// </summary>
/// <param name="stream">The stream to read from</param>
/// <param name="streamBuffer">The buffer to read into</param>
/// <param name="bytesNeeded">How many bytes are needed. If less than the full size of the buffer, it fills the tail end of the streamBuffer</param>
static void FillBuffer(Stream stream, byte[] streamBuffer, int bytesNeeded)
{
// EG1. [123#] - bytesNeeded is 1, when the streamBuffer contains first three matching values, but now we need to read in the next value at the end
// EG2. [####] - bytesNeeded is 4
var bytesAlreadyRead = streamBuffer.Length - bytesNeeded; //invert
while (bytesAlreadyRead < streamBuffer.Length)
{
bytesAlreadyRead += stream.Read(streamBuffer, bytesAlreadyRead, streamBuffer.Length - bytesAlreadyRead);
}
}
/// <summary>
/// Checks if arrays match exactly, or with offset.
/// </summary>
/// <param name="searchBytes">Bytes to search for. Eg. [1234]</param>
/// <param name="streamBuffer">Buffer to match in. Eg. [#123] </param>
/// <param name="startAt">When this is zero, all bytes are checked. Eg. If this value 1, and it matches, this means the next byte in the stream to read may mean a match</param>
/// <returns></returns>
static bool ArraysMatch(byte[] searchBytes, byte[] streamBuffer, int startAt)
{
for (int i = 0; i < searchBytes.Length - startAt; i++)
{
if (searchBytes[i] != streamBuffer[i + startAt])
return false;
}
return true;
}
}
}
Bit old question, but here's my answer. I've found that reading blocks and then searching in that is extremely inefficient compared to just reading one at a time and going from there.
Also, IIRC, the accepted answer would fail if part of the sequence was in one block read and half in another - ex, given 12345, searching for 23, it would read 12, not match, then read 34, not match, etc... haven't tried it, though, seeing as it requires net 4.0. At any rate, this is way simpler, and likely much faster.
static long ReadOneSrch(Stream haystack, byte[] needle)
{
int b;
long i = 0;
while ((b = haystack.ReadByte()) != -1)
{
if (b == needle[i++])
{
if (i == needle.Length)
return haystack.Position - needle.Length;
}
else
i = b == needle[0] ? 1 : 0;
}
return -1;
}
static long Search(Stream stream, byte[] pattern)
{
long start = -1;
stream.Seek(0, SeekOrigin.Begin);
while(stream.Position < stream.Length)
{
if (stream.ReadByte() != pattern[0])
continue;
start = stream.Position - 1;
for (int idx = 1; idx < pattern.Length; idx++)
{
if (stream.ReadByte() != pattern[idx])
{
start = -1;
break;
}
}
if (start > -1)
{
return start;
}
}
return start;
}