I have a byte array that I want to persist in a file. But, I don't want to write to file each time it is updated because it can be updated very frequently. Currently I am planning to use an approach similar to following;
class ThrottleTest
{
private byte[] _internal_data = new byte[256];
CancellationTokenSource _cancel_saving = new CancellationTokenSource();
public void write_to_file()
{
Task.Delay(1000).ContinueWith((task) =>
{
File.WriteAllBytes("path/to/file.data", _internal_data);
}, _cancel_saving.Token);
}
public void operation_that_update_internal_data()
{
// cancel writing previous update
_cancel_saving.Cancel();
/*
* operate on _internal_data
*/
write_to_file();
}
public void another_operation_that_update_internal_data()
{
// cancel writing previous update
_cancel_saving.Cancel();
/*
* operate on _internal_data
*/
write_to_file();
}
}
I don't think this approach would work, because, when I cancel the token once, it will be canceled forever, so it will never write to the file.
First of all, I was wondering if I am on the right track here, and above code can be made to work. If not, what would be the best approach to achieve this behaviour. Moreover, is there a practical way to generalize it to Dictionary<string,byte[]>, where any byte[] can be modified independently?
I would start writing to file by cancelling first the previous operation.
I would also include the cancellation token in the Delay task.
CancellationTokenSource _cancel_saving;
public void write_to_file()
{
_cancel_saving?.Cancel();
_cancel_saving = new CancellationTokenSource();
Task.Delay(1000, _cancel_saving.Token).ContinueWith((task) =>
{
File.WriteAllBytes("path/to/file.data", _internal_data);
}, _cancel_saving.Token);
}
You should use Microsoft's Reactive Framework (aka Rx) - NuGet System.Reactive and add using System.Reactive.Linq; - then you can do this:
public class ThrottleTest
{
private byte[] _internal_data = new byte[256];
private Subject<Unit> _write_to_file = new Subject<Unit>();
public ThrottleTest()
{
_write_to_file
.Throttle(TimeSpan.FromSeconds(1.0))
.Subscribe(_ => File.WriteAllBytes("path/to/file.data", _internal_data));
}
public void write_to_file()
{
_write_to_file.OnNext(Unit.Default);
}
public void operation_that_update_internal_data()
{
/*
* operate on _internal_data
*/
write_to_file();
}
public void another_operation_that_update_internal_data()
{
/*
* operate on _internal_data
*/
write_to_file();
}
}
Your context seems a little odd to me. You are writing all of the bytes, and not using a stream. Putting aside your issue with the cancellation token, delaying a write by 1 second won't reduce the overall load or overall throughput to disk.
This answer has the following assumptions:
You are using an SSD and are concerned about hardware lifetime
This is a low priority activity, where some data loss will be tolerated
This is not a logging activity (otherwise an append to file would work better with a BufferedStream)
This is likely the saving of a serialized C# object tree to disk in case the power goes out
You don't want every change made to the object tree to result in a write to disk.
You don't want to write to disk every second if there has been no change to the object tree.
It should write to disk right away if there hasn't been a write for N seconds
It should wait if there has been a write recently.
Having the WriteAllBytes step as the throttle point is not ideal.
Usage:
rootObject.subObject.value = 9;
rootObject.Save(token);
Support code:
TimeSpan minimumDiskInterval = TimeSpan.FromSeconds(60);
DateTime lastSaveAt = DateTime.MinValue;
bool alreadyQueued = false;
public void Save(CancellationToken token)
{
if (alreadyQueued) //TODO: Interlocked with long instead for atomic memory barrier
return;
alreadyQueued = true; //TODO: Interlocked with long instead for atomic memory barrier
var thisSaveAt = DateTime.UtcNow;
var sinceLastSave = thisSaveAt.Subtract(lastSaveAt);
var difference = TimeSpan.TotalSeconds - sinceLastSave.TotalSeconds;
if (difference < 0)
{
//It has been a while - no need to delay
SaveNow();
}
else
{
//It was done recently
T Task.Delay(TimeSpan.FromSeconds(difference).ContinueWith((task) =>
{
SaveNow();
}, _cancel_saving.Token);
}
}
object fileAccessSync = new object();
public void SaveNow()
{
alreadyQueued = false; //TODO: Interlocked with long instead for atomic memory barrier
byte[] serializedBytes = Serialise(this)
lock (fileAccessSync)
{
File.WriteAllBytes("path/to/file.data", serializedBytes);
}
}
Related
I've built a program that
takes in a list of record data from a file
parses and cleans up each record in a parsing object
outputs it to an output file
So far this has worked on a single thread, but considering the fact that records can exceed 1 million in some cases, we want to implement this in a multi threading context. Multi threading is new to me in .Net, and I've given it a shot but its not working. Below I will provide more details and code:
Main Class (simplified):
public class MainClass
{
parseObject[] parseObjects;
Thread[] threads;
List<InputLineItem> inputList = new List<InputLineItem>();
FileUtils fileUtils = new FileUtils();
public GenParseUtilsThreaded(int threadCount)
{
this.threadCount = threadCount;
Init();
}
public void Init()
{
inputList = fileUtils.GetInputList();
parseObjects = new parseObject[threadCount - 1];
threads = new Thread[threadCount - 1];
InitParseObjects();
Parse();
}
private void InitParseObjects()
{
//using a ref of fileUtils to use as my lock expression
parseObjects[0] = new ParseObject(ref fileUtils);
parseObjects[0].InitValues();
for (int i = 1; i < threadCount - 1; i++)
{
parseObjects[i] = new parseObject(ref fileUtils);
parseObjects[i].InitValues();
}
}
private void InitThreads()
{
for (int i = 0; i < threadCount - 1; i++)
{
Thread t = new Thread(new ThreadStart(parseObjects[0].CleanupAndParseInput));
threads[i] = t;
}
}
public void Parse()
{
try
{
InitThreads();
int objectIndex = 0;
foreach (InputLineItem inputLineItem in inputList)
{
parseObjects[0].inputLineItem = inputLineItem;
threads[objectIndex].Start();
objectIndex++;
if (objectIndex == threadCount)
{
objectIndex = 0;
InitThreads(); //do i need to re-init the threads after I've already used them all once?
}
}
}
catch (Exception e)
{
Console.WriteLine("(286) The following error occured: " + e);
}
}
}
}
And my Parse object class (also simplified):
public class ParseObject
{
public ParserLibrary parser { get; set; }
public FileUtils fileUtils { get; set; }
public InputLineItem inputLineItem { get; set; }
public ParseObject( ref FileUtils fileUtils)
{
this.fileUtils = fileUtils;
}
public void InitValues()
{
//relevant config of parser library object occurs here
}
public void CleanupFields()
{
parser.Clean(inputLineItem.nameValue);
inputLineItem.nameValue = GetCleanupUpValueFromParser();
}
private string GetCleanupFieldValue()
{
//code to extract cleanup up value from parses
}
public void CleanupAndParseInput()
{
CleanupFields();
ParseInput();
}
public void ParseInput()
{
try
{
parser.Parse(InputLineItem.NameValue);
}
catch (Exception e)
{
}
try
{
lock (fileUtils)
{
WriteOutputToFile(inputLineItem);
}
}
catch (Exception e)
{
Console.WriteLine("(414) Failed to write to output: " + e);
}
}
public void WriteOutputToFile(InputLineItem inputLineItem)
{
//writes updated value to output file
}
}
The error I get is when trying to run the Parse function, I get this message:
An unhandled exception of type 'System.AccessViolationException' occurred in GenParse.NET.dll
Attempted to read or write protected memory. This is often an indication that other memory is corrupt.
That being said, I feel like there's a whole lot more that I'm doing wrong here aside from what is causing that error.
I also have further questions:
Do I create multiple parse objects and iteratively feed them to each thread as I'm attempting to do, or should I use one Parse object that gets shared or cloned across each thread?
If, outside the thread, I change a value in the object that I'm passing to the thread, will that change reflect in the object passed to the thread? i.e, is the object passed by value or reference?
Is there a more efficient way for each record to be assigned to a thread and its parse object than I am currently doing with the objectIndex iterator?
THANKS!
Do I create multiple parse objects and iteratively feed them to each thread as I'm attempting to do, or should I use one Parse object that gets shared or cloned across each thread?
You initialize each thread with new ThreadStart(parseObjects[0].CleanupAndParseInput) so all threads will share the same parse object. It is a fairly safe bet that the parse objects are not threadsafe. So each thread should have a separate object. Note that this might not be sufficient, if the parse library uses any global fields it might be non-threadsafe even when using separate objects.
If, outside the thread, I change a value in the object that I'm passing to the thread, will that change reflect in the object passed to the thread? i.e, is the object passed by value or reference?
Objects (i.e. classes) are passed by reference. But any changes to an object are not guaranteed to be visible in other threads unless a memoryBarrier is issued. Most synchronization code (like lock) will issue memory barriers. Keep in mind that any non-atomic operation is unsafe if a field is written an read concurrently.
Is there a more efficient way for each record to be assigned to a thread and its parse object than I am currently doing with the objectIndex iterator?
Using manual threads in this way is very old-school. The modern, easier, and probably faster way is to use a parallel-for loop. This will try to be smart about how many threads it will use and try to adapt chunk sizes to keep the synchronization overhead low.
var items = new List<int>();
ParseObject LocalInit()
{
// Do initalization, This is run once for each thread used
return new ParseObject();
}
ParseObject ThreadMain(int value, ParallelLoopState state, ParseObject threadLocalObject)
{
// Do whatever you need to do
// This is run on multiple threads
return threadLocalObject;
}
void LocalFinally(ParseObject obj)
{
// Do Cleanup for each thread
}
Parallel.ForEach(items, LocalInit, ThreadMain, LocalFinally);
As a final note, I would advice against using multithreading unless you are familiar with the potential dangers and pitfalls it involves, at least for any project where the result is important. There are many ways to screw up and make a program that will work 99.9% of the time, and silently corrupt data the remaining 0.1% of the time.
I'm trying to create a stream application based on Spotify's libspotify SDK.
To achieve this in C# I'm using the ohLibspotify bindings and wrapper. This is only a thin abstraction layer so most of it will be a 1:1 mapping to the libspotify SDK. To play the incoming PCM data I'm using the NAudio library.
Most of the times I can play the first track. Then when I load the second one I get a AccessViolationException whilst trying to call sp_session_player_load(). Also this sometimes happens the first time I try to play a track and sometimes it happens the third time.
This is the function I use to play a track.
public void playTrack(string track, string juke)
{
new Thread(new ThreadStart(() =>
{
var playable = Link.CreateFromString(string.Format("spotify:track:{0}", track)).AsTrack();
if (playing)
{
player.Pause();
App.Logic.spotify.sp_session.PlayerUnload();
}
buffer = new BufferedWaveProvider(new WaveFormat())
{
BufferDuration = TimeSpan.FromSeconds(120),
DiscardOnBufferOverflow = false
};
var waitEvent = new AutoResetEvent(false);
while (!playable.IsLoaded())
{
waitEvent.WaitOne(30);
}
App.Logic.spotify.sp_session.PlayerLoad(playable);
App.Logic.spotify.sp_session.PlayerPlay(true);
player = new WaveOut();
player.Init(buffer);
player.Play();
playing = true;
})).Start();
}
The AccessViolationException occurs on line 6 of the following piece of code within the wrapper library.
[DllImport("libspotify")]
internal static extern SpotifyError sp_session_player_load(IntPtr #session, IntPtr #track);
public void PlayerLoad(Track #track)
{
SpotifyError errorValue;
errorValue = NativeMethods.sp_session_player_load(this._handle, track._handle);
SpotifyMarshalling.CheckError(errorValue);
}
The streaming callbacks:
public override void GetAudioBufferStats(SpotifySession session, out AudioBufferStats stats)
{
stats = new AudioBufferStats()
{
samples = App.Logic.spotify.player.buffer.BufferedBytes / 2,
stutter = 0
};
}
public override int MusicDelivery(SpotifySession session, AudioFormat format, IntPtr frames, int num_frames) {
int incoming_size = num_frames * format.channels * 2;
try
{
if (incoming_size > sample_buffer.Length)
{
short rendered_frames = Convert.ToInt16(Math.Floor((sample_buffer.Length / format.channels / 2d)));
short rendered_size = Convert.ToInt16(rendered_frames * format.channels * 2);
Marshal.Copy(frames, sample_buffer, 0, rendered_size);
App.Logic.spotify.player.buffer.AddSamples(sample_buffer, 0, rendered_size);
return rendered_frames;
}
else
{
Marshal.Copy(frames, sample_buffer, 0, incoming_size);
App.Logic.spotify.player.buffer.AddSamples(sample_buffer, 0, incoming_size);
return num_frames;
}
}
catch (Exception e)
{
return 0;
}
}
As #Weeble said in his comment to my question. It is very hard to diagnose the possible source of an AccessViolationException. In my case it was threading, there were multiple threads accessing the Spotify session object at once.
If this could be your problem as well you might want to look at this article. It talks about thread synchronization in C# and VB.Net. It's very easy really.
lock(sessionLock)
{
App.Logic.spotify.sp_session.PlayerLoad(playable);
App.Logic.spotify.sp_session.PlayerPlay(true);
}
The sessionLock object can be a simple instantiation of the Object type. Though it's not recommended to use any "actual" objects for this. In my case I did the following:
public class Player
{
private object sessionLock = new object();
// Rest of code with locks
}
This way you can lock the sessionLock object every time you want to access the, in this case, SpotifySession object. So that when thread 1 is loading a song and thread 2 wants to process the events at the same time, thread 2 will be blocked until the lock on sessionLock has been lifted.
As #Weeble suggested there are possibly some other things you might want to look into if threading is not the issue.
You might not be handling spotify's ref-counting correctly, and accidentally decreasing a ref-count too early or forgetting to increment one somewhere it is necessary.
You might be corrupting memory in the music callbacks that deal with native pointers.
There might be a bug in ohLibspotify or libspotify. If you think this is the case then go to the ohLibspotify repo or the openhome forum to report your issue.
There's already a question about this issue, but it's not telling me what I need to know:
Let's assume I have a web application, and there's a lot of logging on every roundtrip. I don't want to open a debate about why there's so much logging, or how can I do less loggin operations. I want to know what possibilities I have in order to make this logging issue performant and clean.
So far, I've implemented declarative (attribute based) and imperative logging, which seems to be a cool and clean way of doing it... now, what can I do about performance, assuming I can expect those logs to take more time than expected. Is it ok to open a thread and leave that work to it?
Things I would consider:
Use an efficient file format to minimise the quantity of data to be written (e.g. XML and text formats are easy to read but usually terribly inefficient - the same information can be stored in a binary format in a much smaller space). But don't spend lots of CPU time trying to pack data "optimally". Just go for a simple format that is compact but fast to write.
Test use of compression on the logs. This may not be the case with a fast SSD but in most I/O situations the overhead of compressing data is less than the I/O overhead, so compression gives a net gain (although it is a compromise - raising CPU usage to lower I/O usage).
Only log useful information. No matter how important you think everything is, it's likely you can find something to cut out.
Eliminate repeated data. e.g. Are you logging a client's IP address or domain name repeatedly? Can these be reported once for a session and then not repeated? Or can you store them in a map file and use a compact index value whenever you need to reference them? etc
Test whether buffering the logged data in RAM helps improve performance (e.g. writing a thousand 20 byte log records will mean 1,000 function calls and could cause a lot of disk seeking and other write overheads, while writing a single 20,000 byte block in one burst means only one function call and could give significant performance increase and maximise the burst rate you get out to disk). Often writing blocks in sizes like (4k, 16k, 32, 64k) of data works well as it tends to fit the disk and I/O architecture (but check your specific architecture for clues about what sizes might improve efficiency). The down side of a RAM buffer is that if there is a power outage you will lose more data. So you may have to balance performance against robustness.
(Especially if you are buffering...) Dump the information to an in-memory data structure and pass it to another thread to stream it out to disk. This will help stop your primary thread being held up by log I/O. Take care with threads though - for example, you may have to consider how you will deal with times when you are creating data faster than it can be logged for short bursts - do you need to implement a queue, etc?
Are you logging multiple streams? Can these be multiplexed into a single log to possibly reduce disk seeking and the number of open files?
Is there a hardware solution that will give a large bang for your buck? e.g. Have you used SSD or RAID disks? Will dumping the data to a different server help or hinder? It may not always make much sense to spend $10,000 of developer time making something perform better if you can spend $500 to simply upgrade the disk.
I use the code below to Log. It is a singleton that accepts Logging and puts every message into a concurrentqueue. Every two seconds it writes all that has come in to the disk. Your app is now only delayed by the time it takes to put every message in the list. It's my own code, feel free to use it.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Windows.Forms;
namespace FastLibrary
{
public enum Severity : byte
{
Info = 0,
Error = 1,
Debug = 2
}
public class Log
{
private struct LogMsg
{
public DateTime ReportedOn;
public string Message;
public Severity Seriousness;
}
// Nice and Threadsafe Singleton Instance
private static Log _instance;
public static Log File
{
get { return _instance; }
}
static Log()
{
_instance = new Log();
_instance.Message("Started");
_instance.Start("");
}
~Log()
{
Exit();
}
public static void Exit()
{
if (_instance != null)
{
_instance.Message("Stopped");
_instance.Stop();
_instance = null;
}
}
private ConcurrentQueue<LogMsg> _queue = new ConcurrentQueue<LogMsg>();
private Thread _thread;
private string _logFileName;
private volatile bool _isRunning;
public void Message(string msg)
{
_queue.Enqueue(new LogMsg { ReportedOn = DateTime.Now, Message = msg, Seriousness = Severity.Info });
}
public void Message(DateTime time, string msg)
{
_queue.Enqueue(new LogMsg { ReportedOn = time, Message = msg, Seriousness = Severity.Info });
}
public void Message(Severity seriousness, string msg)
{
_queue.Enqueue(new LogMsg { ReportedOn = DateTime.Now, Message = msg, Seriousness = seriousness });
}
public void Message(DateTime time, Severity seriousness, string msg)
{
_queue.Enqueue(new LogMsg { ReportedOn = time, Message = msg, Seriousness = seriousness });
}
private void Start(string fileName = "", bool oneLogPerProcess = false)
{
_isRunning = true;
// Unique FileName with date in it. And ProcessId so the same process running twice will log to different files
string lp = oneLogPerProcess ? "_" + System.Diagnostics.Process.GetCurrentProcess().Id : "";
_logFileName = fileName == ""
? DateTime.Now.Year.ToString("0000") + DateTime.Now.Month.ToString("00") +
DateTime.Now.Day.ToString("00") + lp + "_" +
System.IO.Path.GetFileNameWithoutExtension(Application.ExecutablePath) + ".log"
: fileName;
_thread = new Thread(LogProcessor);
_thread.IsBackground = true;
_thread.Start();
}
public void Flush()
{
EmptyQueue();
}
private void EmptyQueue()
{
while (_queue.Any())
{
var strList = new List<string>();
//
try
{
// Block concurrent writing to file due to flush commands from other context
lock (_queue)
{
LogMsg l;
while (_queue.TryDequeue(out l)) strList.Add(l.ReportedOn.ToLongTimeString() + "|" + l.Seriousness + "|" + l.Message);
if (strList.Count > 0)
{
System.IO.File.AppendAllLines(_logFileName, strList);
strList.Clear();
}
}
}
catch
{
//ignore errors on errorlogging ;-)
}
}
}
public void LogProcessor()
{
while (_isRunning)
{
EmptyQueue();
// Sleep while running so we write in efficient blocks
if (_isRunning) Thread.Sleep(2000);
else break;
}
}
private void Stop()
{
// This is never called in the singleton.
// But we made it a background thread so all will be killed anyway
_isRunning = false;
if (_thread != null)
{
_thread.Join(5000);
_thread.Abort();
_thread = null;
}
}
}
}
Check if the logger is debug enabled before calling logger.debug, this means your code does not have to evaluate the message string when debug is turned off.
if (_logger.IsDebugEnabled) _logger.Debug($"slow old string {this.foo} {this.bar}");
I'm building a multithreaded app in .net.
I have a thread that listens to a connection (abstract, serial, tcp...).
When it receives a new message, it adds it to via AddMessage. Which then call startSpool. startSpool checks to see if the spool is already running and if it is, returns, otherwise, starts it in a new thread. The reason for this is, the messages HAVE to be processed serially, FIFO.
So, my questions are...
Am I going about this the right way?
Are there better, faster, cheaper patterns out there?
My apologies if there is a typo in my code, I was having problems copying and pasting.
ConcurrentQueue<IMyMessage > messages = new ConcurrentQueue<IMyMessage>();
const int maxSpoolInstances = 1;
object lcurrentSpoolInstances;
int currentSpoolInstances = 0;
Thread spoolThread;
public void AddMessage(IMyMessage message)
{
this.messages.Add(message);
this.startSpool();
}
private void startSpool()
{
bool run = false;
lock (lcurrentSpoolInstances)
{
if (currentSpoolInstances <= maxSpoolInstances)
{
this.currentSpoolInstances++;
run = true;
}
else
{
return;
}
}
if (run)
{
this.spoolThread = new Thread(new ThreadStart(spool));
this.spoolThread.Start();
}
}
private void spool()
{
Message.ITimingMessage message;
while (this.messages.Count > 0)
{
// TODO: Is this below line necessary or does the TryDequeue cover this?
message = null;
this.messages.TryDequeue(out message);
if (message != null)
{
// My long running thing that does something with this message.
}
}
lock (lcurrentSpoolInstances)
{
this.currentSpoolInstances--;
}
}
This would be easier using BlockingCollection<T> instead of ConcurrentQueue<T>.
Something like this should work:
class MessageProcessor : IDisposable
{
BlockingCollection<IMyMessage> messages = new BlockingCollection<IMyMessage>();
public MessageProcessor()
{
// Move this to constructor to prevent race condition in existing code (you could start multiple threads...
Task.Factory.StartNew(this.spool, TaskCreationOptions.LongRunning);
}
public void AddMessage(IMyMessage message)
{
this.messages.Add(message);
}
private void Spool()
{
foreach(IMyMessage message in this.messages.GetConsumingEnumerable())
{
// long running thing that does something with this message.
}
}
public void FinishProcessing()
{
// This will tell the spooling you're done adding, so it shuts down
this.messages.CompleteAdding();
}
void IDisposable.Dispose()
{
this.FinishProcessing();
}
}
Edit: If you wanted to support multiple consumers, you could handle that via a separate constructor. I'd refactor this to:
public MessageProcessor(int numberOfConsumers = 1)
{
for (int i=0;i<numberOfConsumers;++i)
StartConsumer();
}
private void StartConsumer()
{
// Move this to constructor to prevent race condition in existing code (you could start multiple threads...
Task.Factory.StartNew(this.spool, TaskCreationOptions.LongRunning);
}
This would allow you to start any number of consumers. Note that this breaks the rule of having it be strictly FIFO - the processing will potentially process "numberOfConsumer" elements in blocks with this change.
Multiple producers are already supported. The above is thread safe, so any number of threads can call Add(message) in parallel, with no changes.
I think that Reed's answer is the best way to go, but for the sake of academics, here is an example using the concurrent queue -- you had some races in the code that you posted (depending upon how you handle incrementing currnetSpoolInstances)
The changes I made (below) were:
Switched to a Task instead of a Thread (uses thread pool instead of incurring the cost of creating a new thread)
added the code to increment/decrement your spool instance count
changed the "if currentSpoolInstances <= max ... to just < to avoid having one too many workers (probably just a typo)
changed the way that empty queues were handled to avoid a race: I think you had a race, where your while loop could have tested false, (you thread begins to exit), but at that moment, a new item is added (so your spool thread is exiting, but your spool count > 0, so your queue stalls).
private ConcurrentQueue<IMyMessage> messages = new ConcurrentQueue<IMyMessage>();
const int maxSpoolInstances = 1;
object lcurrentSpoolInstances = new object();
int currentSpoolInstances = 0;
public void AddMessage(IMyMessage message)
{
this.messages.Enqueue(message);
this.startSpool();
}
private void startSpool()
{
lock (lcurrentSpoolInstances)
{
if (currentSpoolInstances < maxSpoolInstances)
{
this.currentSpoolInstances++;
Task.Factory.StartNew(spool, TaskCreationOptions.LongRunning);
}
}
}
private void spool()
{
IMyMessage message;
while (true)
{
// you do not need to null message because it is an "out" parameter, had it been a "ref" parameter, you would want to null it.
if(this.messages.TryDequeue(out message))
{
// My long running thing that does something with this message.
}
else
{
lock (lcurrentSpoolInstances)
{
if (this.messages.IsEmpty)
{
this.currentSpoolInstances--;
return;
}
}
}
}
}
Check 'Pipelines pattern': http://msdn.microsoft.com/en-us/library/ff963548.aspx
Use BlockingCollection for the 'buffers'.
Each Processor (e.g. ReadStrings, CorrectCase, ..), should run in a Task.
HTH..
Environment : .net 4.0
I have a task that transforms XML files with a XSLT stylesheet, here is my code
public string TransformFileIntoTempFile(string xsltPath,
string xmlPath)
{
var transform = new MvpXslTransform();
transform.Load(xsltPath, new XsltSettings(true, false),
new XmlUrlResolver());
string tempPath = Path.GetTempFileName();
using (var writer = new StreamWriter(tempPath))
{
using (XmlReader reader = XmlReader.Create(xmlPath))
{
transform.Transform(new XmlInput(reader), null,
new XmlOutput(writer));
}
}
return tempPath;
}
I have X threads that can launch this task in parallel.
Sometimes my input file are about 300 MB, sometimes it's only a few MB.
My problem : I get OutOfMemoryException when my program try to transform some big XML files in the same time.
How can I avoid these OutOfMemoryEception ? My idea is to stop a thread before executing the task until there is enough available memory, but I don't know how to do that. Or there is some other solution (like putting my task in a distinct application).
Thanks
I don't recommend blocking a thread. In worst case, you'll just end up starving the task that could potentially free the memory you needed, leading to deadlock or very bad performance in general.
Instead, I suggest you keep a work queue with priorities. Get the tasks from the Queue scheduled fairly across a thread pool. Make sure no thread ever blocks on a wait operation, instead repost the task to the queue (with a lower priority).
So what you'd do (e.g. on receiving an OutOfMemory exception), is post the same job/task onto the queue and terminate the current task, freeing up the thread for another task.
A simplistic approach is to use LIFO which ensures that a task posted to the queue will have 'lower priority' than any other jobs already on that queue.
Since .NET Framework 4 we have API to work with good old Memory-Mapped Files feature which is available many years within from Win32API, so now you can use it from the .NET Managed Code.
For your task better fit "Persisted memory-mapped files" option,
MSDN:
Persisted files are memory-mapped files that are associated with a
source file on a disk. When the last process has finished working with
the file, the data is saved to the source file on the disk. These
memory-mapped files are suitable for working with extremely large
source files.
On the page of MemoryMappedFile.CreateFromFile() method description you can find a nice example describing how to create a memory mapped Views for the extremely large file.
EDIT: Update regarding considerable notes in comments
Just found method MemoryMappedFile.CreateViewStream() which creates a stream of type MemoryMappedViewStream which is inherited from a System.IO.Stream.
I believe you can create an instance of XmlReader from this stream and then instantiate your custom implementation of the XslTransform using this reader/stream.
EDIT2: remi bourgarel (OP) already tested this approach and looks like this particular XslTransform implementation (I wonder whether ANY would) wont work with MM-View stream in way which was supposed
The main problem is that you are loading the entire Xml file. If you were to just transform-as-you-read the out of memory problem should not normally appear.
That being said I found a MS support article which suggests how it can be done:
http://support.microsoft.com/kb/300934
Disclaimer: I did not test this so if you use it and it works please let us know.
You could consider using a queue to throttle how many concurrent transforms are being done based on some sort of artificial memory boundary e.g. file size. Something like the following could be used.
This sort of throttling strategy can be combined with maximum number of concurrent files being processed to ensure your disk is not being thrashed too much.
NB I have not included necessary try\catch\finally around execution to ensure that exceptions are propogated to calling thread and Waithandles are always released. I could go into further detail here.
public static class QueuedXmlTransform
{
private const int MaxBatchSizeMB = 300;
private const double MB = (1024 * 1024);
private static readonly object SyncObj = new object();
private static readonly TaskQueue Tasks = new TaskQueue();
private static readonly Action Join = () => { };
private static double _CurrentBatchSizeMb;
public static string Transform(string xsltPath, string xmlPath)
{
string tempPath = Path.GetTempFileName();
using (AutoResetEvent transformedEvent = new AutoResetEvent(false))
{
Action transformTask = () =>
{
MvpXslTransform transform = new MvpXslTransform();
transform.Load(xsltPath, new XsltSettings(true, false),
new XmlUrlResolver());
using (StreamWriter writer = new StreamWriter(tempPath))
using (XmlReader reader = XmlReader.Create(xmlPath))
{
transform.Transform(new XmlInput(reader), null,
new XmlOutput(writer));
}
transformedEvent.Set();
};
double fileSizeMb = new FileInfo(xmlPath).Length / MB;
lock (SyncObj)
{
if ((_CurrentBatchSizeMb += fileSizeMb) > MaxBatchSizeMB)
{
_CurrentBatchSizeMb = fileSizeMb;
Tasks.Queue(isParallel: false, task: Join);
}
Tasks.Queue(isParallel: true, task: transformTask);
}
transformedEvent.WaitOne();
}
return tempPath;
}
private class TaskQueue
{
private readonly object _syncObj = new object();
private readonly Queue<QTask> _tasks = new Queue<QTask>();
private int _runningTaskCount;
public void Queue(bool isParallel, Action task)
{
lock (_syncObj)
{
_tasks.Enqueue(new QTask { IsParallel = isParallel, Task = task });
}
ProcessTaskQueue();
}
private void ProcessTaskQueue()
{
lock (_syncObj)
{
if (_runningTaskCount != 0) return;
while (_tasks.Count > 0 && _tasks.Peek().IsParallel)
{
QTask parallelTask = _tasks.Dequeue();
QueueUserWorkItem(parallelTask);
}
if (_tasks.Count > 0 && _runningTaskCount == 0)
{
QTask serialTask = _tasks.Dequeue();
QueueUserWorkItem(serialTask);
}
}
}
private void QueueUserWorkItem(QTask qTask)
{
Action completionTask = () =>
{
qTask.Task();
OnTaskCompleted();
};
_runningTaskCount++;
ThreadPool.QueueUserWorkItem(_ => completionTask());
}
private void OnTaskCompleted()
{
lock (_syncObj)
{
if (--_runningTaskCount == 0)
{
ProcessTaskQueue();
}
}
}
private class QTask
{
public Action Task { get; set; }
public bool IsParallel { get; set; }
}
}
}
Update
Fixed bug in maintaining batch size when rolling over to next batch window:
_CurrentBatchSizeMb = fileSizeMb;