I need convert 5.000.000 records from DB to JSON, but I'm running out of memory after 4.000 records.
I'm using Task thinking when the task is complete the GC clear everything in the thread from memory.
public class Program()
{
public static void Main(string[] args)
{
Program p = new Program();
p.ExportUsingTask();
}
public void ExportUsingTask()
{
List<int> ids = Program.LoadID(); // trying dont keep DBContext references, so GC can free memory
GC.Collect(); // GC can clear 130MB of memory, DBContext have no references anymore
GC.WaitForPendingFinalizers();
foreach (int item in ids)
{
Task job = new Task(() => new Program().Process(item));
job.RunSynchronously();
Task.WaitAll(job);
job.Dispose();
job = null;
GC.Collect(); // GC don't clear memory, uses more and more memory at each iteration, until OutOfMemoryException
GC.WaitForPendingFinalizers();
}
}
public static List<int> LoadID()
{
List<int> ids = new List<int>();
using (Context db = new Context())
{
ids = db.Alfa.Where(a => a.date.Year == 2019).Select(a => a.id).ToList<int>(); // load 500.000 id from DB, use 130MB of memory
// have some business logic here, but isn't the problem, memory is free after execution anyway
db.Dispose();
}
return ids;
}
public void Process(int id)
{
Beta b = GetBetaFromAlfa(id); // Beta is JSON model that I need save to file
string json = Newtonsoft.Json.JsonConvert.SerializeObject(b);
b = null;
using (StreamWriter sw = System.IO.File.AppendText(#"c:\MyFile.json"))
{
sw.Write(json);
sw.Close();
sw.Dispose();
}
GC.Collect(); // GC don't clear memory
GC.WaitForPendingFinalizers();
}
public static Beta GetBetaFromAlfa(int idAlfa)
{
Alfa a = null; // Alfa is my model in DB
Beta b = null; // Beta is JSON model that I need save to file
using (Context db = new Context())
{
Alfa a = db.Alfa.Single(a => a.id == idAlfa);
b = ConvertAlfaToBeta(a);
db.Dispose();
}
GC.Collect(); // GC don't clear memory
GC.WaitForPendingFinalizers();
return b;
}
public static Beta ConvertAlfaToBeta(Alfa alfa)
{
// business logic, something like:
// beta.id = alfa.id;
// beta.name = alfa.name;
// only simple type association (int, string, decimal, datetime, etc)
}
}
public class Alfa(){ ... }
public class Beta(){ ... }
In the first try, I did a single loop reading records one by one, when I got 100 records, I saved all JSON to file. But I ran out of memory anyway when I got 4000 records, using loop:
public void ExportUsingLoop()
{
List<int> ids = Program.LoadID(); // trying dont keep DBContext references, so GC can free memory
GC.Collect(); // GC can clear 130MB of memory, DBContext have no references anymore
GC.WaitForPendingFinalizers();
int count = 0;
StringBuilder content = new StringBuilder();
foreach (int item in ids)
{
count++;
Beta b = GetBetaFromAlfa(id); // Beta is JSON model that I need save to file
string json = Newtonsoft.Json.JsonConvert.SerializeObject(b);
content.AppendLine(json);
b = null;
json = null;
if(count % 100 == 0)
{
using (StreamWriter sw = System.IO.File.AppendText(#"c:\MyFile.json"))
{
sw.Write(content.ToString());
content.Clear(); // just for clarification
sw.Close();
sw.Dispose();
}
GC.Collect(); // GC don't clear memory, uses more and more memory at each iteration, until OutOfMemoryException
GC.WaitForPendingFinalizers();
}
}
}
Related
I am using a basic streamreader to loop through a csv file of about 65gb (450 million rows).
using (sr = new StreamReader(currentFileName))
{
string headerLine = sr.ReadLine(); // skip the headers
while ((string currentTick = sr.ReadLine()) != null)
{
string[] tickValue = currentTick.Split(',');
// Ticks are formatted and added to the array in order to insert them afterwards.
}
}
This creates a list that will hold the ticks that belong to a candle and than call the insertTickBatch function.
private async static Task insertTickBatch(List<Tick> ticks)
{
if (ticks != null && ticks.Any())
{
using (DatabaseEntities db = new DatabaseEntities())
{
db.Configuration.LazyLoadingEnabled = false;
int currentCandleId = ticks.First().CandleId;
var candle = db.Candles.Where(c => c.Id == currentCandleId).FirstOrDefault();
foreach (var curTick in ticks)
{
candle.Ticks.Add(curTick);
}
await db.SaveChangesAsync();
db.Dispose();
Thread.Sleep(10);
}
}
}
This however takes about 15 years to complete and my intention is to speed this up. How do I achieve this?
I am not sure which EF you are using, but if available try this instead of your foreach loop:
db.Ticks.AddRange(ticks);
Also, CSVHelper is a nice package that can convert your entire file into an Tick object list, and of course the Thread.Sleep has to go.
I have a ASP .NET Core 3.0 API app that returns a PDF file of a WPF page. It generates the WPF page itself and then converts it to XPS so i can then convert it to PDF but when its done loading the api doesnt release it from memory so it just builds up until it crashes. I have implemented GC.collect for each time it has generated a PDF but with no real success.
Class i use to generate the PDF from a WPF app with IDispossable
public QueryAndGenerate(int orderNumber, string XPSPath, string PDFPath, bool throwExceptions = true)
{
Helper.Log("QueryAndGenerate start");
this.XPSPath = XPSPath;
this.PDFPath = PDFPath;
List<byte[]> Bytes = new List<byte[]>();
var rows = QueryAndGenerate.GetDataRows(Properties.Resources.joborderQuery, new QueryAndGenerate.MySqlParameter("ORDERNUMBER", orderNumber));
PDFPaths = new List<string>();
Helper.Log(string.Format("rows from query: {0} lenth: {1}", rows, rows.Count));
try
{
foreach (var row in rows)
{
isMultipleGuidenote = true;
QueryAndGenerate queryAndGenerate = new QueryAndGenerate(orderNumber, row.Field<int>("JOBORDERNUMBER"), XPSPath, PDFPath, throwExceptions);
Bytes.Add(File.ReadAllBytes(PDFPath));
Helper.Log("generated file: "+ row);
}
}
catch (Exception e)
{
Helper.Log(e);
}
PdfDocument outputDocument = new PdfDocument();
foreach (byte[] pdfBytes in Bytes)
{
if (pdfBytes.Length != 0)
{
using (MemoryStream stream = new MemoryStream(pdfBytes))
{
PdfDocument inputDocument = PdfReader.Open(stream, PdfDocumentOpenMode.Import);
foreach (PdfPage page in inputDocument.Pages)
{
outputDocument.AddPage(page);
}
}
}
}
outputDocument.Save(this.PDFPath);
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
}
public void Dispose()
{
xpsControls = null;
jobRow = null;
checkRows = null;
warrantyRows = null;
subAssemblyRows = null;
detailRows = null;
detailToolRows = null;
detailItemRows = null;
PDFPaths = null;
cadmanCheck = null;
}
The issue you are probably facing is fragmentation of the large object heap (LOH). The article The Dangers of the Large Object Heap explains the problem very well. We faced the same issue in one of our REST-apis, which generates PDF files. The problem is, that the byte[] used is most likely larger than 85kb, which causes it to be placed in LOH. Your call to GC.Collect(); initiates garbage collection on generation 0 of the heap. If you want to prevent this fragmentation from occuring, you should set GC-options before calling GC.Collect(); as mentioned in How to (not) use the large object heap in .Net at Getting rid of large object heap fragmentation
So basically replace your GC-calls with this:
GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce;
GC.Collect(generation: 2, GCCollectionMode.Forced, blocking: true, compacting: true);
GC.WaitForPendingFinalizers();
GC.Collect(generation: 2, GCCollectionMode.Forced, blocking: true, compacting: true);
I have a simple program that just reading XPS file, i've read the following post and it did solve part of the issue.
Opening XPS document in .Net causes a memory leak
class Program
{
static int intCounter = 0;
static object _intLock = new object();
static int getInt()
{
lock (_intLock)
{
return intCounter++;
}
}
static void Main(string[] args)
{
Console.ReadLine();
for (int i = 0; i < 100; i++)
{
Thread t = new Thread(() =>
{
var ogXps = File.ReadAllBytes(#"C:\Users\Nathan\Desktop\Objective.xps");
readXps(ogXps);
Console.WriteLine(getInt().ToString());
});
t.SetApartmentState(ApartmentState.STA);
t.Start();
Thread.Sleep(50);
}
Console.ReadLine();
}
static void readXps(byte[] originalXPS)
{
try
{
MemoryStream inputStream = new MemoryStream(originalXPS);
string memoryStreamUri = "memorystream://" + Path.GetFileName(Guid.NewGuid().ToString() + ".xps");
Uri packageUri = new Uri(memoryStreamUri);
Package oldPackage = Package.Open(inputStream);
PackageStore.AddPackage(packageUri, oldPackage);
XpsDocument xpsOld = new XpsDocument(oldPackage, CompressionOption.Normal, memoryStreamUri);
FixedDocumentSequence seqOld = xpsOld.GetFixedDocumentSequence();
//The following did solve some of the memory issue
//-----------------------------------------------
var docPager = seqOld.DocumentPaginator;
docPager.ComputePageCount();
for (int i = 0; i < docPager.PageCount; i++)
{
FixedPage fp = docPager.GetPage(i).Visual as FixedPage;
fp.UpdateLayout();
}
seqOld = null;
//-----------------------------------------------
xpsOld.Close();
oldPackage.Close();
oldPackage = null;
inputStream.Close();
inputStream.Dispose();
inputStream = null;
PackageStore.RemovePackage(packageUri);
}
catch (Exception e)
{
}
}
}
^ The program will read a XPS file for hundred times
^Before apply the fix
^After apply the fix
So the fix in the post suggested did eliminate some objects, however i found that there are still objects like Dispatcher , ContextLayoutManager and MediaContext still exists in memory and their number are exactly 100, is this a normal behavior or a memory leak? How do i fix this? Thanks.
25/7/2018 Update
Adding the line Dispatcher.CurrentDispatcher.InvokeShutdown(); did get rid of the Dispatcher , ContextLayoutManager and MediaContext object, don't know if this is an ideal way to fix.
It looks like those classes you're left with are from the XPSDocument, that implements IDisposable but you don't call those. And there are a few more classes that implement that same interface and if they do, as a rule of thumb, either wrap them in a using statement so it is guaranteed their Dispose method gets called or call their Dispose method your self.
An improved version of your readXps method will look like this:
static void readXps(byte[] originalXPS)
{
try
{
using (MemoryStream inputStream = new MemoryStream(originalXPS))
{
string memoryStreamUri = "memorystream://" + Path.GetFileName(Guid.NewGuid().ToString() + ".xps");
Uri packageUri = new Uri(memoryStreamUri);
using(Package oldPackage = Package.Open(inputStream))
{
PackageStore.AddPackage(packageUri, oldPackage);
using(XpsDocument xpsOld = new XpsDocument(oldPackage, CompressionOption.Normal, memoryStreamUri))
{
FixedDocumentSequence seqOld = xpsOld.GetFixedDocumentSequence();
//The following did solve some of the memory issue
//-----------------------------------------------
var docPager = seqOld.DocumentPaginator;
docPager.ComputePageCount();
for (int i = 0; i < docPager.PageCount; i++)
{
FixedPage fp = docPager.GetPage(i).Visual as FixedPage;
fp.UpdateLayout();
}
seqOld = null;
//-----------------------------------------------
} // disposes XpsDocument
} // dispose Package
PackageStore.RemovePackage(packageUri);
} // dispose MemoryStream
}
catch (Exception e)
{
// really do something here, at least:
Debug.WriteLine(e);
}
}
This should at least clean-up most of the objects. I'm not sure if you're going to see the effects in your profiling as that depends on if the objects are actually collected during your analysis. Profiling a debug build might give unanticipated results.
As the remainder of those object instances seem to be bound to the System.Windows.Threading.Dispatcher I suggest you could try to keep a reference to your Threads (but at this point you might consider looking into Tasks) ansd once all threads are done, call the static ExitAllFrames on the Dispatcher.
Your main method will then look like this:
Console.ReadLine();
Thread[] all = new Thread[100];
for (int i = 0; i < all.Length; i++)
{
var t = new Thread(() =>
{
var ogXps = File.ReadAllBytes(#"C:\Users\Nathan\Desktop\Objective.xps");
readXps(ogXps);
Console.WriteLine(getInt().ToString());
});
t.SetApartmentState(ApartmentState.STA);
t.Start();
all[i] = t; // keep reference
Thread.Sleep(50);
}
foreach(var t in all) t.Join(); // https://stackoverflow.com/questions/263116/c-waiting-for-all-threads-to-complete
all = null; // meh
Dispatcher.ExitAllFrames(); // https://stackoverflow.com/a/41953265/578411
Console.ReadLine();
I'm working on a utility to read through a JSON file I've been given and to transform it into SQL Server. My weapon of choice is a .NET Core Console App (I'm trying to do all of my new work with .NET Core unless there is a compelling reason not to). I have the whole thing "working" but there is clearly a problem somewhere because the performance is truly horrifying almost to the point of being unusable.
The JSON file is approximately 27MB and contains a main array of 214 elements and each of those contains a couple of fields along with an array of from 150-350 records (that array has several fields and potentially a small <5 record array or two). Total records are approximately 35,000.
In the code below I've changed some names and stripped out a few of the fields to keep it more readable but all of the logic and code that does actual work is unchanged.
Keep in mind, I've done a lot of testing with the placement and number of calls to SaveChanges() think initially that number of trips to the Db was the problem. Although the version below is calling SaveChanges() once for each iteration of the 214-record loop, I've tried moving it outside of the entire looping structure and there is no discernible change in performance. In other words, with zero trips to the Db, this is still SLOW. How slow you ask, how does > 24 hours to run hit you? I'm willing to try anything at this point and am even considering moving the whole process into SQL Server but would much reather work in C# than TSQL.
static void Main(string[] args)
{
string statusMsg = String.Empty;
JArray sets = JArray.Parse(File.ReadAllText(#"C:\Users\Public\Downloads\ImportFile.json"));
try
{
using (var _db = new WidgetDb())
{
for (int s = 0; s < sets.Count; s++)
{
Console.WriteLine($"{s.ToString()}: {sets[s]["name"]}");
// First we create the Set
Set eSet = new Set()
{
SetCode = (string)sets[s]["code"],
SetName = (string)sets[s]["name"],
Type = (string)sets[s]["type"],
Block = (string)sets[s]["block"] ?? ""
};
_db.Entry(eSet).State = Microsoft.EntityFrameworkCore.EntityState.Added;
JArray widgets = sets[s]["widgets"].ToObject<JArray>();
for (int c = 0; c < widgets.Count; c++)
{
Widget eWidget = new Widget()
{
WidgetId = (string)widgets[c]["id"],
Layout = (string)widgets[c]["layout"] ?? "",
WidgetName = (string)widgets[c]["name"],
WidgetNames = "",
ReleaseDate = releaseDate,
SetCode = (string)sets[s]["code"]
};
// WidgetColors
if (widgets[c]["colors"] != null)
{
JArray widgetColors = widgets[c]["colors"].ToObject<JArray>();
for (int cc = 0; cc < widgetColors.Count; cc++)
{
WidgetColor eWidgetColor = new WidgetColor()
{
WidgetId = eWidget.WidgetId,
Color = (string)widgets[c]["colors"][cc]
};
_db.Entry(eWidgetColor).State = Microsoft.EntityFrameworkCore.EntityState.Added;
}
}
// WidgetTypes
if (widgets[c]["types"] != null)
{
JArray widgetTypes = widgets[c]["types"].ToObject<JArray>();
for (int ct = 0; ct < widgetTypes.Count; ct++)
{
WidgetType eWidgetType = new WidgetType()
{
WidgetId = eWidget.WidgetId,
Type = (string)widgets[c]["types"][ct]
};
_db.Entry(eWidgetType).State = Microsoft.EntityFrameworkCore.EntityState.Added;
}
}
// WidgetVariations
if (widgets[c]["variations"] != null)
{
JArray widgetVariations = widgets[c]["variations"].ToObject<JArray>();
for (int cv = 0; cv < widgetVariations.Count; cv++)
{
WidgetVariation eWidgetVariation = new WidgetVariation()
{
WidgetId = eWidget.WidgetId,
Variation = (string)widgets[c]["variations"][cv]
};
_db.Entry(eWidgetVariation).State = Microsoft.EntityFrameworkCore.EntityState.Added;
}
}
}
_db.SaveChanges();
}
}
statusMsg = "Import Complete";
}
catch (Exception ex)
{
statusMsg = ex.Message + " (" + ex.InnerException + ")";
}
Console.WriteLine(statusMsg);
Console.ReadKey();
}
I had an issue with that kind of code, lots of loops and tons of changing state.
Any change / manipulation you make in _db context, will generate a "trace" of it. And it making your context slower each time. Read more here.
The fix for me was to create new EF context(_db) at some key points. It saved me a few hours per run!
You could try to create a new instance of _db each iteration in this loop
contains a main array of 214 elements
If it make no change, try to add some stopwatch to get a best idea of what/where is taking so long.
If you're making thousands of updates then EF is not really the way to go. Something like SQLBulkCopy will do the trick.
You could try the bulkwriter library.
IEnumerable<string> ReadFile(string path)
{
using (var stream = File.OpenRead(path))
using (var reader = new StreamReader(stream))
{
while (reader.Peek() >= 0)
{
yield return reader.ReadLine();
}
}
}
var items =
from line in ReadFile(#"C:\products.csv")
let values = line.Split(',')
select new Product {Sku = values[0], Name = values[1]};
then
using (var bulkWriter = new BulkWriter<Product>(connectionString)) {
bulkWriter.WriteToDatabase(items);
}
I have a DbContext with a dataset of >20M records, that has to be converted to a different data format. Therefore, I read the data into memory, perform some tasks and then dispose the DbContext. The code works fine, but after a while I get OutOfMemoryExceptions. I have been able to narrow it down to the following piece of code, where I retrieve 2M records, then release them and fetch them again. The first retrieval works just fine, the second one throws an exception.
// first call runs fine
using (var dbContext = new CustomDbContext())
{
var list = dbContext.Items.Take(2000000).ToArray();
foreach (var item in list)
{
// perform conversion tasks...
item.Converted = true;
}
}
// second call throws exception
using (var dbContext = new CustomDbContext())
{
var list = dbContext.Items.Take(2000000).ToArray();
foreach (var item in list)
{
// perform conversion tasks...
item.Converted = true;
}
}
Shouldn't the GC automatically release all memory allocated in the first using block, such that the second block should run as fine as the first one?
In my actual code, I do not retrieve 2 million records at once, but something between 0 and 30K in each iteration. However, after about 15 minutes, I run out of memory, although all objects should have been released.
I suspect you met LOH. Probably your objects are bigger than threashold and they are getting there, thus GC doesnt help by default.
Try this: https://www.simple-talk.com/dotnet/.net-framework/large-object-heap-compaction-should-you-use-it/
and see if your exception goes away.
i.e. add this between first and second part:
GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce;
GC.Collect();
IEnumerable has GetEnumerator() so you could try this to avoid .ToArray() or .ToList() that arenĀ“t necessary if you just want to read:
// first call
using (var dbContext = new CustomDbContext())
{
foreach (var item in dbContext.Items.Take(2000000))
{
// perform conversion tasks...
item.Converted = true;
}
}
// second call
using (var dbContext = new CustomDbContext())
{
foreach (var item in dbContext.Items.Take(2000000))
{
// perform conversion tasks...
item.Converted = true;
}
}
Running GC will not help you, you have to run each iteration in different context. And dispose your context.
// ID is your primary key
long startID = 0;
while(true){
using(var db = new CustomDbContext()){
var slice = db.Items.Where(x=>x.ID > startID)
.OrderBy(x=>x.ID)
.Take(1000).ToList();
// stop if there is nothing to process
if(!slice.Any())
break;
foreach(var item in slice){
// your logic...
item.Converted = true;
}
startID = slice.Last().ID;
}
}
If you want to process these things faster, alternate approach would be to run slices in parallel ....
Alternate Approach
I would recommend using dividing slices in 100x100, then I can process 100 slices of 100 items in parallel.
You can always easily customize slicing to meet your speed needs.
public IEnumerable<IEnumerable<T>> Slice(IEnumerable<T> src, int size){
while(src.Any()){
var s = src.Take(size);
src = src.Skip(size);
yield return s;
}
}
long startID = 0;
while(true){
using(var db = new CustomDbContext()){
var src = db.Items.Where(x=>x.ID > startID)
.OrderBy(x=>x.ID)
.Take(10000).Select(x=>x.ID).ToList();
// stop if there is nothing to process
if(!src.Any())
break;
Parallel.ForEach(src.Slice(100), slice => {
using(var sdb = new CustomDbContext()){
foreach(var item in sdb.Items.Where(x=> slice.Contains(x.ID)){
item.Converted = true;
}
}
} );
startID = src.Last();
}
}
After refactoring, memory gets released. I don't know why, but it works.
private static void Debug()
{
var iteration = 0;
while(true)
{
Console.WriteLine("Iteration {0}", iteration++);
Convert();
}
}
private static void Convert()
{
using (var dbContext = new CustomDbContext(args[0]))
{
var list = dbContext.Items.Take(2000000).ToList();
foreach (var item in list)
{
item.Converted = true;
}
}
}
When I move the content of Convert() to the while loop in Debug(), the OutOfMemoryExceptions is thrown.
private static void Debug()
{
var iteration = 0;
while(true)
{
Console.WriteLine("Iteration {0}", iteration++);
using (var dbContext = new CustomDbContext(args[0]))
{
// OutOfMemoryException in second iteration
var list = dbContext.Items.Take(2000000).ToList();
foreach (var item in list)
{
item.Converted = true;
}
}
}
}