I have a system which is doing following,
Upload documents to SharePoint
Event receiver will add job to DB, Create a folder for job in Document Conversion Directory
A directory watcher will trigger Document Conversion windows service
Windows service will get batch of 10 jobs from DB (using main thread)
On start Windows service creates X number of threads based on processor's cores (using Parallel For)
Then creates worker thread with timeouts for every db jobs (this is different from parallel For threads)
and it carries on...
Oh while converting... in worker threads.. we are calling ActiveDirectory, logging to DB (Read, Write) and uploading document back to SharePoint
I manage to break it... if I upload a password protected document... and soon after upload a powerpoint document, powerpoint document throws password incorrect exception etc..
but if there's gap inbetween both documents even 60 seconds, it all works fine which means powerpoint document does converts to PDF.
Following is the code but I had to trim unnecessary parts out of it,
Here is the main class where things start from,
Parallel.For(0, noOfThreadsToRunOnPDFServer, new ParallelOptions { MaxDegreeOfParallelism = noOfThreadsToRunOnPDFServer },
i =>
{
this.docConvService.ProcessDocuments(i);
});
Then the conversion is happening here...
using System;
using System.IO;
using System.Runtime.ExceptionServices;
using System.Threading;
namespace PDFService
{
public class AsposePDFConverter : IPDFConverter
{
private IDocConversionSettings settings;
private ExceptionDispatchInfo conversionException = null;
public enum SupportedExtensions
{
Doc,
Docx,
Xls,
Xlsx,
Pdf,
Pps,
Ppsx,
Ppt,
Pptx,
Txt,
Html,
Mhtml,
Xhtml,
Msg,
Eml,
Emlx,
One,
Vsd,
Vsdx,
Vss,
Vssx
}
public AsposePDFConverter(IDocConversionSettings settings)
{
this.settings = settings;
}
private void SyncThreadStartWithTimeout(ThreadStart threadStart, TimeSpan timeout)
{
Thread workerThread = new Thread(threadStart);
workerThread.Start();
bool finished = workerThread.Join(timeout);
if (!finished)
{
workerThread.Abort();
throw new ConversionTimeoutException("PDF Conversion exceeded timeout value");
}
}
public MemoryStream ConvertToPDF(string documentName, Stream docContent, double timeoutMS)
{
this.conversionException = null;
MemoryStream outStream = null;
MemoryStream inStream = new MemoryStream();
docContent.CopyTo(inStream);
inStream.Seek(0, SeekOrigin.Begin);
SupportedExtensions documentExtension;
string szExtension = Path.GetExtension(documentName).TrimStart('.');
if (Enum.TryParse(szExtension, true, out documentExtension))
{
switch (documentExtension)
{
case SupportedExtensions.Doc:
case SupportedExtensions.Docx:
case SupportedExtensions.Txt:
case SupportedExtensions.Html:
case SupportedExtensions.Mhtml:
case SupportedExtensions.Xhtml:
SyncThreadStartWithTimeout(
() => { outStream = ConvertWordsToPDF(inStream); },
TimeSpan.FromMilliseconds(timeoutMS));
break;
case SupportedExtensions.Pps:
case SupportedExtensions.Ppsx:
case SupportedExtensions.Ppt:
case SupportedExtensions.Pptx:
SyncThreadStartWithTimeout(
() => { outStream = ConvertSlidesToPDF(inStream); },
TimeSpan.FromMilliseconds(timeoutMS));
break;
}
// Conversion happens on sub-threads so they can time out, if they throw an exception, throw it from this thread
if (this.conversionException != null)
this.conversionException.Throw();
return outStream;
}
else
{
throw new FormatNotSupportedException("Document type is not supported");
}
}
private MemoryStream ConvertWordsToPDF(Stream docContent)
{
try
{
Aspose.Words.License lic = new Aspose.Words.License();
lic.SetLicense(this.settings.AsposeLicensePath);
Aspose.Words.Document doc = new Aspose.Words.Document(docContent);
MemoryStream stream = new MemoryStream();
doc.Save(stream, Aspose.Words.SaveFormat.Pdf);
return stream;
}
catch (Exception ex)
{
this.conversionException = ExceptionDispatchInfo.Capture(ex);
return null;
}
}
private MemoryStream ConvertSlidesToPDF(Stream docContent)
{
try
{
Aspose.Slides.License lic = new Aspose.Slides.License();
lic.SetLicense(this.settings.AsposeLicensePath);
using (Aspose.Slides.Presentation presentation = new Aspose.Slides.Presentation(docContent))
{
MemoryStream stream = new MemoryStream();
presentation.Save(stream, Aspose.Slides.Export.SaveFormat.Pdf);
return stream;
}
}
catch (Exception ex)
{
this.conversionException = ExceptionDispatchInfo.Capture(ex);
return null;
}
}
}
}
Error is,
Error during Document PDF Conversion. Details are: PDFConversionID:
6061, DocumentName: powerpoint.ppsx, WebURL: REMOVED, UploadedBy:
REMOVED, ConversionDuration: 00:01:06.3072410
Aspose.Words.IncorrectPasswordException: The document password is
incorrect. at Aspose.Words.Document. (Stream , LoadOptions )
at Aspose.Words.Document. (Stream , LoadOptions ) at
DocumentPDFConversionService.AsposePDFConverter.ConvertWordsToPDF(Stream
docContent) in...
As you can see there is something very fishy going on
You are using the same instance of this.docConvService in multiple threads, so your conversionException property is probably written by the password-protected doc while your other document is processing. You should instanciate a new instance of your AsposePDFConverter, or change the way you return exceptions, e.g. in a result-object returned by ConvertToPDF, that contains a MemoryStream and your error.
Seperate instance for each request :
Parallel.For(0, noOfThreadsToRunOnPDFServer, new ParallelOptions { MaxDegreeOfParallelism = noOfThreadsToRunOnPDFServer },
i =>
{
new AsposePdfConverter(settings).ProcessDocuments(i);
});
Returning a result-object :
public ConversionResult ConvertToPDF(string documentName, Stream docContent, double timeoutMS)
{
/** Your code **/
return new ConversionResult()
{
MemoryStream = memoryStream,
ConversionException = conversionException
};
}
class ConversionResult {
MemoryStream MemoryStream {get;set;}
ExceptionDispatchInfo ConversionException {get;set;}
}
Related
I'm updating image for the record in the database (the server we use doesn't allow image saving in file system) and it updates the image for the first time, but if I want to re-update a picture shortly, even despite the fact that all the code executes without errors, the byte[] in the database is not updated. If I use breakponts and debug the code - the data is being saved.
The first suspicion was the MemoryStream, the fact that it might not have read everything and there is malformed data that is being discarded from the DB, so I moved out the definition for fileBytes and fileName, as well as service call and return, but it didn't help.
It seems as if the EF was caching the data, but I have no proof for the statement at this moment
public ImageResponse Post(IFormFile file, int targetId)
{
// TODO save content Type
// TODO save content size
if (file == null)
{
throw new Exception();
}
long size = file.Length;
if (size > 0)
{
byte[] fileBytes;
string fileName;
using (var memoryStream = new MemoryStream())
{
file.CopyTo(memoryStream);
fileBytes = memoryStream.ToArray();
fileName = $"{Guid.NewGuid()}{Path.GetExtension(file.FileName)}";
}
this.targetService.AddImage(new TargetImage { TargetId = targetId, ImageFile = fileBytes });
return new ImageResponse
{
Size = size
};
}
else
{
throw new Exception();
}
}
public GeneralResult AddImage(TargetImage targetImage)
{
var entity = this.targets.Find(targetImage.TargetId) ??
throw new Exception();
entity.TargetPictureUrl = targetImage.ImageFile;
this.targets.Save(entity);
return new GeneralResult { Success = true };
}
Under the hood it looks like this:
public void Save(DatabaseEntity entity)
{
if (entity is T)
{
this.Save((T)entity);
}
else
{
throw new ApplicationException($"{typeof(T).Name} is required instance type.");
}
}
and the "entities" is:
private DbSet<T> entities;
protected virtual DbSet<T> Entities => this.entities ?? (this.entities = this.context.Set<T>());
I have a batch of PDFs that I want to convert to Text. It's easy to get text with something like this from iTextSharp:
PdfTextExtractor.GetTextFromPage(reader, pageNumber);
It's easy to get Images using this answer (or similar answers in the thread).
What I can't figure out easily... is how to interleave image placeholders in the text.
Given a PDF, a page # and GetTextFromPage I expect the output to be:
line 1
line 2
line 3
When I'd like it to be (Where 1.1 means page 1, image 1... Page 1, image 2):
line 1
[1.1]
line 2
[1.2]
line 3
Is there a way to get an "image placeholder" for iTextSharp, PdfSharp or anything similar? I'd like a GetTextAndPlaceHoldersFromPage method (or similar).
PS: Hrm... it's not letting me tag iTextSHARP - not iText. C# not Java.
C# Pdf to Text with image placeholder
https://stackoverflow.com/a/28087521/
https://stackoverflow.com/a/33697745/
Although this doesn't have the exact layout mentioned in my question (Since that was a simplified version of what I really wanted anyways), it does have the starting parts as listed by the second note (translated from iText Java)... with extra information pulled from the third note (Some of the reflection used in Java didn't seem to work in C#, so that info came from #3).
Working from this, I'm able to get a List of Strings representing lines in the PDF (all pages, instead of just page 1)... with text added where images should be (Huzzah!). ByteArrayToFile extension method added for flavor (Although I didn't include other parts/extensions that may break a copy/paste usages of this code).
I've also been able to greatly simplify other parts of my process and gut half of the garbage I had working before. Huzzah!!! Thanks #Mkl
internal class Program
{
public static void Main(string[] args)
{
var dir = Settings.TestDirectory;
var file = Settings.TestFile;
Log.Info($"File to Process: {file.FullName}");
using (var reader = new PdfReader(file.FullName))
{
var parser = new PdfReaderContentParser(reader);
var listener = new SimpleMixedExtractionStrategy(file, dir);
parser.ProcessContent(1, listener);
var x = listener.GetResultantText().Split('\n');
}
}
}
public class SimpleMixedExtractionStrategy : LocationTextExtractionStrategy
{
public static readonly ILog Log = LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
public DirectoryInfo OutputPath { get; }
public FileInfo OutputFile { get; }
private static readonly LineSegment UNIT_LINE = new LineSegment(new Vector(0, 0, 1), new Vector(1, 0, 1));
private int _counter;
public SimpleMixedExtractionStrategy(FileInfo outputFile, DirectoryInfo outputPath)
{
OutputPath = outputPath;
OutputFile = outputFile;
}
public override void RenderImage(ImageRenderInfo renderInfo)
{
try
{
var image = renderInfo.GetImage();
if (image == null) return;
var number = _counter++;
var imageFile = new FileInfo($"{OutputFile.FullName}-{number}.{image.GetFileType()}");
imageFile.ByteArrayToFile(image.GetImageAsBytes());
var segment = UNIT_LINE.TransformBy(renderInfo.GetImageCTM());
var location = new TextChunk("[" + imageFile + "]", segment.GetStartPoint(), segment.GetEndPoint(), 0f);
var locationalResultField = typeof(LocationTextExtractionStrategy).GetField("locationalResult", BindingFlags.NonPublic | BindingFlags.Instance);
var LocationalResults = (List<TextChunk>)locationalResultField.GetValue(this);
LocationalResults.Add(location);
}
catch (Exception ex)
{
Log.Debug($"{ex.Message}");
Log.Verbose($"{ex.StackTrace}");
}
}
}
public static class ByteArrayExtensions
{
public static bool ByteArrayToFile(this FileInfo fileName, byte[] byteArray)
{
try
{
// Open file for reading
var fileStream = new FileStream(fileName.FullName, FileMode.Create, FileAccess.Write);
// Writes a block of bytes to this stream using data from a byte array.
fileStream.Write(byteArray, 0, byteArray.Length);
// close file stream
fileStream.Close();
return true;
}
catch (Exception exception)
{
// Error
Log.Error($"Exception caught in process: {exception.Message}", exception);
}
// error occured, return false
return false;
}
}
I've got this code.
const int maxbooks = 5;
Book[] booklist = new Book[maxbooks];
FileStream fs = File.Open(#"books.txt", FileMode.Open, FileAccess.Read);
SoapFormatter sf = new SoapFormatter();
try
{
// something here, deserializing file and assigning to the array
}
catch (Exception e)
{
Console.WriteLine(e.Message);
}
finally
{
fs.Close();
}
I've figured out (or at least, I THINK I've figured out) how to serialize the original array of objects in a separate program. I'm now looking to de serialize that, and create a new array with the de serialized data. Just for reference, here's the other file where I serialized the original array
Book firstbook = new Book("Jimbob", "Jimbob book", "first edition", "Jimbob publishing", "1991");
Book secondbook = new Book("Greg", "Greg book", "third edition", "Unholy publishing", "2010");
Book thirdbook = new Book("Pingu", "Pingu book", "tenth edition", "Antarctic publishing", "1897");
Book fourthbook = new Book("Patrick", "Patrick book", "seventh edition", "underwater publishing", "1991");
Book fifthbook = new Book("Sally", "Sally book", "first edition", "Wowpublishing", "2015");
const int maxbooks = 5;
Book[] booklist = new Book[maxbooks];
booklist[0] = firstbook;
booklist[1] = secondbook;
booklist[2] = thirdbook;
booklist[3] = fourthbook;
booklist[4] = fifthbook;
// writing to a file
FileStream fs = File.Open(#"books.txt", FileMode.Create, FileAccess.Write);
SoapFormatter sf = new SoapFormatter();
int bookindex = 0;
try
{
while (bookindex < maxbooks)
{
sf.Serialize(fs, booklist[bookindex]);
bookindex += 1;
}
}
catch (Exception e)
{
Console.WriteLine(e.Message);
}
finally
{
fs.Close();
}
Using SOAP serialization for the moment with this. Any help will be appreciated.
Serialise the array itself rather than per item. With your serialiser routine you are creating many serialisation chunks that are not valid together appended in a file.
Use the XMLSerializer
For e.g.
Serialize like this:
private async Task SaveSettings(Settings settings)
{
var folder = Windows.Storage.ApplicationData.Current.LocalFolder;
var options = Windows.Storage.CreationCollisionOption.ReplaceExisting;
var file = await folder.CreateFileAsync("Settings.XML", options);
try
{
XmlSerializer SerializerObj = new XmlSerializer(typeof(Settings));
SerializerObj.Serialize(await file.OpenStreamForWriteAsync(), settings);
}
catch
{
// handle any kind of exceptions
}
}
Deserialize like this:
private async Task<Settings> LoadSettings()
{
Settings settings = new Settings();
var folder = Windows.Storage.ApplicationData.Current.LocalFolder;
try
{
var file = await folder.GetFileAsync("Settings.XML");
XmlSerializer SerializerObj = new XmlSerializer(typeof(Settings));
settings = SerializerObj.Deserialize(await file.OpenStreamForReadAsync()) as Settings;
}
catch (Exception ex)
{
// handle any kind of exceptions
}
return settings;
}
This example serializes an object called Settings. You can change it to serialize your array of objects.
This is from a windows 8 app so you may need to adapt it slightly.
I am extracting content of the Files in SQL File Table. The following code works if I do not use Parallel.
I am getting the following exception, when reading sql file stream simultaneously (Parallel).
The process cannot access the file specified because it has been opened in another transaction.
TL;DR:
When reading a file from FileTable (using GET_FILESTREAM_TRANSACTION_CONTEXT) in a Parallel.ForEach I get the above exception.
Sample Code for you to try out:
https://gist.github.com/NerdPad/6d9b399f2f5f5e5c6519
Longer Version:
Fetch Attachments, and extract content:
var documents = new List<ExtractedContent>();
using (var ts = new TransactionScope(TransactionScopeAsyncFlowOption.Enabled))
{
var attachments = await dao.GetAttachmentsAsync();
// Extract the content simultaneously
// documents = attachments.ToDbDocuments().ToList(); // This works
Parallel.ForEach(attachments, a => documents.Add(a.ToDbDocument())); // this doesn't
ts.Complete();
}
DAO Read File Table:
public async Task<IEnumerable<SearchAttachment>> GetAttachmentsAsync()
{
try
{
var commandStr = "....";
IEnumerable<SearchAttachment> attachments = null;
using (var connection = new SqlConnection(this.DatabaseContext.Database.Connection.ConnectionString))
using (var command = new SqlCommand(commandStr, connection))
{
connection.Open();
using (var reader = await command.ExecuteReaderAsync())
{
attachments = reader.ToSearchAttachments().ToList();
}
}
return attachments;
}
catch (System.Exception)
{
throw;
}
}
Create objects for each file:
The object contains a reference to the GET_FILESTREAM_TRANSACTION_CONTEXT
public static IEnumerable<SearchAttachment> ToSearchAttachments(this SqlDataReader reader)
{
if (!reader.HasRows)
{
yield break;
}
// Convert each row to SearchAttachment
while (reader.Read())
{
yield return new SearchAttachment
{
...
...
UNCPath = reader.To<string>(Constants.UNCPath),
ContentStream = reader.To<byte[]>(Constants.Stream) // GET_FILESTREAM_TRANSACTION_CONTEXT()
...
...
};
}
}
Read the file using SqlFileStream:
Exception is thrown here
public static ExtractedContent ToDbDocument(this SearchAttachment attachment)
{
// Read the file
// Exception is thrown here
using (var stream = new SqlFileStream(attachment.UNCPath, attachment.ContentStream, FileAccess.Read, FileOptions.SequentialScan, 4096))
{
...
// extract content from the file
}
....
}
Update 1:
According to this article it seems like it could be an Isolation level issue. Has anyone ever faced similar issue?
The transaction does not flow in to the Parallel.ForEach, you must manually bring the transaction in.
//Switched to a thread safe collection.
var documents = new ConcurrentQueue<ExtractedContent>();
using (var ts = new TransactionScope(TransactionScopeAsyncFlowOption.Enabled))
{
var attachments = await dao.GetAttachmentsAsync();
//Grab a reference to the current transaction.
var transaction = Transaction.Current;
Parallel.ForEach(attachments, a =>
{
//Spawn a dependant clone of the transaction
using (var depTs = transaction.DependentClone(DependentCloneOption.RollbackIfNotComplete))
{
documents.Enqueue(a.ToDbDocument());
depTs.Complete();
}
});
ts.Complete();
}
I also switched from List<ExtractedContent> to ConcurrentQueue<ExtractedContent> because you are not allowed call .Add( on a list from multiple threads at the same time.
Im trying to send some object from a server to the client.
My problem is that when im sending only 1 object, everything works correctly. But at the moment i add another object an exception is thrown - "binary stream does not contain a valid binaryheader" or "No map for object (random number)".
My thoughts are that the deserialization does not understand where the stream starts / ends and i hoped that you guys can help me out here.
heres my deserialization code:
public void Listen()
{
try
{
bool offline = true;
Dispatcher.Invoke(System.Windows.Threading.DispatcherPriority.Normal,
new Action(() => offline = Offline));
while (!offline)
{
TcpObject tcpObject = new TcpObject();
IFormatter formatter = new BinaryFormatter();
tcpObject = (TcpObject)formatter.Deserialize(serverStream);
if (tcpObject.Command == Command.Transfer)
{
SentAntenna sentAntenna = (SentAntenna)tcpObject.Object;
int idx = 0;
foreach (string name in SharedProperties.AntennaNames)
{
if (name == sentAntenna.Name)
break;
idx++;
}
if (idx < 9)
{
PointCollection pointCollection = new PointCollection();
foreach (Frequency f in sentAntenna.Frequencies)
pointCollection.Add(new Point(f.Channel, f.Intensity));
SharedProperties.AntennaPoints[idx] = pointCollection;
}
}
}
}
catch (Exception ex)
{
MessageBox.Show(ex.Message); // raise an event
}
}
serialization code:
case Command.Transfer:
Console.WriteLine("Transfering");
Thread transfer = new Thread(new ThreadStart(delegate
{
try
{
string aName = tcpObject.Object.ToString();
int indx = 0;
foreach (string name in names)
{
if (name == aName)
break;
indx++;
}
if (indx < 9)
{
while (true) // need to kill when the father thread terminates
{
if (antennas[indx].Frequencies != null)
{
lock (antennas[indx].Frequencies)
{
TcpObject sendTcpObject = new TcpObject();
sendTcpObject.Command = Command.Transfer;
SentAntenna sa = new SentAntenna(antennas[indx].Frequencies, aName);
sendTcpObject.Object = sa;
formatter.Serialize(networkStream, sendTcpObject);
}
}
}
}
}
catch (Exception ex) { Console.WriteLine(ex); }
}));
transfer.Start();
break;
Interesting. There's nothing particularly odd in your serialization code, and I've seen people use vanilla concatenation for multiple objects in the past, although I've actually always advised against it as BinaryFormatter does not explicitly claim this scenario is OK. But: if it isn't, the only thing I can suggest is to implement your own framing; so your write code becomes:
serialize to an empty MemoryStream
note the length and write the length to the NetworkStream, for example as a simple fixed-width 32-bit network-byte-order integer
write the payload from the MemoryStream to the NetworkStream
rinse, repeat
And the read code becomes:
read exactly 4 bytes and compute the length
buffer that many bytes into a MemoryStream
deserialize from the NetworkStream
(Noting in both cases to set the MemoryStream's position back to 0 between write and read)
You can also implement a Stream-subclass that caps the length if you want to avoid a buffer when reading, bit that is more complex.
apperantly i came up with a really simple solution. I just made sure only 1 thread is allowed to transfer data at the same time so i changed this line of code:
formatter.Serialize(networkStream, sendTcpObject);
to these lines of code:
if (!transfering) // making sure only 1 thread is transfering data
{
transfering = true;
formatter.Serialize(networkStream, sendTcpObject);
transfering = false;
}