I process emails just fine. Now, I come across some emails that are PDFs and they must be inline since they aren't noticed with the .Attachments. Here is my code. I can't get it. Please help. Thanks!
var message = mainFolder.GetMessage(i - 1);
eCount++;
// Get specifics of email
var attachments = message.Attachments.ToList();
int attCnt = attachments.Capacity;
string preChk = message.From.ToString();
var msgMsg = new MimePart();
var att2 = new List<MimePart>();
var mp2 = new List<Multipart>();
var iter = new MimeIterator(message);
int mpCnt = 0;
if (attCnt == 0)
{
while (iter.MoveNext())
{
mpCnt += 1;
var mp = iter.Parent as Multipart;
var prt = iter.Current as MimePart;
if (mp != null && prt != null && prt.IsAttachment)
{ //Check if an attachment slipped through
mp2.Add(mp);
att2.Add(prt);
}
}
}
// If I expand the iter.MoveNext, I can drill down to the images
iter.MoveNext
I did figure it out and I pretty much eliminated all the code above and condensed it to only a couple lines. In my var attachments, if it = 0, then I know its inline. I did this:
var bd = message.BodyParts.ToList<MimeKit.MimeEntity>();
Usually, we do the MimeKit.Mimepart attachment in attachments loop. I found out a couple things. Do var mp = bd.ElementAt(inAttCnt - 1);
var ma2 = mp.ContentType.Name; Check that ma2 <> null. Dont do a bd.Remove you will throw an exception! The foreach will take care of it! Hope that helps.
Related
I have difficulties understanding this example on how to use facets :
https://lucenenet.apache.org/docs/4.8.0-beta00008/api/Lucene.Net.Demo/Lucene.Net.Demo.Facet.SimpleFacetsExample.html
My goal is to create an index in which each document field have a facet, so that at search time i can choose which facets use to navigate data.
What i am confused about is setup of facets in index creation, to
summarize my question : is index with facets compatibile with
ReferenceManager?
Need DirectoryTaxonomyWriter to be actually written and persisted
on disk or it will embedded into the index itself and is just
temporary? I mean given the code
indexWriter.AddDocument(config.Build(taxoWriter, doc)); of the
example i expect it's temporary and will be embedded into the index (but then the example also show you need the Taxonomy to drill down facet). So can the Taxonomy be tangled in some way with the index so that the are handled althogeter with ReferenceManager?
If is not may i just use the same folder i use for storing index?
Here is a more detailed list of point that confuse me :
In my scenario i am indexing the document asyncrhonously (background process) and then fetching the indext ASAP throught ReferenceManager in ASP.NET application. I hope this way to fetch the index is compatibile with DirectoryTaxonomyWriter needed by facets.
Then i modified the code i write introducing the taxonomy writer as indicated in the example, but i am a bit confused, seems like i can't store DirectoryTaxonomyWriter into the same folder of index because the folder is locked, need i to persist it or it will be embedded into the index (so a RAMDirectory is enougth)? if i need to persist it in a different direcotry, can i safely persist it into subdirectory?
Here the code i am actually using :
private static void BuildIndex (IndexEntry entry)
{
string targetFolder = ConfigurationManager.AppSettings["IndexFolder"] ?? string.Empty;
//** LOG
if (System.IO.Directory.Exists(targetFolder) == false)
{
string message = #"Index folder not found";
_fileLogger.Error(message);
_consoleLogger.Error(message);
return;
}
var metadata = JsonConvert.DeserializeObject<IndexMetadata>(File.ReadAllText(entry.MetdataPath) ?? "{}");
string[] header = new string[0];
List<dynamic> csvRecords = new List<dynamic>();
using (var reader = new StreamReader(entry.DataPath))
{
CsvConfiguration csvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture);
csvConfiguration.AllowComments = false;
csvConfiguration.CountBytes = false;
csvConfiguration.Delimiter = ",";
csvConfiguration.DetectColumnCountChanges = false;
csvConfiguration.Encoding = Encoding.UTF8;
csvConfiguration.HasHeaderRecord = true;
csvConfiguration.IgnoreBlankLines = true;
csvConfiguration.HeaderValidated = null;
csvConfiguration.MissingFieldFound = null;
csvConfiguration.TrimOptions = CsvHelper.Configuration.TrimOptions.None;
csvConfiguration.BadDataFound = null;
using (var csvReader = new CsvReader(reader, csvConfiguration))
{
csvReader.Read();
csvReader.ReadHeader();
csvReader.Read();
header = csvReader.HeaderRecord;
csvRecords = csvReader.GetRecords<dynamic>().ToList();
}
}
string targetDirectory = Path.Combine(targetFolder, "Index__" + metadata.Boundle + "__" + DateTime.Now.ToString("yyyyMMdd_HHmmss") + "__" + Path.GetRandomFileName().Substring(0, 6));
System.IO.Directory.CreateDirectory(targetDirectory);
//** LOG
{
string message = #"..creating index : {0}";
_fileLogger.Information(message, targetDirectory);
_consoleLogger.Information(message, targetDirectory);
}
using (var dir = FSDirectory.Open(targetDirectory))
{
using (DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir))
{
Analyzer analyzer = metadata.GetAnalyzer();
var indexConfig = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer);
using (IndexWriter writer = new IndexWriter(dir, indexConfig))
{
long entryNumber = csvRecords.Count();
long index = 0;
long lastPercentage = 0;
foreach (dynamic csvEntry in csvRecords)
{
Document doc = new Document();
IDictionary<string, object> dynamicCsvEntry = (IDictionary<string, object>)csvEntry;
var indexedMetadataFiled = metadata.IdexedFields;
foreach (string headField in header)
{
if (indexedMetadataFiled.ContainsKey(headField) == false || (indexedMetadataFiled[headField].NeedToBeIndexed == false && indexedMetadataFiled[headField].NeedToBeStored == false))
continue;
var field = new Field(headField,
((string)dynamicCsvEntry[headField] ?? string.Empty).ToLower(),
indexedMetadataFiled[headField].NeedToBeStored ? Field.Store.YES : Field.Store.NO,
indexedMetadataFiled[headField].NeedToBeIndexed ? Field.Index.ANALYZED : Field.Index.NO
);
doc.Add(field);
var facetField = new FacetField(headField, (string)dynamicCsvEntry[headField]);
doc.Add(facetField);
}
long percentage = (long)(((decimal)index / (decimal)entryNumber) * 100m);
if (percentage > lastPercentage && percentage % 10 == 0)
{
_consoleLogger.Information($"..indexing {percentage}%..");
lastPercentage = percentage;
}
writer.AddDocument(doc);
index++;
}
writer.Commit();
}
}
}
//** LOG
{
string message = #"Index Created : {0}";
_fileLogger.Information(message, targetDirectory);
_consoleLogger.Information(message, targetDirectory);
}
}
I was using this piece of code till today and it was working fine:
for (int page = 1; page <= reader.NumberOfPages; page++)
{
var cpage = reader.GetPageN(page);
var content = cpage.Get(PdfName.CONTENTS);
var ir = (PRIndirectReference)content;
var value = reader.GetPdfObject(ir.Number);
if (value.IsStream())
{
PRStream stream = (PRStream)value;
var streamBytes = PdfReader.GetStreamBytes(stream);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
string strs = tokenizer.StringValue;
if (!(br = excludeList.Any(st => strs.Contains(st))))
{
//strfor += tokenizer.StringValue;
if (!string.IsNullOrWhiteSpace(strs) &&
!stringsList.Any(i => i == strs && excludeHeaders.Contains(strs)))
stringsList.Add(strs);
}
}
}
}
finally
{
tokenizer.Close();
}
}
}
But today I got an exception for some pdf file: Unable to cast object of type 'iTextSharp.text.pdf.PdfArray' to type 'iTextSharp.text.pdf.PRIndirectReference
On debugging I got to know that the error is at this line: var ir = (PRIndirectReference)content;. That's because the pdf content that I'm extracting, I get it in the form of ArrayList, as you can see from the below image:
It would be really grateful if anyone can help me with this. Thanks in advance.
EDIT :
The pdf contents are paragraphs, tables, headers & footers, images in few cases. But I'm not bothered of images as I'm bypassing them.
As you can see from the code I'm trying to add the words into a string list, so I expect the output as plain text; words to be specific.
That was real easy! Don't know why I couldn't make out.
PdfReader reader = new PdfReader(name);
List<string> stringsList = new List<string>();
for (int page = 1; page <= reader.NumberOfPages; page++)
{
//directly get the contents into a byte stream
var streamByte = reader.GetPageContent(page);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamByte));
var sb = new StringBuilder(); //use a string builder instead
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
var currentText = tokenizer.StringValue;
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
sb.Append(tokenizer.StringValue);
}
}
}
finally
{
//add appended strings into a string list
if(sb != null)
stringsList.Add(sb.ToString());
tokenizer.Close();
}
}
I've this C# code working with iTextSharp 5 and I need to port it to IText7.
public static PdfReader Fix(PdfReader pdfReader, int pagina)
{
var dic = pdfReader.GetPageN(pagina);
var resources = dic.GetAsDict(PdfName.Resources);
var fonts = resources?.GetAsDict(PdfName.Font);
if (fonts == null) return pdfReader;
foreach (var key in fonts.Keys)
{
var font = fonts.GetAsDict(key);
var firstChar = font.Get(PdfName.FirstChar);
if (firstChar == null)
font.Put(PdfName.FirstChar, new PdfNumber(32));
var lastChar = font.Get(PdfName.LastChar);
if (lastChar == null)
font.Put(PdfName.LastChar, new PdfNumber(255));
var widths = font.GetAsArray(PdfName.Widths);
if (widths != null) continue;
var array = Enumerable.Repeat(600, 256).ToArray();
font.Put(PdfName.Widths, new PdfArray(array));
}
return pdfReader;
}
The problem I have is that the method GetPageN in this line:
var dic = pdfReader.GetPageN(pagina);
has been removed.
Have someone faced the same problem?
Indeed, now the GetPage() method is inside of the PdfDocument class.
There are also some little changes as to how you get the Dictionary entries from the document, which I took the liberty to adjust your code to.
public static PdfReader Fix(PdfReader pdfReader, int pagina)
{
var dic = new PdfDocument(pdfReader).GetPage(pagina);
var resources = dic.GetPdfObject().GetAsDictionary(PdfName.Resources);
var fonts = resources?.GetAsDictionary(PdfName.Font);
if (fonts == null) return pdfReader;
foreach (var key in fonts.KeySet())
{
var font = fonts.GetAsDictionary(key);
var firstChar = font.Get(PdfName.FirstChar);
if (firstChar == null)
font.Put(PdfName.FirstChar, new PdfNumber(32));
var lastChar = font.Get(PdfName.LastChar);
if (lastChar == null)
font.Put(PdfName.LastChar, new PdfNumber(255));
var widths = font.GetAsArray(PdfName.Widths);
if (widths != null) continue;
var array = Enumerable.Repeat(600, 256).ToArray();
font.Put(PdfName.Widths, new PdfArray(array));
}
return pdfReader;
}
(I haven't checked your code, just made sure that at least what you posted now compiles)
I created a tool which iterates through all Commits of a repository.
Then it makes a diff of the commit to all of its parents and then reads the content for some checks.
Now it came out that this slows down very quickly. It always slows down after the same specific commit which is quite large as it is a merge commit.
Here the code how I iteratre through the commits. The following codes are a little simplified for a better focus.
var repo = new Repository(path);
foreach (LibGit2Sharp.Commit commit in repo.Commits)
{
IEnumerable<FileChanges> changed = repo.GetChangedToAllParents(commit);
var files = ResolveChangeFileInfos(changed);
var entry = new Commit(commit.Id.ToString(), commit.Author.Email, commit.Committer.When, commit.Message, files);
yield return entry;
}
in GetChagedToAllParents I basically make a diff foreach parent like this
foreach(var parent in commit.parent)
{
var options = new CompareOptions
{
Algorithm = DiffAlgorithm.Minimal,
IncludeUnmodified = false
};
var patches = repo.Diff.Compare<Patch>(parent.Tree, commit.Tree, options); // Difference
}
and later I read the content of the files in this way:
var blob = repo.Lookup<Blob>(patchEntry.Oid.Sha); // Find blob
Stream contentStream = blob.GetContentStream();
string result = null;
using (var tr = new StreamReader(contentStream, Encoding.UTF8))
{
result = tr.ReadToEnd();
}
Are there any known issues ? am I missing any leaks?
Update
I Found out that most of the time (about 90%) is taken by the diff. and it gets constantly slower
var options = new CompareOptions
{
Algorithm = DiffAlgorithm.Minimal,
IncludeUnmodified = false
};
var patches = repo.Diff.Compare<Patch>(parent.Tree, commit.Tree, options); // Difference
I can reproduce it with this code:
var repo = new Repository(path);
foreach (var commit in repo.Commits)
{
pos++;
if (pos%100 == 0)
{
Console.WriteLine(pos);
}
var options = new CompareOptions
{
Algorithm = DiffAlgorithm.Minimal,
IncludeUnmodified = false,
Similarity = new SimilarityOptions
{
RenameDetectionMode = RenameDetectionMode.None,
WhitespaceMode = WhitespaceMode.IgnoreAllWhitespace
}
};
foreach (var parent in commit.Parents)
{
var changedFiles=
repo.Diff.Compare<TreeChanges>(parent.Tree, commit.Tree, options).ToList();
}
}
it reserved about 500MB for each 1000 commits and at some point it just crashes. So I posted it also here:
https://github.com/libgit2/libgit2sharp/issues/1359
Is there a faster way to get all files that were changed in a specific commit?
In an ASP.Net MVC4 application, I'm using the following code to process a Go To Webinar Attendees report (CSV format).
For some reason, the file that is being loaded is not being released by IIS and it is causing issues when attempting to process another file.
Do you see anything out of the ordinary here?
The CSVHelper (CsvReader) is from https://joshclose.github.io/CsvHelper/
public AttendeesData GetRecords(string filename, string webinarKey)
{
StreamReader sr = new StreamReader(Server.MapPath(filename));
CsvReader csvread = new CsvReader(sr);
csvread.Configuration.HasHeaderRecord = false;
List<AttendeeRecord> record = csvread.GetRecords<AttendeeRecord>().ToList();
record.RemoveRange(0, 7);
AttendeesData attdata = new AttendeesData();
attdata.Attendees = new List<Attendee>();
foreach (var rec in record)
{
Attendee aa = new Attendee();
aa.Webinarkey = webinarKey;
aa.FullName = String.Concat(rec.First_Name, " ", rec.Last_Name);
aa.AttendedWebinar = 0;
aa.Email = rec.Email_Address;
aa.JoinTime = rec.Join_Time.Replace(" CST", "");
aa.LeaveTime = rec.Leave_Time.Replace(" CST", "");
aa.TimeInSession = rec.Time_in_Session.Replace("hour", "hr").Replace("minute", "min");
aa.Makeup = 0;
aa.RegistrantKey = Registrants.Where(x => x.email == rec.Email_Address).FirstOrDefault().registrantKey;
List<string> firstPolls = new List<string>()
{
rec.Poll_1.Trim(), rec.Poll_2.Trim(),rec.Poll_3.Trim(),rec.Poll_4.Trim()
};
int pass1 = firstPolls.Count(x => x != "");
List<string> secondPolls = new List<string>()
{
rec.Poll_5.Trim(), rec.Poll_6.Trim(),rec.Poll_7.Trim(),rec.Poll_8.Trim()
};
int pass2 = secondPolls.Count(x => x != "");
aa.FirstPollCount = pass1;
aa.SecondPollCount = pass2;
if (aa.TimeInSession != "")
{
aa.AttendedWebinar = 1;
}
if (aa.FirstPollCount == 0 || aa.SecondPollCount == 0)
{
aa.AttendedWebinar = 0;
}
attdata.Attendees.Add(aa);
attendeeToDB(aa); // adds to Oracle DB using EF6.
}
// Should I call csvread.Dispose() here?
sr.Close();
return attdata;
}
Yes. You have to dispose objects too.
sr.Close();
csvread.Dispose();
sr.Dispose();
Better strategy to use using keyword.
You should use usings for your streamreaders and writers.
You should follow some naming conventions (Lists contains always multiple entries, rename record to records)
You should use clear names (not aa)