I have a pdf with buttons that take you out to web links. I used iTextSharp to split these into separate PDFs (1 per page) per outside requirements. ISSUE: Any button that has multiple positions, lost the actions.
QUESTION: Does anyone know how to update these actions? I can open the new file, but I'm not sure how to go about using the PdfStamper to add an AA to this Annotation
So when opening the original file, you could get to the Additional Action by doing this:
var r = new PdfReader(f.FullName);
var positionsOfThisButton = r.AcroFields.GetFieldPositions("14");
var field = r.AcroForm.GetField("14")
var targetObject = PdfReader.GetPdfObject(field.Ref);
var kids = targetObject.GetAsArray(PdfName.KIDS);
foreach (var k in kids){
var ko = (PdfDictionary)(k.IsIndirect() ? PdfReader.GetPdfObject(k) : k);
var aaObj = ko.Get(PdfName.AA);
//(aaObj is NULL in the new file)
var aa = (PdfDictionary)(aaObj.IsIndirect() ? PdfReader.GetPdfObject(aaObj) : aaObj);
var dObj = aa.Get(PdfName.D);
var d = (PdfDictionary)(dObj.IsIndirect() ? PdfReader.GetPdfObject(dObj) : dObj);
Debug.WriteLine("S:" + d.GetAsName(PdfName.S).ToString() );
//returns S:/Uri
Debug.WriteLine("URI:" + d.GetAsString(PdfName.URI).ToString() );
//returns URI:http://www.somesite.com/etc
}
Thanks for any help.
FYI ONLY - The following is how I split the files:
List<byte[]> Get(FileInfo f) {
List<byte[]> outputFiles = new List<byte[]>();
var reader = new PdfReader(f.FullName);
int n = reader.NumberOfPages;
reader.Close();
for (int i = n; i > 0; i--) {
reader = new PdfReader(f.FullName);
using (var document = new Document(reader.GetPageSizeWithRotation(1))) {
using (var outputStream = new MemoryStream()) {
using (var writer = new PdfCopy(document, outputStream)) {
writer.SetMergeFields();
writer.PdfVersion = '6';
document.Open();
writer.AddDocument(reader, new List<int> { i });
document.Close();
writer.Close();
}
outputFiles.Insert(0, outputStream.ToArray());
}
}
reader.Close();
}
return outputFiles;
}
Related
I have difficulties understanding this example on how to use facets :
https://lucenenet.apache.org/docs/4.8.0-beta00008/api/Lucene.Net.Demo/Lucene.Net.Demo.Facet.SimpleFacetsExample.html
My goal is to create an index in which each document field have a facet, so that at search time i can choose which facets use to navigate data.
What i am confused about is setup of facets in index creation, to
summarize my question : is index with facets compatibile with
ReferenceManager?
Need DirectoryTaxonomyWriter to be actually written and persisted
on disk or it will embedded into the index itself and is just
temporary? I mean given the code
indexWriter.AddDocument(config.Build(taxoWriter, doc)); of the
example i expect it's temporary and will be embedded into the index (but then the example also show you need the Taxonomy to drill down facet). So can the Taxonomy be tangled in some way with the index so that the are handled althogeter with ReferenceManager?
If is not may i just use the same folder i use for storing index?
Here is a more detailed list of point that confuse me :
In my scenario i am indexing the document asyncrhonously (background process) and then fetching the indext ASAP throught ReferenceManager in ASP.NET application. I hope this way to fetch the index is compatibile with DirectoryTaxonomyWriter needed by facets.
Then i modified the code i write introducing the taxonomy writer as indicated in the example, but i am a bit confused, seems like i can't store DirectoryTaxonomyWriter into the same folder of index because the folder is locked, need i to persist it or it will be embedded into the index (so a RAMDirectory is enougth)? if i need to persist it in a different direcotry, can i safely persist it into subdirectory?
Here the code i am actually using :
private static void BuildIndex (IndexEntry entry)
{
string targetFolder = ConfigurationManager.AppSettings["IndexFolder"] ?? string.Empty;
//** LOG
if (System.IO.Directory.Exists(targetFolder) == false)
{
string message = #"Index folder not found";
_fileLogger.Error(message);
_consoleLogger.Error(message);
return;
}
var metadata = JsonConvert.DeserializeObject<IndexMetadata>(File.ReadAllText(entry.MetdataPath) ?? "{}");
string[] header = new string[0];
List<dynamic> csvRecords = new List<dynamic>();
using (var reader = new StreamReader(entry.DataPath))
{
CsvConfiguration csvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture);
csvConfiguration.AllowComments = false;
csvConfiguration.CountBytes = false;
csvConfiguration.Delimiter = ",";
csvConfiguration.DetectColumnCountChanges = false;
csvConfiguration.Encoding = Encoding.UTF8;
csvConfiguration.HasHeaderRecord = true;
csvConfiguration.IgnoreBlankLines = true;
csvConfiguration.HeaderValidated = null;
csvConfiguration.MissingFieldFound = null;
csvConfiguration.TrimOptions = CsvHelper.Configuration.TrimOptions.None;
csvConfiguration.BadDataFound = null;
using (var csvReader = new CsvReader(reader, csvConfiguration))
{
csvReader.Read();
csvReader.ReadHeader();
csvReader.Read();
header = csvReader.HeaderRecord;
csvRecords = csvReader.GetRecords<dynamic>().ToList();
}
}
string targetDirectory = Path.Combine(targetFolder, "Index__" + metadata.Boundle + "__" + DateTime.Now.ToString("yyyyMMdd_HHmmss") + "__" + Path.GetRandomFileName().Substring(0, 6));
System.IO.Directory.CreateDirectory(targetDirectory);
//** LOG
{
string message = #"..creating index : {0}";
_fileLogger.Information(message, targetDirectory);
_consoleLogger.Information(message, targetDirectory);
}
using (var dir = FSDirectory.Open(targetDirectory))
{
using (DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir))
{
Analyzer analyzer = metadata.GetAnalyzer();
var indexConfig = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer);
using (IndexWriter writer = new IndexWriter(dir, indexConfig))
{
long entryNumber = csvRecords.Count();
long index = 0;
long lastPercentage = 0;
foreach (dynamic csvEntry in csvRecords)
{
Document doc = new Document();
IDictionary<string, object> dynamicCsvEntry = (IDictionary<string, object>)csvEntry;
var indexedMetadataFiled = metadata.IdexedFields;
foreach (string headField in header)
{
if (indexedMetadataFiled.ContainsKey(headField) == false || (indexedMetadataFiled[headField].NeedToBeIndexed == false && indexedMetadataFiled[headField].NeedToBeStored == false))
continue;
var field = new Field(headField,
((string)dynamicCsvEntry[headField] ?? string.Empty).ToLower(),
indexedMetadataFiled[headField].NeedToBeStored ? Field.Store.YES : Field.Store.NO,
indexedMetadataFiled[headField].NeedToBeIndexed ? Field.Index.ANALYZED : Field.Index.NO
);
doc.Add(field);
var facetField = new FacetField(headField, (string)dynamicCsvEntry[headField]);
doc.Add(facetField);
}
long percentage = (long)(((decimal)index / (decimal)entryNumber) * 100m);
if (percentage > lastPercentage && percentage % 10 == 0)
{
_consoleLogger.Information($"..indexing {percentage}%..");
lastPercentage = percentage;
}
writer.AddDocument(doc);
index++;
}
writer.Commit();
}
}
}
//** LOG
{
string message = #"Index Created : {0}";
_fileLogger.Information(message, targetDirectory);
_consoleLogger.Information(message, targetDirectory);
}
}
I'm trying to extract some text from an image using tesseract, and I've noticed if I divide the image to 9 smaller pieces the system is more accurate, so what I'm trying to accomplish is to process all 9 images at once (parallel) and this is the way I wanted to do it:
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
try
{
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
{
Parallel.ForEach(CutUpImage(src), (img) =>
{
using (var ms = new MemoryStream())
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
}
});
return found.Count;
}
}
catch (Exception ex)
{
throw ex;
}
}
but I'm getting an error (Only one image can be processed at once. Please make sure you dispose of the page once your finished with it.).
So I had to move the new TesseractEngine into the loop like this:
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
Parallel.ForEach(CutUpImage(src), (img) =>
{
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
{
using (var ms = new MemoryStream())
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
}
}
});
return found.Count;
}
but then it takes a full minute to finish processing all images.
so my question is how do I make the new TesseractEngine work outside the loop, and more generally how do I make this work faster?
ok so the solution to my problem is simple... don't use parallel processing!
I switched the Parallel.ForEach to a traditional foreach (idk why I decided to try parallel processing first...) and it now takes 12 seconds to process them all, this is the code :
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
using (var ms = new MemoryStream())
foreach (var img in CutUpImage(src))
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
ms.SetLength(0);
}
return found.Count;
}
p.s. this is the CutUpImage code if someone ever wants to use it...
private static Image[] CutUpImage(Image src)
{
int widthThird = (int)((double)src.Width / 3.0 + 0.5);
int heightThird = (int)((double)src.Height / 3.0 + 0.5);
var imgarray = new Image[9];
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
{
var index = i*3+j;
imgarray[index] = new Bitmap(widthThird, heightThird);
Graphics g = Graphics.FromImage(imgarray[index]);
g.DrawImage(src, new Rectangle(0, 0, widthThird, heightThird),
new Rectangle(j * widthThird, i * heightThird, widthThird, heightThird),
GraphicsUnit.Pixel);
g.Dispose();
}
return imgarray;
}
My code is here but i cannot get my required result
DataTable dtWB = new DataTable();
dtWB.Columns.Add("Name");
dtWB.Columns.Add("DOS");
dtWB.Columns.Add("CPT");
dtWB.AcceptChanges();
DataRow drWB = null;
// Auto-detect format, supports:
byte[] ExcelData = File.ReadAllBytes(AppDomain.CurrentDomain.BaseDirectory + path);
MemoryStream stream = new MemoryStream(ExcelData);
using (var reader = ExcelReaderFactory.CreateReader(stream))
{
// Choose one of either 1 or 2:
// 1. Use the reader methods
do
{
while (reader.Read())
{
// reader.GetDouble(0);
}
} while (reader.NextResult());
// 2. Use the AsDataSet extension method
var result = reader.AsDataSet();
//The result of each spreadsheet is in result.Tables
var Tables = result.Tables;
var CT = Tables.Count;
DataTable WBdt = Tables[1];
}
I have a data in an excel sheet like this
https://i.stack.imgur.com/nZl4K.png
I want to show data like this using c#.
https://i.stack.imgur.com/BjzZd.png
Your "Code" simply consists of the ExcelDataReader documentation sample. This will parse the dataset into a collection with the format you requested.
var records = new Dictionary<string, Dictionary<DateTime, double?>>();
using (var stream = File.Open(filePath, FileMode.Open, FileAccess.Read))
{
using (var reader = ExcelReaderFactory.CreateReader(stream))
{
var firstTable = reader.AsDataSet().Tables[0];
for (int i = 1; i < firstTable.Rows.Count; i++)
{
string name = firstTable.Rows[i].Field<string>(0);
var dt_values = new Dictionary<DateTime, double?>();
for (int j = 1; j < firstTable.Columns.Count; j++)
{
DateTime dt = firstTable.Rows[0].Field<DateTime>(j);
double? value = firstTable.Rows[i].Field<double?>(j);
dt_values.Add(dt, value);
}
records.Add(name, dt_values);
}
}
}
var formatted_data = records.SelectMany(x => x.Value.Select(y => new
{
Name = x.Key,
Dos = y.Key.ToShortDateString(),
CPT = y.Value
}));
In an ASP.Net MVC4 application, I'm using the following code to process a Go To Webinar Attendees report (CSV format).
For some reason, the file that is being loaded is not being released by IIS and it is causing issues when attempting to process another file.
Do you see anything out of the ordinary here?
The CSVHelper (CsvReader) is from https://joshclose.github.io/CsvHelper/
public AttendeesData GetRecords(string filename, string webinarKey)
{
StreamReader sr = new StreamReader(Server.MapPath(filename));
CsvReader csvread = new CsvReader(sr);
csvread.Configuration.HasHeaderRecord = false;
List<AttendeeRecord> record = csvread.GetRecords<AttendeeRecord>().ToList();
record.RemoveRange(0, 7);
AttendeesData attdata = new AttendeesData();
attdata.Attendees = new List<Attendee>();
foreach (var rec in record)
{
Attendee aa = new Attendee();
aa.Webinarkey = webinarKey;
aa.FullName = String.Concat(rec.First_Name, " ", rec.Last_Name);
aa.AttendedWebinar = 0;
aa.Email = rec.Email_Address;
aa.JoinTime = rec.Join_Time.Replace(" CST", "");
aa.LeaveTime = rec.Leave_Time.Replace(" CST", "");
aa.TimeInSession = rec.Time_in_Session.Replace("hour", "hr").Replace("minute", "min");
aa.Makeup = 0;
aa.RegistrantKey = Registrants.Where(x => x.email == rec.Email_Address).FirstOrDefault().registrantKey;
List<string> firstPolls = new List<string>()
{
rec.Poll_1.Trim(), rec.Poll_2.Trim(),rec.Poll_3.Trim(),rec.Poll_4.Trim()
};
int pass1 = firstPolls.Count(x => x != "");
List<string> secondPolls = new List<string>()
{
rec.Poll_5.Trim(), rec.Poll_6.Trim(),rec.Poll_7.Trim(),rec.Poll_8.Trim()
};
int pass2 = secondPolls.Count(x => x != "");
aa.FirstPollCount = pass1;
aa.SecondPollCount = pass2;
if (aa.TimeInSession != "")
{
aa.AttendedWebinar = 1;
}
if (aa.FirstPollCount == 0 || aa.SecondPollCount == 0)
{
aa.AttendedWebinar = 0;
}
attdata.Attendees.Add(aa);
attendeeToDB(aa); // adds to Oracle DB using EF6.
}
// Should I call csvread.Dispose() here?
sr.Close();
return attdata;
}
Yes. You have to dispose objects too.
sr.Close();
csvread.Dispose();
sr.Dispose();
Better strategy to use using keyword.
You should use usings for your streamreaders and writers.
You should follow some naming conventions (Lists contains always multiple entries, rename record to records)
You should use clear names (not aa)
The project is in C# and use iTextSharp.
I have a dictionary with a title (string) and file content (byte array). I loop through this dictionary and merge all files together. What I need now is to add bookmarks to the start of the first page in each file, but I should not add any new pages or text to the final document. I have tried different solutions, but all seem to add a table of contents page, a new page before each page or some text at the start of the page.
None of the files have bookmarks originally.
I am looking for a bookmarks structure that looks something like this:
File1
File2
SomeCategory
File3
File4
I would very much appreciate it if anyone could point me in the right direction.
My function for merging the files looks like this:
/// <summary>
/// Merge PDF files, and stamp certificates. This is a modified version of the example in the link below.
/// See: http://www.codeproject.com/Articles/28283/Simple-NET-PDF-Merger for more information.
/// </summary>
/// <param name="sourceFiles">Files to be merged</param>
/// <returns>Byte array with the combined files.</returns>
public static byte[] MergeFiles(Dictionary<string, byte[]> sourceFiles)
{
var document = new Document();
var output = new MemoryStream();
try
{
// Initialize pdf writer
var writer = PdfWriter.GetInstance(document, output);
writer.PageEvent = new PdfPageEvents();
// Open document to write
document.Open();
var content = writer.DirectContent;
// Iterate through all pdf documents
foreach (var sourceFile in sourceFiles)
{
// Create pdf reader
var reader = new PdfReader(sourceFile.Value);
var numberOfPages = reader.NumberOfPages;
// Iterate through all pages
for (var currentPageIndex = 1; currentPageIndex <=
numberOfPages; currentPageIndex++)
{
// Determine page size for the current page
document.SetPageSize(
reader.GetPageSizeWithRotation(currentPageIndex));
// Create page
document.NewPage();
var importedPage =
writer.GetImportedPage(reader, currentPageIndex);
// Determine page orientation
var pageOrientation = reader.GetPageRotation(currentPageIndex);
if ((pageOrientation == 90) || (pageOrientation == 270))
{
content.AddTemplate(importedPage, 0, -1f, 1f, 0, 0,
reader.GetPageSizeWithRotation(currentPageIndex).Height);
}
else
{
content.AddTemplate(importedPage, 1f, 0, 0, 1f, 0, 0);
}
// Add stamp to certificates
if (sourceFile.Key.IsValidDocumentReference())
AddStamp(content, document, sourceFile.Key, currentPageIndex, numberOfPages);
}
}
}
catch (Exception exception)
{
throw new Exception("An unexpected exception occured during the merging process", exception);
}
finally
{
document.Close();
}
return output.GetBuffer();
}
Thanks to Bruno Lowagie who pointed me in the right direction, I was able to produce a solution to my problem.
This is my solution:
public static byte[] MergeFilesAndAddBookmarks(Dictionary<PrintDocument, byte[]> sourceFiles)
{
using (var ms = new MemoryStream())
{
using (var document = new Document())
{
using (var copy = new PdfCopy(document, ms))
{
//Order the files by chapternumber
var files = sourceFiles.GroupBy(f => f.Key.ChapterNumber);
document.Open();
var outlines = new List<Dictionary<string, object>>();
var pageIndex = 1;
foreach (var chapterGroup in files)
{
var map = new Dictionary<string, object>();
outlines.Add(map);
map.Add("Title", chapterGroup.First().Key.ChapterName);
var kids = new List<Dictionary<string, object>>();
map.Add("Kids", kids);
foreach (var sourceFile in chapterGroup)
{
using (var reader = new PdfReader(sourceFile.Value))
{
// add the pages
var n = reader.NumberOfPages;
for (var page = 0; page < n;)
{
if (page == 0)
{
var kid = new Dictionary<string, object>();
kids.Add(kid);
kid["Title"] = sourceFile.Key.Title;
kid["Action"] = "GoTo";
kid["Page"] = String.Format("{0} Fit", pageIndex);
}
copy.AddPage(copy.GetImportedPage(reader, ++page));
}
pageIndex += n;
reader.Close();
}
}
}
copy.Outlines = outlines;
document.Close();
copy.Close();
ms.Close();
}
}
return ms.ToArray();
}
}
}
public class PrintDocument
{
public string Title { get; set; }
public string ChapterName { get; set; }
public int ChapterNumber { get; set; }
}