I'm a little bit lost with PDF standards for data extraction.
I'm using iText and looking for extraction of clickable HTML link within a PDF + Text that contains those links.
I achieved to get HTML links, but how can I get text from those links ?
public static List<string> linkCollector(string pdfPath)
{
List<string> linkCollection = new List<string>();
var reader = new PdfReader(pdfPath);
PdfDocument pdfDoc = new PdfDocument(reader);
for (int page = 1; page <= pdfDoc.GetNumberOfPages(); page++)
{
var pageDict = pdfDoc.GetPage(page);
var annotList = pageDict.GetAnnotations();
for (int i=0;i< annotList.Count(); i++)
{
PdfObject B = annotList[i].GetPdfObject();
var annotDict = B as PdfDictionary;
var linkDict = (PdfDictionary)annotDict.GetAsDictionary(PdfName.A);
var sUri = linkDict.Get(PdfName.URI).ToString();
linkCollection.Add(sUri);
}
}
return linkCollection;
}
Related
I was using this piece of code till today and it was working fine:
for (int page = 1; page <= reader.NumberOfPages; page++)
{
var cpage = reader.GetPageN(page);
var content = cpage.Get(PdfName.CONTENTS);
var ir = (PRIndirectReference)content;
var value = reader.GetPdfObject(ir.Number);
if (value.IsStream())
{
PRStream stream = (PRStream)value;
var streamBytes = PdfReader.GetStreamBytes(stream);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
string strs = tokenizer.StringValue;
if (!(br = excludeList.Any(st => strs.Contains(st))))
{
//strfor += tokenizer.StringValue;
if (!string.IsNullOrWhiteSpace(strs) &&
!stringsList.Any(i => i == strs && excludeHeaders.Contains(strs)))
stringsList.Add(strs);
}
}
}
}
finally
{
tokenizer.Close();
}
}
}
But today I got an exception for some pdf file: Unable to cast object of type 'iTextSharp.text.pdf.PdfArray' to type 'iTextSharp.text.pdf.PRIndirectReference
On debugging I got to know that the error is at this line: var ir = (PRIndirectReference)content;. That's because the pdf content that I'm extracting, I get it in the form of ArrayList, as you can see from the below image:
It would be really grateful if anyone can help me with this. Thanks in advance.
EDIT :
The pdf contents are paragraphs, tables, headers & footers, images in few cases. But I'm not bothered of images as I'm bypassing them.
As you can see from the code I'm trying to add the words into a string list, so I expect the output as plain text; words to be specific.
That was real easy! Don't know why I couldn't make out.
PdfReader reader = new PdfReader(name);
List<string> stringsList = new List<string>();
for (int page = 1; page <= reader.NumberOfPages; page++)
{
//directly get the contents into a byte stream
var streamByte = reader.GetPageContent(page);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamByte));
var sb = new StringBuilder(); //use a string builder instead
try
{
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TK_STRING)
{
var currentText = tokenizer.StringValue;
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
sb.Append(tokenizer.StringValue);
}
}
}
finally
{
//add appended strings into a string list
if(sb != null)
stringsList.Add(sb.ToString());
tokenizer.Close();
}
}
I have an input string which is html. It contains images and I want to change the src property on the img
My code so far is as below:
if (htmlStr.Contains("img"))
{
var html = new HtmlDocument();
html.LoadHtml(htmlStr);
var images = html.DocumentNode.SelectNodes("//img");
if (images != null && images.Count > 0)
{
for (int i = 0; i < images.Count; i++)
{
string imageSrc = images[i].Attributes["src"].Value;
string newSrc = "MyNewValue";
images[i].SetAttributeValue("src", newSrc);
}
}
//htmlStr= ???
}
return htmlStr;
What I am missing is how to update the htmlStr I am returning with the newSrc value each image.
As far as I can tell, you have two options:
// Will give you a raw string.
// Not ideal if you are planning to
// send this over the network, or save as a file.
var updatedStr = html.DocumentNode.OuterHtml;
// Will let you write to any stream.
// Here, I'm just writing to a string builder as an example.
var sb = new StringBuilder();
using (var writer = new StringWriter(sb))
{
html.Save(writer);
}
// These two methods generate the same result, though.
Debug.Assert(string.Equals(updatedStr, sb.ToString()));
I'm struggling with flattening using iTextSharp 5.5.10. This code represent a variety of efforts to flatten this PDF following code found on SO and Google searches.
void Process()
{
PdfReader reader = new PdfReader(mInFileName);
// 1 type of fields
AcroFields formFields = reader.AcroFields;
int countAcroFields = formFields.Fields.Count;
// 2 another type of fields
PRAcroForm form = reader.AcroForm;
List<PRAcroForm.FieldInformation> fieldList = form.Fields;
// 3 i think this is the same as 2
PdfDictionary acroForm = reader.Catalog.GetAsDict(PdfName.ACROFORM);
PdfArray acroFields = (PdfArray)PdfReader.GetPdfObject(acroForm.Get(PdfName.FIELDS), acroForm);
// 4 another type of fields
XfaForm xfaForm = formFields.Xfa;
bool flatten = false;
if (countAcroFields > 0)
{
//No fields found
}
else if (fieldList.Count > 0)
{
//No fields found
}
else if (xfaForm.XfaPresent == true)
{
//No xfaForms found
ReadXfa(reader);
}
else
{
// Yes, there are annotations and this code extracts them but does NOT flatten them
PdfDictionary page = reader.GetPageN(1);
PdfArray annotsArray = page.GetAsArray(PdfName.ANNOTS);
if ( annotsArray == null)
return;
else
{
List<string> namedFieldToFlatten = new List<string>();
foreach (var item in annotsArray)
{
var annot = (PdfDictionary)PdfReader.GetPdfObject(item);
var type = PdfReader.GetPdfObject(annot.Get(PdfName.TYPE)).ToString(); //Expecting be /Annot
var subtype = PdfReader.GetPdfObject(annot.Get(PdfName.SUBTYPE)).ToString(); //Expecting be /Widget
var fieldType = PdfReader.GetPdfObject(annot.Get(PdfName.FT)).ToString(); //Expecting be /Tx
if (annot.Get(PdfName.TYPE).Equals(PdfName.ANNOT) &&
annot.Get(PdfName.SUBTYPE).Equals(PdfName.WIDGET))
{
if (annot.Get(PdfName.FT).Equals(PdfName.TX))
{
flatten = true;
var textLabel = PdfReader.GetPdfObject(annot.Get(PdfName.T)).ToString(); //Name of textbox field
namedFieldToFlatten.Add(textLabel);
var fieldValue = PdfReader.GetPdfObject(annot.Get(PdfName.V)).ToString(); //Value of textbox
Console.WriteLine($"Found Label={textLabel} Value={fieldValue}");
}
}
}
if (flatten == true)
{
// Flatten the PDF [11/9/2016 15:10:06]
string foldername = Path.GetDirectoryName(mInFileName);
string basename = Path.GetFileNameWithoutExtension(mInFileName);
string outName = $"{foldername}\\{basename}_flat.pdf";
using (var fStream = new FileStream(outName, FileMode.Create))
{
//This totally removes the fields instead of flattening them
var stamper = new PdfStamper(reader, fStream) { FormFlattening = true, FreeTextFlattening = true, AnnotationFlattening = true };
var stamperForm = stamper.AcroFields;
stamperForm.GenerateAppearances = true;
foreach (var item in namedFieldToFlatten)
{
stamper.PartialFormFlattening(item);
}
stamper.Close();
reader.Close();
}
}
}
}
}
Any tips on how to turn the "fields" into text as flatten does in Acrobat?
This is the PDF I'm trying to flatten.
This is what the output of this code.
The problem is that your form is broken. It indeed has /Annots, and those annotations are widget annotations that also have entries that are meant to be entries in a field dictionary, but there is no form in the PDF. Or rather: there is a form, but the /Fields array is empty.
As the /Fields array is empty, there are no fields to flatten. The widget annotations are removed and you don't see anything. This is not an iText bug: you need to fix the form before filling it out.
In Java, you'd fix the form like this:
PdfReader reader = new PdfReader(src);
PdfDictionary root = reader.getCatalog();
PdfDictionary form = root.getAsDict(PdfName.ACROFORM);
PdfArray fields = form.getAsArray(PdfName.FIELDS);
PdfDictionary page;
PdfArray annots;
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
page = reader.getPageN(i);
annots = page.getAsArray(PdfName.ANNOTS);
for (int j = 0; j < annots.size(); j++) {
fields.add(annots.getAsIndirectObject(j));
}
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(dest));
stamper.close();
reader.close();
It should be fairly easy to port this to C#. I'm not a C# developer, but this is an (untested) attempt at making a port:
PdfReader reader = new PdfReader(src);
PdfDictionary root = reader.Catalog;
PdfDictionary form = root.GetAsDict(PdfName.ACROFORM);
PdfArray fields = form.GetAsArray(PdfName.FIELDS);
PdfDictionary page;
PdfArray annots;
for (int i = 1; i <= reader.NumberOfPages; i++) {
page = reader.GetPageN(i);
annots = page.GetAsArray(PdfName.ANNOTS);
for (int j = 0; j < annots.Size; j++) {
fields.Add(annots.GetAsIndirectObject(j));
}
}
PdfStamper stamper = new PdfStamper(reader, new FileStream(dest, FileMode.Create));
stamper.Close();
reader.Close();
Once you've fixed the form like this, you'll be able to flatten it correctly.
I have a pdf with buttons that take you out to web links. I used iTextSharp to split these into separate PDFs (1 per page) per outside requirements. ISSUE: Any button that has multiple positions, lost the actions.
QUESTION: Does anyone know how to update these actions? I can open the new file, but I'm not sure how to go about using the PdfStamper to add an AA to this Annotation
So when opening the original file, you could get to the Additional Action by doing this:
var r = new PdfReader(f.FullName);
var positionsOfThisButton = r.AcroFields.GetFieldPositions("14");
var field = r.AcroForm.GetField("14")
var targetObject = PdfReader.GetPdfObject(field.Ref);
var kids = targetObject.GetAsArray(PdfName.KIDS);
foreach (var k in kids){
var ko = (PdfDictionary)(k.IsIndirect() ? PdfReader.GetPdfObject(k) : k);
var aaObj = ko.Get(PdfName.AA);
//(aaObj is NULL in the new file)
var aa = (PdfDictionary)(aaObj.IsIndirect() ? PdfReader.GetPdfObject(aaObj) : aaObj);
var dObj = aa.Get(PdfName.D);
var d = (PdfDictionary)(dObj.IsIndirect() ? PdfReader.GetPdfObject(dObj) : dObj);
Debug.WriteLine("S:" + d.GetAsName(PdfName.S).ToString() );
//returns S:/Uri
Debug.WriteLine("URI:" + d.GetAsString(PdfName.URI).ToString() );
//returns URI:http://www.somesite.com/etc
}
Thanks for any help.
FYI ONLY - The following is how I split the files:
List<byte[]> Get(FileInfo f) {
List<byte[]> outputFiles = new List<byte[]>();
var reader = new PdfReader(f.FullName);
int n = reader.NumberOfPages;
reader.Close();
for (int i = n; i > 0; i--) {
reader = new PdfReader(f.FullName);
using (var document = new Document(reader.GetPageSizeWithRotation(1))) {
using (var outputStream = new MemoryStream()) {
using (var writer = new PdfCopy(document, outputStream)) {
writer.SetMergeFields();
writer.PdfVersion = '6';
document.Open();
writer.AddDocument(reader, new List<int> { i });
document.Close();
writer.Close();
}
outputFiles.Insert(0, outputStream.ToArray());
}
}
reader.Close();
}
return outputFiles;
}
The project is in C# and use iTextSharp.
I have a dictionary with a title (string) and file content (byte array). I loop through this dictionary and merge all files together. What I need now is to add bookmarks to the start of the first page in each file, but I should not add any new pages or text to the final document. I have tried different solutions, but all seem to add a table of contents page, a new page before each page or some text at the start of the page.
None of the files have bookmarks originally.
I am looking for a bookmarks structure that looks something like this:
File1
File2
SomeCategory
File3
File4
I would very much appreciate it if anyone could point me in the right direction.
My function for merging the files looks like this:
/// <summary>
/// Merge PDF files, and stamp certificates. This is a modified version of the example in the link below.
/// See: http://www.codeproject.com/Articles/28283/Simple-NET-PDF-Merger for more information.
/// </summary>
/// <param name="sourceFiles">Files to be merged</param>
/// <returns>Byte array with the combined files.</returns>
public static byte[] MergeFiles(Dictionary<string, byte[]> sourceFiles)
{
var document = new Document();
var output = new MemoryStream();
try
{
// Initialize pdf writer
var writer = PdfWriter.GetInstance(document, output);
writer.PageEvent = new PdfPageEvents();
// Open document to write
document.Open();
var content = writer.DirectContent;
// Iterate through all pdf documents
foreach (var sourceFile in sourceFiles)
{
// Create pdf reader
var reader = new PdfReader(sourceFile.Value);
var numberOfPages = reader.NumberOfPages;
// Iterate through all pages
for (var currentPageIndex = 1; currentPageIndex <=
numberOfPages; currentPageIndex++)
{
// Determine page size for the current page
document.SetPageSize(
reader.GetPageSizeWithRotation(currentPageIndex));
// Create page
document.NewPage();
var importedPage =
writer.GetImportedPage(reader, currentPageIndex);
// Determine page orientation
var pageOrientation = reader.GetPageRotation(currentPageIndex);
if ((pageOrientation == 90) || (pageOrientation == 270))
{
content.AddTemplate(importedPage, 0, -1f, 1f, 0, 0,
reader.GetPageSizeWithRotation(currentPageIndex).Height);
}
else
{
content.AddTemplate(importedPage, 1f, 0, 0, 1f, 0, 0);
}
// Add stamp to certificates
if (sourceFile.Key.IsValidDocumentReference())
AddStamp(content, document, sourceFile.Key, currentPageIndex, numberOfPages);
}
}
}
catch (Exception exception)
{
throw new Exception("An unexpected exception occured during the merging process", exception);
}
finally
{
document.Close();
}
return output.GetBuffer();
}
Thanks to Bruno Lowagie who pointed me in the right direction, I was able to produce a solution to my problem.
This is my solution:
public static byte[] MergeFilesAndAddBookmarks(Dictionary<PrintDocument, byte[]> sourceFiles)
{
using (var ms = new MemoryStream())
{
using (var document = new Document())
{
using (var copy = new PdfCopy(document, ms))
{
//Order the files by chapternumber
var files = sourceFiles.GroupBy(f => f.Key.ChapterNumber);
document.Open();
var outlines = new List<Dictionary<string, object>>();
var pageIndex = 1;
foreach (var chapterGroup in files)
{
var map = new Dictionary<string, object>();
outlines.Add(map);
map.Add("Title", chapterGroup.First().Key.ChapterName);
var kids = new List<Dictionary<string, object>>();
map.Add("Kids", kids);
foreach (var sourceFile in chapterGroup)
{
using (var reader = new PdfReader(sourceFile.Value))
{
// add the pages
var n = reader.NumberOfPages;
for (var page = 0; page < n;)
{
if (page == 0)
{
var kid = new Dictionary<string, object>();
kids.Add(kid);
kid["Title"] = sourceFile.Key.Title;
kid["Action"] = "GoTo";
kid["Page"] = String.Format("{0} Fit", pageIndex);
}
copy.AddPage(copy.GetImportedPage(reader, ++page));
}
pageIndex += n;
reader.Close();
}
}
}
copy.Outlines = outlines;
document.Close();
copy.Close();
ms.Close();
}
}
return ms.ToArray();
}
}
}
public class PrintDocument
{
public string Title { get; set; }
public string ChapterName { get; set; }
public int ChapterNumber { get; set; }
}