I need to create single pdf using few html pages. Actually HTMLs have tables. Each HTMLs(table) has different number of columns, hence it should have to export pdf with difference oriontations.
Eg :
htmlPg1 --> 4 columns
htmlPg2 --> 15 columns
According to above scenario, it is needed to comes up the first html page with portrait mode and second html with landscape.
in below 'Code block 02' lst is a list which having 2 attributes. (Please see 'Code Block 01')
If the lst Oriantation is assigned 0, it is considered as Landscape and otherwise portrait.
Code Block 01
public class PdfExportDoc
{
public int Oriantation { get; set; }
public string Html { get; set; }
}
All are working correctly except the orientation.
Code Block 02
using (var ms = new MemoryStream())
{
using (var doc = new Document())
{
using (var writer = PdfWriter.GetInstance(doc, ms))
{
doc.Open();
foreach (var ele in lst)
{
using (var srHtml = new StringReader(ele.Html))
{
if (ele.Oriantation == 0)
{
doc.SetPageSize(PageSize.A4.Rotate());
}
else
{
doc.SetPageSize(PageSize.A4);
}
XMLWorkerHelper.GetInstance().ParseXHtml(writer, doc, srHtml);
doc.NewPage();
}
}
doc.Close();
}
}
bytes = ms.ToArray();
}
htmlPg1 data dragged for 2 pages and htmlPg2 has only one. This is how the data comes as pdf.
But actually I need the pdf like this.
Please show me a direction for doing this.
This solved my problem.
I get your point Bruno. You have said in your deleted answer NewPage will not added a new page if it is blank. So I added doc.NewPage to both before and after creating paraseXHtml. How ever thanks for your previous direction.
foreach (var ele in lst)
{
using (var srHtml = new StringReader(ele.Html))
{
if (ele.Oriantation == 1)
{
doc.SetPageSize(PageSize.A4.Rotate());
}
else
{
doc.SetPageSize(PageSize.A4);
}
doc.NewPage();
XMLWorkerHelper.GetInstance().ParseXHtml(writer, doc, srHtml);
doc.NewPage();
}
}
Related
I'm having difficulty understanding how to obtain the content from a PdfDocument. I've learned from previous questions that PdfDocument flushes the content to optimize working with large documents. If my function returns a new PdfDocument, how do I get the byte[] to pass into my other functions?
Even with PdfDocument.GetReader() - I can't seem to find what I'm looking for.
My use-case is as follows:
Get pdf content from an email attachment
Pass the pdf to a helper function, which extracts specific pages from the initial attachment
Pass the new PdfDocument into a function which calls Azure's Forms Recognizer API to read the fields into an object
To summarize: given a PdfDocument only, how can I get/create a byte[] from it?
Here is my code:
public async Task<BaseResponse> Handle(ReceiveEmailCommand command, CancellationToken cancellationToken) {
var ms = new MemoryStream(command.attachments.First().Content)
var extractedDocument = pdfService.PreparePdfDocument(ms);
var analyzedDocument = await formsRecognizerService.AnalyzeDocument(extractedDocument);
// Do stuff with the analyzed document...
var response = await FileWebService.AddAnalyzedDocumentToFileSystem(analyzedDocument);
}
The function AnalyzeDocument expects a Stream parameter. I want to pass something like
new Stream(extractedDocument.GetReader().Stream)
Helper function implementations are below:
public PdfDocument PreparePdfDocument(MemoryStream ms)
{
PdfDocument extractedDoc;
var pdfReader = new PdfReader(ms);
var pdf = new PdfDocument(pdfReader);
var doc = new Document(pdf);
var matches = GetNumberWithPages(pdf);
if (matches.Count > 0)
{
var pageRange = matches
.Where(x => x.Number == "125")
.Select(x => Convert.ToInt32(x.PageIndex))
.ToList();
extractedDoc = SplitPages(pdf, pageRange.First(), pageRange.Last());
}
else
{
// If we couldn't parse the PDF then just take first 4, 3 or 2 pages
try
{
extractedDoc = SplitPages(pdf, 1, 4);
}
catch (ITextException)
{
try
{
extractedDoc = SplitPages(pdf, 1, 3);
}
catch (ITextException)
{
try
{
extractedDoc = SplitPages(pdf, 1, 2);
}
catch (Exception)
{
throw;
}
}
}
}
return extractedDoc;
}
private static List<Match> GetNumberWithPages(PdfDocument doc)
{
var regex = new Regex(#"\s+([0-9]+)\s+(\([0-9]+\/[0-9]+\))\s+Page\s+([0-9])\s+of\s+([0-9]+)");
var matches = new List<Match>();
for (int i = 1; i <= doc.GetNumberOfPages(); i++)
{
var page = doc.GetPage(i);
var text = PdfTextExtractor.GetTextFromPage(page);
if (!string.IsNullOrEmpty(text))
{
var match = regex.Match(text);
if (match.Success)
{
var match = EvaluateMatch(match, i, doc.GetNumberOfPages());
if (match != null)
{
matches.Add(match);
}
}
}
}
return matches;
}
private static Match? EvaluateMatch(Match match, int pageIndex, int totalPages)
{
if (match.Captures.Count == 1 && match.Groups.Count == 5)
{
var match = new Match
{
Number = match.Groups[1].Value,
Version = match.Groups[2].Value,
PageIndex = pageIndex.ToString(),
TotalPages = totalPages.ToString()
};
return match;
}
else
{
return null;
}
}
public PdfDocument SplitPages(PdfDocument doc, int startIndex, int endIndex)
{
var outputDocument = CreatePdfDocument();
doc.CopyPagesTo(startIndex, endIndex, outputDocument);
return outputDocument;
}
public PdfDocument CreatePdfDocument()
{
var baos = new ByteArrayOutputStream();
var writer = new PdfWriter(baos);
var pdf = new PdfDocument(writer);
return pdf;
}
I'm having difficulty understanding how to obtain the content from a PdfDocument.
You don't!
When you create a PdfDocument to write to, you initialize it with a PdfWriter. That PdfWriter in turn has been initialized to write somewhere. If you want to access the final PDF, you have to close the PdfDocument and look at that somewhere. Also it is not easy to retrieve that somewhere from the PdfWriter as it is wrapped in a number of layers therein. Thus, you should keep a reference to that somewhere close by.
Thus, your ByteArrayOutputStream usually wouldn't be created hidden in some method CreatePdfDocument but instead in the base method and forwarded to other methods as parameter. Then you can eventually retrieve its data. If you need to create your ByteArrayOutputStream hidden like that, you can return a Pair of PdfDocument and ByteArrayOutputStream instead of the plain PdfDocument.
By the way, the idea behind this architecture is that iText tries to write as much PDF content as possible to that somewhere output as early as possible and free the memory. This allows it to create large documents without requiring a similarly large amount of memory.
when I return the stream I cannot access a closed stream
The ByteArrayOutputStream essentially is a MemoryStream; so you can in particular call ToArray to retrieve the finished PDF even if it's closed.
If you need the ByteArrayOutputStream as a regular stream, simply call PdfWriter.SetCloseStream(false) for your writer to prevent the close of the PdfDocument from also closing the stream.
I want edit to visible rectangle of links in file pdf:
If i use acrobat ,i can edit link type to "Visible Rectangle"
But with iText7 , How can change this value?
I try setting, but it not working:
string strPage = #"C:\test1.pdf";
string strPageNew = #"C:\result.pdf";
PdfReader reader = new PdfReader(strPage);
using (PdfWriter writer = new PdfWriter(strPageNew))
{
using (PdfDocument pdfDoc = new PdfDocument(reader, writer))
{
PdfPage pdfPage = pdfDoc.GetPage(1);
var annotations = pdfPage.GetAnnotations();
if (annotations != null)
{
foreach (PdfAnnotation a in annotations)
{
if (a.GetSubtype().Equals(PdfName.Link))
{
PdfLinkAnnotation link = (PdfLinkAnnotation)a;
var action = link.GetAction();
if (action != null)
{
if (action.Get(PdfName.S).Equals(PdfName.URI))
{
PdfString destination = action.GetAsString(PdfName.URI);
link.SetFlag(4);
link.SetHighlightMode(PdfAnnotation.HIGHLIGHT_OUTLINE);
link.SetBorderStyle(PdfAnnotation.STYLE_SOLID);
link.SetColor(iText.Kernel.Colors.ColorConstants.RED );
}
}
}
}
}
}
}
reader.Close();
I am trying merge several Word Document using OpenXML on ASP.NET MVC 5. But I am constantly getting a message from Microsoft Word that the document is corrupt.
private Stream GenerateDocument(DocumentType documentType)
{
using (var templateStream = File.OpenRead(GetTemplatePath(documentType)))
{
//some code
var result = documentGenerator.Generate();
return result;
}
}
private Stream MergeDocuments(DocumentLibraryModel documentLibrary)
{
var documentTypes = documentLibrary.DocumentTypes.GetEnumerator();
var mainStream = GenerateDocument(documentTypes.Current);
using (WordprocessingDocument mainDocument = WordprocessingDocument.Open(mainStream, true))
{
XElement newBody = XElement.Parse(mainDocument.MainDocumentPart.Document.Body.OuterXml);
documentTypes.MoveNext();
while (documentTypes.MoveNext())
{
WordprocessingDocument tempDocument = WordprocessingDocument.Open(GenerateDocument(documentTypes.Current), true);
XElement tempBody = XElement.Parse(tempDocument.MainDocumentPart.Document.Body.OuterXml);
newBody.Add(tempBody);
mainDocument.MainDocumentPart.Document.Body = new Body(newBody.ToString());
mainDocument.MainDocumentPart.Document.Save();
mainDocument.Package.Flush();
}
}
return mainStream;
}
However the document opens as corrupted.
Any ideas?
Problem lies in this:
XElement tempBody = XElement.Parse(tempDocument.MainDocumentPart.Document.Body.OuterXml);
newBody.Add(tempBody);
You are adding body to body which generates invalid Word document. Word document can contain only one Body at the time.
I would recommend cloning elements instead of parsing XML.
You can do this:
using (WordprocessingDocument mainDocument = WordprocessingDocument.Open(mainStream, true))
{
mainDocument.MainDocumentPart.Document.Body = new Body();
documentTypes.MoveNext();
while (documentTypes.MoveNext())
{
using (WordprocessingDocument tempDocument = WordprocessingDocument.Open(GenerateDocument(documentTypes.Current)))
{
foreach (var element in tempDocument.MainDocumentPart.Document.Body.Elements)
{
mainDocument.MainDocumentPart.Document.Body.AppendChild(element.CloneNode(true));
}
}
}
mainDocument.MainDocumentPart.Document.Save();
}
I'm trying to generate PDF reports using iTextSharp with customer information, header and footer etc. All these reports are already generated using EVO APIs. As part of a migration process, we are planning to generate these reports using iTextSharp APIs.
I need to know if there is any possibility to provide a ready to render HTML string to iTextSharp PDF header (Existing EVO design accepts HTML string and build PDF), instead of using PageEvents to design with PDFPTable and PDFPCell (as the number of reports are huge and to avoid rework)
I need to know if there is any possibility to provide a ready to render HTML string to iTextSharp PDF header (Existing EVO design accepts HTML string and build PDF), instead of using PageEvents to design with PDFPTable and PDFPCell
You will have to use page events to draw header or footers but there is no need to use PdfPTable explicitly there. You actually can render html during a page event, e.g. like this:
[Test]
public void CreatePdfWithHtmlHeader()
{
string htmlHeader = "<!DOCTYPE html><html><body><table style=\"width: 100%; border: 1px solid black;\"><tr><td>A</td><td>B</td></tr></table></body></html>";
using (FileStream output = new FileStream(#"C:\Temp\test-results\content\html-header.pdf", FileMode.Create, FileAccess.Write))
using (Document document = new Document(PageSize.A4))
{
PdfWriter writer = PdfWriter.GetInstance(document, output);
writer.PageEvent = new HtmlPageEventHelper(htmlHeader);
document.Open();
document.Add(new Paragraph("1"));
document.NewPage();
document.Add(new Paragraph("2"));
}
}
making use the following two small helper classes.
HtmlPageEventHelper is a page event listener drawing a given html sniplet into the page header. Obviously it can alternatively or additionally write into the page footer, simply use appropriate column coordinates
public class HtmlPageEventHelper : PdfPageEventHelper
{
public HtmlPageEventHelper(string html)
{
this.html = html;
}
public override void OnEndPage(PdfWriter writer, Document document)
{
base.OnEndPage(writer, document);
ColumnText ct = new ColumnText(writer.DirectContent);
XMLWorkerHelper.GetInstance().ParseXHtml(new ColumnTextElementHandler(ct), new StringReader(html));
ct.SetSimpleColumn(document.Left, document.Top, document.Right, document.GetTop(-20), 10, Element.ALIGN_MIDDLE);
ct.Go();
}
string html = null;
}
For more complex HTML sniplets you may want to replace the XMLWorkerHelper.GetInstance().ParseXHtml call by a customized parser call as presented in #Skary's answer.
ColumnTextElementHandler is an IElementHandler implementation that adds content (generated e.g. by parsing HTML) to a ColumnText
public class ColumnTextElementHandler : IElementHandler
{
public ColumnTextElementHandler(ColumnText ct)
{
this.ct = ct;
}
ColumnText ct = null;
public void Add(IWritable w)
{
if (w is WritableElement)
{
foreach (IElement e in ((WritableElement)w).Elements())
{
ct.AddElement(e);
}
}
}
}
By the way, the test above produces a PDF with this content:
...
...
Disclaimer: I predominantly work with Java and have not used the XmlWorker before. Thus, this code may have considerable potential for improvement.
I am not sure to have understand you question right.
If you are asking how to parse HTML to PDF using iTextSharp here is the solutin i found time ago :
using (Document document = new Document(size))
{
var writer = PdfWriter.GetInstance(document, stream);
document.Open();
document.NewPage();
document.Add(new Chunk(""));
var tagProcessors = (DefaultTagProcessorFactory)Tags.GetHtmlTagProcessorFactory();
tagProcessors.RemoveProcessor(HTML.Tag.IMG);
tagProcessors.AddProcessor(HTML.Tag.IMG, new CustomImageTagProcessor());
var charset = Encoding.UTF8;
CssFilesImpl cssFiles = new CssFilesImpl();
cssFiles.Add(XMLWorkerHelper.GetInstance().GetDefaultCSS());
var cssResolver = new StyleAttrCSSResolver(cssFiles);
cssResolver.AddCss(srcCssData, "utf-8", true);
var hpc = new HtmlPipelineContext(new CssAppliersImpl(new XMLWorkerFontProvider()));
hpc.SetAcceptUnknown(true).AutoBookmark(true).SetTagFactory(tagProcessors);
var htmlPipeline = new HtmlPipeline(hpc, new PdfWriterPipeline(document, writer));
var pipeline = new CssResolverPipeline(cssResolver, htmlPipeline);
var worker = new XMLWorker(pipeline, true);
var xmlParser = new XMLParser(true, worker, charset);
xmlParser.Parse(new StringReader(srcFileData));
document.Close();
}
To get it work you need to add custom image processor to inline image in the HTML you provide to tha above converte function :
public class CustomImageTagProcessor : iTextSharp.tool.xml.html.Image
{
public override IList<IElement> End(IWorkerContext ctx, Tag tag, IList<IElement> currentContent)
{
IDictionary<string, string> attributes = tag.Attributes;
string src;
if (!attributes.TryGetValue(HTML.Attribute.SRC, out src))
return new List<IElement>(1);
if (string.IsNullOrEmpty(src))
return new List<IElement>(1);
if (src.StartsWith("data:image/", StringComparison.InvariantCultureIgnoreCase))
{
// data:[<MIME-type>][;charset=<encoding>][;base64],<data>
var base64Data = src.Substring(src.IndexOf(",") + 1);
var imagedata = Convert.FromBase64String(base64Data);
var image = iTextSharp.text.Image.GetInstance(imagedata);
var list = new List<IElement>();
var htmlPipelineContext = GetHtmlPipelineContext(ctx);
list.Add(GetCssAppliers().Apply(new Chunk((iTextSharp.text.Image)GetCssAppliers().Apply(image, tag, htmlPipelineContext), 0, 0, true), tag, htmlPipelineContext));
return list;
}
else
{
return base.End(ctx, tag, currentContent);
}
}
}
I m developing an application in which a word document is converted in pdf. My problem is too complicated please help me out.
My word doc has a toc, bookmarks, endnotes and hyperlinks. when I save this doc as pdf, only bookmarks are converted. After a long research I found that PDF documents does not support bookmark to bookmark hyperlinks, it needs either page number or named destinations.
So I choose named destinations for this purpose, but I am stuck again , because simple "save as" cannot generate named destinations in the pdf doc. So I print the word doc on adobe PDF printer and I got named destination as required, but again this document neither have bookmarks in it nor hyperlinks. so what I decided that I generate two pdf from a word, first by save as option and second one is by printing.
test.pdf (by save as) (contains bookmarks, hyperlinks)
test_p.pdf( by printing) (only contains named destination)
then I research ones again and found a way to extract all named destination from test_p.pdf into XML by a function of itextsharp.but unfortunately I dont get any way to import back this xml in test.pdf.. thats why I came here.
Guide me what to do next if this approach is ok. else suggest me any ohter approach to accomplish this mission.
I wrote a class to replace urls in my PDF files some times ago:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using iTextSharp.text.pdf;
namespace ReplaceLinks
{
public class ReplacePdfLinks
{
Dictionary<string, PdfObject> _namedDestinations;
PdfReader _reader;
public string InputPdf { set; get; }
public string OutputPdf { set; get; }
public Func<Uri, string> UriToNamedDestination { set; get; }
public void Start()
{
updatePdfLinks();
saveChanges();
}
private PdfArray getAnnotationsOfCurrentPage(int pageNumber)
{
var pageDictionary = _reader.GetPageN(pageNumber);
var annotations = pageDictionary.GetAsArray(PdfName.ANNOTS);
return annotations;
}
private static bool hasAction(PdfDictionary annotationDictionary)
{
return annotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK);
}
private static bool isUriAction(PdfDictionary annotationAction)
{
return annotationAction.Get(PdfName.S).Equals(PdfName.URI);
}
private void replaceUriWithLocalDestination(PdfDictionary annotationAction)
{
var uri = annotationAction.Get(PdfName.URI) as PdfString;
if (uri == null)
return;
if (string.IsNullOrWhiteSpace(uri.ToString()))
return;
var namedDestination = UriToNamedDestination(new Uri(uri.ToString()));
if (string.IsNullOrWhiteSpace(namedDestination))
return;
PdfObject entry;
if (!_namedDestinations.TryGetValue(namedDestination, out entry))
return;
annotationAction.Remove(PdfName.S);
annotationAction.Remove(PdfName.URI);
var newLocalDestination = new PdfArray();
annotationAction.Put(PdfName.S, PdfName.GOTO);
var xRef = ((PdfArray)entry).First(x => x is PdfIndirectReference);
newLocalDestination.Add(xRef);
newLocalDestination.Add(PdfName.FITH);
annotationAction.Put(PdfName.D, newLocalDestination);
}
private void saveChanges()
{
using (var fileStream = new FileStream(OutputPdf, FileMode.Create, FileAccess.Write, FileShare.None))
using (var stamper = new PdfStamper(_reader, fileStream))
{
stamper.Close();
}
}
private void updatePdfLinks()
{
_reader = new PdfReader(InputPdf);
_namedDestinations = _reader.GetNamedDestinationFromStrings();
var pageCount = _reader.NumberOfPages;
for (var i = 1; i <= pageCount; i++)
{
var annotations = getAnnotationsOfCurrentPage(i);
if (annotations == null || !annotations.Any())
continue;
foreach (var annotation in annotations.ArrayList)
{
var annotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(annotation);
if (!hasAction(annotationDictionary))
continue;
var annotationAction = annotationDictionary.Get(PdfName.A) as PdfDictionary;
if (annotationAction == null)
continue;
if (!isUriAction(annotationAction))
continue;
replaceUriWithLocalDestination(annotationAction);
}
}
}
}
}
To use it:
new ReplacePdfLinks
{
InputPdf = #"test.pdf",
OutputPdf = "mod.pdf",
UriToNamedDestination = uri =>
{
if (uri.Host.ToLowerInvariant().Contains("google.com"))
{
return "entry1";
}
return string.Empty;
}
}.Start();
This sample will modify all of the urls containing google.com to point to a specific named destination "entry1".
And this is the sample file to test the above class:
void WriteFile()
{
using (var doc = new Document(PageSize.LETTER))
{
using (var fs = new FileStream("test.pdf", FileMode.Create))
{
using (var writer = PdfWriter.GetInstance(doc, fs))
{
doc.Open();
var blueFont = FontFactory.GetFont("Arial", 12, Font.NORMAL, BaseColor.BLUE);
doc.Add(new Chunk("Go to URL", blueFont).SetAction(new PdfAction("http://www.google.com/", false)));
doc.NewPage();
doc.Add(new Chunk("Go to Test", blueFont).SetLocalGoto("entry1"));
doc.NewPage();
doc.Add(new Chunk("Test").SetLocalDestination("entry1"));
doc.Close();
}
}
}
}