I m developing an application in which a word document is converted in pdf. My problem is too complicated please help me out.
My word doc has a toc, bookmarks, endnotes and hyperlinks. when I save this doc as pdf, only bookmarks are converted. After a long research I found that PDF documents does not support bookmark to bookmark hyperlinks, it needs either page number or named destinations.
So I choose named destinations for this purpose, but I am stuck again , because simple "save as" cannot generate named destinations in the pdf doc. So I print the word doc on adobe PDF printer and I got named destination as required, but again this document neither have bookmarks in it nor hyperlinks. so what I decided that I generate two pdf from a word, first by save as option and second one is by printing.
test.pdf (by save as) (contains bookmarks, hyperlinks)
test_p.pdf( by printing) (only contains named destination)
then I research ones again and found a way to extract all named destination from test_p.pdf into XML by a function of itextsharp.but unfortunately I dont get any way to import back this xml in test.pdf.. thats why I came here.
Guide me what to do next if this approach is ok. else suggest me any ohter approach to accomplish this mission.
I wrote a class to replace urls in my PDF files some times ago:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using iTextSharp.text.pdf;
namespace ReplaceLinks
{
public class ReplacePdfLinks
{
Dictionary<string, PdfObject> _namedDestinations;
PdfReader _reader;
public string InputPdf { set; get; }
public string OutputPdf { set; get; }
public Func<Uri, string> UriToNamedDestination { set; get; }
public void Start()
{
updatePdfLinks();
saveChanges();
}
private PdfArray getAnnotationsOfCurrentPage(int pageNumber)
{
var pageDictionary = _reader.GetPageN(pageNumber);
var annotations = pageDictionary.GetAsArray(PdfName.ANNOTS);
return annotations;
}
private static bool hasAction(PdfDictionary annotationDictionary)
{
return annotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK);
}
private static bool isUriAction(PdfDictionary annotationAction)
{
return annotationAction.Get(PdfName.S).Equals(PdfName.URI);
}
private void replaceUriWithLocalDestination(PdfDictionary annotationAction)
{
var uri = annotationAction.Get(PdfName.URI) as PdfString;
if (uri == null)
return;
if (string.IsNullOrWhiteSpace(uri.ToString()))
return;
var namedDestination = UriToNamedDestination(new Uri(uri.ToString()));
if (string.IsNullOrWhiteSpace(namedDestination))
return;
PdfObject entry;
if (!_namedDestinations.TryGetValue(namedDestination, out entry))
return;
annotationAction.Remove(PdfName.S);
annotationAction.Remove(PdfName.URI);
var newLocalDestination = new PdfArray();
annotationAction.Put(PdfName.S, PdfName.GOTO);
var xRef = ((PdfArray)entry).First(x => x is PdfIndirectReference);
newLocalDestination.Add(xRef);
newLocalDestination.Add(PdfName.FITH);
annotationAction.Put(PdfName.D, newLocalDestination);
}
private void saveChanges()
{
using (var fileStream = new FileStream(OutputPdf, FileMode.Create, FileAccess.Write, FileShare.None))
using (var stamper = new PdfStamper(_reader, fileStream))
{
stamper.Close();
}
}
private void updatePdfLinks()
{
_reader = new PdfReader(InputPdf);
_namedDestinations = _reader.GetNamedDestinationFromStrings();
var pageCount = _reader.NumberOfPages;
for (var i = 1; i <= pageCount; i++)
{
var annotations = getAnnotationsOfCurrentPage(i);
if (annotations == null || !annotations.Any())
continue;
foreach (var annotation in annotations.ArrayList)
{
var annotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(annotation);
if (!hasAction(annotationDictionary))
continue;
var annotationAction = annotationDictionary.Get(PdfName.A) as PdfDictionary;
if (annotationAction == null)
continue;
if (!isUriAction(annotationAction))
continue;
replaceUriWithLocalDestination(annotationAction);
}
}
}
}
}
To use it:
new ReplacePdfLinks
{
InputPdf = #"test.pdf",
OutputPdf = "mod.pdf",
UriToNamedDestination = uri =>
{
if (uri.Host.ToLowerInvariant().Contains("google.com"))
{
return "entry1";
}
return string.Empty;
}
}.Start();
This sample will modify all of the urls containing google.com to point to a specific named destination "entry1".
And this is the sample file to test the above class:
void WriteFile()
{
using (var doc = new Document(PageSize.LETTER))
{
using (var fs = new FileStream("test.pdf", FileMode.Create))
{
using (var writer = PdfWriter.GetInstance(doc, fs))
{
doc.Open();
var blueFont = FontFactory.GetFont("Arial", 12, Font.NORMAL, BaseColor.BLUE);
doc.Add(new Chunk("Go to URL", blueFont).SetAction(new PdfAction("http://www.google.com/", false)));
doc.NewPage();
doc.Add(new Chunk("Go to Test", blueFont).SetLocalGoto("entry1"));
doc.NewPage();
doc.Add(new Chunk("Test").SetLocalDestination("entry1"));
doc.Close();
}
}
}
}
Related
I am able to copy a vbaproject part from xlsm to another xlsm through memory stream and add it through addnew part. Is there any possibility to import/add only the bas file into the xlsm file using open xml.
Can some one assist.Below code helped to take a clone of vba part and update the same.
private static void cloneVbaPart(string src, string dst)
{
using (WordprocessingDocument srcDoc = WordprocessingDocument.Open(src, false))
{
var vbaPart = srcDoc.MainDocumentPart.VbaProjectPart;
using (WordprocessingDocument dstDoc = WordprocessingDocument.Open(dst, true))
{
var partsToRemove = new List<OpenXmlPart>();
foreach (var part in dstDoc.MainDocumentPart.GetPartsOfType<VbaProjectPart>())
{
partsToRemove.Add(part);
}
foreach (var part in dstDoc.MainDocumentPart.GetPartsOfType<CustomizationPart>())
{
partsToRemove.Add(part);
}
foreach (var part in partsToRemove)
{ dstDoc.MainDocumentPart.DeletePart(part); }
var vbaProjectPart = dstDoc.MainDocumentPart.AddNewPart<VbaProjectPart>();
var vbaDataPart = vbaProjectPart.AddNewPart<VbaDataPart>();
using (Stream data = vbaPart.GetStream())
{
vbaProjectPart.FeedData(data);
}
using (Stream data = vbaPart.VbaDataPart.GetStream())
{
vbaDataPart.FeedData(data);
}
}
}
}
I want edit to visible rectangle of links in file pdf:
If i use acrobat ,i can edit link type to "Visible Rectangle"
But with iText7 , How can change this value?
I try setting, but it not working:
string strPage = #"C:\test1.pdf";
string strPageNew = #"C:\result.pdf";
PdfReader reader = new PdfReader(strPage);
using (PdfWriter writer = new PdfWriter(strPageNew))
{
using (PdfDocument pdfDoc = new PdfDocument(reader, writer))
{
PdfPage pdfPage = pdfDoc.GetPage(1);
var annotations = pdfPage.GetAnnotations();
if (annotations != null)
{
foreach (PdfAnnotation a in annotations)
{
if (a.GetSubtype().Equals(PdfName.Link))
{
PdfLinkAnnotation link = (PdfLinkAnnotation)a;
var action = link.GetAction();
if (action != null)
{
if (action.Get(PdfName.S).Equals(PdfName.URI))
{
PdfString destination = action.GetAsString(PdfName.URI);
link.SetFlag(4);
link.SetHighlightMode(PdfAnnotation.HIGHLIGHT_OUTLINE);
link.SetBorderStyle(PdfAnnotation.STYLE_SOLID);
link.SetColor(iText.Kernel.Colors.ColorConstants.RED );
}
}
}
}
}
}
}
reader.Close();
I need to create single pdf using few html pages. Actually HTMLs have tables. Each HTMLs(table) has different number of columns, hence it should have to export pdf with difference oriontations.
Eg :
htmlPg1 --> 4 columns
htmlPg2 --> 15 columns
According to above scenario, it is needed to comes up the first html page with portrait mode and second html with landscape.
in below 'Code block 02' lst is a list which having 2 attributes. (Please see 'Code Block 01')
If the lst Oriantation is assigned 0, it is considered as Landscape and otherwise portrait.
Code Block 01
public class PdfExportDoc
{
public int Oriantation { get; set; }
public string Html { get; set; }
}
All are working correctly except the orientation.
Code Block 02
using (var ms = new MemoryStream())
{
using (var doc = new Document())
{
using (var writer = PdfWriter.GetInstance(doc, ms))
{
doc.Open();
foreach (var ele in lst)
{
using (var srHtml = new StringReader(ele.Html))
{
if (ele.Oriantation == 0)
{
doc.SetPageSize(PageSize.A4.Rotate());
}
else
{
doc.SetPageSize(PageSize.A4);
}
XMLWorkerHelper.GetInstance().ParseXHtml(writer, doc, srHtml);
doc.NewPage();
}
}
doc.Close();
}
}
bytes = ms.ToArray();
}
htmlPg1 data dragged for 2 pages and htmlPg2 has only one. This is how the data comes as pdf.
But actually I need the pdf like this.
Please show me a direction for doing this.
This solved my problem.
I get your point Bruno. You have said in your deleted answer NewPage will not added a new page if it is blank. So I added doc.NewPage to both before and after creating paraseXHtml. How ever thanks for your previous direction.
foreach (var ele in lst)
{
using (var srHtml = new StringReader(ele.Html))
{
if (ele.Oriantation == 1)
{
doc.SetPageSize(PageSize.A4.Rotate());
}
else
{
doc.SetPageSize(PageSize.A4);
}
doc.NewPage();
XMLWorkerHelper.GetInstance().ParseXHtml(writer, doc, srHtml);
doc.NewPage();
}
}
I am trying merge several Word Document using OpenXML on ASP.NET MVC 5. But I am constantly getting a message from Microsoft Word that the document is corrupt.
private Stream GenerateDocument(DocumentType documentType)
{
using (var templateStream = File.OpenRead(GetTemplatePath(documentType)))
{
//some code
var result = documentGenerator.Generate();
return result;
}
}
private Stream MergeDocuments(DocumentLibraryModel documentLibrary)
{
var documentTypes = documentLibrary.DocumentTypes.GetEnumerator();
var mainStream = GenerateDocument(documentTypes.Current);
using (WordprocessingDocument mainDocument = WordprocessingDocument.Open(mainStream, true))
{
XElement newBody = XElement.Parse(mainDocument.MainDocumentPart.Document.Body.OuterXml);
documentTypes.MoveNext();
while (documentTypes.MoveNext())
{
WordprocessingDocument tempDocument = WordprocessingDocument.Open(GenerateDocument(documentTypes.Current), true);
XElement tempBody = XElement.Parse(tempDocument.MainDocumentPart.Document.Body.OuterXml);
newBody.Add(tempBody);
mainDocument.MainDocumentPart.Document.Body = new Body(newBody.ToString());
mainDocument.MainDocumentPart.Document.Save();
mainDocument.Package.Flush();
}
}
return mainStream;
}
However the document opens as corrupted.
Any ideas?
Problem lies in this:
XElement tempBody = XElement.Parse(tempDocument.MainDocumentPart.Document.Body.OuterXml);
newBody.Add(tempBody);
You are adding body to body which generates invalid Word document. Word document can contain only one Body at the time.
I would recommend cloning elements instead of parsing XML.
You can do this:
using (WordprocessingDocument mainDocument = WordprocessingDocument.Open(mainStream, true))
{
mainDocument.MainDocumentPart.Document.Body = new Body();
documentTypes.MoveNext();
while (documentTypes.MoveNext())
{
using (WordprocessingDocument tempDocument = WordprocessingDocument.Open(GenerateDocument(documentTypes.Current)))
{
foreach (var element in tempDocument.MainDocumentPart.Document.Body.Elements)
{
mainDocument.MainDocumentPart.Document.Body.AppendChild(element.CloneNode(true));
}
}
}
mainDocument.MainDocumentPart.Document.Save();
}
I'm trying to transfer data from a treenode (at least I think that's what it is) which contains much more data than I need. It would be very difficult for me to manipulate the data within the treenode. I would much rather have an array which provides me with only the necessary data for data manipulation.
I would like higher rates have following variables:
1. BookmarkNumber (integer)
2. Date (string)
3. DocumentType (string)
4. BookmarkPageNumberString (string)
5. BookmarkPageNumberInteger (integer)
I would like to the above defined rate from the data from variable book_mark (as can be seen in my code).
I've been wrestling with this for two days. Any help would be much appreciated. I'm probably sure that the question wasn't phrased correctly so please ask questions so that I may explain further if needed.
Thanks so much
BTW what I'm trying to do is create a Windows Form program which parses a PDF file which has multiple bookmarks into discrete PDF files for each bookmark/chapter while saving the bookmark in the correct folder with the correct naming convention, the folder and naming convention dependent upon the PDF name and title name of the bookmark/chapter being parsed.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using itextsharp.pdfa;
using iTextSharp.awt;
using iTextSharp.testutils;
using iTextSharp.text;
using iTextSharp.xmp;
using iTextSharp.xtra;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void ChooseImageFileWrapper_Click(object sender, EventArgs e)
{
OpenFileDialog openFileDialog1 = new OpenFileDialog();
openFileDialog1.InitialDirectory = GlobalVariables.InitialDirectory;
openFileDialog1.Filter = "Pdf Files|*.pdf";
openFileDialog1.RestoreDirectory = true;
openFileDialog1.Title = "Image File Wrapper Chooser";
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
try
{
GlobalVariables.ImageFileWrapperPath = openFileDialog1.FileName;
}
catch (Exception ex)
{
MessageBox.Show("Error: Could not read file from disk. Original error: " + ex.Message);
}
}
ImageFileWrapperPath.Text = GlobalVariables.ImageFileWrapperPath;
}
private void ImageFileWrapperPath_TextChanged(object sender, EventArgs e)
{
}
private void button2_Click(object sender, EventArgs e)
{
iTextSharp.text.pdf.PdfReader pdfReader = new iTextSharp.text.pdf.PdfReader(GlobalVariables.ImageFileWrapperPath);
IList<Dictionary<string, object>> book_mark = iTextSharp.text.pdf.SimpleBookmark.GetBookmark(pdfReader);
List<ImageFileWrapperBookmarks> IFWBookmarks = new List<ImageFileWrapperBookmarks>();
foreach (Dictionary<string, object> bk in book_mark) // bk is a single instance of book_mark
{
ImageFileWrapperBookmarks.BookmarkNumber = ImageFileWrapperBookmarks.BookmarkNumber + 1;
foreach (KeyValuePair<string, object> kvr in bk) // kvr is the key/value in bk
{
if (kvr.Key == "Kids" || kvr.Key == "kids")
{
//create recursive program for children
}
else if (kvr.Key == "Title" || kvr.Key == "title")
{
}
else if (kvr.Key == "Page" || kvr.Key == "page")
{
}
}
}
MessageBox.Show(GlobalVariables.ImageFileWrapperPath);
}
}
}
Here's one way to parse a PDF and create a data structure similar to what you describe. First the data structure:
public class BookMark
{
static int _number;
public BookMark() { Number = ++_number; }
public int Number { get; private set; }
public string Title { get; set; }
public string PageNumberString { get; set; }
public int PageNumberInteger { get; set; }
public static void ResetNumber() { _number = 0; }
// bookmarks title may have illegal filename character(s)
public string GetFileName()
{
var fileTitle = Regex.Replace(
Regex.Replace(Title, #"\s+", "-"),
#"[^-\w]", ""
);
return string.Format("{0:D4}-{1}.pdf", Number, fileTitle);
}
}
A method to create a list of Bookmark (above):
List<BookMark> ParseBookMarks(IList<Dictionary<string, object>> bookmarks)
{
int page;
var result = new List<BookMark>();
foreach (var bookmark in bookmarks)
{
// add top-level bookmarks
var stringPage = bookmark["Page"].ToString();
if (Int32.TryParse(stringPage.Split()[0], out page))
{
result.Add(new BookMark() {
Title = bookmark["Title"].ToString(),
PageNumberString = stringPage,
PageNumberInteger = page
});
}
// recurse
if (bookmark.ContainsKey("Kids"))
{
var kids = bookmark["Kids"] as IList<Dictionary<string, object>>;
if (kids != null && kids.Count > 0)
{
result.AddRange(ParseBookMarks(kids));
}
}
}
return result;
}
Call method above like this to dump the results to a text file:
void DumpResults(string path)
{
using (var reader = new PdfReader(path))
{
// need this call to parse page numbers
reader.ConsolidateNamedDestinations();
var bookmarks = ParseBookMarks(SimpleBookmark.GetBookmark(reader));
var sb = new StringBuilder();
foreach (var bookmark in bookmarks)
{
sb.AppendLine(string.Format(
"{0, -4}{1, -100}{2, -25}{3}",
bookmark.Number, bookmark.Title,
bookmark.PageNumberString, bookmark.PageNumberInteger
));
}
File.WriteAllText(outputTextFile, sb.ToString());
}
}
The bigger problem is how to extract each Bookmark into a separate file. If every Bookmark starts a new page it's easy:
Iterate over the return value of ParseBookMarks()
Select a page range that begins with the current BookMark.Number, and ends with the next BookMark.Number - 1
Use that page range to create separate files.
Something like this:
void ProcessPdf(string path)
{
using (var reader = new PdfReader(path))
{
// need this call to parse page numbers
reader.ConsolidateNamedDestinations();
var bookmarks = ParseBookMarks(SimpleBookmark.GetBookmark(reader));
for (int i = 0; i < bookmarks.Count; ++i)
{
int page = bookmarks[i].PageNumberInteger;
int nextPage = i + 1 < bookmarks.Count
// if not top of page will be missing content
? bookmarks[i + 1].PageNumberInteger - 1
/* alternative is to potentially add redundant content:
? bookmarks[i + 1].PageNumberInteger
*/
: reader.NumberOfPages;
string range = string.Format("{0}-{1}", page, nextPage);
// DEMO!
if (i < 10)
{
var outputPath = Path.Combine(OUTPUT_DIR, bookmarks[i].GetFileName());
using (var readerCopy = new PdfReader(reader))
{
var number = bookmarks[i].Number;
readerCopy.SelectPages(range);
using (FileStream stream = new FileStream(outputPath, FileMode.Create))
{
using (var document = new Document())
{
using (var copy = new PdfCopy(document, stream))
{
document.Open();
int n = readerCopy.NumberOfPages;
for (int j = 0; j < n; )
{
copy.AddPage(copy.GetImportedPage(readerCopy, ++j));
}
}
}
}
}
}
}
}
}
The problem is that it's highly unlikely all bookmarks are going to be at the top of every page of the PDF. To see what I mean, experiment with commenting / uncommenting the bookmarks[i + 1].PageNumberInteger lines.