how to copy highlighted text from pdf file

how to copy highlighted text from pdf file - c#

i am using itextsharp library for developing c# application to merge all annotations comments from two different PDF file in another PDF file please help me thanks in advance i have tried code
i have used this code i am able to find highlighted text but not in proper formatting.
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace PdfFileApp
{
public class pdftotext
{
public static void ReadAnnotation()
{
int pageTo = 0;
try
{
using (iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader("D:\\DEMO_Supp_First Proof.pdf"))
{
pageTo = reader.NumberOfPages;
for (int i = 1; i <= reader.NumberOfPages; i++)
{
PdfDictionary page = reader.GetPageN(i);
PdfArray annots = page.GetAsArray(iTextSharp.text.pdf.PdfName.ANNOTS);
if (annots != null)
foreach (PdfObject annot in annots.ArrayList)
{
PdfDictionary annotationDic = (PdfDictionary)iTextSharp.text.pdf.PdfReader.GetPdfObject(annot);
PdfDictionary pdfDictionary = annots.GetAsDict(i);
PdfName subType = (PdfName)annotationDic.Get(PdfName.SUBTYPE);
var author = pdfDictionary.GetAsString(PdfName.T);
if (subType.Equals(PdfName.HIGHLIGHT))
{
PdfArray coordinates = annotationDic.GetAsArray(PdfName.RECT);
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(float.Parse(coordinates.ArrayList[0].ToString(), CultureInfo.InvariantCulture.NumberFormat), float.Parse(coordinates.ArrayList[1].ToString(), CultureInfo.InvariantCulture.NumberFormat),
float.Parse(coordinates.ArrayList[2].ToString(), CultureInfo.InvariantCulture.NumberFormat), float.Parse(coordinates.ArrayList[3].ToString(), CultureInfo.InvariantCulture.NumberFormat));
RenderFilter[] filter = { new RegionTextRenderFilter(rect) };
ITextExtractionStrategy strategy;
StringBuilder sb = new StringBuilder();
strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
Console.WriteLine(sb.ToString());
Console.ReadLine();
var annotatedWord = sb.Replace(System.Environment.NewLine, string.Empty);
}
}
}
}
}
catch (Exception ex)
{
string error = ex.Message;
}
}
}
}

Install Aspose.pdf Library from Nuget packacge and use below code.
string highlightedText = "";
var document = new Aspose.Pdf.Document(#"Path");
Aspose.Pdf.Facades.PdfAnnotationEditor annotationEditor = new Aspose.Pdf.Facades.PdfAnnotationEditor();
annotationEditor.BindPdf(document);
// Extract annotations
var annotationTypes = new[] { Aspose.Pdf.Annotations.AnnotationType.FreeText, Aspose.Pdf.Annotations.AnnotationType.Highlight };
var annotations = annotationEditor.ExtractAnnotations(1, 2, annotationTypes);
foreach (var annotation in annotations)
{
var extractAnnotation = (Aspose.Pdf.Annotations.HighlightAnnotation)annotation;
highlightedText += extractAnnotation.GetMarkedText();
}

Related

link extraction using HtmlAgilityPack and c#

i want to extract google result links
My code works it does extract links, but these links are not what i expected to be extracted.
My program would extract links inside the "a href" tag but all links in search result are not Appropriate links , ads link , googles link are also included
what should i do?
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.ServiceModel.Syndication;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Xml;
namespace Search
{
public partial class Form1 : Form
{
// load snippet
HtmlAgilityPack.HtmlDocument htmlSnippet = new HtmlAgilityPack.HtmlDocument();
public Form1()
{
InitializeComponent();
}
private void btn1_Click(object sender, EventArgs e)
{
listBox1.Items.Clear();
StringBuilder sb = new StringBuilder();
byte[] ResultsBuffer = new byte[8192];
string SearchResults = "http://google.com/search?q=" + txtKeyWords.Text.Trim();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
sb.Append(tempString);
}
}
while (count > 0);
string sbb = sb.ToString();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.OptionOutputAsXml = true;
html.LoadHtml(sbb);
HtmlNode doc = html.DocumentNode;
foreach (HtmlNode link in doc.SelectNodes("//a[#href]"))
{
//HtmlAttribute att = link.Attributes["href"];
string hrefValue = link.GetAttributeValue("href", string.Empty);
// if ()
{
int index = hrefValue.IndexOf("&");
if (index > 0)
{
hrefValue = hrefValue.Substring(0, index);
listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
}
}
}
}
}
}
if i want to work with "a href" tag i have to add some condition in If
but i dont know what condition i should use here:
if ()
someplace i read about extracting cite tag not ahref tag anybody can help?

To get the links that are contained in the cite elements, simply access their inner text, like:
HtmlWeb w = new HtmlWeb();
var hd = w.Load("http://www.google.com/search?q=veverke");
var cites = hd.DocumentNode.SelectNodes("//cite");
foreach (var cite in cites)
Console.WriteLine(cite.InnerText);

get title tag by html agility pack

i'm trying to use htmlagility pack to gain links and tites of results
i have this code
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.ServiceModel.Syndication;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Xml;
namespace Search
{
public partial class Form1 : Form
{
// load snippet
HtmlAgilityPack.HtmlDocument htmlSnippet = new HtmlAgilityPack.HtmlDocument();
public Form1()
{
InitializeComponent();
}
private void btn1_Click(object sender, EventArgs e)
{
listBox1.Items.Clear();
StringBuilder sb = new StringBuilder();
byte[] ResultsBuffer = new byte[8192];
string SearchResults = "http://google.com/search?q=" + txtKeyWords.Text.Trim();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
sb.Append(tempString);
}
}
while (count > 0);
string sbb = sb.ToString();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.OptionOutputAsXml = true;
html.LoadHtml(sbb);
HtmlNode doc = html.DocumentNode;
foreach (HtmlNode link in doc.SelectNodes("//a[#href]"))
{
//HtmlAttribute att = link.Attributes["href"];
string hrefValue = link.GetAttributeValue("href", string.Empty);
if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
{
int index = hrefValue.IndexOf("&");
if (index > 0)
{
hrefValue = hrefValue.Substring(0, index);
listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
}
}
}
}
}
}
this code returns result links for a query i want to get title tag for each link too how can i get title for each links?
anybody can help?

If, by 'title', you mean the displayed text of the link, then you can get it from InnerText property of each HtmlNode link :
foreach (HtmlNode link in doc.SelectNodes("//a[#href]"))
{
.....
var title = link.InnerText.Trim();
}

This will give output: links and titles of the links. This will not return the anchor text of links. You will get only the title of each link.
foreach (HtmlNode link in doc.SelectNodes("//a[#href]"))
{
HtmlWeb htmlWeb = new HtmlWeb();
HtmlAgilityPack.HtmlDocument htmlDocument = htmlWeb.Load(link);
var title = htmlDocument.DocumentNode.SelectSingleNode("html/head/title").InnerText;
}

How to create an array and fill from tree node variable

I'm trying to transfer data from a treenode (at least I think that's what it is) which contains much more data than I need. It would be very difficult for me to manipulate the data within the treenode. I would much rather have an array which provides me with only the necessary data for data manipulation.
I would like higher rates have following variables:
1. BookmarkNumber (integer)
2. Date (string)
3. DocumentType (string)
4. BookmarkPageNumberString (string)
5. BookmarkPageNumberInteger (integer)
I would like to the above defined rate from the data from variable book_mark (as can be seen in my code).
I've been wrestling with this for two days. Any help would be much appreciated. I'm probably sure that the question wasn't phrased correctly so please ask questions so that I may explain further if needed.
Thanks so much
BTW what I'm trying to do is create a Windows Form program which parses a PDF file which has multiple bookmarks into discrete PDF files for each bookmark/chapter while saving the bookmark in the correct folder with the correct naming convention, the folder and naming convention dependent upon the PDF name and title name of the bookmark/chapter being parsed.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using itextsharp.pdfa;
using iTextSharp.awt;
using iTextSharp.testutils;
using iTextSharp.text;
using iTextSharp.xmp;
using iTextSharp.xtra;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void ChooseImageFileWrapper_Click(object sender, EventArgs e)
{
OpenFileDialog openFileDialog1 = new OpenFileDialog();
openFileDialog1.InitialDirectory = GlobalVariables.InitialDirectory;
openFileDialog1.Filter = "Pdf Files|*.pdf";
openFileDialog1.RestoreDirectory = true;
openFileDialog1.Title = "Image File Wrapper Chooser";
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
try
{
GlobalVariables.ImageFileWrapperPath = openFileDialog1.FileName;
}
catch (Exception ex)
{
MessageBox.Show("Error: Could not read file from disk. Original error: " + ex.Message);
}
}
ImageFileWrapperPath.Text = GlobalVariables.ImageFileWrapperPath;
}
private void ImageFileWrapperPath_TextChanged(object sender, EventArgs e)
{
}
private void button2_Click(object sender, EventArgs e)
{
iTextSharp.text.pdf.PdfReader pdfReader = new iTextSharp.text.pdf.PdfReader(GlobalVariables.ImageFileWrapperPath);
IList<Dictionary<string, object>> book_mark = iTextSharp.text.pdf.SimpleBookmark.GetBookmark(pdfReader);
List<ImageFileWrapperBookmarks> IFWBookmarks = new List<ImageFileWrapperBookmarks>();
foreach (Dictionary<string, object> bk in book_mark) // bk is a single instance of book_mark
{
ImageFileWrapperBookmarks.BookmarkNumber = ImageFileWrapperBookmarks.BookmarkNumber + 1;
foreach (KeyValuePair<string, object> kvr in bk) // kvr is the key/value in bk
{
if (kvr.Key == "Kids" || kvr.Key == "kids")
{
//create recursive program for children
}
else if (kvr.Key == "Title" || kvr.Key == "title")
{
}
else if (kvr.Key == "Page" || kvr.Key == "page")
{
}
}
}
MessageBox.Show(GlobalVariables.ImageFileWrapperPath);
}
}
}

Here's one way to parse a PDF and create a data structure similar to what you describe. First the data structure:
public class BookMark
{
static int _number;
public BookMark() { Number = ++_number; }
public int Number { get; private set; }
public string Title { get; set; }
public string PageNumberString { get; set; }
public int PageNumberInteger { get; set; }
public static void ResetNumber() { _number = 0; }
// bookmarks title may have illegal filename character(s)
public string GetFileName()
{
var fileTitle = Regex.Replace(
Regex.Replace(Title, #"\s+", "-"),
#"[^-\w]", ""
);
return string.Format("{0:D4}-{1}.pdf", Number, fileTitle);
}
}
A method to create a list of Bookmark (above):
List<BookMark> ParseBookMarks(IList<Dictionary<string, object>> bookmarks)
{
int page;
var result = new List<BookMark>();
foreach (var bookmark in bookmarks)
{
// add top-level bookmarks
var stringPage = bookmark["Page"].ToString();
if (Int32.TryParse(stringPage.Split()[0], out page))
{
result.Add(new BookMark() {
Title = bookmark["Title"].ToString(),
PageNumberString = stringPage,
PageNumberInteger = page
});
}
// recurse
if (bookmark.ContainsKey("Kids"))
{
var kids = bookmark["Kids"] as IList<Dictionary<string, object>>;
if (kids != null && kids.Count > 0)
{
result.AddRange(ParseBookMarks(kids));
}
}
}
return result;
}
Call method above like this to dump the results to a text file:
void DumpResults(string path)
{
using (var reader = new PdfReader(path))
{
// need this call to parse page numbers
reader.ConsolidateNamedDestinations();
var bookmarks = ParseBookMarks(SimpleBookmark.GetBookmark(reader));
var sb = new StringBuilder();
foreach (var bookmark in bookmarks)
{
sb.AppendLine(string.Format(
"{0, -4}{1, -100}{2, -25}{3}",
bookmark.Number, bookmark.Title,
bookmark.PageNumberString, bookmark.PageNumberInteger
));
}
File.WriteAllText(outputTextFile, sb.ToString());
}
}
The bigger problem is how to extract each Bookmark into a separate file. If every Bookmark starts a new page it's easy:
Iterate over the return value of ParseBookMarks()
Select a page range that begins with the current BookMark.Number, and ends with the next BookMark.Number - 1
Use that page range to create separate files.
Something like this:
void ProcessPdf(string path)
{
using (var reader = new PdfReader(path))
{
// need this call to parse page numbers
reader.ConsolidateNamedDestinations();
var bookmarks = ParseBookMarks(SimpleBookmark.GetBookmark(reader));
for (int i = 0; i < bookmarks.Count; ++i)
{
int page = bookmarks[i].PageNumberInteger;
int nextPage = i + 1 < bookmarks.Count
// if not top of page will be missing content
? bookmarks[i + 1].PageNumberInteger - 1
/* alternative is to potentially add redundant content:
? bookmarks[i + 1].PageNumberInteger
*/
: reader.NumberOfPages;
string range = string.Format("{0}-{1}", page, nextPage);
// DEMO!
if (i < 10)
{
var outputPath = Path.Combine(OUTPUT_DIR, bookmarks[i].GetFileName());
using (var readerCopy = new PdfReader(reader))
{
var number = bookmarks[i].Number;
readerCopy.SelectPages(range);
using (FileStream stream = new FileStream(outputPath, FileMode.Create))
{
using (var document = new Document())
{
using (var copy = new PdfCopy(document, stream))
{
document.Open();
int n = readerCopy.NumberOfPages;
for (int j = 0; j < n; )
{
copy.AddPage(copy.GetImportedPage(readerCopy, ++j));
}
}
}
}
}
}
}
}
}
The problem is that it's highly unlikely all bookmarks are going to be at the top of every page of the PDF. To see what I mean, experiment with commenting / uncommenting the bookmarks[i + 1].PageNumberInteger lines.

Extract powerpoint titles with C#

I have powerponint 97-2003 files(.ppt extension) and I need to extract slide titles programatically using C#.
I have tried using Microsoft.Office.Interop but without success.
I have search with google and as a maximum I have found how to obtain reference to PowerPoint.Slide:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.Office.Core;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
namespace Tester
{
class Program
{
static void Main(string[] args)
{
Microsoft.Office.Interop.PowerPoint.Application presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
try
{
string pptPath = #"D:\somefile.ppt";
TestReadingTitles(presentationApp, pptPath);
}
finally
{
presentationApp.Quit();
}
}
private static void TestReadingTitles(Microsoft.Office.Interop.PowerPoint.Application presentationApp, string pptPath)
{
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
Microsoft.Office.Core.MsoTriState readOnly = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Core.MsoTriState untitled = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Core.MsoTriState withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPath, readOnly, untitled, withWindow);
for (int i = 0; i < presentation.Slides.Count; i++)
{
foreach (PowerPoint.Slide slide in presentation.Slides)
{
string slidetitle = ??????????????????;
}
}
}
}
}

You can extract the titles without looping through the shapes.
private static string ExtractSlideTitlefromSlide(Microsoft.Office.Interop.PowerPoint.Slide slide, string defaultValue)
{
if (slide.Shapes.HasTitle == Office.MsoTriState.msoTrue)
{
if (slide.Shapes.Title.TextFrame.HasText == Office.MsoTriState.msoTrue)
{
return slide.Shapes.Title.TextFrame.TextRange.Text;
}
}
return defaultValue;
}

I have no solution for direct extract slide titles from ppt. This is a workarround - first temproaly convert it into pptx and then extract titles using openxml.
For conversion from ppt to pptx I have used Microsoft Interop which I do not like but I have no better solution.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Presentation;
using D = DocumentFormat.OpenXml.Drawing;
using Shape = DocumentFormat.OpenXml.Presentation.Shape;
namespace Tester
{
class Program
{
static void Main(string[] args)
{
string pptPath = #"D:\mypresentation.ppt";
ReadTitles(pptPath);
}
private static void ReadTitles(string pptPath)
{
IList<string> slideTitles = GetSlidesTitles(pptPath);
Debug.Print("SLIDES TITLES FOR {0}:", pptPath);
foreach (string slideTitle in slideTitles)
{
Debug.Print("\t {0}", slideTitle);
}
}
private static IList<string> GetSlidesTitles(string pptPath)
{
string pptxPath = SaveAsPptx(pptPath);
IList<string> titles = GetSlideTitles(pptxPath);
try
{
File.Delete(pptxPath);
Debug.Print("Temporary pptx file {0} deleted.", pptxPath);
}
catch (Exception e)
{
Debug.Print("Error deleting file {0}. ERROR: {1}", pptxPath, e.Message);
}
return titles;
}
private static string SaveAsPptx(string pptPathIn)
{
Microsoft.Office.Interop.PowerPoint.Application presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
string pptxPathOut = null;
try
{
string pptDir = Path.GetDirectoryName(pptPathIn);
string pptFileNameOnly = Path.GetFileNameWithoutExtension(pptPathIn);
pptxPathOut = Path.Combine(pptDir, pptFileNameOnly + ".pptx");
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
Microsoft.Office.Core.MsoTriState readOnly = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Core.MsoTriState untitled = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Core.MsoTriState withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Debug.Print("Opening ppt file {0} ...", pptPathIn);
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPathIn, readOnly, untitled, withWindow);
Debug.Print("Starting creation of pptx from ppt {0}", pptPathIn);
presentation.SaveCopyAs(pptxPathOut, PowerPoint.PpSaveAsFileType.ppSaveAsOpenXMLPresentation, Microsoft.Office.Core.MsoTriState.msoFalse);
Debug.Print("Successfully created pptx {0} from ppt {1}", pptxPathOut, pptPathIn);
}
catch (Exception e)
{
Debug.Print("Error during creating pptx from ppt " + pptPathIn, e);
}
finally
{
presentationApp.Quit();
}
return pptxPathOut;
}
// Get a list of the titles of all the slides in the presentation.
public static IList<string> GetSlideTitles(string presentationFile)
{
// Open the presentation as read-only.
using (PresentationDocument presentationDocument = PresentationDocument.Open(presentationFile, false))
{
return GetSlideTitles(presentationDocument);
}
}
// Get a list of the titles of all the slides in the presentation.
public static IList<string> GetSlideTitles(PresentationDocument presentationDocument)
{
if (presentationDocument == null)
{
throw new ArgumentNullException("presentationDocument");
}
// Get a PresentationPart object from the PresentationDocument object.
PresentationPart presentationPart = presentationDocument.PresentationPart;
if (presentationPart != null &&
presentationPart.Presentation != null)
{
// Get a Presentation object from the PresentationPart object.
Presentation presentation = presentationPart.Presentation;
if (presentation.SlideIdList != null)
{
List<string> titlesList = new List<string>();
// Get the title of each slide in the slide order.
foreach (var slideId in presentation.SlideIdList.Elements<SlideId>())
{
SlidePart slidePart = presentationPart.GetPartById(slideId.RelationshipId) as SlidePart;
// Get the slide title.
string title = GetSlideTitle(slidePart);
// An empty title can also be added.
titlesList.Add(title);
}
return titlesList;
}
}
return null;
}
// Get the title string of the slide.
public static string GetSlideTitle(SlidePart slidePart)
{
if (slidePart == null)
{
throw new ArgumentNullException("slidePart");
}
// Declare a paragraph separator.
string paragraphSeparator = null;
if (slidePart.Slide != null)
{
// Find all the title shapes.
var shapes = from shape in slidePart.Slide.Descendants<Shape>()
where IsTitleShape(shape)
select shape;
StringBuilder paragraphText = new StringBuilder();
foreach (var shape in shapes)
{
// Get the text in each paragraph in this shape.
foreach (var paragraph in shape.TextBody.Descendants<D.Paragraph>())
{
// Add a line break.
paragraphText.Append(paragraphSeparator);
foreach (var text in paragraph.Descendants<D.Text>())
{
paragraphText.Append(text.Text);
}
paragraphSeparator = "\n";
}
}
return paragraphText.ToString();
}
return string.Empty;
}
// Determines whether the shape is a title shape.
private static bool IsTitleShape(Shape shape)
{
var placeholderShape = shape.NonVisualShapeProperties.ApplicationNonVisualDrawingProperties.GetFirstChild<PlaceholderShape>();
if (placeholderShape != null && placeholderShape.Type != null && placeholderShape.Type.HasValue)
{
switch ((PlaceholderValues)placeholderShape.Type)
{
// Any title shape.
case PlaceholderValues.Title:
// A centered title.
case PlaceholderValues.CenteredTitle:
return true;
default:
return false;
}
}
return false;
}
}
}

Finally I have found a way to get out powerpoint presentation titles from .ppt file without converting it to .pptx. Here is a solution:
using System;
using System.Collections.Generic;
using Microsoft.Office.Core;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
namespace Mintra.Publisher.DocumentConverter.Core.Utils
{
class InteropUtility
{
public static IList<string> GetPresentationTitles(string pptPath)
{
IList<string> result = new List<string>();
var presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
try
{
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
var readOnly = Microsoft.Office.Core.MsoTriState.msoTrue;
var untitled = Microsoft.Office.Core.MsoTriState.msoTrue;
var withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPath, readOnly, untitled, withWindow);
int i = 0;
foreach (PowerPoint.Slide slide in presentation.Slides)
{
string defaultTitle = String.Format("Slide {0}", i);
String shapeTitle = ExtractSlideTitlefromShape(slide, defaultTitle);
result.Add(shapeTitle);
}
}
finally
{
presentationApp.Quit();
}
return result;
}
private static string ExtractSlideTitlefromShape(PowerPoint.Slide slide, string defaultValue)
{
PowerPoint.HeadersFooters headersFooters = slide.HeadersFooters;
PowerPoint.Shapes mastershapes = slide.Master.Shapes;
for (int i = 1; i <= slide.Shapes.Count; i++)
{
PowerPoint.Shape shape = slide.Shapes[i];
bool hasTextFrame = shape.HasTextFrame == MsoTriState.msoTrue;
bool isTypePlaceholder = shape.Type.Equals(MsoShapeType.msoPlaceholder);
bool hasTextInTextFrame = shape.TextFrame.HasText == MsoTriState.msoTrue;
bool isTitleShape = shape.Name.ToLower().Contains("title");
if (isTypePlaceholder && hasTextFrame && hasTextInTextFrame && isTitleShape)
{
return shape.TextFrame.TextRange.Text;
}
}
return defaultValue;
}
}
}

Microsoft.Office.Interop.PowerPoint.Application pptApplication = new Microsoft.Office.Interop.PowerPoint.Application();
Microsoft.Office.Interop.PowerPoint.Slides slides;
Microsoft.Office.Interop.PowerPoint._Slide slide;
// Create the Presentation File
Presentation pptPresentation = pptApplication.Presentations.Add(MsoTriState.msoTrue);
for (int i = 0; i < 2; i++)
{
Microsoft.Office.Interop.PowerPoint.CustomLayout customLayout = pptPresentation.SlideMaster.CustomLayouts[Microsoft.Office.Interop.PowerPoint.PpSlideLayout.ppLayoutChartAndText];
// customLayout.t
// Create new Slide
slides = pptPresentation.Slides;
slide = slides.AddSlide(1, customLayout);
slide.Shapes.Title.Top = 0;
slide.Shapes.Title.TextFrame.TextRange.Text = "Welcome!";
All you need is change Welcome text.

import named destinations in pdf

I m developing an application in which a word document is converted in pdf. My problem is too complicated please help me out.
My word doc has a toc, bookmarks, endnotes and hyperlinks. when I save this doc as pdf, only bookmarks are converted. After a long research I found that PDF documents does not support bookmark to bookmark hyperlinks, it needs either page number or named destinations.
So I choose named destinations for this purpose, but I am stuck again , because simple "save as" cannot generate named destinations in the pdf doc. So I print the word doc on adobe PDF printer and I got named destination as required, but again this document neither have bookmarks in it nor hyperlinks. so what I decided that I generate two pdf from a word, first by save as option and second one is by printing.
test.pdf (by save as) (contains bookmarks, hyperlinks)
test_p.pdf( by printing) (only contains named destination)
then I research ones again and found a way to extract all named destination from test_p.pdf into XML by a function of itextsharp.but unfortunately I dont get any way to import back this xml in test.pdf.. thats why I came here.
Guide me what to do next if this approach is ok. else suggest me any ohter approach to accomplish this mission.

I wrote a class to replace urls in my PDF files some times ago:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using iTextSharp.text.pdf;
namespace ReplaceLinks
{
public class ReplacePdfLinks
{
Dictionary<string, PdfObject> _namedDestinations;
PdfReader _reader;
public string InputPdf { set; get; }
public string OutputPdf { set; get; }
public Func<Uri, string> UriToNamedDestination { set; get; }
public void Start()
{
updatePdfLinks();
saveChanges();
}
private PdfArray getAnnotationsOfCurrentPage(int pageNumber)
{
var pageDictionary = _reader.GetPageN(pageNumber);
var annotations = pageDictionary.GetAsArray(PdfName.ANNOTS);
return annotations;
}
private static bool hasAction(PdfDictionary annotationDictionary)
{
return annotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK);
}
private static bool isUriAction(PdfDictionary annotationAction)
{
return annotationAction.Get(PdfName.S).Equals(PdfName.URI);
}
private void replaceUriWithLocalDestination(PdfDictionary annotationAction)
{
var uri = annotationAction.Get(PdfName.URI) as PdfString;
if (uri == null)
return;
if (string.IsNullOrWhiteSpace(uri.ToString()))
return;
var namedDestination = UriToNamedDestination(new Uri(uri.ToString()));
if (string.IsNullOrWhiteSpace(namedDestination))
return;
PdfObject entry;
if (!_namedDestinations.TryGetValue(namedDestination, out entry))
return;
annotationAction.Remove(PdfName.S);
annotationAction.Remove(PdfName.URI);
var newLocalDestination = new PdfArray();
annotationAction.Put(PdfName.S, PdfName.GOTO);
var xRef = ((PdfArray)entry).First(x => x is PdfIndirectReference);
newLocalDestination.Add(xRef);
newLocalDestination.Add(PdfName.FITH);
annotationAction.Put(PdfName.D, newLocalDestination);
}
private void saveChanges()
{
using (var fileStream = new FileStream(OutputPdf, FileMode.Create, FileAccess.Write, FileShare.None))
using (var stamper = new PdfStamper(_reader, fileStream))
{
stamper.Close();
}
}
private void updatePdfLinks()
{
_reader = new PdfReader(InputPdf);
_namedDestinations = _reader.GetNamedDestinationFromStrings();
var pageCount = _reader.NumberOfPages;
for (var i = 1; i <= pageCount; i++)
{
var annotations = getAnnotationsOfCurrentPage(i);
if (annotations == null || !annotations.Any())
continue;
foreach (var annotation in annotations.ArrayList)
{
var annotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(annotation);
if (!hasAction(annotationDictionary))
continue;
var annotationAction = annotationDictionary.Get(PdfName.A) as PdfDictionary;
if (annotationAction == null)
continue;
if (!isUriAction(annotationAction))
continue;
replaceUriWithLocalDestination(annotationAction);
}
}
}
}
}
To use it:
new ReplacePdfLinks
{
InputPdf = #"test.pdf",
OutputPdf = "mod.pdf",
UriToNamedDestination = uri =>
{
if (uri.Host.ToLowerInvariant().Contains("google.com"))
{
return "entry1";
}
return string.Empty;
}
}.Start();
This sample will modify all of the urls containing google.com to point to a specific named destination "entry1".
And this is the sample file to test the above class:
void WriteFile()
{
using (var doc = new Document(PageSize.LETTER))
{
using (var fs = new FileStream("test.pdf", FileMode.Create))
{
using (var writer = PdfWriter.GetInstance(doc, fs))
{
doc.Open();
var blueFont = FontFactory.GetFont("Arial", 12, Font.NORMAL, BaseColor.BLUE);
doc.Add(new Chunk("Go to URL", blueFont).SetAction(new PdfAction("http://www.google.com/", false)));
doc.NewPage();
doc.Add(new Chunk("Go to Test", blueFont).SetLocalGoto("entry1"));
doc.NewPage();
doc.Add(new Chunk("Test").SetLocalDestination("entry1"));
doc.Close();
}
}
}
}

Develop Reference

C# (C-Sharp) is a programming language developed by Microsoft that runs on the .NET Framework.

how to copy highlighted text from pdf file - c#

Related

link extraction using HtmlAgilityPack and c#

get title tag by html agility pack

How to create an array and fill from tree node variable

Extract powerpoint titles with C#

import named destinations in pdf

Categories

Resources