Extract powerpoint titles with C#

Extract powerpoint titles with C# - c#

I have powerponint 97-2003 files(.ppt extension) and I need to extract slide titles programatically using C#.
I have tried using Microsoft.Office.Interop but without success.
I have search with google and as a maximum I have found how to obtain reference to PowerPoint.Slide:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.Office.Core;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
namespace Tester
{
class Program
{
static void Main(string[] args)
{
Microsoft.Office.Interop.PowerPoint.Application presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
try
{
string pptPath = #"D:\somefile.ppt";
TestReadingTitles(presentationApp, pptPath);
}
finally
{
presentationApp.Quit();
}
}
private static void TestReadingTitles(Microsoft.Office.Interop.PowerPoint.Application presentationApp, string pptPath)
{
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
Microsoft.Office.Core.MsoTriState readOnly = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Core.MsoTriState untitled = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Core.MsoTriState withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPath, readOnly, untitled, withWindow);
for (int i = 0; i < presentation.Slides.Count; i++)
{
foreach (PowerPoint.Slide slide in presentation.Slides)
{
string slidetitle = ??????????????????;
}
}
}
}
}

You can extract the titles without looping through the shapes.
private static string ExtractSlideTitlefromSlide(Microsoft.Office.Interop.PowerPoint.Slide slide, string defaultValue)
{
if (slide.Shapes.HasTitle == Office.MsoTriState.msoTrue)
{
if (slide.Shapes.Title.TextFrame.HasText == Office.MsoTriState.msoTrue)
{
return slide.Shapes.Title.TextFrame.TextRange.Text;
}
}
return defaultValue;
}

I have no solution for direct extract slide titles from ppt. This is a workarround - first temproaly convert it into pptx and then extract titles using openxml.
For conversion from ppt to pptx I have used Microsoft Interop which I do not like but I have no better solution.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Presentation;
using D = DocumentFormat.OpenXml.Drawing;
using Shape = DocumentFormat.OpenXml.Presentation.Shape;
namespace Tester
{
class Program
{
static void Main(string[] args)
{
string pptPath = #"D:\mypresentation.ppt";
ReadTitles(pptPath);
}
private static void ReadTitles(string pptPath)
{
IList<string> slideTitles = GetSlidesTitles(pptPath);
Debug.Print("SLIDES TITLES FOR {0}:", pptPath);
foreach (string slideTitle in slideTitles)
{
Debug.Print("\t {0}", slideTitle);
}
}
private static IList<string> GetSlidesTitles(string pptPath)
{
string pptxPath = SaveAsPptx(pptPath);
IList<string> titles = GetSlideTitles(pptxPath);
try
{
File.Delete(pptxPath);
Debug.Print("Temporary pptx file {0} deleted.", pptxPath);
}
catch (Exception e)
{
Debug.Print("Error deleting file {0}. ERROR: {1}", pptxPath, e.Message);
}
return titles;
}
private static string SaveAsPptx(string pptPathIn)
{
Microsoft.Office.Interop.PowerPoint.Application presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
string pptxPathOut = null;
try
{
string pptDir = Path.GetDirectoryName(pptPathIn);
string pptFileNameOnly = Path.GetFileNameWithoutExtension(pptPathIn);
pptxPathOut = Path.Combine(pptDir, pptFileNameOnly + ".pptx");
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
Microsoft.Office.Core.MsoTriState readOnly = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Core.MsoTriState untitled = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Core.MsoTriState withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Debug.Print("Opening ppt file {0} ...", pptPathIn);
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPathIn, readOnly, untitled, withWindow);
Debug.Print("Starting creation of pptx from ppt {0}", pptPathIn);
presentation.SaveCopyAs(pptxPathOut, PowerPoint.PpSaveAsFileType.ppSaveAsOpenXMLPresentation, Microsoft.Office.Core.MsoTriState.msoFalse);
Debug.Print("Successfully created pptx {0} from ppt {1}", pptxPathOut, pptPathIn);
}
catch (Exception e)
{
Debug.Print("Error during creating pptx from ppt " + pptPathIn, e);
}
finally
{
presentationApp.Quit();
}
return pptxPathOut;
}
// Get a list of the titles of all the slides in the presentation.
public static IList<string> GetSlideTitles(string presentationFile)
{
// Open the presentation as read-only.
using (PresentationDocument presentationDocument = PresentationDocument.Open(presentationFile, false))
{
return GetSlideTitles(presentationDocument);
}
}
// Get a list of the titles of all the slides in the presentation.
public static IList<string> GetSlideTitles(PresentationDocument presentationDocument)
{
if (presentationDocument == null)
{
throw new ArgumentNullException("presentationDocument");
}
// Get a PresentationPart object from the PresentationDocument object.
PresentationPart presentationPart = presentationDocument.PresentationPart;
if (presentationPart != null &&
presentationPart.Presentation != null)
{
// Get a Presentation object from the PresentationPart object.
Presentation presentation = presentationPart.Presentation;
if (presentation.SlideIdList != null)
{
List<string> titlesList = new List<string>();
// Get the title of each slide in the slide order.
foreach (var slideId in presentation.SlideIdList.Elements<SlideId>())
{
SlidePart slidePart = presentationPart.GetPartById(slideId.RelationshipId) as SlidePart;
// Get the slide title.
string title = GetSlideTitle(slidePart);
// An empty title can also be added.
titlesList.Add(title);
}
return titlesList;
}
}
return null;
}
// Get the title string of the slide.
public static string GetSlideTitle(SlidePart slidePart)
{
if (slidePart == null)
{
throw new ArgumentNullException("slidePart");
}
// Declare a paragraph separator.
string paragraphSeparator = null;
if (slidePart.Slide != null)
{
// Find all the title shapes.
var shapes = from shape in slidePart.Slide.Descendants<Shape>()
where IsTitleShape(shape)
select shape;
StringBuilder paragraphText = new StringBuilder();
foreach (var shape in shapes)
{
// Get the text in each paragraph in this shape.
foreach (var paragraph in shape.TextBody.Descendants<D.Paragraph>())
{
// Add a line break.
paragraphText.Append(paragraphSeparator);
foreach (var text in paragraph.Descendants<D.Text>())
{
paragraphText.Append(text.Text);
}
paragraphSeparator = "\n";
}
}
return paragraphText.ToString();
}
return string.Empty;
}
// Determines whether the shape is a title shape.
private static bool IsTitleShape(Shape shape)
{
var placeholderShape = shape.NonVisualShapeProperties.ApplicationNonVisualDrawingProperties.GetFirstChild<PlaceholderShape>();
if (placeholderShape != null && placeholderShape.Type != null && placeholderShape.Type.HasValue)
{
switch ((PlaceholderValues)placeholderShape.Type)
{
// Any title shape.
case PlaceholderValues.Title:
// A centered title.
case PlaceholderValues.CenteredTitle:
return true;
default:
return false;
}
}
return false;
}
}
}

Finally I have found a way to get out powerpoint presentation titles from .ppt file without converting it to .pptx. Here is a solution:
using System;
using System.Collections.Generic;
using Microsoft.Office.Core;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
namespace Mintra.Publisher.DocumentConverter.Core.Utils
{
class InteropUtility
{
public static IList<string> GetPresentationTitles(string pptPath)
{
IList<string> result = new List<string>();
var presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
try
{
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
var readOnly = Microsoft.Office.Core.MsoTriState.msoTrue;
var untitled = Microsoft.Office.Core.MsoTriState.msoTrue;
var withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPath, readOnly, untitled, withWindow);
int i = 0;
foreach (PowerPoint.Slide slide in presentation.Slides)
{
string defaultTitle = String.Format("Slide {0}", i);
String shapeTitle = ExtractSlideTitlefromShape(slide, defaultTitle);
result.Add(shapeTitle);
}
}
finally
{
presentationApp.Quit();
}
return result;
}
private static string ExtractSlideTitlefromShape(PowerPoint.Slide slide, string defaultValue)
{
PowerPoint.HeadersFooters headersFooters = slide.HeadersFooters;
PowerPoint.Shapes mastershapes = slide.Master.Shapes;
for (int i = 1; i <= slide.Shapes.Count; i++)
{
PowerPoint.Shape shape = slide.Shapes[i];
bool hasTextFrame = shape.HasTextFrame == MsoTriState.msoTrue;
bool isTypePlaceholder = shape.Type.Equals(MsoShapeType.msoPlaceholder);
bool hasTextInTextFrame = shape.TextFrame.HasText == MsoTriState.msoTrue;
bool isTitleShape = shape.Name.ToLower().Contains("title");
if (isTypePlaceholder && hasTextFrame && hasTextInTextFrame && isTitleShape)
{
return shape.TextFrame.TextRange.Text;
}
}
return defaultValue;
}
}
}

Microsoft.Office.Interop.PowerPoint.Application pptApplication = new Microsoft.Office.Interop.PowerPoint.Application();
Microsoft.Office.Interop.PowerPoint.Slides slides;
Microsoft.Office.Interop.PowerPoint._Slide slide;
// Create the Presentation File
Presentation pptPresentation = pptApplication.Presentations.Add(MsoTriState.msoTrue);
for (int i = 0; i < 2; i++)
{
Microsoft.Office.Interop.PowerPoint.CustomLayout customLayout = pptPresentation.SlideMaster.CustomLayouts[Microsoft.Office.Interop.PowerPoint.PpSlideLayout.ppLayoutChartAndText];
// customLayout.t
// Create new Slide
slides = pptPresentation.Slides;
slide = slides.AddSlide(1, customLayout);
slide.Shapes.Title.Top = 0;
slide.Shapes.Title.TextFrame.TextRange.Text = "Welcome!";
All you need is change Welcome text.

Related

How to choose multiple image from Gallery at one time (tickbox) in Xamarin Android?

Hi have a Xamarin Android project using C#. Currently I am using await CrossMedia.Current.PickPhotoAsync() method to upload image. However, it did not provide a tickbox beside the images for me to select multiple. How can I manage to select multiple images and upload together?

You could implement it by yourself.
1.Add these methods to your MainActivity.cs file
public static int OPENGALLERYCODE = 100;
protected override void OnActivityResult(int requestCode, Result resultCode, Intent data)
{
base.OnActivityResult(requestCode, resultCode, data);
//If we are calling multiple image selection, enter into here and return photos and their filepaths.
if (requestCode == OPENGALLERYCODE && resultCode == Result.Ok)
{
List<string> images = new List<string>();
if (data != null)
{
//Separate all photos and get the path from them all individually.
ClipData clipData = data.ClipData;
if (clipData != null)
{
for (int i = 0; i < clipData.ItemCount; i++)
{
ClipData.Item item = clipData.GetItemAt(i);
Android.Net.Uri uri = item.Uri;
var path = GetRealPathFromURI(uri);
if (path != null)
{
images.Add(path);
}
}
}
else
{
Android.Net.Uri uri = data.Data;
var path = GetRealPathFromURI(uri);
if (path != null)
{
images.Add(path);
}
}
}
}
}
/// <summary>
/// Get the real path for the current image passed.
/// </summary>
public String GetRealPathFromURI(Android.Net.Uri contentURI)
{
try
{
ICursor imageCursor = null;
string fullPathToImage = "";
imageCursor = ContentResolver.Query(contentURI, null, null, null, null);
imageCursor.MoveToFirst();
int idx = imageCursor.GetColumnIndex(MediaStore.Images.ImageColumns.Data);
if (idx != -1)
{
fullPathToImage = imageCursor.GetString(idx);
}
else
{
ICursor cursor = null;
var docID = DocumentsContract.GetDocumentId(contentURI);
var id = docID.Split(':')[1];
var whereSelect = MediaStore.Images.ImageColumns.Id + "=?";
var projections = new string[] { MediaStore.Images.ImageColumns.Data };
cursor = ContentResolver.Query(MediaStore.Images.Media.InternalContentUri, projections, whereSelect, new string[] { id }, null);
if (cursor.Count == 0)
{
cursor = ContentResolver.Query(MediaStore.Images.Media.ExternalContentUri, projections, whereSelect, new string[] { id }, null);
}
var colData = cursor.GetColumnIndexOrThrow(MediaStore.Images.ImageColumns.Data);
cursor.MoveToFirst();
fullPathToImage = cursor.GetString(colData);
}
return fullPathToImage;
}
catch (Exception ex)
{
Toast.MakeText(Xamarin.Forms.Forms.Context, "Unable to get path", ToastLength.Long).Show();
}
return null;
}
2.invoked the following in the specific Activity which you want to open the picker
public void OpenGallery()
{
try
{
var imageIntent = new Intent(Intent.ActionPick);
imageIntent.SetType("image/*");
imageIntent.PutExtra(Intent.ExtraAllowMultiple, true);
imageIntent.SetAction(Intent.ActionGetContent);
this.StartActivityForResult(Intent.CreateChooser(imageIntent, "Select photo"), OPENGALLERYCODE);
Toast.MakeText(this, "Tap and hold to select multiple photos.", ToastLength.Short).Show();
}
catch (Exception ex)
{
Console.WriteLine(ex.ToString());
Toast.MakeText(this, "Error. Can not continue, try again.", ToastLength.Long).Show();
}
}
void ClearFileDirectory()
{
var directory = new Java.IO.File(Android.OS.Environment.GetExternalStoragePublicDirectory(Android.OS.Environment.DirectoryPictures), ImageHelpers.collectionName).Path.ToString();
if (Directory.Exists(directory))
{
var list = Directory.GetFiles(directory, "*");
if (list.Length > 0)
{
for (int i = 0; i < list.Length; i++)
{
File.Delete(list[i]);
}
}
}
}
//collectionName is the name of the folder in your Android Pictures directory.
public static readonly string collectionName = "TmpPictures";
public string SaveFile(byte[] imageByte, string fileName)
{
var fileDir = new Java.IO.File(Android.OS.Environment.GetExternalStoragePublicDirectory(Android.OS.Environment.DirectoryPictures), collectionName);
if (!fileDir.Exists())
{
fileDir.Mkdirs();
}
var file = new Java.IO.File(fileDir, fileName);
System.IO.File.WriteAllBytes(file.Path, imageByte);
return file.Path;
}
public string CompressImage(string path)
{
byte[] imageBytes;
//Get the bitmap.
var originalImage = BitmapFactory.DecodeFile(path);
//Set imageSize and imageCompression parameters.
var imageSize = .86;
var imageCompression = 67;
//Resize it and then compress it to Jpeg.
var width = (originalImage.Width * imageSize);
var height = (originalImage.Height * imageSize);
var scaledImage = Bitmap.CreateScaledBitmap(originalImage, (int)width, (int)height, true);
using (MemoryStream ms = new MemoryStream())
{
scaledImage.Compress(Bitmap.CompressFormat.Jpeg, imageCompression, ms);
imageBytes = ms.ToArray();
}
originalImage.Recycle();
originalImage.Dispose();
GC.Collect();
return SaveFile(imageBytes, Guid.NewGuid().ToString());
}

The name 'Core' does not exist in the current context

I am new to Xamarin and I am following Xamarin University tutorial. Everything was working fine until I started running into the error core does not exist. I looked over an old question that was posted and I followed the instructions but this did not help me at all. I have posted the link to the old question below.
The type or namespace name 'Core' does not exist in the namespace
code for MainPage.xml.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Xamarin.Forms;
namespace MyTelephone
{
public partial class MainPage : ContentPage
{
Entry phoneNumberText;
Button translateButton;
Button callButton;
string translatedNumber;
public MainPage()
{
this.Padding = new Thickness(20, 20, 20, 20);
StackLayout panel = new StackLayout
{
Spacing = 15
};
panel.Children.Add(new Label
{
Text = "Enter a password:",
FontSize = Device.GetNamedSize(NamedSize.Large, typeof(Label))
});
panel.Children.Add(phoneNumberText = new Entry
{
Text = "1-855-XAMARIN",
});
panel.Children.Add(translateButton = new Button
{
Text = "Translate"
});
panel.Children.Add(callButton = new Button
{
Text = "Call",
IsEnabled = false,
});
translateButton.Clicked += OnTranslate;
this.Content = panel;
//InitializeComponent();
}
private void OnTranslate(object sender, EventArgs e)
{
translatedNumber =
Core.PhonewordTranslator.ToNumber(phoneNumberText.Text);
if (!string.IsNullOrEmpty(translatedNumber))
{
callButton.IsEnabled = true;
callButton.Text = "Call" + translatedNumber;
}
else
{
callButton.IsEnabled = false;
callButton.Text = "Call";
}
}
}
}
code for PhonewordTranslator.cs
using System.Text;
namespace MyTelephone
{
public static class PhonewordTranslator
{
public static string ToNumber(string raw)
{
if (string.IsNullOrWhiteSpace(raw))
return null;
raw = raw.ToUpperInvariant();
var newNumber = new StringBuilder();
foreach (var c in raw)
{
if ("-0123456789".Contains(c))
newNumber.Append(c);
else
{
var result = TranslateToNumber(c);
if (result != null)
newNumber.Append(result);
else
return null;
}
}
return newNumber.ToString();
}
static bool Contains(this string keyString, char c)
{
return keyString.IndexOf(c) >= 0;
}
static readonly string[] digits = {
"ABC","DEF","GHI","JKL","MNO","PQRS","TUV","WXYZ"
};
static int? TranslateToNumber(char c)
{
for (int i = 0; i < digits.Length; i++)
{
if (digits[i].Contains(c))
return 2 + i;
}
return null;
}
}
}

The problem has to do with xamarin, and nothing with the question you linked.
This is the reason for the error:
translatedNumber =
Core.PhonewordTranslator.ToNumber(phoneNumberText.Text);
Check your file PhoneTranslator.cs it. You need to rename the namespace.
Source:
https://forums.xamarin.com/discussion/13839/core-phonewordtranslator-tonumber-compilation-error
For those people needing more information, this issue can be caused by not renaming the namespace in the PhoneTranslator.cs file to "Core".

How to create an array and fill from tree node variable

I'm trying to transfer data from a treenode (at least I think that's what it is) which contains much more data than I need. It would be very difficult for me to manipulate the data within the treenode. I would much rather have an array which provides me with only the necessary data for data manipulation.
I would like higher rates have following variables:
1. BookmarkNumber (integer)
2. Date (string)
3. DocumentType (string)
4. BookmarkPageNumberString (string)
5. BookmarkPageNumberInteger (integer)
I would like to the above defined rate from the data from variable book_mark (as can be seen in my code).
I've been wrestling with this for two days. Any help would be much appreciated. I'm probably sure that the question wasn't phrased correctly so please ask questions so that I may explain further if needed.
Thanks so much
BTW what I'm trying to do is create a Windows Form program which parses a PDF file which has multiple bookmarks into discrete PDF files for each bookmark/chapter while saving the bookmark in the correct folder with the correct naming convention, the folder and naming convention dependent upon the PDF name and title name of the bookmark/chapter being parsed.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using itextsharp.pdfa;
using iTextSharp.awt;
using iTextSharp.testutils;
using iTextSharp.text;
using iTextSharp.xmp;
using iTextSharp.xtra;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void ChooseImageFileWrapper_Click(object sender, EventArgs e)
{
OpenFileDialog openFileDialog1 = new OpenFileDialog();
openFileDialog1.InitialDirectory = GlobalVariables.InitialDirectory;
openFileDialog1.Filter = "Pdf Files|*.pdf";
openFileDialog1.RestoreDirectory = true;
openFileDialog1.Title = "Image File Wrapper Chooser";
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
try
{
GlobalVariables.ImageFileWrapperPath = openFileDialog1.FileName;
}
catch (Exception ex)
{
MessageBox.Show("Error: Could not read file from disk. Original error: " + ex.Message);
}
}
ImageFileWrapperPath.Text = GlobalVariables.ImageFileWrapperPath;
}
private void ImageFileWrapperPath_TextChanged(object sender, EventArgs e)
{
}
private void button2_Click(object sender, EventArgs e)
{
iTextSharp.text.pdf.PdfReader pdfReader = new iTextSharp.text.pdf.PdfReader(GlobalVariables.ImageFileWrapperPath);
IList<Dictionary<string, object>> book_mark = iTextSharp.text.pdf.SimpleBookmark.GetBookmark(pdfReader);
List<ImageFileWrapperBookmarks> IFWBookmarks = new List<ImageFileWrapperBookmarks>();
foreach (Dictionary<string, object> bk in book_mark) // bk is a single instance of book_mark
{
ImageFileWrapperBookmarks.BookmarkNumber = ImageFileWrapperBookmarks.BookmarkNumber + 1;
foreach (KeyValuePair<string, object> kvr in bk) // kvr is the key/value in bk
{
if (kvr.Key == "Kids" || kvr.Key == "kids")
{
//create recursive program for children
}
else if (kvr.Key == "Title" || kvr.Key == "title")
{
}
else if (kvr.Key == "Page" || kvr.Key == "page")
{
}
}
}
MessageBox.Show(GlobalVariables.ImageFileWrapperPath);
}
}
}

Here's one way to parse a PDF and create a data structure similar to what you describe. First the data structure:
public class BookMark
{
static int _number;
public BookMark() { Number = ++_number; }
public int Number { get; private set; }
public string Title { get; set; }
public string PageNumberString { get; set; }
public int PageNumberInteger { get; set; }
public static void ResetNumber() { _number = 0; }
// bookmarks title may have illegal filename character(s)
public string GetFileName()
{
var fileTitle = Regex.Replace(
Regex.Replace(Title, #"\s+", "-"),
#"[^-\w]", ""
);
return string.Format("{0:D4}-{1}.pdf", Number, fileTitle);
}
}
A method to create a list of Bookmark (above):
List<BookMark> ParseBookMarks(IList<Dictionary<string, object>> bookmarks)
{
int page;
var result = new List<BookMark>();
foreach (var bookmark in bookmarks)
{
// add top-level bookmarks
var stringPage = bookmark["Page"].ToString();
if (Int32.TryParse(stringPage.Split()[0], out page))
{
result.Add(new BookMark() {
Title = bookmark["Title"].ToString(),
PageNumberString = stringPage,
PageNumberInteger = page
});
}
// recurse
if (bookmark.ContainsKey("Kids"))
{
var kids = bookmark["Kids"] as IList<Dictionary<string, object>>;
if (kids != null && kids.Count > 0)
{
result.AddRange(ParseBookMarks(kids));
}
}
}
return result;
}
Call method above like this to dump the results to a text file:
void DumpResults(string path)
{
using (var reader = new PdfReader(path))
{
// need this call to parse page numbers
reader.ConsolidateNamedDestinations();
var bookmarks = ParseBookMarks(SimpleBookmark.GetBookmark(reader));
var sb = new StringBuilder();
foreach (var bookmark in bookmarks)
{
sb.AppendLine(string.Format(
"{0, -4}{1, -100}{2, -25}{3}",
bookmark.Number, bookmark.Title,
bookmark.PageNumberString, bookmark.PageNumberInteger
));
}
File.WriteAllText(outputTextFile, sb.ToString());
}
}
The bigger problem is how to extract each Bookmark into a separate file. If every Bookmark starts a new page it's easy:
Iterate over the return value of ParseBookMarks()
Select a page range that begins with the current BookMark.Number, and ends with the next BookMark.Number - 1
Use that page range to create separate files.
Something like this:
void ProcessPdf(string path)
{
using (var reader = new PdfReader(path))
{
// need this call to parse page numbers
reader.ConsolidateNamedDestinations();
var bookmarks = ParseBookMarks(SimpleBookmark.GetBookmark(reader));
for (int i = 0; i < bookmarks.Count; ++i)
{
int page = bookmarks[i].PageNumberInteger;
int nextPage = i + 1 < bookmarks.Count
// if not top of page will be missing content
? bookmarks[i + 1].PageNumberInteger - 1
/* alternative is to potentially add redundant content:
? bookmarks[i + 1].PageNumberInteger
*/
: reader.NumberOfPages;
string range = string.Format("{0}-{1}", page, nextPage);
// DEMO!
if (i < 10)
{
var outputPath = Path.Combine(OUTPUT_DIR, bookmarks[i].GetFileName());
using (var readerCopy = new PdfReader(reader))
{
var number = bookmarks[i].Number;
readerCopy.SelectPages(range);
using (FileStream stream = new FileStream(outputPath, FileMode.Create))
{
using (var document = new Document())
{
using (var copy = new PdfCopy(document, stream))
{
document.Open();
int n = readerCopy.NumberOfPages;
for (int j = 0; j < n; )
{
copy.AddPage(copy.GetImportedPage(readerCopy, ++j));
}
}
}
}
}
}
}
}
}
The problem is that it's highly unlikely all bookmarks are going to be at the top of every page of the PDF. To see what I mean, experiment with commenting / uncommenting the bookmarks[i + 1].PageNumberInteger lines.

How to implement and do OCR in a C# project?

I ve been searching for a while and all that i ve seen some OCR library requests. I would like to know how to implement the purest, easy to install and use OCR library with detailed info for installation into a C# project.
If posible, I just wanna implement it like a usual dll reference...
Example:
using org.pdfbox.pdmodel;
using org.pdfbox.util;
Also a little OCR code example would be nice, such as:
public string OCRFromBitmap(Bitmap Bmp)
{
Bmp.Save(temppath, System.Drawing.Imaging.ImageFormat.Tiff);
string OcrResult = Analyze(temppath);
File.Delete(temppath);
return OcrResult;
}
So please consider that I'm not familiar to OCR projects and give me an answer like talking to a dummy.
Edit:
I guess people misunderstood my request. I wanted to know how to implement those open source OCR libraries to a C# project and how to use them. The link given as dup is not giving answers that I requested at all.

If anyone is looking into this, I've been trying different options and the following approach yields very good results. The following are the steps to get a working example:
Add .NET Wrapper for tesseract to your project. It can be added via NuGet package Install-Package Tesseract(https://github.com/charlesw/tesseract).
Go to the Downloads section of the official Tesseract project (https://code.google.com/p/tesseract-ocr/ EDIT: It's now located here: https://github.com/tesseract-ocr/langdata).
Download the preferred language data, example: tesseract-ocr-3.02.eng.tar.gz English language data for Tesseract 3.02.
Create tessdata directory in your project and place the language data files in it.
Go to Properties of the newly added files and set them to copy on build.
Add a reference to System.Drawing.
From .NET Wrapper repository, in the Samples directory copy the sample phototest.tif file into your project directory and set it to copy on build.
Create the following two files in your project (just to get started):
Program.cs
using System;
using Tesseract;
using System.Diagnostics;
namespace ConsoleApplication
{
class Program
{
public static void Main(string[] args)
{
var testImagePath = "./phototest.tif";
if (args.Length > 0)
{
testImagePath = args[0];
}
try
{
var logger = new FormattedConsoleLogger();
var resultPrinter = new ResultPrinter(logger);
using (var engine = new TesseractEngine(#"./tessdata", "eng", EngineMode.Default))
{
using (var img = Pix.LoadFromFile(testImagePath))
{
using (logger.Begin("Process image"))
{
var i = 1;
using (var page = engine.Process(img))
{
var text = page.GetText();
logger.Log("Text: {0}", text);
logger.Log("Mean confidence: {0}", page.GetMeanConfidence());
using (var iter = page.GetIterator())
{
iter.Begin();
do
{
if (i % 2 == 0)
{
using (logger.Begin("Line {0}", i))
{
do
{
using (logger.Begin("Word Iteration"))
{
if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
{
logger.Log("New block");
}
if (iter.IsAtBeginningOf(PageIteratorLevel.Para))
{
logger.Log("New paragraph");
}
if (iter.IsAtBeginningOf(PageIteratorLevel.TextLine))
{
logger.Log("New line");
}
logger.Log("word: " + iter.GetText(PageIteratorLevel.Word));
}
} while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
}
}
i++;
} while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
}
}
}
}
}
}
catch (Exception e)
{
Trace.TraceError(e.ToString());
Console.WriteLine("Unexpected Error: " + e.Message);
Console.WriteLine("Details: ");
Console.WriteLine(e.ToString());
}
Console.Write("Press any key to continue . . . ");
Console.ReadKey(true);
}
private class ResultPrinter
{
readonly FormattedConsoleLogger logger;
public ResultPrinter(FormattedConsoleLogger logger)
{
this.logger = logger;
}
public void Print(ResultIterator iter)
{
logger.Log("Is beginning of block: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Block));
logger.Log("Is beginning of para: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Para));
logger.Log("Is beginning of text line: {0}", iter.IsAtBeginningOf(PageIteratorLevel.TextLine));
logger.Log("Is beginning of word: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Word));
logger.Log("Is beginning of symbol: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Symbol));
logger.Log("Block text: \"{0}\"", iter.GetText(PageIteratorLevel.Block));
logger.Log("Para text: \"{0}\"", iter.GetText(PageIteratorLevel.Para));
logger.Log("TextLine text: \"{0}\"", iter.GetText(PageIteratorLevel.TextLine));
logger.Log("Word text: \"{0}\"", iter.GetText(PageIteratorLevel.Word));
logger.Log("Symbol text: \"{0}\"", iter.GetText(PageIteratorLevel.Symbol));
}
}
}
}
FormattedConsoleLogger.cs
using System;
using System.Collections.Generic;
using System.Text;
using Tesseract;
namespace ConsoleApplication
{
public class FormattedConsoleLogger
{
const string Tab = " ";
private class Scope : DisposableBase
{
private int indentLevel;
private string indent;
private FormattedConsoleLogger container;
public Scope(FormattedConsoleLogger container, int indentLevel)
{
this.container = container;
this.indentLevel = indentLevel;
StringBuilder indent = new StringBuilder();
for (int i = 0; i < indentLevel; i++)
{
indent.Append(Tab);
}
this.indent = indent.ToString();
}
public void Log(string format, object[] args)
{
var message = String.Format(format, args);
StringBuilder indentedMessage = new StringBuilder(message.Length + indent.Length * 10);
int i = 0;
bool isNewLine = true;
while (i < message.Length)
{
if (message.Length > i && message[i] == '\r' && message[i + 1] == '\n')
{
indentedMessage.AppendLine();
isNewLine = true;
i += 2;
}
else if (message[i] == '\r' || message[i] == '\n')
{
indentedMessage.AppendLine();
isNewLine = true;
i++;
}
else
{
if (isNewLine)
{
indentedMessage.Append(indent);
isNewLine = false;
}
indentedMessage.Append(message[i]);
i++;
}
}
Console.WriteLine(indentedMessage.ToString());
}
public Scope Begin()
{
return new Scope(container, indentLevel + 1);
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
var scope = container.scopes.Pop();
if (scope != this)
{
throw new InvalidOperationException("Format scope removed out of order.");
}
}
}
}
private Stack<Scope> scopes = new Stack<Scope>();
public IDisposable Begin(string title = "", params object[] args)
{
Log(title, args);
Scope scope;
if (scopes.Count == 0)
{
scope = new Scope(this, 1);
}
else
{
scope = ActiveScope.Begin();
}
scopes.Push(scope);
return scope;
}
public void Log(string format, params object[] args)
{
if (scopes.Count > 0)
{
ActiveScope.Log(format, args);
}
else
{
Console.WriteLine(String.Format(format, args));
}
}
private Scope ActiveScope
{
get
{
var top = scopes.Peek();
if (top == null) throw new InvalidOperationException("No current scope");
return top;
}
}
}
}

Here's one: (check out http://hongouru.blogspot.ie/2011/09/c-ocr-optical-character-recognition.html or http://www.codeproject.com/Articles/41709/How-To-Use-Office-2007-OCR-Using-C for more info)
using MODI;
static void Main(string[] args)
{
DocumentClass myDoc = new DocumentClass();
myDoc.Create(#"theDocumentName.tiff"); //we work with the .tiff extension
myDoc.OCR(MiLANGUAGES.miLANG_ENGLISH, true, true);
foreach (Image anImage in myDoc.Images)
{
Console.WriteLine(anImage.Layout.Text); //here we cout to the console.
}
}

I'm using tesseract OCR engine with TessNet2 (a C# wrapper - http://www.pixel-technology.com/freeware/tessnet2/).
Some basic code:
using tessnet2;
...
Bitmap image = new Bitmap(#"u:\user files\bwalker\2849257.tif");
tessnet2.Tesseract ocr = new tessnet2.Tesseract();
ocr.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,$-/#&=()\"':?"); // Accepted characters
ocr.Init(#"C:\Users\bwalker\Documents\Visual Studio 2010\Projects\tessnetWinForms\tessnetWinForms\bin\Release\", "eng", false); // Directory of your tessdata folder
List<tessnet2.Word> result = ocr.DoOCR(image, System.Drawing.Rectangle.Empty);
string Results = "";
foreach (tessnet2.Word word in result)
{
Results += word.Confidence + ", " + word.Text + ", " + word.Left + ", " + word.Top + ", " + word.Bottom + ", " + word.Right + "\n";
}

Some online API's work pretty well: ocr.space and Google Cloud Vision. Both of these are free, as long as you do less than 1000 OCR's per month. You can drag & drop an image to do a quick manual test to see how they perform for your images.
I find OCR.space easier to use (no messing around with nuget libraries), but, for my purpose, Google Cloud Vision provided slightly better results than OCR.space.
Google Cloud Vision example:
GoogleCredential cred = GoogleCredential.FromJson(json);
Channel channel = new Channel(ImageAnnotatorClient.DefaultEndpoint.Host, ImageAnnotatorClient.DefaultEndpoint.Port, cred.ToChannelCredentials());
ImageAnnotatorClient client = ImageAnnotatorClient.Create(channel);
Image image = Image.FromStream(stream);
EntityAnnotation googleOcrText = client.DetectText(image).First();
Console.Write(googleOcrText.Description);
OCR.space example:
string uri = $"https://api.ocr.space/parse/imageurl?apikey=helloworld&url={imageUri}";
string responseString = WebUtilities.DoGetRequest(uri);
OcrSpaceResult result = JsonConvert.DeserializeObject<OcrSpaceResult>(responseString);
if ((!result.IsErroredOnProcessing) && !String.IsNullOrEmpty(result.ParsedResults[0].ParsedText))
return result.ParsedResults[0].ParsedText;

A new API is OcrEngine.RecognizeAsync from WinRT/UWP. It can also be used in WinForms:
...
//for AsBuffer
using System.Runtime.InteropServices.WindowsRuntime;
...
async private void button5_Click(object sender, EventArgs e)
{
OcrEngine ocrEngine = null;
ocrEngine = OcrEngine.TryCreateFromUserProfileLanguages();
if (ocrEngine == null) return;
//convert the image to BGRA8 format which is needed by SoftwareBitmap
//is there a better method for this?
Bitmap img = new Bitmap(#"1.png");
byte[] ba = new byte[img.Width * img.Height * 4];
int o = 0;
for (int y = 0; y < img.Height; y++)
{
for (int x = 0; x < img.Width; x++)
{
var p = img.GetPixel(x, y);
ba[o++] = p.B;
ba[o++] = p.G;
ba[o++] = p.R;
ba[o++] = p.A;
}
}
var buffer = ba.AsBuffer();
var outputBitmap = SoftwareBitmap.CreateCopyFromBuffer(
buffer,
BitmapPixelFormat.Bgra8,
img.Width,
img.Height);
var ocrResult = await ocrEngine.RecognizeAsync(outputBitmap);
}
To use WinRT/UWP API in WinForms, add Nuget package "Microsoft.Windows.SDK.Contracts" (version 10.0.17134.100 for Win10 1803 SDK tested here) as described here.

Programmatically convert Word (docx) to PDF

Ok before you think "Not another question like this" please read this first.
I have an application (web application in ASP.NET MVC 3) the generates Word files in DocX using the DocX library.
The application takes a template and fills it in with all the data from a database.
Now I want to create a PDF version of that created docx-file.
I know apose.word is an option, but not for me since I have little budget. Other libs where I have to spend some money on are also out of the question.
I don't have a sharepoint server so Word Automation Services isn't an option either.
So I have 2 options (that I know) left and they both use iTextSharp. Don't know which is better.
I could use the generated XML from the docx file and transform it to a version that is usable by iTextSharp.
I could create the PDF like I create the docx with a template.
Anybody has any idea on how much work it is, which of those 2 has better performance and if it is even possible to do.
I know that the second option has the downside that when I change a template I have to change it for both versions.
If you have a better solution (free that is), you are welcome to share it.

Another option, even if it needs some work: install OpenOffice on server and, using UNO libraries (including them as assemblies in your app), you can open docx document and save it in PDF directly.
In a few minutes I post an example...
PARTIAL EXAMPLE:
This is a class I created a long time ago and used to convert files to pdf
using unoidl.com.sun.star.lang;
using unoidl.com.sun.star.uno;
using unoidl.com.sun.star.container;
using unoidl.com.sun.star.frame;
using unoidl.com.sun.star.beans;
using unoidl.com.sun.star.view;
using System.Collections.Generic;
using System.IO;
namespace QOpenOffice
{
public enum AppType
{
Writer,
Calc,
Impress,
Draw,
Math
}
public enum ExportFilter{
Word97,
WriterPDF,
CalcPDF,
DrawPDF,
ImpressPDF,
MathPDF
}
class OpenOffice
{
private XComponentContext context;
private XMultiServiceFactory service;
private XComponentLoader component;
private XComponent doc;
private List<string> filters = new List<string>();
#region Constructors
public OpenOffice()
{
/// This will start a new instance of OpenOffice.org if it is not running,
/// or it will obtain an existing instance if it is already open.
context = uno.util.Bootstrap.bootstrap();
/// The next step is to create a new OpenOffice.org service manager
service = (XMultiServiceFactory)context.getServiceManager();
/// Create a new Desktop instance using our service manager
component = (XComponentLoader)service.createInstance("com.sun.star.frame.Desktop");
// Getting filters
XNameContainer filters = (XNameContainer)service.createInstance("com.sun.star.document.FilterFactory");
foreach (string filter in filters.getElementNames())
this.filters.Add(filter);
}
~OpenOffice()
{
if (doc != null)
doc.dispose();
doc = null;
}
#endregion
#region Private methods
private string FilterToString(ExportFilter filter)
{
switch (filter)
{
case ExportFilter.Word97: return "MS Word 97";
case ExportFilter.WriterPDF: return "writer_pdf_Export";
case ExportFilter.CalcPDF: return "calc_pdf_Export";
case ExportFilter.DrawPDF: return "draw_pdf_Export";
case ExportFilter.ImpressPDF: return "impress_pdf_Export";
case ExportFilter.MathPDF: return "math_pdf_Export";
}
return "";
}
#endregion
#region Public methods
public bool Load(string filename, bool hidden)
{
return Load(filename, hidden, "", "");
}
public bool Load(string filename, bool hidden, int filter_index, string filter_options)
{
return Load(filename, hidden, filters[filter_index], filter_options);
}
public bool Load(string filename, bool hidden, string filter_name, string filter_options)
{
List<PropertyValue> pv = new List<PropertyValue>();
pv.Add(new PropertyValue("Hidden", 0, new uno.Any(hidden), PropertyState.DIRECT_VALUE));
if (filter_name != "")
{
pv.Add(new PropertyValue("FilterName", 0, new uno.Any(filter_name), PropertyState.DIRECT_VALUE));
pv.Add(new PropertyValue("FilterOptions", 0, new uno.Any(filter_options), PropertyState.DIRECT_VALUE));
}
try
{
doc = component.loadComponentFromURL(
"file:///" + filename.Replace('\\', '/'), "_blank",
0, pv.ToArray());
return true;
}
catch
{
doc = null;
return false;
}
}
public bool Print()
{
return Print(1, "");
}
public bool Print(int copies, string pages)
{
List<PropertyValue> pv = new List<PropertyValue>();
pv.Add(new PropertyValue("CopyCount", 0, new uno.Any(copies), PropertyState.DIRECT_VALUE));
if (pages != "")
pv.Add(new PropertyValue("Pages", 0, new uno.Any(pages), PropertyState.DIRECT_VALUE));
//if (doc is XPrintable)
try
{
((XPrintable)doc).print(pv.ToArray());
return true;
}
catch { return false; }
}
public bool Save(string filename, ExportFilter filter)
{
return Save(filename, FilterToString(filter));
}
public bool Save(string filename, string filter)
{
List<PropertyValue> pv = new List<PropertyValue>();
pv.Add(new PropertyValue("FilterName", 0, new uno.Any(filter), PropertyState.DIRECT_VALUE));
pv.Add(new PropertyValue("Overwrite", 0, new uno.Any(true), PropertyState.DIRECT_VALUE));
try
{
filename = filename.Replace("\\", "/");
((XStorable)doc).storeToURL("file:///" + filename, pv.ToArray());
return true;
}
catch { return false; }
}
public bool ExportToPdf(string filename)
{
filename = Path.ChangeExtension(filename, ".pdf");
bool ret = Save(filename, "writer_pdf_Export");
if (!ret) ret = Save(filename, "impress_pdf_Export");
if (!ret) ret = Save(filename, "calc_pdf_Export");
if (!ret) ret = Save(filename, "draw_pdf_Export");
if (!ret) ret = Save(filename, "impress_pdf_Export");
if (!ret) ret = Save(filename, "math_pdf_Export");
return ret;
}
public void Close()
{
doc.dispose();
doc = null;
}
public bool New(AppType app, bool hidden)
{
try
{
string sapp = "private:factory/";
switch (app)
{
case AppType.Writer:
sapp += "swriter";
break;
case AppType.Calc:
sapp += "scalc";
break;
case AppType.Impress:
sapp += "simpress";
break;
case AppType.Draw:
sapp += "sdraw";
break;
case AppType.Math:
sapp += "smath";
break;
}
PropertyValue pv = new PropertyValue("Hidden", 0, new uno.Any(hidden), PropertyState.DIRECT_VALUE);
doc = component.loadComponentFromURL(sapp, "_blank", 0, new PropertyValue[1] { pv });
return true;
}
catch
{
doc = null;
return false;
}
}
#endregion
#region Properties
public List<string> Filters
{
get { return filters; }
}
#endregion
}
}

Develop Reference

C# (C-Sharp) is a programming language developed by Microsoft that runs on the .NET Framework.

Extract powerpoint titles with C# - c#

Related

How to choose multiple image from Gallery at one time (tickbox) in Xamarin Android?

The name 'Core' does not exist in the current context

How to create an array and fill from tree node variable

How to implement and do OCR in a C# project?

Programmatically convert Word (docx) to PDF

Categories

Resources