How to implement and do OCR in a C# project? - c#

I ve been searching for a while and all that i ve seen some OCR library requests. I would like to know how to implement the purest, easy to install and use OCR library with detailed info for installation into a C# project.
If posible, I just wanna implement it like a usual dll reference...
Example:
using org.pdfbox.pdmodel;
using org.pdfbox.util;
Also a little OCR code example would be nice, such as:
public string OCRFromBitmap(Bitmap Bmp)
{
Bmp.Save(temppath, System.Drawing.Imaging.ImageFormat.Tiff);
string OcrResult = Analyze(temppath);
File.Delete(temppath);
return OcrResult;
}
So please consider that I'm not familiar to OCR projects and give me an answer like talking to a dummy.
Edit:
I guess people misunderstood my request. I wanted to know how to implement those open source OCR libraries to a C# project and how to use them. The link given as dup is not giving answers that I requested at all.

If anyone is looking into this, I've been trying different options and the following approach yields very good results. The following are the steps to get a working example:
Add .NET Wrapper for tesseract to your project. It can be added via NuGet package Install-Package Tesseract(https://github.com/charlesw/tesseract).
Go to the Downloads section of the official Tesseract project (https://code.google.com/p/tesseract-ocr/ EDIT: It's now located here: https://github.com/tesseract-ocr/langdata).
Download the preferred language data, example: tesseract-ocr-3.02.eng.tar.gz English language data for Tesseract 3.02.
Create tessdata directory in your project and place the language data files in it.
Go to Properties of the newly added files and set them to copy on build.
Add a reference to System.Drawing.
From .NET Wrapper repository, in the Samples directory copy the sample phototest.tif file into your project directory and set it to copy on build.
Create the following two files in your project (just to get started):
Program.cs
using System;
using Tesseract;
using System.Diagnostics;
namespace ConsoleApplication
{
class Program
{
public static void Main(string[] args)
{
var testImagePath = "./phototest.tif";
if (args.Length > 0)
{
testImagePath = args[0];
}
try
{
var logger = new FormattedConsoleLogger();
var resultPrinter = new ResultPrinter(logger);
using (var engine = new TesseractEngine(#"./tessdata", "eng", EngineMode.Default))
{
using (var img = Pix.LoadFromFile(testImagePath))
{
using (logger.Begin("Process image"))
{
var i = 1;
using (var page = engine.Process(img))
{
var text = page.GetText();
logger.Log("Text: {0}", text);
logger.Log("Mean confidence: {0}", page.GetMeanConfidence());
using (var iter = page.GetIterator())
{
iter.Begin();
do
{
if (i % 2 == 0)
{
using (logger.Begin("Line {0}", i))
{
do
{
using (logger.Begin("Word Iteration"))
{
if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
{
logger.Log("New block");
}
if (iter.IsAtBeginningOf(PageIteratorLevel.Para))
{
logger.Log("New paragraph");
}
if (iter.IsAtBeginningOf(PageIteratorLevel.TextLine))
{
logger.Log("New line");
}
logger.Log("word: " + iter.GetText(PageIteratorLevel.Word));
}
} while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
}
}
i++;
} while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
}
}
}
}
}
}
catch (Exception e)
{
Trace.TraceError(e.ToString());
Console.WriteLine("Unexpected Error: " + e.Message);
Console.WriteLine("Details: ");
Console.WriteLine(e.ToString());
}
Console.Write("Press any key to continue . . . ");
Console.ReadKey(true);
}
private class ResultPrinter
{
readonly FormattedConsoleLogger logger;
public ResultPrinter(FormattedConsoleLogger logger)
{
this.logger = logger;
}
public void Print(ResultIterator iter)
{
logger.Log("Is beginning of block: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Block));
logger.Log("Is beginning of para: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Para));
logger.Log("Is beginning of text line: {0}", iter.IsAtBeginningOf(PageIteratorLevel.TextLine));
logger.Log("Is beginning of word: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Word));
logger.Log("Is beginning of symbol: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Symbol));
logger.Log("Block text: \"{0}\"", iter.GetText(PageIteratorLevel.Block));
logger.Log("Para text: \"{0}\"", iter.GetText(PageIteratorLevel.Para));
logger.Log("TextLine text: \"{0}\"", iter.GetText(PageIteratorLevel.TextLine));
logger.Log("Word text: \"{0}\"", iter.GetText(PageIteratorLevel.Word));
logger.Log("Symbol text: \"{0}\"", iter.GetText(PageIteratorLevel.Symbol));
}
}
}
}
FormattedConsoleLogger.cs
using System;
using System.Collections.Generic;
using System.Text;
using Tesseract;
namespace ConsoleApplication
{
public class FormattedConsoleLogger
{
const string Tab = " ";
private class Scope : DisposableBase
{
private int indentLevel;
private string indent;
private FormattedConsoleLogger container;
public Scope(FormattedConsoleLogger container, int indentLevel)
{
this.container = container;
this.indentLevel = indentLevel;
StringBuilder indent = new StringBuilder();
for (int i = 0; i < indentLevel; i++)
{
indent.Append(Tab);
}
this.indent = indent.ToString();
}
public void Log(string format, object[] args)
{
var message = String.Format(format, args);
StringBuilder indentedMessage = new StringBuilder(message.Length + indent.Length * 10);
int i = 0;
bool isNewLine = true;
while (i < message.Length)
{
if (message.Length > i && message[i] == '\r' && message[i + 1] == '\n')
{
indentedMessage.AppendLine();
isNewLine = true;
i += 2;
}
else if (message[i] == '\r' || message[i] == '\n')
{
indentedMessage.AppendLine();
isNewLine = true;
i++;
}
else
{
if (isNewLine)
{
indentedMessage.Append(indent);
isNewLine = false;
}
indentedMessage.Append(message[i]);
i++;
}
}
Console.WriteLine(indentedMessage.ToString());
}
public Scope Begin()
{
return new Scope(container, indentLevel + 1);
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
var scope = container.scopes.Pop();
if (scope != this)
{
throw new InvalidOperationException("Format scope removed out of order.");
}
}
}
}
private Stack<Scope> scopes = new Stack<Scope>();
public IDisposable Begin(string title = "", params object[] args)
{
Log(title, args);
Scope scope;
if (scopes.Count == 0)
{
scope = new Scope(this, 1);
}
else
{
scope = ActiveScope.Begin();
}
scopes.Push(scope);
return scope;
}
public void Log(string format, params object[] args)
{
if (scopes.Count > 0)
{
ActiveScope.Log(format, args);
}
else
{
Console.WriteLine(String.Format(format, args));
}
}
private Scope ActiveScope
{
get
{
var top = scopes.Peek();
if (top == null) throw new InvalidOperationException("No current scope");
return top;
}
}
}
}

Here's one: (check out http://hongouru.blogspot.ie/2011/09/c-ocr-optical-character-recognition.html or http://www.codeproject.com/Articles/41709/How-To-Use-Office-2007-OCR-Using-C for more info)
using MODI;
static void Main(string[] args)
{
DocumentClass myDoc = new DocumentClass();
myDoc.Create(#"theDocumentName.tiff"); //we work with the .tiff extension
myDoc.OCR(MiLANGUAGES.miLANG_ENGLISH, true, true);
foreach (Image anImage in myDoc.Images)
{
Console.WriteLine(anImage.Layout.Text); //here we cout to the console.
}
}

I'm using tesseract OCR engine with TessNet2 (a C# wrapper - http://www.pixel-technology.com/freeware/tessnet2/).
Some basic code:
using tessnet2;
...
Bitmap image = new Bitmap(#"u:\user files\bwalker\2849257.tif");
tessnet2.Tesseract ocr = new tessnet2.Tesseract();
ocr.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,$-/#&=()\"':?"); // Accepted characters
ocr.Init(#"C:\Users\bwalker\Documents\Visual Studio 2010\Projects\tessnetWinForms\tessnetWinForms\bin\Release\", "eng", false); // Directory of your tessdata folder
List<tessnet2.Word> result = ocr.DoOCR(image, System.Drawing.Rectangle.Empty);
string Results = "";
foreach (tessnet2.Word word in result)
{
Results += word.Confidence + ", " + word.Text + ", " + word.Left + ", " + word.Top + ", " + word.Bottom + ", " + word.Right + "\n";
}

Some online API's work pretty well: ocr.space and Google Cloud Vision. Both of these are free, as long as you do less than 1000 OCR's per month. You can drag & drop an image to do a quick manual test to see how they perform for your images.
I find OCR.space easier to use (no messing around with nuget libraries), but, for my purpose, Google Cloud Vision provided slightly better results than OCR.space.
Google Cloud Vision example:
GoogleCredential cred = GoogleCredential.FromJson(json);
Channel channel = new Channel(ImageAnnotatorClient.DefaultEndpoint.Host, ImageAnnotatorClient.DefaultEndpoint.Port, cred.ToChannelCredentials());
ImageAnnotatorClient client = ImageAnnotatorClient.Create(channel);
Image image = Image.FromStream(stream);
EntityAnnotation googleOcrText = client.DetectText(image).First();
Console.Write(googleOcrText.Description);
OCR.space example:
string uri = $"https://api.ocr.space/parse/imageurl?apikey=helloworld&url={imageUri}";
string responseString = WebUtilities.DoGetRequest(uri);
OcrSpaceResult result = JsonConvert.DeserializeObject<OcrSpaceResult>(responseString);
if ((!result.IsErroredOnProcessing) && !String.IsNullOrEmpty(result.ParsedResults[0].ParsedText))
return result.ParsedResults[0].ParsedText;

A new API is OcrEngine.RecognizeAsync from WinRT/UWP. It can also be used in WinForms:
...
//for AsBuffer
using System.Runtime.InteropServices.WindowsRuntime;
...
async private void button5_Click(object sender, EventArgs e)
{
OcrEngine ocrEngine = null;
ocrEngine = OcrEngine.TryCreateFromUserProfileLanguages();
if (ocrEngine == null) return;
//convert the image to BGRA8 format which is needed by SoftwareBitmap
//is there a better method for this?
Bitmap img = new Bitmap(#"1.png");
byte[] ba = new byte[img.Width * img.Height * 4];
int o = 0;
for (int y = 0; y < img.Height; y++)
{
for (int x = 0; x < img.Width; x++)
{
var p = img.GetPixel(x, y);
ba[o++] = p.B;
ba[o++] = p.G;
ba[o++] = p.R;
ba[o++] = p.A;
}
}
var buffer = ba.AsBuffer();
var outputBitmap = SoftwareBitmap.CreateCopyFromBuffer(
buffer,
BitmapPixelFormat.Bgra8,
img.Width,
img.Height);
var ocrResult = await ocrEngine.RecognizeAsync(outputBitmap);
}
To use WinRT/UWP API in WinForms, add Nuget package "Microsoft.Windows.SDK.Contracts" (version 10.0.17134.100 for Win10 1803 SDK tested here) as described here.

Related

Selenium: web application dead and not running launched with Selenium using C#

I am new to automation, I just learned about Selenium for 3 days and built a simple C# console application with Selenium which trying to automate "sell" request on a trading website.
I noticed when I login the trading website with normal Chrome, the site is working as normal. But when I launch it with my C# console app, it seems like the trading website is not functioning as normal, seems dead.
Here I attached with both developer console picture as reference.
Normal Chrome browser:
enter image description here
Launched with C# console app:
enter image description here
Am I missing any part of my code, and cause the launched Chrome is being "locked".
Please help me
below is my code:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using OpenQA.Selenium;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.Support.UI;
using WebDriverManager;
using WebDriverManager.DriverConfigs.Impl;
namespace ii.tfxBot
{
class Program
{
static void Main(string[] args)
{
string ExecuteUrl = "https://tex.tfxi.com/#/exchange/tfx_usdt";
Console.WriteLine("test case started ");
//create the reference for the browser
//new DriverManager().SetUpDriver(new ChromeConfig());
var ccOptions = new ChromeOptions();
ccOptions.LeaveBrowserRunning = true;
//ccOptions.AttachToEdgeChrome = true;
//change the path accordingly
//ccOptions.EdgeExecutablePath = "C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe";
new DriverManager().SetUpDriver(new ChromeConfig());
var driver = new ChromeDriver(#"C:\WebDriver\bin", ccOptions);
// navigate to URL
driver.Navigate().GoToUrl("https://tex.tfxi.com/#/login");
Thread.Sleep(2000);
// identify the Google search text box
//IWebElement ele = driver.FindElement(By.Name("q"));
////enter the value in the google search text box
//ele.SendKeys("javatpoint tutorials");
//Thread.Sleep(2000);
////identify the google search button
//IWebElement ele1 = driver.FindElement(By.Name("btnK"));
//// click on the Google search button
//ele1.Click();
bool isUrl = false;
while (isUrl == false)
{
if (driver.Url == ExecuteUrl)
{
StartExecution(driver);
break;
}
Console.WriteLine("Execute Url not matched... try again in 1 second");
Thread.Sleep(5000);
}
//close the browser
//driver.Close();
Thread.Sleep(8000000);
Console.WriteLine("test case ended ");
}
private static void StartExecution(ChromeDriver driver)
{
var inputElements = driver.FindElements(By.CssSelector(".ivu-input.ivu-input-default"));
int i = 0;
foreach (var inputElement in inputElements)
{
Console.WriteLine("input count " + i.ToString() + " : " + inputElement.GetAttribute("placeholder").ToString());
string plText = inputElement.GetAttribute("placeholder").ToString();
if (plText == "出售數量")
{
Console.WriteLine("found the input number: " + i.ToString());
}
else
{
//inputElement.SendKeys("28000");
}
i++;
}
WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(300));
IWebElement divLimit;
bool found = false;
while (found == false)
{
//divLimit = driver.FindElement(By.ClassName("bd bd_limited"));
//divLimit = wait.Until(SeleniumExtras.WaitHelpers.ExpectedConditions.ElementIsVisible(By.CssSelector("bd bd_limited")));
divLimit = wait.Until(SeleniumExtras.WaitHelpers.ExpectedConditions.ElementExists(By.CssSelector(".bd.bd_limited"))); ;
if (divLimit != null)
{
Console.WriteLine("item found, proceed to get selling price!");
GetSellingPrice(divLimit);
break;
}
Console.WriteLine("item NOT found!");
Thread.Sleep(300);
}
int tfxBal = GetTfxBalance(driver);
WaitforElement(driver, By.CssSelector(".mask2"), tfxBal);
}
public static int GetTfxBalance(ChromeDriver driver)
{
int TfxBal = 0;
WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(300));
IWebElement divHdLogin;
divHdLogin = wait.Until(SeleniumExtras.WaitHelpers.ExpectedConditions.ElementExists(By.CssSelector(".hd.hd_login")));
//(IWebElement)driver.FindElements(By.CssSelector(".hd.hd_login"));
var targetKeys = divHdLogin.FindElements(By.TagName("span"));
Console.WriteLine("targetKeys 0 : " + targetKeys[0].Text);
Console.WriteLine("targetKeys 1 : " + targetKeys[1].Text);
Console.WriteLine("targetKeys 2 : " + targetKeys[2].Text);
decimal o = Convert.ToDecimal(targetKeys[1].Text);
if (targetKeys[1].Text.Contains("."))
{
var a = targetKeys[1].Text.Split('.');
TfxBal = Convert.ToInt32(a[0]);
}
else
{
TfxBal = Convert.ToInt32(targetKeys[1].Text);
}
return TfxBal;
}
public static void GetSellingPrice(IWebElement divLimit)
{
IWebElement divFormContent = divLimit.FindElement(By.CssSelector(".ivu-form-item-content"));
IWebElement ipSellPrice = divFormContent.FindElement(By.CssSelector(".ivu-input.ivu-input-default"));
Console.WriteLine(ipSellPrice.GetAttribute("value"));
}
public static bool Elexists(By by, WebDriver driver)
{
try
{
driver.FindElement(by);
IWebElement divPopup = driver.FindElement(by);
WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(300));
// textLeft countNumber
IWebElement divcountNumber;
divcountNumber = wait.Until(SeleniumExtras.WaitHelpers.ExpectedConditions.ElementExists(By.CssSelector(".textLeft.countNumber")));
//Console.WriteLine("count number: " + divcountNumber.Text);
string popupStyle = divPopup.GetAttribute("style");
//Console.WriteLine("div Popup Style : " + popupStyle);
if (popupStyle == "display: none;")
{
//Console.WriteLine("No Pop up..");
return false;
}
else
{
Console.WriteLine("Pop up display has changed!");
return true;
}
}
catch (NoSuchElementException)
{
return false;
}
}
public static void WaitforElement(WebDriver driver, By by, int tfxBalance = 0)
{
// waitforMin = 2 hours
int waitforMin = 60 * 1000 * 2 * 2 * 2;
for (int i = 0; i < waitforMin; i++)
{
System.Threading.Thread.Sleep(250);
if (Elexists(by, driver))
{
Console.WriteLine("Pop up appears! Performing selling now");
var inputElements = driver.FindElements(By.CssSelector(".ivu-input.ivu-input-default"));
IWebElement SellBtn = driver.FindElement(By.CssSelector(".bg-red.ivu-btn.ivu-btn-default.ivu-btn-long"));
foreach (var inputElement in inputElements)
{
string plText = inputElement.GetAttribute("placeholder").ToString();
if (plText == "出售數量")
{
Console.WriteLine("found the input box");
inputElement.SendKeys("28000");
SellBtn.Click();
}
}
break;
}
Console.Write(".");
}
}
public static void SellTfx()
{
//
}
}
}
I am expecting some solution to fix my code

Updating app to support Android API 33 and OnActivityResult always retuns a resultcode of Result.Cancelled

I'm updating a working app that targeted API 9 (Pie) to API 33 (Tiramisu) and the camera always returns Result.Cancelled to the OnActivityResult code. I'm using Visual Studio 2022 Version 17.3.6 on a Windows 11 pro machine. This is my camera code:
camerabutton.Click += (sender, evt) =>
{
var cameraispresent = checkCameraHardware(this);
if (cameraispresent)
{
try
{
CreateDirectoryForPictures();
MySubjectInfo.Name = MySubjectInfo.Name.Replace(",", "");
MySubjectInfo.Name = MySubjectInfo.Name.Replace(" ", "");
MySubjectInfo.Name = MySubjectInfo.Name.Replace(".", "");
var filename = MySubjectInfo.Name + ".jpg";
Intent intent = new Intent(MediaStore.ActionImageCapture);
App._file = new File(App._dir, String.Format(filename, Guid.NewGuid()));
intent.PutExtra(MediaStore.ExtraOutput, Uri.FromFile(App._file));
StartActivityForResult(intent, TakePictureRequestCode);
}
catch (Exception e)
{
var result = e.Message;
};
}
};
The create directory code, this code was also updated to reflect changes in API 33, however I can't find the folder on my test device via file explorer where the photos should be stored yet App._dir.Exists() returns true saying it's there:
private void CreateDirectoryForPictures()
{
int version = (int)Android.OS.Build.VERSION.SdkInt;
var root = "";
if (Android.OS.Environment.IsExternalStorageEmulated)
{
root = Android.OS.Environment.ExternalStorageDirectory.ToString();
}
else
{
root = System.Environment.GetFolderPath(System.Environment.SpecialFolder.MyPictures);
}
if (version >= Convert.ToInt32(BuildVersionCodes.Q))
{
App._dir = new File(root + "/PhotoManager");
}
else
{
App._dir = new File(
Environment.GetExternalStoragePublicDirectory(
Environment.DirectoryPictures), "PhotoManager");
}
if (!App._dir.Exists())
{
App._dir.Mkdirs();
}
}
This is the app class where I store the data:
public static class App
{
public static File _file;
public static File _dir;
public static Bitmap bitmap;
}
This is the OnActivityResult1 code, I left my API 9 code in there commented out so you can see where I was and where I'm going. With API 9 I needed to resize the picture to scale it down to 160 X 100 as these photos are used for ID cards, I commented that code out until I can figure out why the camera intent is returning null:
protected override void OnActivityResult(int requestCode, Result resultCode, Intent data)
{
base.OnActivityResult(requestCode, resultCode, data);
if (requestCode == TakePictureRequestCode && resultCode == Result.Ok)
{
//Intent mediaScanIntent = new Intent(Intent.ActionMediaScannerScanFile);
//Uri contentUri = Uri.FromFile(App._file);
//mediaScanIntent.SetData(contentUri);
//SendBroadcast(mediaScanIntent);
Bitmap photo = (Bitmap)data.GetByteArrayExtra("data");
App.bitmap = photo;
//int height = 160; //Resources.DisplayMetrics.HeightPixels;
//int width = 100; //MyImageView.Height;
//App.bitmap = App._file.Path.LoadAndResizeBitmap(width, height);
if (App.bitmap != null)
{
MyImageView.SetImageBitmap(App.bitmap);
Bitmap bitmap = App.bitmap;
var filePath = App._file.AbsolutePath;
var stream = new System.IO.FileStream(filePath, System.IO.FileMode.Create);
bitmap.Compress(Bitmap.CompressFormat.Jpeg, 100, stream);
stream.Close();
bitmap = null;
App.bitmap = null;
PictureTaken = true;
}
// Dispose of the Java side bitmap.
GC.Collect();
}
}
So what am I doing wrong?
You cannot use Uri.FromFile anymore since Android 7.
Use FileProvider to serve a writable uri for the camera app.
Example: EnsureBluetoothEnabled called from a permission request
Setup
activityResultCallback = new ActivityResultCallback();
activityResultCallback.OnActivityResultCalled += ActivityResultCallback_ActivityResultCalled;
activityResultLauncher = RegisterForActivityResult(new ActivityResultContracts.StartActivityForResult(), activityResultCallback);
private void EnsureBluetoothEnabled()
{
if ((bluetoothAdapter != null) && (!bluetoothAdapter.IsEnabled))
activityResultLauncher.Launch(new Intent(BluetoothAdapter.ActionRequestEnable));
}
Handler:
private void ActivityResultCallback_ActivityResultCalled(object sender, ActivityResult result)
{
if (result.ResultCode == (int)Result.Ok)
{
if (bluetoothAdapter.State == State.On)
ShowBluetoothConfirmationDialog(true);
}
else if (result.ResultCode == (int)Result.Canceled)
ShowBluetoothConfirmationDialog(false);
}
ActivityResultCallback class
public class ActivityResultCallback : Java.Lang.Object, IActivityResultCallback
{
public EventHandler<ActivityResult> OnActivityResultCalled;
public void OnActivityResult(Java.Lang.Object result)
{
ActivityResult activityResult = result as ActivityResult;
OnActivityResultCalled?.Invoke(this, activityResult);
}
}

Problem with multithreading and File options c#

my goal is to create a output file which parse all url in result.
My problem :
I use Paralelism and he can't be associated with File because it's secured,
i try to put a lock, and i think i don't put it correctly
message error is :
The process cannot access because it is being used by another process the file
What i want :
result is saved in a txt file
Code :
//VARIABLE
List<string> dorklist = File.ReadAllLines("dorks.txt").ToList();
int numdork = File.ReadAllLines("dorks.txt").Length;
int totalurls = File.ReadAllLines("Queue.txt").Length;
int chekeddork = 0;
int errors = 0;
File.Create("Queue.txt");
//TITLE
Task.Factory.StartNew(() =>
{
while (true) {
Console.Title = ("PARSER | " + chekeddork + "/" + numdork + " URLS : " + totalurls);
Thread.Sleep(100);
}
});
//BOUCLE
Parallel.ForEach(dorklist, new ParallelOptions { MaxDegreeOfParallelism = 100 }, dorklist => {
try
{
using (HttpRequest req = new HttpRequest())
{
//HEADERS
req.AddHeader(HttpHeader.UserAgent, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0");
//REQUESTS
var content = req.Get("https://www.google.com/search?q="+dorklist);
if (Convert.ToString(content.StatusCode) == "OK")
{
chekeddork++;
Console.WriteLine("Dork used : " + dorklist);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(content.ToString());
var links = doc.DocumentNode.SelectNodes("//div[#class='r']//a");
foreach (var link in links)
{
var href = link.Attributes["href"];
var result = href.Value.ToString();
using (StreamWriter streamaze = new StreamWriter("Queue.txt", true))
{
streamaze.WriteLine(results);
streamaze.Close();
}
}
}
else
{ Console.WriteLine("Bad");
errors++;
}
}
}
catch (HttpException)
{
Console.WriteLine("Banned");
errors++;
}
});
}
}
}
AND the problem is here : `
using (StreamWriter streamaze = new StreamWriter("Queue.txt", true))
{
streamaze.WriteLine(results);
streamaze.Close();
Because there is Parallel.ForEach and the maxdegree
Thanks to all help me
you cannot be writing to one file from several threads at once via separate opens.
Open separate files per thread and merge afterwards.
Or open the file once before the parallel foreach and , under lock, write to the one open file.
Or put that using inside a lock
lock(dorkList) // or create a lock object
{
using (StreamWriter streamaze = new StreamWriter("Queue.txt", true))
{
streamaze.WriteLine(results);
streamaze.Close();
}
}
or use a specifc lock objecct like this
int chekeddork = 0;
int errors = 0;
File.Create("Queue.txt");
object locker = new Object();
....
lock(locker) // or create a lock object
{
using (StreamWriter streamaze = new StreamWriter("Queue.txt", true))
{
streamaze.WriteLine(results);
streamaze.Close();
}
}

OpenXml Excel: throw error in any word after mail address

I read Excel files using OpenXml. all work fine but if the spreadsheet contains one cell that has an address mail and after it a space and another word, such as:
abc#abc.com abc
It throws an exception immediately at the opening of the spreadsheet:
var _doc = SpreadsheetDocument.Open(_filePath, false);
exception:
DocumentFormat.OpenXml.Packaging.OpenXmlPackageException
Additional information:
Invalid Hyperlink: Malformed URI is embedded as a
hyperlink in the document.
There is an open issue on the OpenXml forum related to this problem: Malformed Hyperlink causes exception
In the post they talk about encountering this issue with a malformed "mailto:" hyperlink within a Word document.
They propose a work-around here: Workaround for malformed hyperlink exception
The workaround is essentially a small console application which locates the invalid URL and replaces it with a hard-coded value; here is the code snippet from their sample that does the replacement; you could augment this code to attempt to correct the passed brokenUri:
private static Uri FixUri(string brokenUri)
{
return new Uri("http://broken-link/");
}
The problem I had was actually with an Excel document (like you) and it had to do with a malformed http URL; I was pleasantly surprised to find that their code worked just fine with my Excel file.
Here is the entire work-around source code, just in case one of these links goes away in the future:
void Main(string[] args)
{
var fileName = #"C:\temp\corrupt.xlsx";
var newFileName = #"c:\temp\Fixed.xlsx";
var newFileInfo = new FileInfo(newFileName);
if (newFileInfo.Exists)
newFileInfo.Delete();
File.Copy(fileName, newFileName);
WordprocessingDocument wDoc;
try
{
using (wDoc = WordprocessingDocument.Open(newFileName, true))
{
ProcessDocument(wDoc);
}
}
catch (OpenXmlPackageException e)
{
e.Dump();
if (e.ToString().Contains("The specified package is not valid."))
{
using (FileStream fs = new FileStream(newFileName, FileMode.OpenOrCreate, FileAccess.ReadWrite))
{
UriFixer.FixInvalidUri(fs, brokenUri => FixUri(brokenUri));
}
}
}
}
private static Uri FixUri(string brokenUri)
{
brokenUri.Dump();
return new Uri("http://broken-link/");
}
private static void ProcessDocument(WordprocessingDocument wDoc)
{
var elementCount = wDoc.MainDocumentPart.Document.Descendants().Count();
Console.WriteLine(elementCount);
}
}
public static class UriFixer
{
public static void FixInvalidUri(Stream fs, Func<string, Uri> invalidUriHandler)
{
XNamespace relNs = "http://schemas.openxmlformats.org/package/2006/relationships";
using (ZipArchive za = new ZipArchive(fs, ZipArchiveMode.Update))
{
foreach (var entry in za.Entries.ToList())
{
if (!entry.Name.EndsWith(".rels"))
continue;
bool replaceEntry = false;
XDocument entryXDoc = null;
using (var entryStream = entry.Open())
{
try
{
entryXDoc = XDocument.Load(entryStream);
if (entryXDoc.Root != null && entryXDoc.Root.Name.Namespace == relNs)
{
var urisToCheck = entryXDoc
.Descendants(relNs + "Relationship")
.Where(r => r.Attribute("TargetMode") != null && (string)r.Attribute("TargetMode") == "External");
foreach (var rel in urisToCheck)
{
var target = (string)rel.Attribute("Target");
if (target != null)
{
try
{
Uri uri = new Uri(target);
}
catch (UriFormatException)
{
Uri newUri = invalidUriHandler(target);
rel.Attribute("Target").Value = newUri.ToString();
replaceEntry = true;
}
}
}
}
}
catch (XmlException)
{
continue;
}
}
if (replaceEntry)
{
var fullName = entry.FullName;
entry.Delete();
var newEntry = za.CreateEntry(fullName);
using (StreamWriter writer = new StreamWriter(newEntry.Open()))
using (XmlWriter xmlWriter = XmlWriter.Create(writer))
{
entryXDoc.WriteTo(xmlWriter);
}
}
}
}
}
The fix by #RMD works great. I've been using it for years. But there is a new fix.
You can see the fix here in the changelog for issue #793
Upgrade OpenXML to 2.12.0.
Right click solution and select Manage NuGet Packages.
Implement the fix
It is helpful to have a unit test. Create an excel file with a bad email address like test#gmail,com. (Note the comma instead of the dot).
Make sure the stream you open and the call to SpreadsheetDocument.Open allows Read AND Write.
You need to implement a RelationshipErrorHandlerFactory and use it in the options when you open. Here is the code I used:
public class UriRelationshipErrorHandler : RelationshipErrorHandler
{
public override string Rewrite(Uri partUri, string id, string uri)
{
return "https://broken-link";
}
}
Then you need to use it when you open the document like this:
var openSettings = new OpenSettings
{
RelationshipErrorHandlerFactory = package =>
{
return new UriRelationshipErrorHandler();
}
};
using var document = SpreadsheetDocument.Open(stream, true, openSettings);
One of the nice things about this solution is that it does not require you to create a temporary "fixed" version of your file and it is far less code.
Unfortunately solution where you have to open file as zip and replace broken hyperlink would not help me.
I just was wondering how it is posible that it works fine when your target framework is 4.0 even if your only installed .Net Framework has version 4.7.2.
I have found out that there is private static field inside System.UriParser that selects version of URI's RFC specification. So it is possible to set it to V2 as it is set for .net 4.0 and lower versions of .Net Framework. Only problem that it is private static readonly.
Maybe someone will want to set it globally for whole application. But I wrote UriQuirksVersionPatcher that will update this version and restore it back in Dispose method. It is obviously not thread-safe but it is acceptable for my purpose.
using System;
using System.Diagnostics;
using System.Reflection;
namespace BarCap.RiskServices.RateSubmissions.Utility
{
#if (NET20 || NET35 || NET40)
public class UriQuirksVersionPatcher : IDisposable
{
public void Dispose()
{
}
}
#else
public class UriQuirksVersionPatcher : IDisposable
{
private const string _quirksVersionFieldName = "s_QuirksVersion"; //See Source\ndp\fx\src\net\System\_UriSyntax.cs in NexFX sources
private const string _uriQuirksVersionEnumName = "UriQuirksVersion";
/// <code>
/// private enum UriQuirksVersion
/// {
/// V1 = 1, // RFC 1738 - Not supported
/// V2 = 2, // RFC 2396
/// V3 = 3, // RFC 3986, 3987
/// }
/// </code>
private const string _oldQuirksVersion = "V2";
private static readonly Lazy<FieldInfo> _targetFieldInfo;
private static readonly Lazy<int?> _patchValue;
private readonly int _oldValue;
private readonly bool _isEnabled;
static UriQuirksVersionPatcher()
{
var targetType = typeof(UriParser);
_targetFieldInfo = new Lazy<FieldInfo>(() => targetType.GetField(_quirksVersionFieldName, BindingFlags.Static | BindingFlags.NonPublic));
_patchValue = new Lazy<int?>(() => GetUriQuirksVersion(targetType));
}
public UriQuirksVersionPatcher()
{
int? patchValue = _patchValue.Value;
_isEnabled = patchValue.HasValue;
if (!_isEnabled) //Disabled if it failed to get enum value
{
return;
}
int originalValue = QuirksVersion;
_isEnabled = originalValue != patchValue;
if (!_isEnabled) //Disabled if value is proper
{
return;
}
_oldValue = originalValue;
QuirksVersion = patchValue.Value;
}
private int QuirksVersion
{
get
{
return (int)_targetFieldInfo.Value.GetValue(null);
}
set
{
_targetFieldInfo.Value.SetValue(null, value);
}
}
private static int? GetUriQuirksVersion(Type targetType)
{
int? result = null;
try
{
result = (int)targetType.GetNestedType(_uriQuirksVersionEnumName, BindingFlags.Static | BindingFlags.NonPublic)
.GetField(_oldQuirksVersion, BindingFlags.Static | BindingFlags.Public)
.GetValue(null);
}
catch
{
#if DEBUG
Debug.WriteLine("ERROR: Failed to find UriQuirksVersion.V2 enum member.");
throw;
#endif
}
return result;
}
public void Dispose()
{
if (_isEnabled)
{
QuirksVersion = _oldValue;
}
}
}
#endif
}
Usage:
using(new UriQuirksVersionPatcher())
{
using(var document = SpreadsheetDocument.Open(fullPath, false))
{
//.....
}
}
P.S. Later I found that someone already implemented this pathcher: https://github.com/google/google-api-dotnet-client/blob/master/Src/Support/Google.Apis.Core/Util/UriPatcher.cs
I haven't use OpenXml but if there's no specific reason for using it then I highly recommend LinqToExcel from LinqToExcel. Example of code is here:
var sheet = new ExcelQueryFactory("filePath");
var allRows = from r in sheet.Worksheet() select r;
foreach (var r in allRows) {
var cella = r["Header"].ToString();
}

Extract powerpoint titles with C#

I have powerponint 97-2003 files(.ppt extension) and I need to extract slide titles programatically using C#.
I have tried using Microsoft.Office.Interop but without success.
I have search with google and as a maximum I have found how to obtain reference to PowerPoint.Slide:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.Office.Core;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
namespace Tester
{
class Program
{
static void Main(string[] args)
{
Microsoft.Office.Interop.PowerPoint.Application presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
try
{
string pptPath = #"D:\somefile.ppt";
TestReadingTitles(presentationApp, pptPath);
}
finally
{
presentationApp.Quit();
}
}
private static void TestReadingTitles(Microsoft.Office.Interop.PowerPoint.Application presentationApp, string pptPath)
{
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
Microsoft.Office.Core.MsoTriState readOnly = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Core.MsoTriState untitled = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Core.MsoTriState withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPath, readOnly, untitled, withWindow);
for (int i = 0; i < presentation.Slides.Count; i++)
{
foreach (PowerPoint.Slide slide in presentation.Slides)
{
string slidetitle = ??????????????????;
}
}
}
}
}
You can extract the titles without looping through the shapes.
private static string ExtractSlideTitlefromSlide(Microsoft.Office.Interop.PowerPoint.Slide slide, string defaultValue)
{
if (slide.Shapes.HasTitle == Office.MsoTriState.msoTrue)
{
if (slide.Shapes.Title.TextFrame.HasText == Office.MsoTriState.msoTrue)
{
return slide.Shapes.Title.TextFrame.TextRange.Text;
}
}
return defaultValue;
}
I have no solution for direct extract slide titles from ppt. This is a workarround - first temproaly convert it into pptx and then extract titles using openxml.
For conversion from ppt to pptx I have used Microsoft Interop which I do not like but I have no better solution.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Presentation;
using D = DocumentFormat.OpenXml.Drawing;
using Shape = DocumentFormat.OpenXml.Presentation.Shape;
namespace Tester
{
class Program
{
static void Main(string[] args)
{
string pptPath = #"D:\mypresentation.ppt";
ReadTitles(pptPath);
}
private static void ReadTitles(string pptPath)
{
IList<string> slideTitles = GetSlidesTitles(pptPath);
Debug.Print("SLIDES TITLES FOR {0}:", pptPath);
foreach (string slideTitle in slideTitles)
{
Debug.Print("\t {0}", slideTitle);
}
}
private static IList<string> GetSlidesTitles(string pptPath)
{
string pptxPath = SaveAsPptx(pptPath);
IList<string> titles = GetSlideTitles(pptxPath);
try
{
File.Delete(pptxPath);
Debug.Print("Temporary pptx file {0} deleted.", pptxPath);
}
catch (Exception e)
{
Debug.Print("Error deleting file {0}. ERROR: {1}", pptxPath, e.Message);
}
return titles;
}
private static string SaveAsPptx(string pptPathIn)
{
Microsoft.Office.Interop.PowerPoint.Application presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
string pptxPathOut = null;
try
{
string pptDir = Path.GetDirectoryName(pptPathIn);
string pptFileNameOnly = Path.GetFileNameWithoutExtension(pptPathIn);
pptxPathOut = Path.Combine(pptDir, pptFileNameOnly + ".pptx");
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
Microsoft.Office.Core.MsoTriState readOnly = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Core.MsoTriState untitled = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Core.MsoTriState withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Debug.Print("Opening ppt file {0} ...", pptPathIn);
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPathIn, readOnly, untitled, withWindow);
Debug.Print("Starting creation of pptx from ppt {0}", pptPathIn);
presentation.SaveCopyAs(pptxPathOut, PowerPoint.PpSaveAsFileType.ppSaveAsOpenXMLPresentation, Microsoft.Office.Core.MsoTriState.msoFalse);
Debug.Print("Successfully created pptx {0} from ppt {1}", pptxPathOut, pptPathIn);
}
catch (Exception e)
{
Debug.Print("Error during creating pptx from ppt " + pptPathIn, e);
}
finally
{
presentationApp.Quit();
}
return pptxPathOut;
}
// Get a list of the titles of all the slides in the presentation.
public static IList<string> GetSlideTitles(string presentationFile)
{
// Open the presentation as read-only.
using (PresentationDocument presentationDocument = PresentationDocument.Open(presentationFile, false))
{
return GetSlideTitles(presentationDocument);
}
}
// Get a list of the titles of all the slides in the presentation.
public static IList<string> GetSlideTitles(PresentationDocument presentationDocument)
{
if (presentationDocument == null)
{
throw new ArgumentNullException("presentationDocument");
}
// Get a PresentationPart object from the PresentationDocument object.
PresentationPart presentationPart = presentationDocument.PresentationPart;
if (presentationPart != null &&
presentationPart.Presentation != null)
{
// Get a Presentation object from the PresentationPart object.
Presentation presentation = presentationPart.Presentation;
if (presentation.SlideIdList != null)
{
List<string> titlesList = new List<string>();
// Get the title of each slide in the slide order.
foreach (var slideId in presentation.SlideIdList.Elements<SlideId>())
{
SlidePart slidePart = presentationPart.GetPartById(slideId.RelationshipId) as SlidePart;
// Get the slide title.
string title = GetSlideTitle(slidePart);
// An empty title can also be added.
titlesList.Add(title);
}
return titlesList;
}
}
return null;
}
// Get the title string of the slide.
public static string GetSlideTitle(SlidePart slidePart)
{
if (slidePart == null)
{
throw new ArgumentNullException("slidePart");
}
// Declare a paragraph separator.
string paragraphSeparator = null;
if (slidePart.Slide != null)
{
// Find all the title shapes.
var shapes = from shape in slidePart.Slide.Descendants<Shape>()
where IsTitleShape(shape)
select shape;
StringBuilder paragraphText = new StringBuilder();
foreach (var shape in shapes)
{
// Get the text in each paragraph in this shape.
foreach (var paragraph in shape.TextBody.Descendants<D.Paragraph>())
{
// Add a line break.
paragraphText.Append(paragraphSeparator);
foreach (var text in paragraph.Descendants<D.Text>())
{
paragraphText.Append(text.Text);
}
paragraphSeparator = "\n";
}
}
return paragraphText.ToString();
}
return string.Empty;
}
// Determines whether the shape is a title shape.
private static bool IsTitleShape(Shape shape)
{
var placeholderShape = shape.NonVisualShapeProperties.ApplicationNonVisualDrawingProperties.GetFirstChild<PlaceholderShape>();
if (placeholderShape != null && placeholderShape.Type != null && placeholderShape.Type.HasValue)
{
switch ((PlaceholderValues)placeholderShape.Type)
{
// Any title shape.
case PlaceholderValues.Title:
// A centered title.
case PlaceholderValues.CenteredTitle:
return true;
default:
return false;
}
}
return false;
}
}
}
Finally I have found a way to get out powerpoint presentation titles from .ppt file without converting it to .pptx. Here is a solution:
using System;
using System.Collections.Generic;
using Microsoft.Office.Core;
using PowerPoint = Microsoft.Office.Interop.PowerPoint;
namespace Mintra.Publisher.DocumentConverter.Core.Utils
{
class InteropUtility
{
public static IList<string> GetPresentationTitles(string pptPath)
{
IList<string> result = new List<string>();
var presentationApp = new Microsoft.Office.Interop.PowerPoint.Application();
try
{
presentationApp.Visible = Microsoft.Office.Core.MsoTriState.msoTrue;
Microsoft.Office.Interop.PowerPoint.Presentations presentations = presentationApp.Presentations;
var readOnly = Microsoft.Office.Core.MsoTriState.msoTrue;
var untitled = Microsoft.Office.Core.MsoTriState.msoTrue;
var withWindow = Microsoft.Office.Core.MsoTriState.msoFalse;
Microsoft.Office.Interop.PowerPoint.Presentation presentation = presentations.Open(pptPath, readOnly, untitled, withWindow);
int i = 0;
foreach (PowerPoint.Slide slide in presentation.Slides)
{
string defaultTitle = String.Format("Slide {0}", i);
String shapeTitle = ExtractSlideTitlefromShape(slide, defaultTitle);
result.Add(shapeTitle);
}
}
finally
{
presentationApp.Quit();
}
return result;
}
private static string ExtractSlideTitlefromShape(PowerPoint.Slide slide, string defaultValue)
{
PowerPoint.HeadersFooters headersFooters = slide.HeadersFooters;
PowerPoint.Shapes mastershapes = slide.Master.Shapes;
for (int i = 1; i <= slide.Shapes.Count; i++)
{
PowerPoint.Shape shape = slide.Shapes[i];
bool hasTextFrame = shape.HasTextFrame == MsoTriState.msoTrue;
bool isTypePlaceholder = shape.Type.Equals(MsoShapeType.msoPlaceholder);
bool hasTextInTextFrame = shape.TextFrame.HasText == MsoTriState.msoTrue;
bool isTitleShape = shape.Name.ToLower().Contains("title");
if (isTypePlaceholder && hasTextFrame && hasTextInTextFrame && isTitleShape)
{
return shape.TextFrame.TextRange.Text;
}
}
return defaultValue;
}
}
}
Microsoft.Office.Interop.PowerPoint.Application pptApplication = new Microsoft.Office.Interop.PowerPoint.Application();
Microsoft.Office.Interop.PowerPoint.Slides slides;
Microsoft.Office.Interop.PowerPoint._Slide slide;
// Create the Presentation File
Presentation pptPresentation = pptApplication.Presentations.Add(MsoTriState.msoTrue);
for (int i = 0; i < 2; i++)
{
Microsoft.Office.Interop.PowerPoint.CustomLayout customLayout = pptPresentation.SlideMaster.CustomLayouts[Microsoft.Office.Interop.PowerPoint.PpSlideLayout.ppLayoutChartAndText];
// customLayout.t
// Create new Slide
slides = pptPresentation.Slides;
slide = slides.AddSlide(1, customLayout);
slide.Shapes.Title.Top = 0;
slide.Shapes.Title.TextFrame.TextRange.Text = "Welcome!";
All you need is change Welcome text.

Categories

Resources