How to process multiple images at once - c#

I'm trying to extract some text from an image using tesseract, and I've noticed if I divide the image to 9 smaller pieces the system is more accurate, so what I'm trying to accomplish is to process all 9 images at once (parallel) and this is the way I wanted to do it:
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
try
{
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
{
Parallel.ForEach(CutUpImage(src), (img) =>
{
using (var ms = new MemoryStream())
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
}
});
return found.Count;
}
}
catch (Exception ex)
{
throw ex;
}
}
but I'm getting an error (Only one image can be processed at once. Please make sure you dispose of the page once your finished with it.).
So I had to move the new TesseractEngine into the loop like this:
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
Parallel.ForEach(CutUpImage(src), (img) =>
{
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
{
using (var ms = new MemoryStream())
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
}
}
});
return found.Count;
}
but then it takes a full minute to finish processing all images.
so my question is how do I make the new TesseractEngine work outside the loop, and more generally how do I make this work faster?

ok so the solution to my problem is simple... don't use parallel processing!
I switched the Parallel.ForEach to a traditional foreach (idk why I decided to try parallel processing first...) and it now takes 12 seconds to process them all, this is the code :
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
using (var ms = new MemoryStream())
foreach (var img in CutUpImage(src))
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
ms.SetLength(0);
}
return found.Count;
}
p.s. this is the CutUpImage code if someone ever wants to use it...
private static Image[] CutUpImage(Image src)
{
int widthThird = (int)((double)src.Width / 3.0 + 0.5);
int heightThird = (int)((double)src.Height / 3.0 + 0.5);
var imgarray = new Image[9];
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
{
var index = i*3+j;
imgarray[index] = new Bitmap(widthThird, heightThird);
Graphics g = Graphics.FromImage(imgarray[index]);
g.DrawImage(src, new Rectangle(0, 0, widthThird, heightThird),
new Rectangle(j * widthThird, i * heightThird, widthThird, heightThird),
GraphicsUnit.Pixel);
g.Dispose();
}
return imgarray;
}

Related

Double values get converted to string values when the double values has decimal points

I am trying to use CSVHelper library to write data into a MemoryStream and then generate a CSV file using that MemoryStream.
The problem is Double values get converted to weird string values when the double values have decimal points. The expected output and weird output are there at the bottom.
Is anyone know how to overcome this issue? or Is there any mistake in the below code?
public class Foo
{
public Foo()
{
}
public double valOne { get; set; }
public double valTwo { get; set; }
}
public class FooMap : ClassMap<Foo>
{
public FooMap()
{
Map(m => m.valOne).Index(0).Name("Val One");
Map(m => m.valTwo).Index(1).Name("Val Two");
}
}
var records = new List<Foo> {
new Foo{valOne = 3224.12, valTwo = 4122},
new Foo{valOne = 2030.20, valTwo = 5555},
};
var config = new CsvConfiguration(CultureInfo.CurrentCulture) { Delimiter = ",", HasHeaderRecord = true };
using (var memoryStream = new MemoryStream())
using (var writer = new StreamWriter(memoryStream))
using (var csv = new CsvWriter(writer, config))
{
csv.Context.RegisterClassMap<FooMap>();
csv.WriteHeader<Foo>();
csv.NextRecord();
foreach (var record in records)
{
csv.WriteRecord(record);
csv.NextRecord();
}
writer.Flush();
var result = Encoding.UTF8.GetString(memoryStream.ToArray());
byte[] bytes = Encoding.ASCII.GetBytes(result);
return new FileContentResult(bytes, "text/csv")
{
FileDownloadName = "Sample_Report_Name"
};
}
Expected Output:
Val One, Val Two
3224.12,4122
2030.20,5555
Weird Output:
Val One, Val Two
"3224,12",4122
"2030,20",5555
The issue is the CurrentCulture of the computer running the code uses commas instead of periods to indicate the decimal point. Using CultureInfo.InvariantCulture instead of CultureInfo.CurrentCulture should fix the formatting issue.
Also, you can simplify your code by using csv.WriteRecords(records).
var records = new List<Foo> {
new Foo{valOne = 3224.12, valTwo = 4122},
new Foo{valOne = 2030.20, valTwo = 5555},
};
var config = new CsvConfiguration(CultureInfo.CurrentCulture) { Delimiter = ",", HasHeaderRecord = true };
using (var memoryStream = new MemoryStream())
using (var writer = new StreamWriter(memoryStream))
using (var csv = new CsvWriter(writer, config))
{
csv.Context.RegisterClassMap<FooMap>();
csv.WriteRecords(records);
writer.Flush();
var result = Encoding.UTF8.GetString(memoryStream.ToArray());
byte[] bytes = Encoding.ASCII.GetBytes(result);
return new FileContentResult(bytes, "text/csv")
{
FileDownloadName = "Sample_Report_Name"
};
}

Lucene.Net (4.8) AutoComplete / AutoSuggestion

I'd like to implement a searchable index using Lucene.Net 4.8 that supplies a user with suggestions / autocomplete for single words & phrases.
The index has been created successfully; the suggestions are where I've stalled.
Version 4.8 seems to have introduced a substantial number of breaking changes, and none of the available samples I've found work.
Where I stand
For reference, LuceneVersion is this:
private readonly LuceneVersion LuceneVersion = LuceneVersion.LUCENE_48;
Solution 1
I've tried this, but can't get past reader.Terms:
public void TryAutoComplete()
{
var analyzer = new EnglishAnalyzer(LuceneVersion);
var config = new IndexWriterConfig(LuceneVersion, analyzer);
RAMDirectory dir = new RAMDirectory();
using (IndexWriter iw = new IndexWriter(dir, config))
{
Document d = new Document();
TextField f = new TextField("text","",Field.Store.YES);
d.Add(f);
f.SetStringValue("abc");
iw.AddDocument(d);
f.SetStringValue("colorado");
iw.AddDocument(d);
f.SetStringValue("coloring book");
iw.AddDocument(d);
iw.Commit();
using (IndexReader reader = iw.GetReader(false))
{
TermEnum terms = reader.Terms(new Term("text", "co"));
int maxSuggestsCpt = 0;
// will print:
// colorado
// coloring book
do
{
Console.WriteLine(terms.Term.Text);
maxSuggestsCpt++;
if (maxSuggestsCpt >= 5)
break;
}
while (terms.Next() && terms.Term.Text.StartsWith("co"));
}
}
}
reader.Terms no longer exists. Being new to Lucene, it's unclear how to refactor this.
Solution 2
Trying this, I'm thrown an error:
public void TryAutoComplete2()
{
using(var analyzer = new EnglishAnalyzer(LuceneVersion))
{
IndexWriterConfig config = new IndexWriterConfig(LuceneVersion, analyzer);
RAMDirectory dir = new RAMDirectory();
using(var iw = new IndexWriter(dir,config))
{
Document d = new Document()
{
new TextField("text", "this is a document with a some words",Field.Store.YES),
new Int32Field("id", 42, Field.Store.YES)
};
iw.AddDocument(d);
iw.Commit();
using (IndexReader reader = iw.GetReader(false))
using (SpellChecker speller = new SpellChecker(new RAMDirectory()))
{
//ERROR HERE!!!
speller.IndexDictionary(new LuceneDictionary(reader, "text"), config, false);
string[] suggestions = speller.SuggestSimilar("dcument", 5);
IndexSearcher searcher = new IndexSearcher(reader);
foreach (string suggestion in suggestions)
{
TopDocs docs = searcher.Search(new TermQuery(new Term("text", suggestion)), null, Int32.MaxValue);
foreach (var doc in docs.ScoreDocs)
{
System.Diagnostics.Debug.WriteLine(searcher.Doc(doc.Doc).Get("id"));
}
}
}
}
}
}
When debugging, speller.IndexDictionary(new LuceneDictionary(reader, "text"), config, false); throws a The object cannot be set twice! error, which I can't explain.
Any thoughts are welcome.
Clarification
I'd like to return a list of suggested terms for a given input, not the documents or their full content.
For example, if a document contains "Hello, my name is Clark. I'm from Atlanta," and I submit "Atl," then "Atlanta" should come back as a suggestion.
If I am understanding you correctly you may be over-complicating your index design a bit. If your goal is to use Lucene for auto-complete, you want to create an index of the terms you consider complete. Then simply query the index using a PrefixQuery using a partial word or phrase.
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.En;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;
using System;
using System.Linq;
namespace LuceneDemoApp
{
class LuceneAutoCompleteIndex : IDisposable
{
const LuceneVersion Version = LuceneVersion.LUCENE_48;
RAMDirectory Directory;
Analyzer Analyzer;
IndexWriterConfig WriterConfig;
private void IndexDoc(IndexWriter writer, string term)
{
Document doc = new Document();
doc.Add(new StringField(FieldName, term, Field.Store.YES));
writer.AddDocument(doc);
}
public LuceneAutoCompleteIndex(string fieldName, int maxResults)
{
FieldName = fieldName;
MaxResults = maxResults;
Directory = new RAMDirectory();
Analyzer = new EnglishAnalyzer(Version);
WriterConfig = new IndexWriterConfig(Version, Analyzer);
WriterConfig.OpenMode = OpenMode.CREATE_OR_APPEND;
}
public string FieldName { get; }
public int MaxResults { get; set; }
public void Add(string term)
{
using (var writer = new IndexWriter(Directory, WriterConfig))
{
IndexDoc(writer, term);
}
}
public void AddRange(string[] terms)
{
using (var writer = new IndexWriter(Directory, WriterConfig))
{
foreach (string term in terms)
{
IndexDoc(writer, term);
}
}
}
public string[] WhereStartsWith(string term)
{
using (var reader = DirectoryReader.Open(Directory))
{
IndexSearcher searcher = new IndexSearcher(reader);
var query = new PrefixQuery(new Term(FieldName, term));
TopDocs foundDocs = searcher.Search(query, MaxResults);
var matches = foundDocs.ScoreDocs
.Select(scoreDoc => searcher.Doc(scoreDoc.Doc).Get(FieldName))
.ToArray();
return matches;
}
}
public void Dispose()
{
Directory.Dispose();
Analyzer.Dispose();
}
}
}
Running this:
var indexValues = new string[] { "apple fruit", "appricot", "ape", "avacado", "banana", "pear" };
var index = new LuceneAutoCompleteIndex("fn", 10);
index.AddRange(indexValues);
var matches = index.WhereStartsWith("app");
foreach (var match in matches)
{
Console.WriteLine(match);
}
You get this:
apple fruit
appricot

amazonS3client.SelectObjectContentAsync - downloading the large jsonline formate file - unwanted line break

I am trying to download a file content from the S3 bucket using the SelectObjectContentAsync method from AWSSDK for C#.
But there are some unwanted line break(\n) in mid of the raw data.
Data Example :
{"Id":1,"Name":"aaa"}, {"Id":2,"N
\name":"bbb"}
My Code :
var amazonS3Client = new AmazonS3Client(awsAccessKeyId, awsSecretAccessKey, region);
SelectObjectContentRequest selectObjectContentRequest = new SelectObjectContentRequest()
{
Bucket = bucketName,
Key = key,
ExpressionType = ExpressionType.SQL,
Expression = query,
InputSerialization = new InputSerialization()
{
JSON = new JSONInput()
{
JsonType = JsonType.Lines
},
CompressionType = CompressionType.Gzip
},
OutputSerialization = new OutputSerialization()
{
JSON = new JSONOutput()
{
RecordDelimiter = ","
}
}
};
using (var content = amazonS3Client.SelectObjectContentAsync(selectObjectContentRequest).Result.Payload)
{
foreach (var item in content)
{
if (item is RecordsEvent recordsEvent)
{
using (var reader = new StreamReader(recordsEvent.Payload, Encoding.UTF8))
{
using (var file = new StreamWriter(path, true))
{
file.WriteLine(reader.ReadToEnd());
}
}
}
}
}

iTextSharp AcroForm - multi-field not copying

I have a pdf with buttons that take you out to web links. I used iTextSharp to split these into separate PDFs (1 per page) per outside requirements. ISSUE: Any button that has multiple positions, lost the actions.
QUESTION: Does anyone know how to update these actions? I can open the new file, but I'm not sure how to go about using the PdfStamper to add an AA to this Annotation
So when opening the original file, you could get to the Additional Action by doing this:
var r = new PdfReader(f.FullName);
var positionsOfThisButton = r.AcroFields.GetFieldPositions("14");
var field = r.AcroForm.GetField("14")
var targetObject = PdfReader.GetPdfObject(field.Ref);
var kids = targetObject.GetAsArray(PdfName.KIDS);
foreach (var k in kids){
var ko = (PdfDictionary)(k.IsIndirect() ? PdfReader.GetPdfObject(k) : k);
var aaObj = ko.Get(PdfName.AA);
//(aaObj is NULL in the new file)
var aa = (PdfDictionary)(aaObj.IsIndirect() ? PdfReader.GetPdfObject(aaObj) : aaObj);
var dObj = aa.Get(PdfName.D);
var d = (PdfDictionary)(dObj.IsIndirect() ? PdfReader.GetPdfObject(dObj) : dObj);
Debug.WriteLine("S:" + d.GetAsName(PdfName.S).ToString() );
//returns S:/Uri
Debug.WriteLine("URI:" + d.GetAsString(PdfName.URI).ToString() );
//returns URI:http://www.somesite.com/etc
}
Thanks for any help.
FYI ONLY - The following is how I split the files:
List<byte[]> Get(FileInfo f) {
List<byte[]> outputFiles = new List<byte[]>();
var reader = new PdfReader(f.FullName);
int n = reader.NumberOfPages;
reader.Close();
for (int i = n; i > 0; i--) {
reader = new PdfReader(f.FullName);
using (var document = new Document(reader.GetPageSizeWithRotation(1))) {
using (var outputStream = new MemoryStream()) {
using (var writer = new PdfCopy(document, outputStream)) {
writer.SetMergeFields();
writer.PdfVersion = '6';
document.Open();
writer.AddDocument(reader, new List<int> { i });
document.Close();
writer.Close();
}
outputFiles.Insert(0, outputStream.ToArray());
}
}
reader.Close();
}
return outputFiles;
}

retrieve the full list of text in isolated storage Windows Phone 7

I am trying to retrieve multiple images and text through data binding, but I only manage to retrieve only the first text in isolated storage(code below).
Is it possible to retrieve multiple text through data binding into a ListBox?
string imageFileName = App.imagePath;
string a;
object b;
sting h;
int i;
string noteSeparate;
private void Library_Loaded(object sender, RoutedEventArgs e)
{
if (MainListBox.Items.Count == 0)
{
//To save the separated note by '^'
string[] noteSeparated;
//Read the file and display it line by line.
IsolatedStorageFile myStore = IsolatedStorageFile.GetUserStoreForApplication();
//Read the note saved in myFile.txt
StreamReader readFile = new StreamReader(new IsolatedStorageFileStream("ViewFolder\\myFile.txt", FileMode.Open, myStore));
try
{
String fileText = readFile.ReadLine();
//noteSeparated is the variable that save the retrieve note from myFile.txt and is noteSeparated by '^'
noteSeparated = fileText.Split(new char[] { '^' });
for (i = 0; i < noteSeparated.Length; i = i + 3)
{
noteSeparate = noteSeparated[i];
a = noteSeparate;
break;
}
h = a;
readFile.Close();
}
catch (Exception)
{
noNoteBlock.Visibility = Visibility.Visible;
}
}
string imageFolder = "imageFolder";
var isoFile = IsolatedStorageFile.GetUserStoreForApplication();
// Check if directory exists
if (!isoFile.DirectoryExists(imageFolder))
{
//isoFile.CreateDirectory(imageFolder);
throw new Exception("Image directory not found");
}
ObservableCollection<Items> LibraryItems = new ObservableCollection<Items>();
// Get files
foreach (string fileName in isoFile.GetFileNames())
{
//string filePath = Path.Combine(imageFolder, imageFileName);
string filePath = Path.Combine(imageFolder, fileName);
using (var imageStream = isoFile.OpenFile(filePath, FileMode.Open, FileAccess.Read))
{
var imageSource = PictureDecoder.DecodeJpeg(imageStream);
BitmapImage bi = new BitmapImage();
ListBoxItem item = new ListBoxItem();
bi.SetSource(imageStream);
item.Content = new Image() { Source = bi, MaxHeight = 100, MaxWidth = 100, Margin = new Thickness(0, 0, 0, 20) };
//MainListBox.Items.Add(item);
b = bi;
}
LibraryItems.Add(new Items(b, h));
MainListBox.ItemsSource = LibraryItems;
}
}
Can anyone help me retrieving all the text saved in isolated storage. The text in isolated file is in the format of "noteTitle^note^imagePath^noteTitle^note^imagePath^...." and so on.. I am trying to retrieve all the noteTitle only.
Can anyone help me with getting all the noteTitle only?
With Regex:
using (var streamReader = new StreamReader(new IsolatedStorageFileStream("ViewFolder\\myFile.txt", FileMode.Open, myStore)))
{
var text = streamReader.ReadToEnd();
var titles = Regex.Matches(text, #"(?<title>[^\^]+)\^(?<note>[^\^]+)\^(?<imagePath>[^\^]+)")
.Cast<Match>()
.Select(arg => arg.Groups["title"])
.ToList();
}
or with Split
using (var streamReader = new StreamReader(new IsolatedStorageFileStream("ViewFolder\\myFile.txt", FileMode.Open, myStore)))
{
var text = streamReader.ReadToEnd();
var i = 0;
var titles = text.Split('^').Where(arg => i++ % 3 == 0).ToList();
}
[EDIT] To bind the list to the ListBox:
private void Library_Loaded(object sender, RoutedEventArgs e)
{
using (var streamReader = new StreamReader(new IsolatedStorageFileStream("ViewFolder\\myFile.txt", FileMode.Open, myStore)))
{
var text = streamReader.ReadToEnd();
var i = 0;
MainListBox.ItemsSource = text.Split('^').Where(arg => i++ % 3 == 0).ToList();
}
}
[EDIT]
Replace this piece of code:
String fileText = readFile.ReadLine();
//noteSeparated is the variable that save the retrieve note from myFile.txt and is noteSeparated by '^'
noteSeparated = fileText.Split(new char[] { '^' });
for (i = 0; i < noteSeparated.Length; i = i + 3)
{
noteSeparate = noteSeparated[i];
a = noteSeparate;
break;
}
h = a;
with:
var fileText = readFile.ReadToEnd();
var i = 0;
var titles = fileText .Split('^').Where(arg => i++ % 3 == 0).ToList();
titles will be a list of the notTitle.

Categories

Resources