Parse a persian pdf file to txt and its images

Parse a persian pdf file to txt and its images - c#

I used this code for convert a English pdf and it work perfectly, but when i use it for Persian file, its output has no Persian character !! how i can parse a Unicode pdf to a text file and a folder contains image files?
using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;
namespace Spider.Utils
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (input[i] == 213)
c = "'".ToCharArray()[0];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
resultString += c.ToString();
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return CleanupContent(resultString);
}
catch
{
return "";
}
}
private string CleanupContent(string text)
{
string[] patterns = { #"\\\(", #"\\\)", #"\\226", #"\\222", #"\\223", #"\\224", #"\\340", #"\\342", #"\\344", #"\\300", #"\\302", #"\\304", #"\\351", #"\\350", #"\\352", #"\\353", #"\\311", #"\\310", #"\\312", #"\\313", #"\\362", #"\\364", #"\\366", #"\\322", #"\\324", #"\\326", #"\\354", #"\\356", #"\\357", #"\\314", #"\\316", #"\\317", #"\\347", #"\\307", #"\\371", #"\\373", #"\\374", #"\\331", #"\\333", #"\\334", #"\\256", #"\\231", #"\\253", #"\\273", #"\\251", #"\\221"};
string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" };
for (int i = 0; i < patterns.Length; i++)
{
string regExPattern = patterns[i];
Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
text = regex.Replace(text, replace[i]);
}
return text;
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="tokens">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}

Is there a specific reason you're not using the relatively new parsing classes? I don't know the Persian language, but the first Persian PDF I found on Google works, and it's much less code:
PdfReader reader = new PdfReader(pdfPath);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
StringBuilder sb = new StringBuilder();
for (int i = 1; i <= reader.NumberOfPages; i++) {
ITextExtractionStrategy strategy = parser.ProcessContent(
i, new SimpleTextExtractionStrategy()
);
sb.Append(strategy.GetResultantText());
}
A number of bugs have been fixed recently, so I'm using the latest iTextSharp SVN build. Also your question title includes parsing images, but your code doesn't, so the example above is not extracting any images.

Related

I need to check if a string is a pangram string or not is my code correct?

public static class Inova
{
public static bool IsPangram(string str)
{
int compteur = 26;
for (int i = 0; i <= str.Length; i++)
{
if (('A' <= str[i] && str[i] <= 'Z') || ('a' <= str[i] && str[i] <= 'z'))
{
for (int j = str[i + 1]; j <= str.Length; j++)
{
if (compteur != 0 && str[i] != str[j])
{
compteur = compteur - 1;
}
}
}
if (compteur == 0)
{
return true;
}
else
{
return false;
}
}
}
}

There are multiple things incorrect:
for (int j = str[i + 1]; j <= str.Length; j++)
this does not do what you think, it will convert the next char to an int, you want to loop all letters until end, beginning from the current letter + 1.
The if ... else belong to the end of the method, outside of the loop, otherwise you return false after the first iteration in the for-loop
So you want to know if it's a perfect pangram? First we need to say what a pangram is: a sentence containing every letter of the alphabet. It seems you want to check if it's even a perfect pangram, so every letter should appear exactly once. Here is a method not using any fancy LINQ(which might not be allowed) that supports perfect/imperfect pangrams:
public static class Inova
{
private const string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
public static bool IsPangram(string str, bool mustBePerfect)
{
HashSet<char> remaingLetters = new HashSet<char>(alphabet);
foreach (char c in str)
{
char letter = char.ToUpperInvariant(c);
if (!alphabet.Contains(letter)) continue;
bool repeatingLetter = !remaingLetters.Remove(letter);
if (mustBePerfect && repeatingLetter)
{
return false; // already removed
}
}
return remaingLetters.Count == 0;
}
}
Usage:
bool isPangram = Inova.IsPangram("abcdefghijklmnopqrstuvwxyZZ", false);
Since z appears twice this method returns false for perfect and true for not perfect.
Demo: https://dotnetfiddle.net/gEXuvG
Side-note: i wanted to keep it simple, if you want you can still improve it. You can return true in the loop: if(!mustBePerfect && remaingLetters.Count == 0) return true.

I would check for existence of each letter in the string, so
public static bool IsPangram(string str) {
str = str.ToLower();
for (int i = 0; i < 26; i++) {
char c = Convert.ToChar(i + 97);
if (!str.Contains(c)) {
return false;
}
}
return true;
}
Console.WriteLine(IsPangram("hello world"));
Console.WriteLine(IsPangram("abcdefghi jkl mno pqrstuvwxyz"));
// output:
False
True

C# Split ignore commas in input string

So I've finished Huffman compression algorithm, but it will work only if a .txt file doesn't contain , / ;
So I want to ignore only these symbols
What I've got:
Dictionary of a character and its code, separated by comma. Each pair separated by ;
/ at the end of a dictionary (because next symbol is number of added zeros)
Example:
c,1;a,00;t,01;/3
Of course, when I type in file something like "I love dogs, cats", I've got problems:) Like this one: System.FormatException: "String must be exactly one character long."
Here's my code:
Dictionary<char, string> forDecoding = new Dictionary<char, string>();
using (sr = new StreamReader(fileName))
{
string line = sr.ReadToEnd();
string[] lines = line.Split('/');
string[] dict = lines[0].Split(';');
string[] item = null;
foreach (var v in lines[0])
{
encoded.Add(v);
}
count = Convert.ToInt32(lines[1]);
for (int i = 0; i < dict.Length - 1; i++)
{
item = dict[i].Split(',');
forDecoding.Add(Convert.ToChar(item[0]), item[1]);
}
}
Is there any way to modify it in order to see that comma inside like another symbol of input information that has to be encoded?

Try the following:
Dictionary<char, string> forDecoding = new Dictionary<char, string>();
...
private void LoadEncodedData(string filename)
{
//remove existing items
forDecoding.Clear();
using (System.IO.StreamReader sr = new System.IO.StreamReader(filename))
{
int fileLength = 0;
char previousChar = char.MinValue;
char secondPreviousChar = char.MinValue;
char dictKey = char.MinValue;
string dictVal = string.Empty;
bool isDictVal = false;
bool isNextCharEOF = false;
bool isNumberOfZeros = false;
int numberOfAddedZeros = 0;
string numberOfZerosStr = string.Empty;
while (!sr.EndOfStream)
{
//read char and move position
char currentChar = (char)sr.Read();
//read next char without moving position
char nextChar = (char)sr.Peek();
//65535 = EOF
if (nextChar == 65535)
{
//set val
isNextCharEOF = true;
}
if (!isNextCharEOF && !isDictVal && nextChar == ',')
{
//set value
dictKey = currentChar;
}
else if (!isDictVal && previousChar == ',' && currentChar != ',')
{
//start saving chars for dictionary value
dictVal = currentChar.ToString();
//set value
isDictVal = true;
}
else if (isDictVal && currentChar == ';')
{
System.Diagnostics.Debug.WriteLine("dictKey: '" + dictKey.ToString() + "' dictVal: '" + dictVal + "'");
//add to dictionary
forDecoding.Add(dictKey, dictVal);
//re-initialize
dictVal = string.Empty;
//set value
isDictVal = false;
}
else if (isDictVal)
{
//append
dictVal += currentChar.ToString();
}
else if (!isDictVal && secondPreviousChar == ';' && previousChar == '/' && currentChar != ',')
{
if (!isNextCharEOF)
{
//set value
isNumberOfZeros = true;
numberOfZerosStr = currentChar.ToString();
}
else
{
//set value
numberOfZerosStr = currentChar.ToString();
numberOfAddedZeros = 0;
Int32.TryParse(numberOfZerosStr, out numberOfAddedZeros);
System.Diagnostics.Debug.WriteLine(" numberOfAddedZeros: " + numberOfAddedZeros + System.Environment.NewLine);
//set value
isNumberOfZeros = false;
numberOfZerosStr = string.Empty;
}
}
else if (isNumberOfZeros && (isNextCharEOF || nextChar != ','))
{
//append
numberOfZerosStr += currentChar;
numberOfAddedZeros = 0;
Int32.TryParse(numberOfZerosStr, out numberOfAddedZeros);
System.Diagnostics.Debug.WriteLine(" numberOfAddedZeros: " + numberOfAddedZeros + System.Environment.NewLine);
//set value
isNumberOfZeros = false;
numberOfZerosStr = string.Empty;
}
else if (isNumberOfZeros)
{
//append
numberOfZerosStr += currentChar;
}
//set value
secondPreviousChar = previousChar;
previousChar = currentChar;
fileLength += 1; //increment
}
}
}
Example data: c,1;/,00;t,01;/3

How to detect any NON UTF8 character in a file in C#?

How can I identify all NON UTF8 characters from a given file?
We need to write it in C# and be able to execute it in a SSIS environment.
After the execution we need to find out and check all the wrong occurrences given eventually their line number into the input file.
Assumptions:
- file is a csv well formatted (in our case),
- new line has CR LF

When you load your file into byte array and then attempt to load it to the string invalid UTF8 characters will be replaced by ? (question marks). Your code should look something like this:
byte[] data = File.ReadAllBytes(pathToYourFile);
string result = Encoding.UTF8.GetString(data);
Next, you can take for example cleaning steps???

After a bit of research, we collected some hints:
Stackoverflow: Determine a string's encoding in C#
utf8check: https://archive.codeplex.com/?p=utf8checker
Daniel Lemire's blog: https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
Here's what we have learned:
we needed to scan byte by byte,
the class from which to start
the algorithm for checking UTF8 (well implemented from point 2)
SO: we needed to improve the version of utf8checker class in order to keep scanning the entire file and not finishing at first wrong occurrence.
After the complete scanning the code produces a log file listing all the NON utf8 occurrences.
The following code is working in our case. It's execute in a SSIS Script Task and it reads the filename from the input parameter.
Maybe could be improved further.
/*
Microsoft SQL Server Integration Services Script Task
Write scripts using Microsoft Visual C# 2008.
The ScriptMain is the entry point class of the script.
*/
using System;
using System.Data;
using Microsoft.SqlServer.Dts.Runtime;
using System.Windows.Forms;
using System.IO;
using System.Text;
using System.Linq;
using System.Collections.Generic;
namespace ST_5c3d8ec1340c4ab9bbb71cb975760e42.csproj
{
[System.AddIn.AddIn("ScriptMain", Version = "1.0", Publisher = "", Description = "")]
public partial class ScriptMain : Microsoft.SqlServer.Dts.Tasks.ScriptTask.VSTARTScriptObjectModelBase
{
public void Main()
{
String fileToCheck, logFileName;
bool OK_UTF8;
IUtf8Checker fileCheckerUtf8 = new Utf8Checker();
List<IErrorUtf8Checker> errorsList;
System.IO.StreamWriter logFile;
try
{
fileToCheck = Dts.Variables["User::InputFile"].Value.ToString();
logFileName = fileToCheck + "_utf8check.log";
if (File.Exists(fileToCheck))
{
OK_UTF8 = fileCheckerUtf8.Check(fileToCheck);
if (OK_UTF8 == false)
{
errorsList = fileCheckerUtf8.GetErrorList();
logFile = new StreamWriter(logFileName);
int i = 0;
foreach (ErrorUtf8Checker e in errorsList)
{
logFile.WriteLine(++i + ") " + e.ToString());
}
logFile.Close();
}
}
//exit always with success. It writes a log file if any warning occurs
Dts.TaskResult = (int)ScriptResults.Success;
}
catch (DecoderFallbackException eUTF)
{
Console.Write(eUTF.ToString());
Dts.TaskResult = (int)ScriptResults.Failure;
}
catch (Exception e)
{
Console.Write(e.ToString());
Dts.TaskResult = (int)ScriptResults.Failure;
}
}
#region VSTA generated code
enum ScriptResults
{
Success = Microsoft.SqlServer.Dts.Runtime.DTSExecResult.Success,
Failure = Microsoft.SqlServer.Dts.Runtime.DTSExecResult.Failure
};
#endregion
/**
* PrintOnSSISConsole
* Used to print a string s into the immediate console of SSIS
*/
public void PrintOnSSISConsole(String s)
{
System.Diagnostics.Debug.WriteLine(s);
}
/// <summary>
/// Interface for checking for utf8.
/// </summary>
public interface IUtf8Checker
{
/// <summary>
/// Check if file is utf8 encoded.
/// </summary>
/// <param name="fileName"></param>
/// <returns>true if utf8 encoded, otherwise false.</returns>
bool Check(string fileName);
/// <summary>
/// Check if stream is utf8 encoded.
/// </summary>
/// <param name="stream"></param>
/// <returns>true if utf8 encoded, otherwise false.</returns>
bool IsUtf8(Stream stream);
/// <summary>
/// Return a list of found errors of type of IErrorUtf8Checker
/// </summary>
/// <returns>List of errors found through the Check metod</returns>
List<IErrorUtf8Checker> GetErrorList();
}
public interface IErrorUtf8Checker
{
}
/// <summary>
/// http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
///
/// http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html
///
/// http://www.unicode.org/versions/corrigendum1.html
///
/// http://www.ietf.org/rfc/rfc2279.txt
///
/// </summary>
public class Utf8Checker : IUtf8Checker
{
// newLineArray = used to understand the new line sequence
private static byte[] newLineArray = new byte[2] { 13, 10 };
private int line = 1;
private byte[] lineArray = new byte[2] { 0, 0 };
// used to keep trak of number of errors found into the file
private List<IErrorUtf8Checker> errorsList;
public Utf8Checker()
{
this.errorsList = new List<IErrorUtf8Checker>();
}
public int getNumberOfErrors()
{
return errorsList.Count();
}
public bool Check(string fileName)
{
using (BufferedStream fstream = new BufferedStream(File.OpenRead(fileName)))
{
return this.IsUtf8(fstream);
}
}
public int getLine()
{
return line;
}
public List<IErrorUtf8Checker> GetErrorList()
{
return errorsList;
}
/// <summary>
/// Check if stream is utf8 encoded.
/// Notice: stream is read completely in memory!
/// </summary>
/// <param name="stream">Stream to read from.</param>
/// <returns>True if the whole stream is utf8 encoded.</returns>
public bool IsUtf8(Stream stream)
{
int count = 4 * 1024;
byte[] buffer;
int read;
while (true)
{
buffer = new byte[count];
stream.Seek(0, SeekOrigin.Begin);
read = stream.Read(buffer, 0, count);
if (read < count)
{
break;
}
buffer = null;
count *= 2;
}
return IsUtf8(buffer, read);
}
/// <summary>
///
/// </summary>
/// <param name="buffer"></param>
/// <param name="length"></param>
/// <returns></returns>
public bool IsUtf8(byte[] buffer, int length)
{
int position = 0;
int bytes = 0;
bool ret = true;
while (position < length)
{
if (!IsValid(buffer, position, length, ref bytes))
{
ret = false;
errorsList.Add(new ErrorUtf8Checker(getLine(), buffer[position]));
}
position += bytes;
}
return ret;
}
/// <summary>
///
/// </summary>
/// <param name="buffer"></param>
/// <param name="position"></param>
/// <param name="length"></param>
/// <param name="bytes"></param>
/// <returns></returns>
public bool IsValid(byte[] buffer, int position, int length, ref int bytes)
{
if (length > buffer.Length)
{
throw new ArgumentException("Invalid length");
}
if (position > length - 1)
{
bytes = 0;
return true;
}
byte ch = buffer[position];
char ctest = (char)ch; // for debug only
this.detectNewLine(ch);
if (ch <= 0x7F)
{
bytes = 1;
return true;
}
if (ch >= 0xc2 && ch <= 0xdf)
{
if (position >= length - 2)
{
bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 2;
return true;
}
if (ch == 0xe0)
{
if (position >= length - 3)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0xa0 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 3;
return true;
}
if (ch >= 0xe1 && ch <= 0xef)
{
if (position >= length - 3)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 3;
return true;
}
if (ch == 0xf0)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x90 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
if (ch == 0xf4)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0x8f ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
if (ch >= 0xf1 && ch <= 0xf3)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
return false;
}
private void detectNewLine(byte ch)
{
// looking for second char for new line (char 13 feed)
if (this.lineArray[0] == newLineArray[0])
{
if (ch == newLineArray[1])
{
// found new line
this.lineArray[1] = ch;
line++;
// reset work array: lineArray
this.lineArray[1] = 0;
}
// we have to reset work array because CR(13)LF(10) must be in sequence
this.lineArray[0] = 0;
}
else
{
// found first character (char 10 return)
if (ch == newLineArray[0])
{
this.lineArray[0] = ch;
}
}
}
}
public class ErrorUtf8Checker : IErrorUtf8Checker
{
private int line;
private byte ch;
public ErrorUtf8Checker(int line, byte character)
{
this.line = line;
this.ch = character;
}
public ErrorUtf8Checker(int line)
{
this.line = line;
}
public override string ToString()
{
string s;
try
{
if (ch > 0)
{
s = "line: " + line + " code: " + ch + ", char: " + (char)ch;
}
else
{
s = "line: " + line;
}
return s;
}
catch (Exception e)
{
Console.Write(e.ToString());
return base.ToString();
}
}
}
}
}
Given the example:
Hello world test UTF8
err 1: °
text ok line 3
err 2: ò
errs 3: à è § °
end file
the code posted will create a new file containing:
1) line: 2 code: 176, char: °
2) line: 4 code: 242, char: ò
3) line: 5 code: 224, char: à
4) line: 5 code: 232, char: è
5) line: 5 code: 167, char: §
6) line: 5 code: 176, char: °

Function, which deletes all code comments

I need to do a function which deletes all comments from the text(code). My code is almost finished, but it doesn't work if comment starts in the first line of the file. It says index out of bounds, I tried changing for loops to start from 1 and then if to(text[i] == '/' && text[i - 1] == '/') but it doesn't work.
Any suggestion how can I fix that or improve my code because it looks weird.
public void RemoveComments(string text)
{
for (int i = 0; i < text.Length; i++)
{
if (text[i] == '/' && text[i + 1] == '/')
{
text = text.Remove(i, 2);
for (int j = i; j < text.Length; j++)
{
if (text[j] != '\n')
{
text = text.Remove(j, 1);
j--;
}
else if (text[j] == '\n')
{
text = text.Remove(j, 1);
j--;
while (text[j] == ' ')
{
text = text.Remove(j, 1);
j--;
}
i = j;
break;
}
}
}
else if (text[i] == '/' && text[i + 1] == '*')
{
text = text.Remove(i, 2);
for (int j = i; j < text.Length; j++)
{
if (text[j] != '*' && text[j + 1] != '/')
{
text = text.Remove(j, 1);
j--;
}
else if (text[j] == '*' && text[j + 1] == '/')
{
text = text.Remove(j, 2);
j = j - 2;
while (text[j] == ' ')
{
text = text.Remove(j, 1);
j--;
if (text[j] == '\n')
{
text = text.Remove(j, 1);
j--;
}
}
i = j;
break;
}
}
}
}
Console.WriteLine(text);
}
EDIT: Now I did many experiments and I found that the problem is with(in // loop) I needed this loop this to fix some small aligment problems:
while (text[j] == ' ')
{
text = text.Remove(j, 1);
j--;
}
Test.txt file.
//int a;
int c; //int d;
Console.Write/*Line*/("Hhehehe");
if(1>0)
/*ConsoleWriteLine("Yes")*/
//Nooo

Looks like you have C# code files. Thus you can use the power of Roslyn. Simply parse code file into syntax tree and then visit that tree with visitor which skips comments:
var code = File.ReadAllText("Code.cs");
SyntaxTree tree = CSharpSyntaxTree.ParseText(code);
var root = (CompilationUnitSyntax)tree.GetRoot();
var codeWithoutComments = new CommentsRemover().Visit(root).ToString();
Console.WriteLine(codeWithoutComments);
Visitor:
class CommentsRemover : CSharpSyntaxRewriter
{
public override SyntaxTrivia VisitTrivia(SyntaxTrivia trivia)
{
switch(trivia.Kind())
{
case SyntaxKind.SingleLineCommentTrivia:
case SyntaxKind.MultiLineCommentTrivia:
return default; // new SyntaxTrivia() // if C# <= 7.0
default:
return trivia;
}
}
}
Sample code file:
using System;
using System.Collections.Generic;
using System.Text;
namespace ConsoleApp
{
/* Sample
Multiline Comment */
class Program
{
static void Main(string[] args)
{
// Comment
Console.Write/*Line*/("Hello, World!"); // Print greeting
/*ConsoleWriteLine("Yes")*/
}
}
}
Output:
using System;
using System.Collections.Generic;
using System.Text;
namespace ConsoleApp
{
class Program
{
static void Main(string[] args)
{
Console.Write("Hello, World!");
}
}
}
Notes: As you can see, after removing comments from the lines which had nothing except comment, you get empty lines. You can create one more visitor to remove empty lines. Also consider to remove XML comments as well.

You have a loop based on text.Length
for (int i = 0; i < text.Length; i++)
But inside of the loop you are shorten the text. At a certain point it is smaller as the origin text.Length and you running out of index I guiess

changing data to xml

I have been tasked with finishing a demo for a client as our main developer is currently unavailable. All the programming experience I have is that I briefly crossed swords with C# 5 years ago but haven't used it since.
I need help with turning exported data into XML if that makes sense. Currently we have a build which takes a PDF form extracts the required data which is shown to work via command prompt. I need to be able to turn this data into XML, so it can be queried to a database. The idea of the program is to take required data from a PDF convert it to XML and query it to a database where the data is stored. We are using the C# language in conjunction with the iTextsharp library. I would post the code but I'm not allowed to.
So I am asking can anyone help me out? Maybe point me towards an example of how this is done and or explain as simply as possible how I would go about doing this? I wouldn't usually ask for help from others but because the fact I haven't coded in years, it has left me feeling intimidated.

This may prove usefull to you... Taken from a way to parse PDF data using iTextSharp
using iTextSharp.text.pdf;
using iTextSharp.text;
private void openPDF()
{
string str = "";
string newFile = "c:\\New Document.pdf";
Document doc = new Document();
PdfReader reader = new PdfReader("c:\\New Document.pdf");
for (int i = 1; i <= reader.NumberOfPages; i++)
{
byte[] bt = reader.GetPageContent(i);
str += ExtractTextFromPDFBytes(bt);
}
}
private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) {
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return string.Empty;
}
}
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a)))
{
return true;
}
}
return false;
}
Once the data is parsed it's then a matter of converting each line to a class (as recommended by Andrei V in your post's comment), serializing it to XML and then storing either the XML file itself to the database or the XML data to the database.

Develop Reference

C# (C-Sharp) is a programming language developed by Microsoft that runs on the .NET Framework.

Parse a persian pdf file to txt and its images - c#

Related

I need to check if a string is a pangram string or not is my code correct?

C# Split ignore commas in input string

How to detect any NON UTF8 character in a file in C#?

Function, which deletes all code comments

changing data to xml

Categories

Resources