How to detect any NON UTF8 character in a file in C#? - c#

How can I identify all NON UTF8 characters from a given file?
We need to write it in C# and be able to execute it in a SSIS environment.
After the execution we need to find out and check all the wrong occurrences given eventually their line number into the input file.
Assumptions:
- file is a csv well formatted (in our case),
- new line has CR LF

When you load your file into byte array and then attempt to load it to the string invalid UTF8 characters will be replaced by ? (question marks). Your code should look something like this:
byte[] data = File.ReadAllBytes(pathToYourFile);
string result = Encoding.UTF8.GetString(data);
Next, you can take for example cleaning steps???

After a bit of research, we collected some hints:
Stackoverflow: Determine a string's encoding in C#
utf8check: https://archive.codeplex.com/?p=utf8checker
Daniel Lemire's blog: https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
Here's what we have learned:
we needed to scan byte by byte,
the class from which to start
the algorithm for checking UTF8 (well implemented from point 2)
SO: we needed to improve the version of utf8checker class in order to keep scanning the entire file and not finishing at first wrong occurrence.
After the complete scanning the code produces a log file listing all the NON utf8 occurrences.
The following code is working in our case. It's execute in a SSIS Script Task and it reads the filename from the input parameter.
Maybe could be improved further.
/*
Microsoft SQL Server Integration Services Script Task
Write scripts using Microsoft Visual C# 2008.
The ScriptMain is the entry point class of the script.
*/
using System;
using System.Data;
using Microsoft.SqlServer.Dts.Runtime;
using System.Windows.Forms;
using System.IO;
using System.Text;
using System.Linq;
using System.Collections.Generic;
namespace ST_5c3d8ec1340c4ab9bbb71cb975760e42.csproj
{
[System.AddIn.AddIn("ScriptMain", Version = "1.0", Publisher = "", Description = "")]
public partial class ScriptMain : Microsoft.SqlServer.Dts.Tasks.ScriptTask.VSTARTScriptObjectModelBase
{
public void Main()
{
String fileToCheck, logFileName;
bool OK_UTF8;
IUtf8Checker fileCheckerUtf8 = new Utf8Checker();
List<IErrorUtf8Checker> errorsList;
System.IO.StreamWriter logFile;
try
{
fileToCheck = Dts.Variables["User::InputFile"].Value.ToString();
logFileName = fileToCheck + "_utf8check.log";
if (File.Exists(fileToCheck))
{
OK_UTF8 = fileCheckerUtf8.Check(fileToCheck);
if (OK_UTF8 == false)
{
errorsList = fileCheckerUtf8.GetErrorList();
logFile = new StreamWriter(logFileName);
int i = 0;
foreach (ErrorUtf8Checker e in errorsList)
{
logFile.WriteLine(++i + ") " + e.ToString());
}
logFile.Close();
}
}
//exit always with success. It writes a log file if any warning occurs
Dts.TaskResult = (int)ScriptResults.Success;
}
catch (DecoderFallbackException eUTF)
{
Console.Write(eUTF.ToString());
Dts.TaskResult = (int)ScriptResults.Failure;
}
catch (Exception e)
{
Console.Write(e.ToString());
Dts.TaskResult = (int)ScriptResults.Failure;
}
}
#region VSTA generated code
enum ScriptResults
{
Success = Microsoft.SqlServer.Dts.Runtime.DTSExecResult.Success,
Failure = Microsoft.SqlServer.Dts.Runtime.DTSExecResult.Failure
};
#endregion
/**
* PrintOnSSISConsole
* Used to print a string s into the immediate console of SSIS
*/
public void PrintOnSSISConsole(String s)
{
System.Diagnostics.Debug.WriteLine(s);
}
/// <summary>
/// Interface for checking for utf8.
/// </summary>
public interface IUtf8Checker
{
/// <summary>
/// Check if file is utf8 encoded.
/// </summary>
/// <param name="fileName"></param>
/// <returns>true if utf8 encoded, otherwise false.</returns>
bool Check(string fileName);
/// <summary>
/// Check if stream is utf8 encoded.
/// </summary>
/// <param name="stream"></param>
/// <returns>true if utf8 encoded, otherwise false.</returns>
bool IsUtf8(Stream stream);
/// <summary>
/// Return a list of found errors of type of IErrorUtf8Checker
/// </summary>
/// <returns>List of errors found through the Check metod</returns>
List<IErrorUtf8Checker> GetErrorList();
}
public interface IErrorUtf8Checker
{
}
/// <summary>
/// http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
///
/// http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html
///
/// http://www.unicode.org/versions/corrigendum1.html
///
/// http://www.ietf.org/rfc/rfc2279.txt
///
/// </summary>
public class Utf8Checker : IUtf8Checker
{
// newLineArray = used to understand the new line sequence
private static byte[] newLineArray = new byte[2] { 13, 10 };
private int line = 1;
private byte[] lineArray = new byte[2] { 0, 0 };
// used to keep trak of number of errors found into the file
private List<IErrorUtf8Checker> errorsList;
public Utf8Checker()
{
this.errorsList = new List<IErrorUtf8Checker>();
}
public int getNumberOfErrors()
{
return errorsList.Count();
}
public bool Check(string fileName)
{
using (BufferedStream fstream = new BufferedStream(File.OpenRead(fileName)))
{
return this.IsUtf8(fstream);
}
}
public int getLine()
{
return line;
}
public List<IErrorUtf8Checker> GetErrorList()
{
return errorsList;
}
/// <summary>
/// Check if stream is utf8 encoded.
/// Notice: stream is read completely in memory!
/// </summary>
/// <param name="stream">Stream to read from.</param>
/// <returns>True if the whole stream is utf8 encoded.</returns>
public bool IsUtf8(Stream stream)
{
int count = 4 * 1024;
byte[] buffer;
int read;
while (true)
{
buffer = new byte[count];
stream.Seek(0, SeekOrigin.Begin);
read = stream.Read(buffer, 0, count);
if (read < count)
{
break;
}
buffer = null;
count *= 2;
}
return IsUtf8(buffer, read);
}
/// <summary>
///
/// </summary>
/// <param name="buffer"></param>
/// <param name="length"></param>
/// <returns></returns>
public bool IsUtf8(byte[] buffer, int length)
{
int position = 0;
int bytes = 0;
bool ret = true;
while (position < length)
{
if (!IsValid(buffer, position, length, ref bytes))
{
ret = false;
errorsList.Add(new ErrorUtf8Checker(getLine(), buffer[position]));
}
position += bytes;
}
return ret;
}
/// <summary>
///
/// </summary>
/// <param name="buffer"></param>
/// <param name="position"></param>
/// <param name="length"></param>
/// <param name="bytes"></param>
/// <returns></returns>
public bool IsValid(byte[] buffer, int position, int length, ref int bytes)
{
if (length > buffer.Length)
{
throw new ArgumentException("Invalid length");
}
if (position > length - 1)
{
bytes = 0;
return true;
}
byte ch = buffer[position];
char ctest = (char)ch; // for debug only
this.detectNewLine(ch);
if (ch <= 0x7F)
{
bytes = 1;
return true;
}
if (ch >= 0xc2 && ch <= 0xdf)
{
if (position >= length - 2)
{
bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 2;
return true;
}
if (ch == 0xe0)
{
if (position >= length - 3)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0xa0 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 3;
return true;
}
if (ch >= 0xe1 && ch <= 0xef)
{
if (position >= length - 3)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 3;
return true;
}
if (ch == 0xf0)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x90 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
if (ch == 0xf4)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0x8f ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
if (ch >= 0xf1 && ch <= 0xf3)
{
if (position >= length - 4)
{
//bytes = 0;
return false;
}
if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf ||
buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf ||
buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf)
{
//bytes = 0;
return false;
}
bytes = 4;
return true;
}
return false;
}
private void detectNewLine(byte ch)
{
// looking for second char for new line (char 13 feed)
if (this.lineArray[0] == newLineArray[0])
{
if (ch == newLineArray[1])
{
// found new line
this.lineArray[1] = ch;
line++;
// reset work array: lineArray
this.lineArray[1] = 0;
}
// we have to reset work array because CR(13)LF(10) must be in sequence
this.lineArray[0] = 0;
}
else
{
// found first character (char 10 return)
if (ch == newLineArray[0])
{
this.lineArray[0] = ch;
}
}
}
}
public class ErrorUtf8Checker : IErrorUtf8Checker
{
private int line;
private byte ch;
public ErrorUtf8Checker(int line, byte character)
{
this.line = line;
this.ch = character;
}
public ErrorUtf8Checker(int line)
{
this.line = line;
}
public override string ToString()
{
string s;
try
{
if (ch > 0)
{
s = "line: " + line + " code: " + ch + ", char: " + (char)ch;
}
else
{
s = "line: " + line;
}
return s;
}
catch (Exception e)
{
Console.Write(e.ToString());
return base.ToString();
}
}
}
}
}
Given the example:
Hello world test UTF8
err 1: °
text ok line 3
err 2: ò
errs 3: à è § °
end file
the code posted will create a new file containing:
1) line: 2 code: 176, char: °
2) line: 4 code: 242, char: ò
3) line: 5 code: 224, char: à
4) line: 5 code: 232, char: è
5) line: 5 code: 167, char: §
6) line: 5 code: 176, char: °

Related

C# Iterate through binary file and build a text file with found bytes

I try to be more specific.
I have a binary file which has some portions of text inside.
I want to search for some byte sequence in the binary file, if the sequences are found take the byte arrays and build a text file with them.
So the step has to be repeated till the end of the binary file.
I used BinaryReader to search for a byte sequence, in order to validate the binary file, but now I am stuck on how to proceed with this combination.
My other issue is that I have to skip certain portions of the binary file until the next sequence is found.
So for example, I find the first sequence at 0x10 and it lasts for 10 bytes. Then I have to skip 32 bytes where another byte sequence then starts for x bytes till a tail byte that marks the end of the sequence.
Each time a byte sequence is found I have to save it in a text file, finally writing it to disk.
Any help?
Something like this, then:
class Program
{
const string filename = "some file";
static void Main(string[] args)
{
byte[] bytes = System.IO.File.ReadAllBytes(filename);
string[] find = new string[] { "me", "you" };
int offsetAfterFind = 32;
int pos = 0;
while (pos < bytes.Length)
{
bool isFound = false;
int index = 0;
while (!isFound && index < find.Length)
{
bool isMatch = true;
for (int n = 0; n < find[index].Length; n++)
{
if (pos + n >= bytes.Length)
{
isMatch = false;
}
else
{
if (bytes[pos + n] != find[index][n]) isMatch = false;
}
}
if (isMatch)
{
isFound = true;
break;
}
index++;
}
if (isFound)
{
Console.WriteLine(String.Format("Found {0} at {1}", find[index], pos));
pos += find[index].Length + offsetAfterFind;
}
else
{
pos++;
}
}
}
}
All right. I managed to do it and maybe this will be useful to someone else:
public static void ConvertToSRTSubs()
{
byte [] openingTimeWindow = Encoding.ASCII.GetBytes("["); \\Timespan in the binary is wrapped around square brackets
byte [] nextOpening = Encoding.ASCII.GetBytes("[00"); \\ I need this as a point to get the end of the sentence, because there is a fixed size between sentences and next timespan.
byte [] closingTimeWindow = Encoding.ASCII.GetBytes("]"); \\End of the timespan
int found = 0; \\This will iterate through every timespan match
int backPos = 0; \\Pointer to the first occurrence
int nextPos = 0;
int sentenceStartPos = 0;
int newStartFound = 0;
string srtTime = String.Empty;
string srtSentence = String.Empty;
byte[] array = File.ReadAllBytes(Path.Combine(coursePath, hashedSubFileName));
try
{
using (StreamWriter s = new StreamWriter(Video.outPath + ext, false))
{
for (int i = 0; i < array.Length; i++)
{
if (openingTimeWindow[0] == array[i] && closingTimeWindow[0] == array[i + 12])
{
found++;
s.WriteLine(found);
try
{
backPos = i;
for (i = backPos + 12; i < array.Length; i++ )
{
if (newStartFound == 1)
break;
if (nextOpening[0] == array[i] && nextOpening[1] == array[i + 1] && nextOpening[2] == array[i + 2])
{
nextPos = i - 19;
newStartFound++;
}
}
i = backPos;
newStartFound = 0;
sentenceStartPos = backPos + 27;
sentenceSize = nextPos - sentenceStartPos;
if (sentenceSize < 0) sentenceSize = 1;
byte[] startTime = new byte[11];
byte[] sentence = new byte[sentenceSize];
Array.Copy(array, backPos + 1, startTime, 0, 11);
Array.Copy(array, sentenceStartPos, sentence, 0, sentenceSize);
srtTimeRaw = srtTime = Encoding.UTF8.GetString(startTime);
srtTime = srtTimeRaw.Replace('.', ',') + "0" + " --> " + span;
s.WriteLine(srtTime);
srtSentence = Encoding.UTF8.GetString(sentence);
s.WriteLine(srtSentence);
s.WriteLine();
}
catch (ArgumentException argex)
{
MessageBox.Show(argex.ToString());
}
}
}
}
}
catch (DirectoryNotFoundException dex)
{
MessageBox.Show(dex.ToString());
}
}
Maybe not the cleanest code, but it works :)

Display entire IP Address range knowing Minimum and Maximum values? [duplicate]

How do I iterate through a range of IP addresses provided by the user?
I'm flexible on the format, provided it allows all ranges to be specified. Perhaps something like the nmap-style:
'192.0.2.1' # one IP address
'192.0.2.0-31' # one block with 32 IP addresses.
'192.0.2-3.1-254' # two blocks with 254 IP addresses.
'0-255.0-255.0-255.0-255' # the whole IPv4 address space
For example, if the user entered 192.0.2-3.1-254, I would like to know how to generate a list of all the valid IP addresses in this range so that I could iterate through them.
For example:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
namespace IpRanges
{
public class IPRange
{
public IPRange(string ipRange)
{
if (ipRange == null)
throw new ArgumentNullException();
if (!TryParseCIDRNotation(ipRange) && !TryParseSimpleRange(ipRange))
throw new ArgumentException();
}
public IEnumerable<IPAddress> GetAllIP()
{
int capacity = 1;
for (int i = 0; i < 4; i++)
capacity *= endIP[i] - beginIP[i] + 1;
List<IPAddress> ips = new List<IPAddress>(capacity);
for (int i0 = beginIP[0]; i0 <= endIP[0]; i0++)
{
for (int i1 = beginIP[1]; i1 <= endIP[1]; i1++)
{
for (int i2 = beginIP[2]; i2 <= endIP[2]; i2++)
{
for (int i3 = beginIP[3]; i3 <= endIP[3]; i3++)
{
ips.Add(new IPAddress(new byte[] { (byte)i0, (byte)i1, (byte)i2, (byte)i3 }));
}
}
}
}
return ips;
}
/// <summary>
/// Parse IP-range string in CIDR notation.
/// For example "12.15.0.0/16".
/// </summary>
/// <param name="ipRange"></param>
/// <returns></returns>
private bool TryParseCIDRNotation(string ipRange)
{
string[] x = ipRange.Split('/');
if (x.Length != 2)
return false;
byte bits = byte.Parse(x[1]);
uint ip = 0;
String[] ipParts0 = x[0].Split('.');
for (int i = 0; i < 4; i++)
{
ip = ip << 8;
ip += uint.Parse(ipParts0[i]);
}
byte shiftBits = (byte)(32 - bits);
uint ip1 = (ip >> shiftBits) << shiftBits;
if (ip1 != ip) // Check correct subnet address
return false;
uint ip2 = ip1 >> shiftBits;
for (int k = 0; k < shiftBits; k++)
{
ip2 = (ip2 << 1) + 1;
}
beginIP = new byte[4];
endIP = new byte[4];
for (int i = 0; i < 4; i++)
{
beginIP[i] = (byte) ((ip1 >> (3 - i) * 8) & 255);
endIP[i] = (byte)((ip2 >> (3 - i) * 8) & 255);
}
return true;
}
/// <summary>
/// Parse IP-range string "12.15-16.1-30.10-255"
/// </summary>
/// <param name="ipRange"></param>
/// <returns></returns>
private bool TryParseSimpleRange(string ipRange)
{
String[] ipParts = ipRange.Split('.');
beginIP = new byte[4];
endIP = new byte[4];
for (int i = 0; i < 4; i++)
{
string[] rangeParts = ipParts[i].Split('-');
if (rangeParts.Length < 1 || rangeParts.Length > 2)
return false;
beginIP[i] = byte.Parse(rangeParts[0]);
endIP[i] = (rangeParts.Length == 1) ? beginIP[i] : byte.Parse(rangeParts[1]);
}
return true;
}
private byte [] beginIP;
private byte [] endIP;
}
}
Check out the snippet here. Keep the credits in place if you use this please.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Linq;
using System.Net;
/* ====================================================================================
C# IP address range finder helper class (C) Nahum Bazes
* Free for private & commercial use - no restriction applied, please leave credits.
* DO NOT REMOVE THIS COMMENT
* ==================================================================================== */
namespace IPAddressTools
{
public class RangeFinder
{
public IEnumerable<string> GetIPRange(IPAddress startIP,
IPAddress endIP)
{
uint sIP = ipToUint(startIP.GetAddressBytes());
uint eIP = ipToUint(endIP.GetAddressBytes());
while (sIP <= eIP)
{
yield return new IPAddress(reverseBytesArray(sIP)).ToString();
sIP++;
}
}
/* reverse byte order in array */
protected uint reverseBytesArray(uint ip)
{
byte[] bytes = BitConverter.GetBytes(ip);
bytes = bytes.Reverse().ToArray();
return (uint)BitConverter.ToInt32(bytes, 0);
}
/* Convert bytes array to 32 bit long value */
protected uint ipToUint(byte[] ipBytes)
{
ByteConverter bConvert = new ByteConverter();
uint ipUint = 0;
int shift = 24; // indicates number of bits left for shifting
foreach (byte b in ipBytes)
{
if (ipUint == 0)
{
ipUint = (uint)bConvert.ConvertTo(b, typeof(uint)) << shift;
shift -= 8;
continue;
}
if (shift >= 8)
ipUint += (uint)bConvert.ConvertTo(b, typeof(uint)) << shift;
else
ipUint += (uint)bConvert.ConvertTo(b, typeof(uint));
shift -= 8;
}
return ipUint;
}
}
}
I'm late to the game, but your question was mentioned in duplicate, so I just add the answer here. Using the IPAddressRange library, you can enumerate your IPs like that:
var start = IPAddress.Parse("192.168.0.2");
var end = IPAddress.Parse("192.168.0.254");
var range = new IPAddressRange(start, end);
foreach (var ip in range)
{
Console.WriteLine(ip);
}
The library also supports CIDR notation and range strings
I think this should do it.
static void TestFunc()
{
byte[,] range = ParseRange("192.0.2-5.14-28");
foreach (IPAddress addr in Enumerate(range))
{
Console.WriteLine(addr);
}
}
static byte[,] ParseRange(string str)
{
if (string.IsNullOrEmpty(str)) throw new ArgumentException("str");
string[] partStr = str.Split('.');
if (partStr.Length != 4) throw new FormatException();
byte[,] range = new byte[4, 2];
for (int i = 0; i < 4; i++)
{
string[] rangeStr = partStr[i].Split('-');
if (rangeStr.Length > 2) throw new FormatException();
range[i, 0] = byte.Parse(rangeStr[0]);
range[i, 1] = byte.Parse(rangeStr[Math.Min(rangeStr.Length - 1, 1)]);
// Remove this to allow ranges to wrap around.
// For example: 254-4 = 254, 255, 0, 1, 2, 3, 4
if (range[i, 1] < range[i, 0]) throw new FormatException();
}
return range;
}
static IEnumerable<IPAddress> Enumerate(byte[,] range)
{
if (range.GetLength(0) != 4) throw new ArgumentException("range");
if (range.GetLength(1) != 2) throw new ArgumentException("range");
for (byte a = range[0, 0]; a != (byte)(range[0, 1] + 1); a++)
{
for (byte b = range[1, 0]; b != (byte)(range[1, 1] + 1); b++)
{
for (byte c = range[2, 0]; c != (byte)(range[2, 1] + 1); c++)
{
for (byte d = range[3, 0]; d != (byte)(range[3, 1] + 1); d++)
{
yield return new IPAddress(new byte[] { a, b, c, d });
}
}
}
}
}

Unicode to Mazovia Encoding redundant char

I've been dealing with this for a few hours. I'm saving a string containing Polish diacritics ąśółńźć etc. to a file, but the software I must use to read that file reads only in Mazovia encoding, a pretty old encoding and not supported by the Microsoft Encoding class.
A .Net string consists of UTF-16 characters, so I've been using this code to convert from Unicode to Mazovia.
string rekord = (linia.Substring(0, linia.Length - 1)) + Environment.NewLine;
string rekordMazovia = Kodowanie.UnicodeNaMazovia(rekord);
File.AppendAllText(sciezka, rekordMazovia);
public static class Kodowanie {
public static string UnicodeNaMazovia(string tekst) {
return tekst
.Replace((char)0x104, (char)0x8F) //Ą
.Replace((char)0x106, (char)0x95) //Ć
.Replace((char)0x118, (char)0x90) //Ę
.Replace((char)0x141, (char)0x9C) //Ł
.Replace((char)0x143, (char)0xA5) //Ń
.Replace((char)0xD3, (char)0xA3) //Ó
.Replace((char)0x15A, (char)0x98) //Ś
.Replace((char)0x179, (char)0xA0) //Ź
.Replace((char)0x17B, (char)0xA1) //Ż
.Replace((char)0x105, (char)0x86) //ą
.Replace((char)0x107, (char)0x8D) //ć
.Replace((char)0x119, (char)0x91) //ę
.Replace((char)0x142, (char)0x92) //ł
.Replace((char)0x144, (char)0xA4) //ń
.Replace((char)0xF3, (char)0xA2) //ó
.Replace((char)0x15B, (char)0x9E) //ś
.Replace((char)0x17A, (char)0xA6) //ź
.Replace((char)0x17C, (char)0xA7); //ż
}
}
Everything would be fine except after reading the generated file in the application I get one redundant char > before every diacritic. It looks like this
How to get rid of it? How to do it better?
Mazovia encoding is like code page 437 but it has different letters at some positions so you can't use 437.
If you implement MazoviaEncoding, you can easily use
Encoding encoding = new MazoviaEncoding();
String output = "ąśółńźć";
File.WriteAllText(#"test.txt", output, encoding);
//File.AppendAllText(#"test.txt", output, encoding);
// will work just as well, just pass the encoding as 3rd parameter
The file will contain:
0x86 0x9E 0xA2 0x92 0xA4 0xA6 0x8D
Which is correct according to http://en.wikipedia.org/wiki/Mazovia_encoding
The implementation can then be used like other Encoding in C#. For instance, reading the file back works as well:
Encoding encoding = new MazoviaEncoding();
String result = File.ReadAllText(#"test.txt", encoding);
Here's my implementation:
using System.Collections.Generic;
using System.Text;
namespace System.Text {
class MazoviaEncoding : Encoding
{
private static int[] codePoints = {
0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
,0x00C7,0x00FC,0x00E9,0x00E2,0x00E4,0x00E0,0x0105,0x00E7,0x00EA,0x00EB,0x00E8,0x00EF,0x00EE,0x0107,0x00C4,0x0104
,0x0118,0x0119,0x0142,0x00F4,0x00F6,0x0106,0x00FB,0x00F9,0x015A,0x00D6,0x00DC,0x00A2,0x0141,0x00A5,0x015B,0x0192
,0x0179,0x017B,0x00F3,0x00D3,0x0144,0x0143,0x017A,0x017C,0x00BF,0x2310,0x00AC,0x00BD,0x00BC,0x00A1,0x00AB,0x00BB
,0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255D,0x255C,0x255B,0x2510
,0x2514,0x2534,0x252C,0x251C,0x2500,0x253C,0x255E,0x255F,0x255A,0x2554,0x2569,0x2566,0x2560,0x2550,0x256C,0x2567
,0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256B,0x256A,0x2518,0x250C,0x2588,0x2584,0x258C,0x2590,0x2580
,0x03B1,0x00DF,0x0393,0x03C0,0x03A3,0x03C3,0x00B5,0x03C4,0x03A6,0x0398,0x03A9,0x03B4,0x221E,0x03C6,0x03B5,0x2229
,0x2261,0x00B1,0x2265,0x2264,0x2320,0x2321,0x00F7,0x2248,0x00B0,0x2219,0x00B7,0x221A,0x207F,0x00B2,0x25A0,0x00A0
};
private static Dictionary<char, byte> unicodeToByte;
static MazoviaEncoding()
{
unicodeToByte = new Dictionary<char, byte>();
for (int i = 0; i < codePoints.Length; ++i)
{
unicodeToByte.Add((char)codePoints[i], (byte)i);
}
}
public override int GetMaxByteCount(int charCount)
{
if (charCount < 0)
{
throw new ArgumentOutOfRangeException();
}
return charCount;
}
public override int GetMaxCharCount(int byteCount)
{
if (byteCount < 0)
{
throw new ArgumentOutOfRangeException();
}
return byteCount;
}
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
{
if( chars == null || bytes == null ) {
throw new ArgumentNullException();
}
if( charIndex + charCount > chars.Length ||
charIndex < 0 ||
byteIndex < 0 ||
byteIndex + charCount > bytes.Length
) {
throw new ArgumentOutOfRangeException();
}
int total = 0;
int j = 0;
for (int i = charIndex; i < charIndex + charCount; ++i)
{
char cur = chars[i];
byte asMazovia;
if (!unicodeToByte.TryGetValue(cur, out asMazovia))
{
asMazovia = (byte)0x003F; // "?"
}
total++;
bytes[j+byteIndex] = asMazovia;
j++;
}
return total;
}
public override int GetChars( byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex )
{
if (chars == null || bytes == null)
{
throw new ArgumentNullException();
}
if ( byteIndex + byteCount > bytes.Length ||
charIndex < 0 ||
byteIndex < 0 ||
charIndex + byteCount > chars.Length
)
{
throw new ArgumentOutOfRangeException();
}
int total = 0;
int j = 0;
for (int i = byteIndex; i < byteIndex + byteCount; ++i)
{
byte cur = bytes[i];
char decoded = (char)codePoints[cur];
total++;
chars[charIndex + j] = decoded;
j++;
}
return total;
}
public override int GetByteCount(char[] charArray, int index, int count)
{
if (charArray == null)
{
throw new ArgumentNullException();
}
if (index + count <= charArray.Length && index >= 0 && count >= 0)
{
return count;
}
else
{
throw new ArgumentOutOfRangeException();
}
}
public override int GetCharCount( byte[] bytes, int index, int count )
{
if (bytes == null)
{
throw new ArgumentNullException();
}
if (index < 0 || count < 0 || index + count > bytes.Length)
{
throw new ArgumentOutOfRangeException();
}
return count;
}
}
}

Parse a persian pdf file to txt and its images

I used this code for convert a English pdf and it work perfectly, but when i use it for Persian file, its output has no Persian character !! how i can parse a Unicode pdf to a text file and a folder contains image files?
using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;
namespace Spider.Utils
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten = 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (input[i] == 213)
c = "'".ToCharArray()[0];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
resultString += c.ToString();
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return CleanupContent(resultString);
}
catch
{
return "";
}
}
private string CleanupContent(string text)
{
string[] patterns = { #"\\\(", #"\\\)", #"\\226", #"\\222", #"\\223", #"\\224", #"\\340", #"\\342", #"\\344", #"\\300", #"\\302", #"\\304", #"\\351", #"\\350", #"\\352", #"\\353", #"\\311", #"\\310", #"\\312", #"\\313", #"\\362", #"\\364", #"\\366", #"\\322", #"\\324", #"\\326", #"\\354", #"\\356", #"\\357", #"\\314", #"\\316", #"\\317", #"\\347", #"\\307", #"\\371", #"\\373", #"\\374", #"\\331", #"\\333", #"\\334", #"\\256", #"\\231", #"\\253", #"\\273", #"\\251", #"\\221"};
string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" };
for (int i = 0; i < patterns.Length; i++)
{
string regExPattern = patterns[i];
Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
text = regex.Replace(text, replace[i]);
}
return text;
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="tokens">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
#endregion
}
}
Is there a specific reason you're not using the relatively new parsing classes? I don't know the Persian language, but the first Persian PDF I found on Google works, and it's much less code:
PdfReader reader = new PdfReader(pdfPath);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
StringBuilder sb = new StringBuilder();
for (int i = 1; i <= reader.NumberOfPages; i++) {
ITextExtractionStrategy strategy = parser.ProcessContent(
i, new SimpleTextExtractionStrategy()
);
sb.Append(strategy.GetResultantText());
}
A number of bugs have been fixed recently, so I'm using the latest iTextSharp SVN build. Also your question title includes parsing images, but your code doesn't, so the example above is not extracting any images.

C# implementation of Google's 'Encoded Polyline Algorithm'

Does anyone have a concise and robust implementation of Google's Encoded Polyline Algorithm in C#?
I essentially want the implementation of this signature:
public string Encode(IEnumerable<Point> points);
Here's the implementation I settled on:
public static string Encode(IEnumerable<GeoLocation> points)
{
var str = new StringBuilder();
var encodeDiff = (Action<int>)(diff => {
int shifted = diff << 1;
if (diff < 0)
shifted = ~shifted;
int rem = shifted;
while (rem >= 0x20)
{
str.Append((char)((0x20 | (rem & 0x1f)) + 63));
rem >>= 5;
}
str.Append((char)(rem + 63));
});
int lastLat = 0;
int lastLng = 0;
foreach (var point in points)
{
int lat = (int)Math.Round(point.Latitude * 1E5);
int lng = (int)Math.Round(point.Longitude * 1E5);
encodeDiff(lat - lastLat);
encodeDiff(lng - lastLng);
lastLat = lat;
lastLng = lng;
}
return str.ToString();
}
Hope that helps someone else out.
Maybe is late but I've just solved the same problem but to Encode list of locations and Decode polylines, I used http://json2csharp.com/ to generate corresponding classes in C# in order to deserialize the response with JsonConvert like:
var googleDirectionsResponse = JsonConvert.DeserializeObject<RootObject>(responseString);
This gave me this definition of Location (I'll clean it sooner or later):
public class Location
{
public double lat { get; set; }
public double lng { get; set; }
}
And I created a converter class to do the trick in both directions (IT's not original, it is just a refactor from this class: https://gist.github.com/shinyzhu/4617989 ):
/// <summary>
/// Google Polyline Converter (Encoder and Decoder)
/// </summary>
public static class GooglePolylineConverter
{
/// <summary>
/// Decodes the specified polyline string.
/// </summary>
/// <param name="polylineString">The polyline string.</param>
/// <returns>A list with Locations</returns>
public static IEnumerable<Location> Decode(string polylineString)
{
if (string.IsNullOrEmpty(polylineString))
throw new ArgumentNullException(nameof(polylineString));
var polylineChars = polylineString.ToCharArray();
var index = 0;
var currentLat = 0;
var currentLng = 0;
while (index < polylineChars.Length)
{
// Next lat
var sum = 0;
var shifter = 0;
int nextFiveBits;
do
{
nextFiveBits = polylineChars[index++] - 63;
sum |= (nextFiveBits & 31) << shifter;
shifter += 5;
} while (nextFiveBits >= 32 && index < polylineChars.Length);
if (index >= polylineChars.Length)
break;
currentLat += (sum & 1) == 1 ? ~(sum >> 1) : (sum >> 1);
// Next lng
sum = 0;
shifter = 0;
do
{
nextFiveBits = polylineChars[index++] - 63;
sum |= (nextFiveBits & 31) << shifter;
shifter += 5;
} while (nextFiveBits >= 32 && index < polylineChars.Length);
if (index >= polylineChars.Length && nextFiveBits >= 32)
break;
currentLng += (sum & 1) == 1 ? ~(sum >> 1) : (sum >> 1);
yield return new Location
{
lat = Convert.ToDouble(currentLat) / 1E5,
lng = Convert.ToDouble(currentLng) / 1E5
};
}
}
/// <summary>
/// Encodes the specified locations list.
/// </summary>
/// <param name="locations">The locations.</param>
/// <returns>The polyline string.</returns>
public static string Encode(IEnumerable<Location> locations)
{
var str = new StringBuilder();
var encodeDiff = (Action<int>)(diff =>
{
var shifted = diff << 1;
if (diff < 0)
shifted = ~shifted;
var rem = shifted;
while (rem >= 0x20)
{
str.Append((char)((0x20 | (rem & 0x1f)) + 63));
rem >>= 5;
}
str.Append((char)(rem + 63));
});
var lastLat = 0;
var lastLng = 0;
foreach (var point in locations)
{
var lat = (int)Math.Round(point.lat * 1E5);
var lng = (int)Math.Round(point.lng * 1E5);
encodeDiff(lat - lastLat);
encodeDiff(lng - lastLng);
lastLat = lat;
lastLng = lng;
}
return str.ToString();
}
}
I hope it helps.
Javascript implementation, in case anyone is interested:
var polyline_encoder = (function() {
var _ = {};
var invert_bits = function(str) {
var ret = "";
for(var i=0; i<str.length; i++) {
if(str.charAt(i) == "1")
ret += "0";
else
ret += "1";
}
return ret;
};
var get_binary = function(num) {
var binary = parseInt(num).toString(2);
var bit_difference = 32 - binary.length;
for(var i=0; i<bit_difference; i++)
binary = "0" + binary;
if(num < 0) {
binary = invert_bits(binary);
binary = parseInt(binary, 2);
binary++;
return parseInt(binary).toString(2);
}
return binary;
};
_.encode_polyline = function(points) {
var ret = "";
var last_point, val_1, val_2;
for(var i=0; i<points.length; i++) {
if(!last_point) {
val_1 = points[i][0];
val_2 = points[i][1];
} else {
val_1 = points[i][0] - last_point[0];
val_2 = points[i][1] - last_point[1];
}
last_point = points[i];
ret += _.encode_polyline_value(val_1) + _.encode_polyline_value(val_2);
}
return ret;
};
_.encode_polyline_value = function(value) {
var ret = "";
value = Math.round(value * 100000);
var shifted = value << 1;
if(shifted < 0)
shifted = ~shifted;
var rem = shifted;
while(rem >= 32) {
ret += get_ascii_value(((0x20 | (rem & 0x1f)) + 63));
rem >>= 5;
}
ret += get_ascii_value(rem + 63);
return ret;
};
var get_ascii_value = function(num) {
var ascii_table =
// 0 thru 9
"??????????" +
// 10 thru 19
"??????????" +
// 20 thru 29
"??????????" +
// 30 thru 39
"?? !\"#$%&'" +
// 40 thru 49
"()*+,-./01" +
// 50 thru 59
"23456789:;" +
// 60 thru 69
"<=>?#ABCDE" +
// 70 thru 79
"FGHIJKLMNO" +
// 80 thru 89
"PQRSTUVWXY" +
// 90 thru 99
"Z[\\]^_`abc" +
// 100 thru 109
"defghijklm" +
// 110 thru 119
"nopqrstuvw" +
// 120 thru 127
"xyz{|}~?";
var value = ascii_table.substr(num, 1);
if(value == "?")
value = "";
return value;
};
return _;
})();

Categories

Resources