.NET 6 System.Text.Json.JsonSerializer Deserialize UTF-8 escaped string - c#

I have some JSON files (facebook backup) which is UTF-8 encoded but special charachters are escaped. The escaped characters are also UTF-8 encoded but in hexadecimal format. For example:
{
"sender_name": "Tam\u00c3\u00a1s"
}
I want to use System.Text.Json.JsonSerializer for deserialization. The problem it is interprets the escaped hexes as UTF-16 characters.
So it will be deserealized as "Tamás" not as "Tamás" as it should.
Code to repro:
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
class Msg
{
[JsonPropertyName("sender_name")]
public string SenderName { get; set; }
}
public class Program
{
public static void Main()
{
var data = #"{
""sender_name"": ""Tam\u00c3\u00a1s""
}";
var msg = JsonSerializer.Deserialize<Msg>(data);
Console.WriteLine(msg.SenderName);
}
}
Can i change the serializer to interpret it as UTF-8?

The problem here is that the sender of your JSON has wrong values \u00c3 and \u00a1 for the numeric escape codes for á inside their string literal. The meaning of the \uXXXX escape sequences is specified by the JSON Proposal as well as the JSON Standard. It is defined such that XXXX is the character's "4HEXDIG" UTF-16 Unicode codepoint value [1], which, for á, is \u00E1. Instead the provider of your JSON file (Facebook's "backup your data feature" apparently) is using UTF-8 Hex values for the \uXXXX escape sequences, rather than UTF-16 as required by the standard.
There is no built-in way to tell System.Text.Json (or Json.NET for that matter) that the \uXXXX escape sequences use nonstandard values, however Utf8JsonReader provides access to the underlying, raw byte stream via the ValueSpan and ValueSequence properties, so it is possible to create a custom JsonConverter<string> that does the necessary decoding and unescaping itself.
First, create the following converter:
public class StringConverterForUtf8EscapedCharValues : JsonConverter<string>
{
public override string? Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.String)
throw new JsonException();
if (!reader.ValueIsEscaped)
return reader.GetString();
ReadOnlySpan<byte> span = reader.HasValueSequence ? reader.ValueSequence.ToArray() : reader.ValueSpan;
// Normally a JSON string will be a utf8 byte sequence with embedded utf18 escape codes.
// These improperly encoded JSON strings are utf8 byte sequences with embedded utf8 escape codes.
var encoding = Encoding.UTF8;
var decoder = encoding.GetDecoder();
var sb = new StringBuilder();
var maxCharCount = Encoding.UTF8.GetMaxCharCount(4);
for (int i = 0; i < span.Length; i++)
{
if (span[i] != '\\')
{
Span<char> chars = stackalloc char[maxCharCount];
var n = decoder.GetChars(span.Slice(i, 1), chars, false);
sb.Append(chars.Slice(0, n));
}
else if (i < span.Length - 1 && span[i+1] == '"')
{
sb.Append('"');
i++;
}
else if (i < span.Length - 1 && span[i+1] == '\\')
{
sb.Append('\\');
i++;
}
else if (i < span.Length - 1 && span[i+1] == '/')
{
sb.Append('/');
i++;
}
else if (i < span.Length - 1 && span[i+1] == 'b')
{
sb.Append('\u0008');
i++;
}
else if (i < span.Length - 1 && span[i+1] == 'b')
{
sb.Append('\u0008');
i++;
}
else if (i < span.Length - 1 && span[i+1] == 'f')
{
sb.Append('\u0008');
i++;
}
else if (i < span.Length - 1 && span[i+1] == 'f')
{
sb.Append('\u000C');
i++;
}
else if (i < span.Length - 1 && span[i+1] == 'f')
{
sb.Append('\u000C');
i++;
}
else if (i < span.Length - 1 && span[i+1] == 'n')
{
sb.Append('\n');
i++;
}
else if (i < span.Length - 1 && span[i+1] == 'r')
{
sb.Append('\r');
i++;
}
else if (i < span.Length - 1 && span[i+1] == 't')
{
sb.Append('\t');
i++;
}
else if (i < span.Length - 5 && span[i+1] == 'u')
{
Span<char> hexchars = stackalloc char[4] { (char)span[i+2], (char)span[i+3], (char)span[i+4], (char)span[i+5] };
if (!byte.TryParse(hexchars, NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo, out var b))
{
throw new JsonException();
}
Span<char> chars = stackalloc char[maxCharCount];
Span<byte> bytes = stackalloc byte[1] { b };
var n = decoder.GetChars(bytes, chars, false);
sb.Append(chars.Slice(0, n));
i += 5;
}
else
{
throw new JsonException();
}
}
var s = sb.ToString();
return s;
}
public override void Write(Utf8JsonWriter writer, string value, JsonSerializerOptions options) => writer.WriteStringValue(value);
}
And now you will be able to do
var options = new JsonSerializerOptions
{
Converters = { new StringConverterForUtf8EscapedCharValues() },
};
var msg = JsonSerializer.Deserialize<Msg>(data, options);
Assert.That(msg?.SenderName?.StartsWith("Tamás") == true); // Succeeds
Console.WriteLine(msg?.SenderName); // Prints Tamás
Notes:
Since a JSON file is generally a UTF-8 encoded character stream, decoding a single string literal in a well-formed JSON file can require decoding a mixture of UTF-8 and UTF-16 values.
The converter may not work if the underlying byte stream was not encoded using UTF-8.
Writing with (incorrect) UTF-8 values for escaped characters is not implemented.
The incorrect escaped values should be fixed before the JSON string literal is decoded to a c# string because the presence or absence of escape sequences is lost once decoding and unescaping are complete.
I haven't tested performance. It might be more performant to use the decoder returned by Encoding.UTF8.GetDecoder() to decode in chunks, rather than byte-by-byte as is done in this prototype.
Demo fiddle here.
[1] Characters not in the Basic Multilingual Plane should use two sequential escape sequences, e.g. \uD834\uDD1E

try this code
var msg = JsonSerializer.Deserialize<Msg>(data);
msg.SenderName= DecodeFromUtf16ToUtf8(msg.SenderName); // Tamás
public string DecodeFromUtf16ToUtf8(string utf16String)
{
// copy the string as UTF-8 bytes.
byte[] utf8Bytes = new byte[utf16String.Length];
for (int i = 0; i < utf16String.Length; ++i)
utf8Bytes[i] = (byte)utf16String[i];
return Encoding.UTF8.GetString(utf8Bytes, 0, utf8Bytes.Length);
}
or you can add the json constructor
var msg = System.Text.Json.JsonSerializer.Deserialize<Msg>(data);
public class Msg
{
[JsonPropertyName("sender_name")]
public string SenderName { get; set; }
public Msg(string SenderName)
{
this.SenderName= DecodeFromUtf16ToUtf8(SenderName);
}
}

Related

Is there built-in support for automatically escaping invalid HTTP Header characters in .NET Core?

In .NET Framework, there is the method System.Web.Util.HttpEncoder.HeaderNameValueEncode() to (as the name suggests) encodes characters (such as newlines) that would be invalid in HTTP Headers so that they are appropriately escaped.
Is there an equivalent method in .NET Core? I'm aware of System.Text.Encodings.Web.HtmlEncoder.Default.Encode(), but this encodes more than is needed (such as single quotes) for HTTP Headers.
EDIT:
This question is NOT a duplicate, I'm not looking for escaping characters in URLs, I'm looking for built-in functionality for escaping characters in HTTP Header values.
.Net Core doesn't implement .Net Framework's HeaderNameValueEncode(). Not even version 3.
You can find and use the Mono implementation here: https://github.com/mono/mono/blob/master/mcs/class/System.Web/System.Web.Util/HttpEncoder.cs
I like more a single-method implementation using Linq, as:
public static void HeaderNameValueEncode(string headerName, string headerValue, out string encodedHeaderName, out string encodedHeaderValue)
{
if (string.IsNullOrEmpty(headerName))
{
encodedHeaderName = headerName;
}
else
{
var sb = new StringBuilder();
headerName.All(ch => { if ((ch == 9 || ch >= 32) && ch != 127) sb.Append(ch); return true; });
encodedHeaderName = sb.ToString();
}
if (string.IsNullOrEmpty(headerValue))
{
encodedHeaderValue = headerValue;
}
else
{
var sb = new StringBuilder();
headerValue.All(ch => { if ((ch == 9 || ch >= 32) && ch != 127) sb.Append(ch); return true; });
encodedHeaderValue = sb.ToString();
}
}
or, if you prefer to convert only one string:
public static string HeaderNameOrValueEncode(string headerString)
{
if (string.IsNullOrEmpty(headerString))
{
return headerString;
}
else
{
var sb = new StringBuilder();
headerString.All(ch => { if ((ch == 9 || ch >= 32) && ch != 127) sb.Append(ch); return true; });
return sb.ToString();
}
}

How to exchange numbers to alphabet and alphabet to numbers in a string?

How do I convert numbers to its equivalent alphabet character and convert alphabet character to its numeric values from a string (except 0, 0 should stay 0 for obvious reasons)
So basically if there is a string
string content="D93AK0F5I";
How can I convert it to ?
string new_content="4IC11106E9";
I'm assuming you're aware this is not reversible, and that you're only using upper case and digits. Here you go...
private string Transpose(string input)
{
StringBuilder result = new StringBuilder();
foreach (var character in input)
{
if (character == '0')
{
result.Append(character);
}
else if (character >= '1' && character <= '9')
{
int offset = character - '1';
char replacement = (char)('A' + offset);
result.Append(replacement);
}
else if (character >= 'A' && character <= 'Z') // I'm assuming upper case only; feel free to duplicate for lower case
{
int offset = character - 'A' + 1;
result.Append(offset);
}
else
{
throw new ApplicationException($"Unexpected character: {character}");
}
}
return result.ToString();
}
Well, if you are only going to need a one way translation, here is quite a simple way to do it, using linq:
string convert(string input)
{
var chars = "0abcdefghijklmnopqrstuvwxyz";
return string.Join("",
input.Select(
c => char.IsDigit(c) ?
chars[int.Parse(c.ToString())].ToString() :
(chars.IndexOf(char.ToLowerInvariant(c))).ToString())
);
}
You can see a live demo on rextester.
You can use ArrayList of Albhabets. For example
ArrayList albhabets = new ArrayList();
albhabets.Add("A");
albhabets.Add("B");
and so on.
And now parse your string character by character.
string s = "1BC34D";
char[] characters = s.ToCharArray();
for (int i = 0; i < characters.Length; i++)
{
if (Char.IsNumber(characters[0]))
{
var index = characters[0];
var stringAlbhabet = albhabets[index];
}
else
{
var digitCharacter = albhabets.IndexOf(characters[0]);
}
}
This way you can get "Alphabet" representation of number & numeric representation of "Alphabet".

Reversing of an Arabic string resulting from abcpdf .net

I have used abcpdf.net to convert an Arabic pdf through read(pdfpath) and gettext() functions, the resulted text (string) looks like these it's unreadable as Arabic is an RTL language. My question is now I have to reverse the Arabic parts of the string to make it readable but I don't know how can I do that. How can I extract the Arabic part only and then reverse it?
I am using C#, and here is a sample of the extracted string from my PDF when using abcpdf .net library:
0.00
KCCUSER1
6:17:19PM28/10/2010ةعابطلا خيرات
(200) لوادتملا زكارمو تاكرح
ةصاقملل ةيتيوكلا ةكرشلا
28/10/2010
RBKPI012
لمعلا خيرات
عمجم/ ح - 88لجلا عيبلل افيا ةيلودلا ةيلاملا تاراشتسلا ةكرش - 65646
C023
يحاتتفلا ديصرلا
private string Convert(string source)
{
string arabicWord = string.Empty;
StringBuilder sbDestination = new StringBuilder();
foreach (var ch in source)
{
if (IsArabic(ch))
arabicWord += ch;
else
{
if (arabicWord != string.Empty)
sbDestination.Append(Reverse(arabicWord));
sbDestination.Append(ch);
arabicWord = string.Empty;
}
}
// if the last word was arabic
if (arabicWord != string.Empty)
sbDestination.Append(Reverse(arabicWord));
return sbDestination.ToString();
}
IsArabic method from here
private bool IsArabic(char character)
{
if (character >= 0x600 && character <= 0x6ff)
return true;
if (character >= 0x750 && character <= 0x77f)
return true;
if (character >= 0xfb50 && character <= 0xfc3f)
return true;
if (character >= 0xfe70 && character <= 0xfefc)
return true;
return false;
}
// Reverse the characters of string
string Reverse(string source)
{
return new string(source.ToCharArray().Reverse().ToArray());
}
Good luck!

What is the best algorithm for arbitrary delimiter/escape character processing?

I'm a little surprised that there isn't some information on this on the web, and I keep finding that the problem is a little stickier than I thought.
Here's the rules:
You are starting with delimited/escaped data to split into an array.
The delimiter is one arbitrary character
The escape character is one arbitrary character
Both the delimiter and the escape character could occur in data
Regex is fine, but a good-performance solution is best
Edit: Empty elements (including leading or ending delimiters) can be ignored
The code signature (in C# would be, basically)
public static string[] smartSplit(
string delimitedData,
char delimiter,
char escape) {}
The stickiest part of the problem is the escaped consecutive escape character case, of course, since (calling / the escape character and , the delimiter): ////////, = ////,
Am I missing somewhere this is handled on the web or in another SO question? If not, put your big brains to work... I think this problem is something that would be nice to have on SO for the public good. I'm working on it myself, but don't have a good solution yet.
A simple state machine is usually the easiest and fastest way. Example in Python:
def extract(input, delim, escape):
# states
parsing = 0
escaped = 1
state = parsing
found = []
parsed = ""
for c in input:
if state == parsing:
if c == delim:
found.append(parsed)
parsed = ""
elif c == escape:
state = escaped
else:
parsed += c
else: # state == escaped
parsed += c
state = parsing
if parsed:
found.append(parsed)
return found
void smartSplit(string const& text, char delim, char esc, vector<string>& tokens)
{
enum State { NORMAL, IN_ESC };
State state = NORMAL;
string frag;
for (size_t i = 0; i<text.length(); ++i)
{
char c = text[i];
switch (state)
{
case NORMAL:
if (c == delim)
{
if (!frag.empty())
tokens.push_back(frag);
frag.clear();
}
else if (c == esc)
state = IN_ESC;
else
frag.append(1, c);
break;
case IN_ESC:
frag.append(1, c);
state = NORMAL;
break;
}
}
if (!frag.empty())
tokens.push_back(frag);
}
private static string[] Split(string input, char delimiter, char escapeChar, bool removeEmpty)
{
if (input == null)
{
return new string[0];
}
char[] specialChars = new char[]{delimiter, escapeChar};
var tokens = new List<string>();
var token = new StringBuilder();
for (int i = 0; i < input.Length; i++)
{
var c = input[i];
if (c.Equals(escapeChar))
{
if (i >= input.Length - 1)
{
throw new ArgumentException("Uncompleted escape sequence has been encountered at the end of the input");
}
var nextChar = input[i + 1];
if (nextChar != escapeChar && nextChar != delimiter)
{
throw new ArgumentException("Unknown escape sequence has been encountered: " + c + nextChar);
}
token.Append(nextChar);
i++;
}
else if (c.Equals(delimiter))
{
if (!removeEmpty || token.Length > 0)
{
tokens.Add(token.ToString());
token.Length = 0;
}
}
else
{
var index = input.IndexOfAny(specialChars, i);
if (index < 0)
{
token.Append(c);
}
else
{
token.Append(input.Substring(i, index - i));
i = index - 1;
}
}
}
if (!removeEmpty || token.Length > 0)
{
tokens.Add(token.ToString());
}
return tokens.ToArray();
}
The implementation of this kind of tokenizer in terms of a FSM is fairly straight forward.
You do have a few decisions to make (like, what do I do with leading delimiters? strip or emit NULL tokens).
Here is an abstract version which ignores leading and multiple delimiters, and doesn't allow escaping the newline:
state(input) action
========================
BEGIN(*): token.clear(); state=START;
END(*): return;
*(\n\0): token.emit(); state=END;
START(DELIMITER): ; // NB: the input is *not* added to the token!
START(ESCAPE): state=ESC; // NB: the input is *not* added to the token!
START(*): token.append(input); state=NORM;
NORM(DELIMITER): token.emit(); token.clear(); state=START;
NORM(ESCAPE): state=ESC; // NB: the input is *not* added to the token!
NORM(*): token.append(input);
ESC(*): token.append(input); state=NORM;
This kind of implementation has the advantage of dealing with consecutive excapes naturally, and can be easily extended to give special meaning to more escape sequences (i.e. add a rule like ESC(t) token.appeand(TAB)).
Here's my ported function in C#
public static void smartSplit(string text, char delim, char esc, ref List<string> listToBuild)
{
bool currentlyEscaped = false;
StringBuilder fragment = new StringBuilder();
for (int i = 0; i < text.Length; i++)
{
char c = text[i];
if (currentlyEscaped)
{
fragment.Append(c);
currentlyEscaped = false;
}
else
{
if (c == delim)
{
if (fragment.Length > 0)
{
listToBuild.Add(fragment.ToString());
fragment.Remove(0, fragment.Length);
}
}
else if (c == esc)
currentlyEscaped = true;
else
fragment.Append(c);
}
}
if (fragment.Length > 0)
{
listToBuild.Add(fragment.ToString());
}
}
Hope this helps someone in the future. Thanks to KenE for pointing me in the right direction.
Here's a more idiomatic and readable way to do it:
public IEnumerable<string> SplitAndUnescape(
string encodedString,
char separator,
char escape)
{
var inEscapeSequence = false;
var currentToken = new StringBuilder();
foreach (var currentCharacter in encodedString)
if (inEscapeSequence)
{
currentToken.Append(currentCharacter);
inEscapeSequence = false;
}
else
if (currentCharacter == escape)
inEscapeSequence = true;
else
if (currentCharacter == separator)
{
yield return currentToken.ToString();
currentToken.Clear();
}
else
currentToken.Append(currentCharacter);
yield return currentToken.ToString();
}
Note that this doesn't remove empty elements. I don't think that should be the responsibility of the parser. If you want to remove them, just call Where(item => item.Any()) on the result.
I think this is too much logic for a single method; it gets hard to follow. If someone has time, I think it would be better to break it up into multiple methods and maybe its own class.
You'ew looking for something like a "string tokenizer". There's a version I found quickly that's similar. Or look at getopt.

How do you remove invalid hexadecimal characters from an XML-based data source prior to constructing an XmlReader or XPathDocument that uses the data?

Is there any easy/general way to clean an XML based data source prior to using it in an XmlReader so that I can gracefully consume XML data that is non-conformant to the hexadecimal character restrictions placed on XML?
Note:
The solution needs to handle XML
data sources that use character
encodings other than UTF-8, e.g. by
specifying the character encoding at
the XML document declaration. Not
mangling the character encoding of
the source while stripping invalid
hexadecimal characters has been a
major sticking point.
The removal of invalid hexadecimal characters should only remove hexadecimal encoded values, as you can often find href values in data that happens to contains a string that would be a string match for a hexadecimal character.
Background:
I need to consume an XML-based data source that conforms to a specific format (think Atom or RSS feeds), but want to be able to consume data sources that have been published which contain invalid hexadecimal characters per the XML specification.
In .NET if you have a Stream that represents the XML data source, and then attempt to parse it using an XmlReader and/or XPathDocument, an exception is raised due to the inclusion of invalid hexadecimal characters in the XML data. My current attempt to resolve this issue is to parse the Stream as a string and use a regular expression to remove and/or replace the invalid hexadecimal characters, but I am looking for a more performant solution.
It may not be perfect (emphasis added since people missing this disclaimer), but what I've done in that case is below. You can adjust to use with a stream.
/// <summary>
/// Removes control characters and other non-UTF-8 characters
/// </summary>
/// <param name="inString">The string to process</param>
/// <returns>A string with no control characters or entities above 0x00FD</returns>
public static string RemoveTroublesomeCharacters(string inString)
{
if (inString == null) return null;
StringBuilder newString = new StringBuilder();
char ch;
for (int i = 0; i < inString.Length; i++)
{
ch = inString[i];
// remove any characters outside the valid UTF-8 range as well as all control characters
// except tabs and new lines
//if ((ch < 0x00FD && ch > 0x001F) || ch == '\t' || ch == '\n' || ch == '\r')
//if using .NET version prior to 4, use above logic
if (XmlConvert.IsXmlChar(ch)) //this method is new in .NET 4
{
newString.Append(ch);
}
}
return newString.ToString();
}
I like Eugene's whitelist concept. I needed to do a similar thing as the original poster, but I needed to support all Unicode characters, not just up to 0x00FD. The XML spec is:
Char = #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
In .NET, the internal representation of Unicode characters is only 16 bits, so we can't `allow' 0x10000-0x10FFFF explicitly. The XML spec explicitly disallows the surrogate code points starting at 0xD800 from appearing. However it is possible that if we allowed these surrogate code points in our whitelist, utf-8 encoding our string might produce valid XML in the end as long as proper utf-8 encoding was produced from the surrogate pairs of utf-16 characters in the .NET string. I haven't explored this though, so I went with the safer bet and didn't allow the surrogates in my whitelist.
The comments in Eugene's solution are misleading though, the problem is that the characters we are excluding are not valid in XML ... they are perfectly valid Unicode code points. We are not removing `non-utf-8 characters'. We are removing utf-8 characters that may not appear in well-formed XML documents.
public static string XmlCharacterWhitelist( string in_string ) {
if( in_string == null ) return null;
StringBuilder sbOutput = new StringBuilder();
char ch;
for( int i = 0; i < in_string.Length; i++ ) {
ch = in_string[i];
if( ( ch >= 0x0020 && ch <= 0xD7FF ) ||
( ch >= 0xE000 && ch <= 0xFFFD ) ||
ch == 0x0009 ||
ch == 0x000A ||
ch == 0x000D ) {
sbOutput.Append( ch );
}
}
return sbOutput.ToString();
}
As the way to remove invalid XML characters I suggest you to use XmlConvert.IsXmlChar method. It was added since .NET Framework 4 and is presented in Silverlight too. Here is the small sample:
void Main() {
string content = "\v\f\0";
Console.WriteLine(IsValidXmlString(content)); // False
content = RemoveInvalidXmlChars(content);
Console.WriteLine(IsValidXmlString(content)); // True
}
static string RemoveInvalidXmlChars(string text) {
char[] validXmlChars = text.Where(ch => XmlConvert.IsXmlChar(ch)).ToArray();
return new string(validXmlChars);
}
static bool IsValidXmlString(string text) {
try {
XmlConvert.VerifyXmlChars(text);
return true;
} catch {
return false;
}
}
DRY implementation of this answer's solution (using a different constructor - feel free to use the one you need in your application):
public class InvalidXmlCharacterReplacingStreamReader : StreamReader
{
private readonly char _replacementCharacter;
public InvalidXmlCharacterReplacingStreamReader(string fileName, char replacementCharacter) : base(fileName)
{
this._replacementCharacter = replacementCharacter;
}
public override int Peek()
{
int ch = base.Peek();
if (ch != -1 && IsInvalidChar(ch))
{
return this._replacementCharacter;
}
return ch;
}
public override int Read()
{
int ch = base.Read();
if (ch != -1 && IsInvalidChar(ch))
{
return this._replacementCharacter;
}
return ch;
}
public override int Read(char[] buffer, int index, int count)
{
int readCount = base.Read(buffer, index, count);
for (int i = index; i < readCount + index; i++)
{
char ch = buffer[i];
if (IsInvalidChar(ch))
{
buffer[i] = this._replacementCharacter;
}
}
return readCount;
}
private static bool IsInvalidChar(int ch)
{
return (ch < 0x0020 || ch > 0xD7FF) &&
(ch < 0xE000 || ch > 0xFFFD) &&
ch != 0x0009 &&
ch != 0x000A &&
ch != 0x000D;
}
}
Modernising dnewcombe's answer, you could take a slightly simpler approach
public static string RemoveInvalidXmlChars(string input)
{
var isValid = new Predicate<char>(value =>
(value >= 0x0020 && value <= 0xD7FF) ||
(value >= 0xE000 && value <= 0xFFFD) ||
value == 0x0009 ||
value == 0x000A ||
value == 0x000D);
return new string(Array.FindAll(input.ToCharArray(), isValid));
}
or, with Linq
public static string RemoveInvalidXmlChars(string input)
{
return new string(input.Where(value =>
(value >= 0x0020 && value <= 0xD7FF) ||
(value >= 0xE000 && value <= 0xFFFD) ||
value == 0x0009 ||
value == 0x000A ||
value == 0x000D).ToArray());
}
I'd be interested to know how the performance of these methods compares and how they all compare to a black list approach using Buffer.BlockCopy.
Here is dnewcome's answer in a custom StreamReader. It simply wraps a real stream reader and replaces the characters as they are read.
I only implemented a few methods to save myself time. I used this in conjunction with XDocument.Load and a file stream and only the Read(char[] buffer, int index, int count) method was called, so it worked like this. You may need to implement additional methods to get this to work for your application. I used this approach because it seems more efficient than the other answers. I also only implemented one of the constructors, you could obviously implement any of the StreamReader constructors that you need, since it is just a pass through.
I chose to replace the characters rather than removing them because it greatly simplifies the solution. In this way the length of the text stays the same, so there is no need to keep track of a separate index.
public class InvalidXmlCharacterReplacingStreamReader : TextReader
{
private StreamReader implementingStreamReader;
private char replacementCharacter;
public InvalidXmlCharacterReplacingStreamReader(Stream stream, char replacementCharacter)
{
implementingStreamReader = new StreamReader(stream);
this.replacementCharacter = replacementCharacter;
}
public override void Close()
{
implementingStreamReader.Close();
}
public override ObjRef CreateObjRef(Type requestedType)
{
return implementingStreamReader.CreateObjRef(requestedType);
}
public void Dispose()
{
implementingStreamReader.Dispose();
}
public override bool Equals(object obj)
{
return implementingStreamReader.Equals(obj);
}
public override int GetHashCode()
{
return implementingStreamReader.GetHashCode();
}
public override object InitializeLifetimeService()
{
return implementingStreamReader.InitializeLifetimeService();
}
public override int Peek()
{
int ch = implementingStreamReader.Peek();
if (ch != -1)
{
if (
(ch < 0x0020 || ch > 0xD7FF) &&
(ch < 0xE000 || ch > 0xFFFD) &&
ch != 0x0009 &&
ch != 0x000A &&
ch != 0x000D
)
{
return replacementCharacter;
}
}
return ch;
}
public override int Read()
{
int ch = implementingStreamReader.Read();
if (ch != -1)
{
if (
(ch < 0x0020 || ch > 0xD7FF) &&
(ch < 0xE000 || ch > 0xFFFD) &&
ch != 0x0009 &&
ch != 0x000A &&
ch != 0x000D
)
{
return replacementCharacter;
}
}
return ch;
}
public override int Read(char[] buffer, int index, int count)
{
int readCount = implementingStreamReader.Read(buffer, index, count);
for (int i = index; i < readCount+index; i++)
{
char ch = buffer[i];
if (
(ch < 0x0020 || ch > 0xD7FF) &&
(ch < 0xE000 || ch > 0xFFFD) &&
ch != 0x0009 &&
ch != 0x000A &&
ch != 0x000D
)
{
buffer[i] = replacementCharacter;
}
}
return readCount;
}
public override Task<int> ReadAsync(char[] buffer, int index, int count)
{
throw new NotImplementedException();
}
public override int ReadBlock(char[] buffer, int index, int count)
{
throw new NotImplementedException();
}
public override Task<int> ReadBlockAsync(char[] buffer, int index, int count)
{
throw new NotImplementedException();
}
public override string ReadLine()
{
throw new NotImplementedException();
}
public override Task<string> ReadLineAsync()
{
throw new NotImplementedException();
}
public override string ReadToEnd()
{
throw new NotImplementedException();
}
public override Task<string> ReadToEndAsync()
{
throw new NotImplementedException();
}
public override string ToString()
{
return implementingStreamReader.ToString();
}
}
Regex based approach
public static string StripInvalidXmlCharacters(string str)
{
var invalidXmlCharactersRegex = new Regex("[^\u0009\u000a\u000d\u0020-\ud7ff\ue000-\ufffd]|([\ud800-\udbff](?![\udc00-\udfff]))|((?<![\ud800-\udbff])[\udc00-\udfff])");
return invalidXmlCharactersRegex.Replace(str, "");
}
See my blogpost for more details
I created a slightly updated version of #Neolisk's answer, which supports the *Async functions and uses the .Net 4.0 XmlConvert.IsXmlChar function.
public class InvalidXmlCharacterReplacingStreamReader : StreamReader
{
private readonly char _replacementCharacter;
public InvalidXmlCharacterReplacingStreamReader(string fileName, char replacementCharacter) : base(fileName)
{
_replacementCharacter = replacementCharacter;
}
public InvalidXmlCharacterReplacingStreamReader(Stream stream, char replacementCharacter) : base(stream)
{
_replacementCharacter = replacementCharacter;
}
public override int Peek()
{
var ch = base.Peek();
if (ch != -1 && IsInvalidChar(ch))
{
return _replacementCharacter;
}
return ch;
}
public override int Read()
{
var ch = base.Read();
if (ch != -1 && IsInvalidChar(ch))
{
return _replacementCharacter;
}
return ch;
}
public override int Read(char[] buffer, int index, int count)
{
var readCount = base.Read(buffer, index, count);
ReplaceInBuffer(buffer, index, readCount);
return readCount;
}
public override async Task<int> ReadAsync(char[] buffer, int index, int count)
{
var readCount = await base.ReadAsync(buffer, index, count).ConfigureAwait(false);
ReplaceInBuffer(buffer, index, readCount);
return readCount;
}
private void ReplaceInBuffer(char[] buffer, int index, int readCount)
{
for (var i = index; i < readCount + index; i++)
{
var ch = buffer[i];
if (IsInvalidChar(ch))
{
buffer[i] = _replacementCharacter;
}
}
}
private static bool IsInvalidChar(int ch)
{
return IsInvalidChar((char)ch);
}
private static bool IsInvalidChar(char ch)
{
return !XmlConvert.IsXmlChar(ch);
}
}
The above solutions seem to be for removing invalid characters prior to converting to XML.
Use this code to remove invalid XML characters from an XML string. eg. &x1A;
public static string CleanInvalidXmlChars( string Xml, string XMLVersion )
{
string pattern = String.Empty;
switch( XMLVersion )
{
case "1.0":
pattern = #"&#x((10?|[2-F])FFF[EF]|FDD[0-9A-F]|7F|8[0-46-9A-F]9[0-9A-F]);";
break;
case "1.1":
pattern = #"&#x((10?|[2-F])FFF[EF]|FDD[0-9A-F]|[19][0-9A-F]|7F|8[0-46-9A-F]|0?[1-8BCEF]);";
break;
default:
throw new Exception( "Error: Invalid XML Version!" );
}
Regex regex = new Regex( pattern, RegexOptions.IgnoreCase );
if( regex.IsMatch( Xml ) )
Xml = regex.Replace( Xml, String.Empty );
return Xml;
}
http://balajiramesh.wordpress.com/2008/05/30/strip-illegal-xml-characters-based-on-w3c-standard/
Modified answer or original answer by Neolisk above.
Changes: of \0 character is passed, removal is done, rather than a replacement. also, made use of XmlConvert.IsXmlChar(char) method
/// <summary>
/// Replaces invalid Xml characters from input file, NOTE: if replacement character is \0, then invalid Xml character is removed, instead of 1-for-1 replacement
/// </summary>
public class InvalidXmlCharacterReplacingStreamReader : StreamReader
{
private readonly char _replacementCharacter;
public InvalidXmlCharacterReplacingStreamReader(string fileName, char replacementCharacter)
: base(fileName)
{
_replacementCharacter = replacementCharacter;
}
public override int Peek()
{
int ch = base.Peek();
if (ch != -1 && IsInvalidChar(ch))
{
if ('\0' == _replacementCharacter)
return Peek(); // peek at the next one
return _replacementCharacter;
}
return ch;
}
public override int Read()
{
int ch = base.Read();
if (ch != -1 && IsInvalidChar(ch))
{
if ('\0' == _replacementCharacter)
return Read(); // read next one
return _replacementCharacter;
}
return ch;
}
public override int Read(char[] buffer, int index, int count)
{
int readCount= 0, ch;
for (int i = 0; i < count && (ch = Read()) != -1; i++)
{
readCount++;
buffer[index + i] = (char)ch;
}
return readCount;
}
private static bool IsInvalidChar(int ch)
{
return !XmlConvert.IsXmlChar((char)ch);
}
}
Use this function to remove invalid xml characters.
public static string CleanInvalidXmlChars(string text)
{
string re = #"[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD\x10000-x10FFFF]";
return Regex.Replace(text, re, "");
}
private static String removeNonUtf8CompliantCharacters( final String inString ) {
if (null == inString ) return null;
byte[] byteArr = inString.getBytes();
for ( int i=0; i < byteArr.length; i++ ) {
byte ch= byteArr[i];
// remove any characters outside the valid UTF-8 range as well as all control characters
// except tabs and new lines
if ( !( (ch > 31 && ch < 253 ) || ch == '\t' || ch == '\n' || ch == '\r') ) {
byteArr[i]=' ';
}
}
return new String( byteArr );
}
You can pass non-UTF characters with the following:
string sFinalString = "";
string hex = "";
foreach (char ch in UTFCHAR)
{
int tmp = ch;
if ((ch < 0x00FD && ch > 0x001F) || ch == '\t' || ch == '\n' || ch == '\r')
{
sFinalString += ch;
}
else
{
sFinalString += "&#" + tmp+";";
}
}
Try this for PHP!
$goodUTF8 = iconv("utf-8", "utf-8//IGNORE", $badUTF8);

Categories

Resources