I have a string[] in which every elements ends with some numeric value.
string[] partNumbers = new string[]
{
"ABC10", "ABC1","ABC2", "ABC11","ABC10", "AB1", "AB2", "Ab11"
};
I am trying to sort the above array as follows using LINQ but I am not getting the expected result.
var result = partNumbers.OrderBy(x => x);
Actual Result:
AB1
Ab11
AB2
ABC1
ABC10
ABC10
ABC11
ABC2
Expected Result
AB1
AB2
AB11
..
That is because the default ordering for string is standard alpha numeric dictionary (lexicographic) ordering, and ABC11 will come before ABC2 because ordering always proceeds from left to right.
To get what you want, you need to pad the numeric portion in your order by clause, something like:
var result = partNumbers.OrderBy(x => PadNumbers(x));
where PadNumbers could be defined as:
public static string PadNumbers(string input)
{
return Regex.Replace(input, "[0-9]+", match => match.Value.PadLeft(10, '0'));
}
This pads zeros for any number (or numbers) that appear in the input string so that OrderBy sees:
ABC0000000010
ABC0000000001
...
AB0000000011
The padding only happens on the key used for comparison. The original strings (without padding) are preserved in the result.
Note that this approach assumes a maximum number of digits for numbers in the input.
If you want to sort a list of objects by a specific property using LINQ and a custom comparer like the one by Dave Koelle you would do something like this:
...
items = items.OrderBy(x => x.property, new AlphanumComparator()).ToList();
...
You also have to alter Dave's class to inherit from System.Collections.Generic.IComparer<object> instead of the basic IComparer so the class signature becomes:
...
public class AlphanumComparator : System.Collections.Generic.IComparer<object>
{
...
Personally, I prefer the implementation by James McCormack because it implements IDisposable, though my benchmarking shows that it is slightly slower.
You can use PInvoke to get fast and good result:
class AlphanumericComparer : IComparer<string>
{
[DllImport("shlwapi.dll", CharSet = CharSet.Unicode)]
static extern int StrCmpLogicalW(string s1, string s2);
public int Compare(string x, string y) => StrCmpLogicalW(x, y);
}
You can use it like AlphanumComparatorFast from the answer above.
You can PInvoke to StrCmpLogicalW (the windows function) to do this. See here: Natural Sort Order in C#
public class AlphanumComparatorFast : IComparer
{
List<string> GetList(string s1)
{
List<string> SB1 = new List<string>();
string st1, st2, st3;
st1 = "";
bool flag = char.IsDigit(s1[0]);
foreach (char c in s1)
{
if (flag != char.IsDigit(c) || c=='\'')
{
if(st1!="")
SB1.Add(st1);
st1 = "";
flag = char.IsDigit(c);
}
if (char.IsDigit(c))
{
st1 += c;
}
if (char.IsLetter(c))
{
st1 += c;
}
}
SB1.Add(st1);
return SB1;
}
public int Compare(object x, object y)
{
string s1 = x as string;
if (s1 == null)
{
return 0;
}
string s2 = y as string;
if (s2 == null)
{
return 0;
}
if (s1 == s2)
{
return 0;
}
int len1 = s1.Length;
int len2 = s2.Length;
int marker1 = 0;
int marker2 = 0;
// Walk through two the strings with two markers.
List<string> str1 = GetList(s1);
List<string> str2 = GetList(s2);
while (str1.Count != str2.Count)
{
if (str1.Count < str2.Count)
{
str1.Add("");
}
else
{
str2.Add("");
}
}
int x1 = 0; int res = 0; int x2 = 0; string y2 = "";
bool status = false;
string y1 = ""; bool s1Status = false; bool s2Status = false;
//s1status ==false then string ele int;
//s2status ==false then string ele int;
int result = 0;
for (int i = 0; i < str1.Count && i < str2.Count; i++)
{
status = int.TryParse(str1[i].ToString(), out res);
if (res == 0)
{
y1 = str1[i].ToString();
s1Status = false;
}
else
{
x1 = Convert.ToInt32(str1[i].ToString());
s1Status = true;
}
status = int.TryParse(str2[i].ToString(), out res);
if (res == 0)
{
y2 = str2[i].ToString();
s2Status = false;
}
else
{
x2 = Convert.ToInt32(str2[i].ToString());
s2Status = true;
}
//checking --the data comparision
if(!s2Status && !s1Status ) //both are strings
{
result = str1[i].CompareTo(str2[i]);
}
else if (s2Status && s1Status) //both are intergers
{
if (x1 == x2)
{
if (str1[i].ToString().Length < str2[i].ToString().Length)
{
result = 1;
}
else if (str1[i].ToString().Length > str2[i].ToString().Length)
result = -1;
else
result = 0;
}
else
{
int st1ZeroCount=str1[i].ToString().Trim().Length- str1[i].ToString().TrimStart(new char[]{'0'}).Length;
int st2ZeroCount = str2[i].ToString().Trim().Length - str2[i].ToString().TrimStart(new char[] { '0' }).Length;
if (st1ZeroCount > st2ZeroCount)
result = -1;
else if (st1ZeroCount < st2ZeroCount)
result = 1;
else
result = x1.CompareTo(x2);
}
}
else
{
result = str1[i].CompareTo(str2[i]);
}
if (result == 0)
{
continue;
}
else
break;
}
return result;
}
}
USAGE of this Class:
List<string> marks = new List<string>();
marks.Add("M'00Z1");
marks.Add("M'0A27");
marks.Add("M'00Z0");
marks.Add("0000A27");
marks.Add("100Z0");
string[] Markings = marks.ToArray();
Array.Sort(Markings, new AlphanumComparatorFast());
For those who likes a generic approach, adjust the AlphanumComparator to Dave Koelle : AlphanumComparator slightly.
Step one (I rename the class to non-abbreviated and taking a TCompareType generic type argument):
public class AlphanumericComparator<TCompareType> : IComparer<TCompareType>
The next adjustments is to import the following namespace:
using System.Collections.Generic;
And we change the signature of the Compare method from object to TCompareType:
public int Compare(TCompareType x, TCompareType y)
{ .... no further modifications
Now we can specify the right type for the AlphanumericComparator.
(It should actually be called AlphanumericComparer I think), when we use it.
Example usage from my code:
if (result.SearchResults.Any()) {
result.SearchResults = result.SearchResults.OrderBy(item => item.Code, new AlphanumericComparator<string>()).ToList();
}
Now you have an alphanumeric comparator (comparer) that accepts generic arguments and can be used on different types.
And here is an extension method for using the comparator:
/// <summary>
/// Returns an ordered collection by key selector (property expression) using alpha numeric comparer
/// </summary>
/// <typeparam name="T">The item type in the ienumerable</typeparam>
/// <typeparam name="TKey">The type of the key selector (property to order by)</typeparam>
/// <param name="coll">The source ienumerable</param>
/// <param name="keySelector">The key selector, use a member expression in lambda expression</param>
/// <returns></returns>
public static IEnumerable<T> OrderByMember<T, TKey>(this IEnumerable<T> coll, Func<T, TKey> keySelector)
{
var result = coll.OrderBy(keySelector, new AlphanumericComparer<TKey>());
return result;
}
Well looks like its doing a Lexicographical Ordering irrespective to small or capital chars.
You can try using some custom expression in that lambda to do that.
There's no natural way to do this in .NET, but have a look at this blog post on natural sorting
You could put this into an extension method and use that instead of OrderBy
Looks like Dave Koelle's code link is dead. I got the last version from WebArchive.
/*
* The Alphanum Algorithm is an improved sorting algorithm for strings
* containing numbers. Instead of sorting numbers in ASCII order like
* a standard sort, this algorithm sorts numbers in numeric order.
*
* The Alphanum Algorithm is discussed at http://www.DaveKoelle.com
*
* Based on the Java implementation of Dave Koelle's Alphanum algorithm.
* Contributed by Jonathan Ruckwood <jonathan.ruckwood#gmail.com>
*
* Adapted by Dominik Hurnaus <dominik.hurnaus#gmail.com> to
* - correctly sort words where one word starts with another word
* - have slightly better performance
*
* Released under the MIT License - https://opensource.org/licenses/MIT
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
using System;
using System.Collections;
using System.Text;
/*
* Please compare against the latest Java version at http://www.DaveKoelle.com
* to see the most recent modifications
*/
namespace AlphanumComparator
{
public class AlphanumComparator : IComparer
{
private enum ChunkType {Alphanumeric, Numeric};
private bool InChunk(char ch, char otherCh)
{
ChunkType type = ChunkType.Alphanumeric;
if (char.IsDigit(otherCh))
{
type = ChunkType.Numeric;
}
if ((type == ChunkType.Alphanumeric && char.IsDigit(ch))
|| (type == ChunkType.Numeric && !char.IsDigit(ch)))
{
return false;
}
return true;
}
public int Compare(object x, object y)
{
String s1 = x as string;
String s2 = y as string;
if (s1 == null || s2 == null)
{
return 0;
}
int thisMarker = 0, thisNumericChunk = 0;
int thatMarker = 0, thatNumericChunk = 0;
while ((thisMarker < s1.Length) || (thatMarker < s2.Length))
{
if (thisMarker >= s1.Length)
{
return -1;
}
else if (thatMarker >= s2.Length)
{
return 1;
}
char thisCh = s1[thisMarker];
char thatCh = s2[thatMarker];
StringBuilder thisChunk = new StringBuilder();
StringBuilder thatChunk = new StringBuilder();
while ((thisMarker < s1.Length) && (thisChunk.Length==0 ||InChunk(thisCh, thisChunk[0])))
{
thisChunk.Append(thisCh);
thisMarker++;
if (thisMarker < s1.Length)
{
thisCh = s1[thisMarker];
}
}
while ((thatMarker < s2.Length) && (thatChunk.Length==0 ||InChunk(thatCh, thatChunk[0])))
{
thatChunk.Append(thatCh);
thatMarker++;
if (thatMarker < s2.Length)
{
thatCh = s2[thatMarker];
}
}
int result = 0;
// If both chunks contain numeric characters, sort them numerically
if (char.IsDigit(thisChunk[0]) && char.IsDigit(thatChunk[0]))
{
thisNumericChunk = Convert.ToInt32(thisChunk.ToString());
thatNumericChunk = Convert.ToInt32(thatChunk.ToString());
if (thisNumericChunk < thatNumericChunk)
{
result = -1;
}
if (thisNumericChunk > thatNumericChunk)
{
result = 1;
}
}
else
{
result = thisChunk.ToString().CompareTo(thatChunk.ToString());
}
if (result != 0)
{
return result;
}
}
return 0;
}
}
}
Since the number of characters at the beginning is variable, a regular expression would help:
var re = new Regex(#"\d+$"); // finds the consecutive digits at the end of the string
var result = partNumbers.OrderBy(x => int.Parse(re.Match(x).Value));
If there were a fixed number of prefix characters, then you could use the Substring method to extract starting from the relevant characters:
// parses the string as a number starting from the 5th character
var result = partNumbers.OrderBy(x => int.Parse(x.Substring(4)));
If the numbers might contain a decimal separator or thousands separator, then the regular expression needs to allow those characters as well:
var re = new Regex(#"[\d,]*\.?\d+$");
var result = partNumbers.OrderBy(x => double.Parse(x.Substring(4)));
If the string returned by the regular expression or Substring might be unparseable by int.Parse / double.Parse, then use the relevant TryParse variant:
var re = new Regex(#"\d+$"); // finds the consecutive digits at the end of the string
var result = partNumbers.OrderBy(x => {
int? parsed = null;
if (int.TryParse(re.Match(x).Value, out var temp)) {
parsed = temp;
}
return parsed;
});
Just extending #Nathan's answer here.
var maxStringLength = partNumbers.Max(x => x).Count();
var result = partNumbers.OrderBy(x => PadNumbers(x, maxStringLength));
Then pass the param to the PadNumbers function will be dynamic.
public static string PadNumbers(string input, int maxStringLength)
{
return Regex.Replace(input, "[0-9]+", match => match.Value.PadLeft(maxStringLength, '0'));
}
I don´t know how to do that in LINQ, but maybe you like this way to:
Array.Sort(partNumbers, new AlphanumComparatorFast());
// Display the results
foreach (string h in partNumbers )
{
Console.WriteLine(h);
}
Related
I have a string[] in which every elements ends with some numeric value.
string[] partNumbers = new string[]
{
"ABC10", "ABC1","ABC2", "ABC11","ABC10", "AB1", "AB2", "Ab11"
};
I am trying to sort the above array as follows using LINQ but I am not getting the expected result.
var result = partNumbers.OrderBy(x => x);
Actual Result:
AB1
Ab11
AB2
ABC1
ABC10
ABC10
ABC11
ABC2
Expected Result
AB1
AB2
AB11
..
That is because the default ordering for string is standard alpha numeric dictionary (lexicographic) ordering, and ABC11 will come before ABC2 because ordering always proceeds from left to right.
To get what you want, you need to pad the numeric portion in your order by clause, something like:
var result = partNumbers.OrderBy(x => PadNumbers(x));
where PadNumbers could be defined as:
public static string PadNumbers(string input)
{
return Regex.Replace(input, "[0-9]+", match => match.Value.PadLeft(10, '0'));
}
This pads zeros for any number (or numbers) that appear in the input string so that OrderBy sees:
ABC0000000010
ABC0000000001
...
AB0000000011
The padding only happens on the key used for comparison. The original strings (without padding) are preserved in the result.
Note that this approach assumes a maximum number of digits for numbers in the input.
If you want to sort a list of objects by a specific property using LINQ and a custom comparer like the one by Dave Koelle you would do something like this:
...
items = items.OrderBy(x => x.property, new AlphanumComparator()).ToList();
...
You also have to alter Dave's class to inherit from System.Collections.Generic.IComparer<object> instead of the basic IComparer so the class signature becomes:
...
public class AlphanumComparator : System.Collections.Generic.IComparer<object>
{
...
Personally, I prefer the implementation by James McCormack because it implements IDisposable, though my benchmarking shows that it is slightly slower.
You can use PInvoke to get fast and good result:
class AlphanumericComparer : IComparer<string>
{
[DllImport("shlwapi.dll", CharSet = CharSet.Unicode)]
static extern int StrCmpLogicalW(string s1, string s2);
public int Compare(string x, string y) => StrCmpLogicalW(x, y);
}
You can use it like AlphanumComparatorFast from the answer above.
You can PInvoke to StrCmpLogicalW (the windows function) to do this. See here: Natural Sort Order in C#
public class AlphanumComparatorFast : IComparer
{
List<string> GetList(string s1)
{
List<string> SB1 = new List<string>();
string st1, st2, st3;
st1 = "";
bool flag = char.IsDigit(s1[0]);
foreach (char c in s1)
{
if (flag != char.IsDigit(c) || c=='\'')
{
if(st1!="")
SB1.Add(st1);
st1 = "";
flag = char.IsDigit(c);
}
if (char.IsDigit(c))
{
st1 += c;
}
if (char.IsLetter(c))
{
st1 += c;
}
}
SB1.Add(st1);
return SB1;
}
public int Compare(object x, object y)
{
string s1 = x as string;
if (s1 == null)
{
return 0;
}
string s2 = y as string;
if (s2 == null)
{
return 0;
}
if (s1 == s2)
{
return 0;
}
int len1 = s1.Length;
int len2 = s2.Length;
int marker1 = 0;
int marker2 = 0;
// Walk through two the strings with two markers.
List<string> str1 = GetList(s1);
List<string> str2 = GetList(s2);
while (str1.Count != str2.Count)
{
if (str1.Count < str2.Count)
{
str1.Add("");
}
else
{
str2.Add("");
}
}
int x1 = 0; int res = 0; int x2 = 0; string y2 = "";
bool status = false;
string y1 = ""; bool s1Status = false; bool s2Status = false;
//s1status ==false then string ele int;
//s2status ==false then string ele int;
int result = 0;
for (int i = 0; i < str1.Count && i < str2.Count; i++)
{
status = int.TryParse(str1[i].ToString(), out res);
if (res == 0)
{
y1 = str1[i].ToString();
s1Status = false;
}
else
{
x1 = Convert.ToInt32(str1[i].ToString());
s1Status = true;
}
status = int.TryParse(str2[i].ToString(), out res);
if (res == 0)
{
y2 = str2[i].ToString();
s2Status = false;
}
else
{
x2 = Convert.ToInt32(str2[i].ToString());
s2Status = true;
}
//checking --the data comparision
if(!s2Status && !s1Status ) //both are strings
{
result = str1[i].CompareTo(str2[i]);
}
else if (s2Status && s1Status) //both are intergers
{
if (x1 == x2)
{
if (str1[i].ToString().Length < str2[i].ToString().Length)
{
result = 1;
}
else if (str1[i].ToString().Length > str2[i].ToString().Length)
result = -1;
else
result = 0;
}
else
{
int st1ZeroCount=str1[i].ToString().Trim().Length- str1[i].ToString().TrimStart(new char[]{'0'}).Length;
int st2ZeroCount = str2[i].ToString().Trim().Length - str2[i].ToString().TrimStart(new char[] { '0' }).Length;
if (st1ZeroCount > st2ZeroCount)
result = -1;
else if (st1ZeroCount < st2ZeroCount)
result = 1;
else
result = x1.CompareTo(x2);
}
}
else
{
result = str1[i].CompareTo(str2[i]);
}
if (result == 0)
{
continue;
}
else
break;
}
return result;
}
}
USAGE of this Class:
List<string> marks = new List<string>();
marks.Add("M'00Z1");
marks.Add("M'0A27");
marks.Add("M'00Z0");
marks.Add("0000A27");
marks.Add("100Z0");
string[] Markings = marks.ToArray();
Array.Sort(Markings, new AlphanumComparatorFast());
For those who likes a generic approach, adjust the AlphanumComparator to Dave Koelle : AlphanumComparator slightly.
Step one (I rename the class to non-abbreviated and taking a TCompareType generic type argument):
public class AlphanumericComparator<TCompareType> : IComparer<TCompareType>
The next adjustments is to import the following namespace:
using System.Collections.Generic;
And we change the signature of the Compare method from object to TCompareType:
public int Compare(TCompareType x, TCompareType y)
{ .... no further modifications
Now we can specify the right type for the AlphanumericComparator.
(It should actually be called AlphanumericComparer I think), when we use it.
Example usage from my code:
if (result.SearchResults.Any()) {
result.SearchResults = result.SearchResults.OrderBy(item => item.Code, new AlphanumericComparator<string>()).ToList();
}
Now you have an alphanumeric comparator (comparer) that accepts generic arguments and can be used on different types.
And here is an extension method for using the comparator:
/// <summary>
/// Returns an ordered collection by key selector (property expression) using alpha numeric comparer
/// </summary>
/// <typeparam name="T">The item type in the ienumerable</typeparam>
/// <typeparam name="TKey">The type of the key selector (property to order by)</typeparam>
/// <param name="coll">The source ienumerable</param>
/// <param name="keySelector">The key selector, use a member expression in lambda expression</param>
/// <returns></returns>
public static IEnumerable<T> OrderByMember<T, TKey>(this IEnumerable<T> coll, Func<T, TKey> keySelector)
{
var result = coll.OrderBy(keySelector, new AlphanumericComparer<TKey>());
return result;
}
Well looks like its doing a Lexicographical Ordering irrespective to small or capital chars.
You can try using some custom expression in that lambda to do that.
There's no natural way to do this in .NET, but have a look at this blog post on natural sorting
You could put this into an extension method and use that instead of OrderBy
Looks like Dave Koelle's code link is dead. I got the last version from WebArchive.
/*
* The Alphanum Algorithm is an improved sorting algorithm for strings
* containing numbers. Instead of sorting numbers in ASCII order like
* a standard sort, this algorithm sorts numbers in numeric order.
*
* The Alphanum Algorithm is discussed at http://www.DaveKoelle.com
*
* Based on the Java implementation of Dave Koelle's Alphanum algorithm.
* Contributed by Jonathan Ruckwood <jonathan.ruckwood#gmail.com>
*
* Adapted by Dominik Hurnaus <dominik.hurnaus#gmail.com> to
* - correctly sort words where one word starts with another word
* - have slightly better performance
*
* Released under the MIT License - https://opensource.org/licenses/MIT
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
using System;
using System.Collections;
using System.Text;
/*
* Please compare against the latest Java version at http://www.DaveKoelle.com
* to see the most recent modifications
*/
namespace AlphanumComparator
{
public class AlphanumComparator : IComparer
{
private enum ChunkType {Alphanumeric, Numeric};
private bool InChunk(char ch, char otherCh)
{
ChunkType type = ChunkType.Alphanumeric;
if (char.IsDigit(otherCh))
{
type = ChunkType.Numeric;
}
if ((type == ChunkType.Alphanumeric && char.IsDigit(ch))
|| (type == ChunkType.Numeric && !char.IsDigit(ch)))
{
return false;
}
return true;
}
public int Compare(object x, object y)
{
String s1 = x as string;
String s2 = y as string;
if (s1 == null || s2 == null)
{
return 0;
}
int thisMarker = 0, thisNumericChunk = 0;
int thatMarker = 0, thatNumericChunk = 0;
while ((thisMarker < s1.Length) || (thatMarker < s2.Length))
{
if (thisMarker >= s1.Length)
{
return -1;
}
else if (thatMarker >= s2.Length)
{
return 1;
}
char thisCh = s1[thisMarker];
char thatCh = s2[thatMarker];
StringBuilder thisChunk = new StringBuilder();
StringBuilder thatChunk = new StringBuilder();
while ((thisMarker < s1.Length) && (thisChunk.Length==0 ||InChunk(thisCh, thisChunk[0])))
{
thisChunk.Append(thisCh);
thisMarker++;
if (thisMarker < s1.Length)
{
thisCh = s1[thisMarker];
}
}
while ((thatMarker < s2.Length) && (thatChunk.Length==0 ||InChunk(thatCh, thatChunk[0])))
{
thatChunk.Append(thatCh);
thatMarker++;
if (thatMarker < s2.Length)
{
thatCh = s2[thatMarker];
}
}
int result = 0;
// If both chunks contain numeric characters, sort them numerically
if (char.IsDigit(thisChunk[0]) && char.IsDigit(thatChunk[0]))
{
thisNumericChunk = Convert.ToInt32(thisChunk.ToString());
thatNumericChunk = Convert.ToInt32(thatChunk.ToString());
if (thisNumericChunk < thatNumericChunk)
{
result = -1;
}
if (thisNumericChunk > thatNumericChunk)
{
result = 1;
}
}
else
{
result = thisChunk.ToString().CompareTo(thatChunk.ToString());
}
if (result != 0)
{
return result;
}
}
return 0;
}
}
}
Since the number of characters at the beginning is variable, a regular expression would help:
var re = new Regex(#"\d+$"); // finds the consecutive digits at the end of the string
var result = partNumbers.OrderBy(x => int.Parse(re.Match(x).Value));
If there were a fixed number of prefix characters, then you could use the Substring method to extract starting from the relevant characters:
// parses the string as a number starting from the 5th character
var result = partNumbers.OrderBy(x => int.Parse(x.Substring(4)));
If the numbers might contain a decimal separator or thousands separator, then the regular expression needs to allow those characters as well:
var re = new Regex(#"[\d,]*\.?\d+$");
var result = partNumbers.OrderBy(x => double.Parse(x.Substring(4)));
If the string returned by the regular expression or Substring might be unparseable by int.Parse / double.Parse, then use the relevant TryParse variant:
var re = new Regex(#"\d+$"); // finds the consecutive digits at the end of the string
var result = partNumbers.OrderBy(x => {
int? parsed = null;
if (int.TryParse(re.Match(x).Value, out var temp)) {
parsed = temp;
}
return parsed;
});
Just extending #Nathan's answer here.
var maxStringLength = partNumbers.Max(x => x).Count();
var result = partNumbers.OrderBy(x => PadNumbers(x, maxStringLength));
Then pass the param to the PadNumbers function will be dynamic.
public static string PadNumbers(string input, int maxStringLength)
{
return Regex.Replace(input, "[0-9]+", match => match.Value.PadLeft(maxStringLength, '0'));
}
I don´t know how to do that in LINQ, but maybe you like this way to:
Array.Sort(partNumbers, new AlphanumComparatorFast());
// Display the results
foreach (string h in partNumbers )
{
Console.WriteLine(h);
}
I currently have the following 2 methods:
public static Point[] PolygonFromFile(string file)
{
string[] GBfile = File.ReadAllLines(file);
// remove unnecessary data from first 2 lines, never contains any polygon data
GBfile[0] = string.Empty;
GBfile[1] = string.Empty;
const string delim = " ";
List<string> points = (from s in GBfile where !string.IsNullOrEmpty(s) from i in s.Split(delim.ToCharArray()) where !string.IsNullOrEmpty(i) && i != "END" select i).ToList();
List<Point> polygon = new List<Point>();
for (int i = 0; i < points.Count / 2; i++)
{
polygon.Add(new Point
{
X = PointParse(points[i]),
Y = PointParse(points[i + 1])
});
}
return polygon.ToArray();
}
private static double PointParse(string value)
{
string[] parsed = value.Split("E".ToCharArray());
char function = '\0';
bool met = false;
foreach (char c in parsed[1])
{
if (c == '-' || c == '+')
{
function = c;
break;
}
}
var splitMultiplier = parsed[1].Split(function);
double decimalPlaces = Convert.ToDouble(splitMultiplier[1]);
if (decimalPlaces == 0) return Convert.ToDouble(parsed[0]);
switch (function)
{
case '+':
return Convert.ToDouble(parsed[0]) * Math.Pow(10, decimalPlaces);
case '-':
return Convert.ToDouble(parsed[0]) / Math.Pow(10, decimalPlaces);
}
return 0;
}
That will convert a value such as 5.807600E-02 to the true value of 0.058076.
I feel like this is an extremely verbose way of achieving what I need to, is there a function in C# to achieve this or do I need to go through the above process to convert the given value to the necessary one?
.NET can parse these values on its own
Double.Parse("5.807600E-02");
will return the value 0.058076 as a double.
What about this?
double d = Double.Parse("1.2345E-03", System.Globalization.NumberStyles.Float);
You can use parse function. Try this.
decimal d = Decimal.Parse("5.807600E-02",System.Globalization.NumberStyles.Float);
How can I compare 2 strings in C# ignoring the case, spaces and any line-breaks. I also need to check if both strings are null then they are marked as same.
Thanks!
You should normalize each string by removing the characters that you don't want to compare and then you can perform a String.Equals with a StringComparison that ignores case.
Something like this:
string s1 = "HeLLo wOrld!";
string s2 = "Hello\n WORLd!";
string normalized1 = Regex.Replace(s1, #"\s", "");
string normalized2 = Regex.Replace(s2, #"\s", "");
bool stringEquals = String.Equals(
normalized1,
normalized2,
StringComparison.OrdinalIgnoreCase);
Console.WriteLine(stringEquals);
Here Regex.Replace is used first to remove all whitespace characters. The special case of both strings being null is not treated here but you can easily handle that case before performing the string normalization.
This may also work.
String.Compare(s1, s2, CultureInfo.CurrentCulture, CompareOptions.IgnoreCase | CompareOptions.IgnoreSymbols) == 0
Edit:
IgnoreSymbols: Indicates that the string comparison must ignore symbols, such as
white-space characters, punctuation, currency symbols, the percent
sign, mathematical symbols, the ampersand, and so on.
Remove all the characters you don't want and then use the ToLower() method to ignore case.
edit: While the above works, it's better to use StringComparison.OrdinalIgnoreCase. Just pass it as the second argument to the Equals method.
First replace all whitespace via regular expression from both string and then use the String.Compare method with parameter ignoreCase = true.
string a = System.Text.RegularExpressions.Regex.Replace("void foo", #"\s", "");
string b = System.Text.RegularExpressions.Regex.Replace("voidFoo", #"\s", "");
bool isTheSame = String.Compare(a, b, true) == 0;
If you need performance, the Regex solutions on this page run too slow for you. Maybe you have a large list of strings you want to sort. (A Regex solution is more readable however)
I have a class that looks at each individual char in both strings and compares them while ignoring case and whitespace. It doesn't allocate any new strings. It uses the char.IsWhiteSpace(ch) to determine whitespace, and char.ToLowerInvariant(ch) for case-insensitivity (if required). In my testing, my solution runs about 5x - 8x faster than a Regex-based solution. My class also implements IEqualityComparer's GetHashCode(obj) method using this code in another SO answer. This GetHashCode(obj) also ignores whitespace and optionally ignores case.
Here's my class:
private class StringCompIgnoreWhiteSpace : IEqualityComparer<string>
{
public bool Equals(string strx, string stry)
{
if (strx == null) //stry may contain only whitespace
return string.IsNullOrWhiteSpace(stry);
else if (stry == null) //strx may contain only whitespace
return string.IsNullOrWhiteSpace(strx);
int ix = 0, iy = 0;
for (; ix < strx.Length && iy < stry.Length; ix++, iy++)
{
char chx = strx[ix];
char chy = stry[iy];
//ignore whitespace in strx
while (char.IsWhiteSpace(chx) && ix < strx.Length)
{
ix++;
chx = strx[ix];
}
//ignore whitespace in stry
while (char.IsWhiteSpace(chy) && iy < stry.Length)
{
iy++;
chy = stry[iy];
}
if (ix == strx.Length && iy != stry.Length)
{ //end of strx, so check if the rest of stry is whitespace
for (int iiy = iy + 1; iiy < stry.Length; iiy++)
{
if (!char.IsWhiteSpace(stry[iiy]))
return false;
}
return true;
}
if (ix != strx.Length && iy == stry.Length)
{ //end of stry, so check if the rest of strx is whitespace
for (int iix = ix + 1; iix < strx.Length; iix++)
{
if (!char.IsWhiteSpace(strx[iix]))
return false;
}
return true;
}
//The current chars are not whitespace, so check that they're equal (case-insensitive)
//Remove the following two lines to make the comparison case-sensitive.
chx = char.ToLowerInvariant(chx);
chy = char.ToLowerInvariant(chy);
if (chx != chy)
return false;
}
//If strx has more chars than stry
for (; ix < strx.Length; ix++)
{
if (!char.IsWhiteSpace(strx[ix]))
return false;
}
//If stry has more chars than strx
for (; iy < stry.Length; iy++)
{
if (!char.IsWhiteSpace(stry[iy]))
return false;
}
return true;
}
public int GetHashCode(string obj)
{
if (obj == null)
return 0;
int hash = 17;
unchecked // Overflow is fine, just wrap
{
for (int i = 0; i < obj.Length; i++)
{
char ch = obj[i];
if(!char.IsWhiteSpace(ch))
//use this line for case-insensitivity
hash = hash * 23 + char.ToLowerInvariant(ch).GetHashCode();
//use this line for case-sensitivity
//hash = hash * 23 + ch.GetHashCode();
}
}
return hash;
}
}
private static void TestComp()
{
var comp = new StringCompIgnoreWhiteSpace();
Console.WriteLine(comp.Equals("abcd", "abcd")); //true
Console.WriteLine(comp.Equals("abCd", "Abcd")); //true
Console.WriteLine(comp.Equals("ab Cd", "Ab\n\r\tcd ")); //true
Console.WriteLine(comp.Equals(" ab Cd", " A b" + Environment.NewLine + "cd ")); //true
Console.WriteLine(comp.Equals(null, " \t\n\r ")); //true
Console.WriteLine(comp.Equals(" \t\n\r ", null)); //true
Console.WriteLine(comp.Equals("abcd", "abcd h")); //false
Console.WriteLine(comp.GetHashCode(" a b c d")); //-699568861
//This is -699568861 if you #define StringCompIgnoreWhiteSpace_CASE_INSENSITIVE
// Otherwise it's -1555613149
Console.WriteLine(comp.GetHashCode("A B c \t d"));
}
Here's my testing code (with a Regex example):
private static void SpeedTest()
{
const int loop = 100000;
string first = "a bc d";
string second = "ABC D";
var compChar = new StringCompIgnoreWhiteSpace();
Stopwatch sw1 = Stopwatch.StartNew();
for (int i = 0; i < loop; i++)
{
bool equals = compChar.Equals(first, second);
}
sw1.Stop();
Console.WriteLine(string.Format("char time = {0}", sw1.Elapsed)); //char time = 00:00:00.0361159
var compRegex = new StringCompIgnoreWhiteSpaceRegex();
Stopwatch sw2 = Stopwatch.StartNew();
for (int i = 0; i < loop; i++)
{
bool equals = compRegex.Equals(first, second);
}
sw2.Stop();
Console.WriteLine(string.Format("regex time = {0}", sw2.Elapsed)); //regex time = 00:00:00.2773072
}
private class StringCompIgnoreWhiteSpaceRegex : IEqualityComparer<string>
{
public bool Equals(string strx, string stry)
{
if (strx == null)
return string.IsNullOrWhiteSpace(stry);
else if (stry == null)
return string.IsNullOrWhiteSpace(strx);
string a = System.Text.RegularExpressions.Regex.Replace(strx, #"\s", "");
string b = System.Text.RegularExpressions.Regex.Replace(stry, #"\s", "");
return String.Compare(a, b, true) == 0;
}
public int GetHashCode(string obj)
{
if (obj == null)
return 0;
string a = System.Text.RegularExpressions.Regex.Replace(obj, #"\s", "");
return a.GetHashCode();
}
}
I would probably start by removing the characters you don't want to compare from the string before comparing. If performance is a concern, you might look at storing a version of each string with the characters already removed.
Alternatively, you could write a compare routine that would skip over the characters you want to ignore. But that just seems like more work to me.
You can also use the following custom function
public static string ExceptChars(this string str, IEnumerable<char> toExclude)
{
StringBuilder sb = new StringBuilder();
for (int i = 0; i < str.Length; i++)
{
char c = str[i];
if (!toExclude.Contains(c))
sb.Append(c);
}
return sb.ToString();
}
public static bool SpaceCaseInsenstiveComparision(this string stringa, string stringb)
{
return (stringa==null&&stringb==null)||stringa.ToLower().ExceptChars(new[] { ' ', '\t', '\n', '\r' }).Equals(stringb.ToLower().ExceptChars(new[] { ' ', '\t', '\n', '\r' }));
}
And then use it following way
"Te st".SpaceCaseInsenstiveComparision("Te st");
Another option is the LINQ SequenceEquals method which according to my tests is more than twice as fast as the Regex approach used in other answers and very easy to read and maintain.
public static bool Equals_Linq(string s1, string s2)
{
return Enumerable.SequenceEqual(
s1.Where(c => !char.IsWhiteSpace(c)).Select(char.ToUpperInvariant),
s2.Where(c => !char.IsWhiteSpace(c)).Select(char.ToUpperInvariant));
}
public static bool Equals_Regex(string s1, string s2)
{
return string.Equals(
Regex.Replace(s1, #"\s", ""),
Regex.Replace(s2, #"\s", ""),
StringComparison.OrdinalIgnoreCase);
}
Here the simple performance test code I used:
var s1 = "HeLLo wOrld!";
var s2 = "Hello\n WORLd!";
var watch = Stopwatch.StartNew();
for (var i = 0; i < 1000000; i++)
{
Equals_Linq(s1, s2);
}
Console.WriteLine(watch.Elapsed); // ~1.7 seconds
watch = Stopwatch.StartNew();
for (var i = 0; i < 1000000; i++)
{
Equals_Regex(s1, s2);
}
Console.WriteLine(watch.Elapsed); // ~4.6 seconds
An approach not optimized for performance, but for completeness.
normalizes null
normalizes unicode, combining characters, diacritics
normalizes new lines
normalizes white space
normalizes casing
code snippet:
public static class StringHelper
{
public static bool AreEquivalent(string source, string target)
{
if (source == null) return target == null;
if (target == null) return false;
var normForm1 = Normalize(source);
var normForm2 = Normalize(target);
return string.Equals(normForm1, normForm2);
}
private static string Normalize(string value)
{
Debug.Assert(value != null);
// normalize unicode, combining characters, diacritics
value = value.Normalize(NormalizationForm.FormC);
// normalize new lines to white space
value = value.Replace("\r\n", "\n").Replace("\r", "\n");
// normalize white space
value = Regex.Replace(value, #"\s", string.Empty);
// normalize casing
return value.ToLowerInvariant();
}
}
I would Trim the string using Trim() to remove all the
whitespace.
Use StringComparison.OrdinalIgnoreCase to ignore case sensitivity ex. stringA.Equals(stringB, StringComparison.OrdinalIgnoreCase)
The question is complicated but I will explain it in details.
The goal is to make a function which will return next "step" of the given string.
For example
String.Step("a"); // = "b"
String.Step("b"); // = "c"
String.Step("g"); // = "h"
String.Step("z"); // = "A"
String.Step("A"); // = "B"
String.Step("B"); // = "C"
String.Step("G"); // = "H"
Until here its quite easy, But taking in mind that input IS string it can contain more than 1 characters and the function must behave like this.
String.Step("Z"); // = "aa";
String.Step("aa"); // = "ab";
String.Step("ag"); // = "ah";
String.Step("az"); // = "aA";
String.Step("aA"); // = "aB";
String.Step("aZ"); // = "ba";
String.Step("ZZ"); // = "aaa";
and so on...
This doesn't exactly need to extend the base String class.
I tried to work it out by each characters ASCII values but got stuck with strings containing 2 characters.
I would really appreciate if someone can provide full code of the function.
Thanks in advance.
EDIT
*I'm sorry I forgot to mention earlier that the function "reparse" the self generated string when its length reaches n.
continuation of this function will be smth like this. for example n = 3
String.Step("aaa"); // = "aab";
String.Step("aaZ"); // = "aba";
String.Step("aba"); // = "abb";
String.Step("abb"); // = "abc";
String.Step("abZ"); // = "aca";
.....
String.Step("zzZ"); // = "zAa";
String.Step("zAa"); // = "zAb";
........
I'm sorry I didn't mention it earlier, after reading some answers I realised that the problem was in question.
Without this the function will always produce character "a" n times after the end of the step.
NOTE: This answer is incorrect, as "aa" should follow after "Z"... (see comments below)
Here is an algorithm that might work:
each "string" represents a number to a given base (here: twice the count of letters in the alphabet).
The next step can thus be computed by parsing the "number"-string back into a int, adding 1 and then formatting it back to the base.
Example:
"a" == 1 -> step("a") == step(1) == 1 + 1 == 2 == "b"
Now your problem is reduced to parsing the string as a number to a given base and reformatting it. A quick googling suggests this page: http://everything2.com/title/convert+any+number+to+decimal
How to implement this?
a lookup table for letters to their corresponding number: a=1, b=2, c=3, ... Y = ?, Z = 0
to parse a string to number, read the characters in reverse order, looking up the numbers and adding them up:
"ab" -> 2*BASE^0 + 1*BASE^1
with BASE being the number of "digits" (2 count of letters in alphabet, is that 48?)
EDIT: This link looks even more promising: http://www.citidel.org/bitstream/10117/20/12/convexp.html
Quite collection of approaches, here is mine:-
The Function:
private static string IncrementString(string s)
{
byte[] vals = System.Text.Encoding.ASCII.GetBytes(s);
for (var i = vals.Length - 1; i >= 0; i--)
{
if (vals[i] < 90)
{
vals[i] += 1;
break;
}
if (vals[i] == 90)
{
if (i != 0)
{
vals[i] = 97;
continue;
}
else
{
return new String('a', vals.Length + 1);
}
}
if (vals[i] < 122)
{
vals[i] += 1;
break;
}
vals[i] = 65;
break;
}
return System.Text.Encoding.ASCII.GetString(vals);
}
The Tests
Console.WriteLine(IncrementString("a") == "b");
Console.WriteLine(IncrementString("z") == "A");
Console.WriteLine(IncrementString("Z") == "aa");
Console.WriteLine(IncrementString("aa") == "ab");
Console.WriteLine(IncrementString("az") == "aA");
Console.WriteLine(IncrementString("aZ") == "ba");
Console.WriteLine(IncrementString("zZ") == "Aa");
Console.WriteLine(IncrementString("Za") == "Zb");
Console.WriteLine(IncrementString("ZZ") == "aaa");
public static class StringStep
{
public static string Next(string str)
{
string result = String.Empty;
int index = str.Length - 1;
bool carry;
do
{
result = Increment(str[index--], out carry) + result;
}
while (carry && index >= 0);
if (index >= 0) result = str.Substring(0, index+1) + result;
if (carry) result = "a" + result;
return result;
}
private static char Increment(char value, out bool carry)
{
carry = false;
if (value >= 'a' && value < 'z' || value >= 'A' && value < 'Z')
{
return (char)((int)value + 1);
}
if (value == 'z') return 'A';
if (value == 'Z')
{
carry = true;
return 'a';
}
throw new Exception(String.Format("Invalid character value: {0}", value));
}
}
Split the input string into columns and process each, right-to-left, like you would if it was basic arithmetic. Apply whatever code you've got that works with a single column to each column. When you get a Z, you 'increment' the next-left column using the same algorithm. If there's no next-left column, stick in an 'a'.
I'm sorry the question is stated partly.
I edited the question so that it meets the requirements, without the edit the function would end up with a n times by step by step increasing each word from lowercase a to uppercase z without "re-parsing" it.
Please consider re-reading the question, including the edited part
This is what I came up with. I'm not relying on ASCII int conversion, and am rather using an array of characters. This should do precisely what you're looking for.
public static string Step(this string s)
{
char[] stepChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".ToCharArray();
char[] str = s.ToCharArray();
int idx = s.Length - 1;
char lastChar = str[idx];
for (int i=0; i<stepChars.Length; i++)
{
if (stepChars[i] == lastChar)
{
if (i == stepChars.Length - 1)
{
str[idx] = stepChars[0];
if (str.Length > 1)
{
string tmp = Step(new string(str.Take(str.Length - 1).ToArray()));
str = (tmp + str[idx]).ToCharArray();
}
else
str = new char[] { stepChars[0], str[idx] };
}
else
str[idx] = stepChars[i + 1];
break;
}
}
return new string(str);
}
This is a special case of a numeral system. It has the base of 52. If you write some parser and output logic you can do any kind of arithmetics an obviously the +1 (++) here.
The digits are "a"-"z" and "A" to "Z" where "a" is zero and "Z" is 51
So you have to write a parser who takes the string and builds an int or long from it. This function is called StringToInt() and is implemented straight forward (transform char to number (0..51) multiply with 52 and take the next char)
And you need the reverse function IntToString which is also implementet straight forward (modulo the int with 52 and transform result to digit, divide the int by 52 and repeat this until int is null)
With this functions you can do stuff like this:
IntToString( StringToInt("ZZ") +1 ) // Will be "aaa"
You need to account for A) the fact that capital letters have a lower decimal value in the Ascii table than lower case ones. B) The table is not continuous A-Z-a-z - there are characters inbetween Z and a.
public static string stepChar(string str)
{
return stepChar(str, str.Length - 1);
}
public static string stepChar(string str, int charPos)
{
return stepChar(Encoding.ASCII.GetBytes(str), charPos);
}
public static string stepChar(byte[] strBytes, int charPos)
{
//Escape case
if (charPos < 0)
{
//just prepend with a and return
return "a" + Encoding.ASCII.GetString(strBytes);
}
else
{
strBytes[charPos]++;
if (strBytes[charPos] == 91)
{
//Z -> a plus increment previous char
strBytes[charPos] = 97;
return stepChar(strBytes, charPos - 1); }
else
{
if (strBytes[charPos] == 123)
{
//z -> A
strBytes[charPos] = 65;
}
return Encoding.ASCII.GetString(strBytes);
}
}
}
You'll probably want some checking in place to ensure that the input string only contains chars A-Za-z
Edit Tidied up code and added new overload to remove redundant byte[] -> string -> byte[] conversion
Proof http://geekcubed.org/random/strIncr.png
This is a lot like how Excel columns would work if they were unbounded. You could change 52 to reference chars.Length for easier modification.
static class AlphaInt {
private static string chars =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
public static string StepNext(string input) {
return IntToAlpha(AlphaToInt(input) + 1);
}
public static string IntToAlpha(int num) {
if(num-- <= 0) return "a";
if(num % 52 == num) return chars.Substring(num, 1);
return IntToAlpha(num / 52) + IntToAlpha(num % 52 + 1);
}
public static int AlphaToInt(string str) {
int num = 0;
for(int i = 0; i < str.Length; i++) {
num += (chars.IndexOf(str.Substring(i, 1)) + 1)
* (int)Math.Pow(52, str.Length - i - 1);
}
return num;
}
}
LetterToNum should be be a Function that maps "a" to 0 and "Z" to 51.
NumToLetter the inverse.
long x = "aazeiZa".Aggregate((x,y) => (x*52) + LetterToNum(y)) + 1;
string s = "";
do { // assertion: x > 0
var c = x % 52;
s = NumToLetter() + s;
x = (x - c) / 52;
} while (x > 0)
// s now should contain the result
I need to decide whether file name fits to file mask. The file mask could contain * or ? characters. Is there any simple solution for this?
bool bFits = Fits("myfile.txt", "my*.txt");
private bool Fits(string sFileName, string sFileMask)
{
??? anything simple here ???
}
I appreciate finding Joel's answer--saved me some time as well ! I did, however, have to make a few changes to make the method do what most users would expect:
I removed the 'this' keyword preceding the first argument. It does nothing here (though it could be useful if the method is intended to be an extension method, in which case it needs to be public and contained within a static class and itself be a static method).
I made the regular expression case-independent to match standard Windows wildcard behavior (so e.g. "c*.*" and "C*.*" both return the same result).
I added starting and ending anchors to the regular expression, again to match standard Windows wildcard behavior (so e.g. "stuff.txt" would be matched by "stuff*" or "s*" or "s*.*" but not by just "s").
private bool FitsMask(string fileName, string fileMask)
{
Regex mask = new Regex(
'^' +
fileMask
.Replace(".", "[.]")
.Replace("*", ".*")
.Replace("?", ".")
+ '$',
RegexOptions.IgnoreCase);
return mask.IsMatch(fileName);
}
2009.11.04 Update: Match one of several masks
For even more flexibility, here is a plug-compatible method built on top of the original. This version lets you pass multiple masks (hence the plural on the second parameter name fileMasks) separated by lines, commas, vertical bars, or spaces. I wanted it so that I could let the user put as many choices as desired in a ListBox and then select all files matching any of them. Note that some controls (like a ListBox) use CR-LF for line breaks while others (e.g. RichTextBox) use just LF--that is why both "\r\n" and "\n" show up in the Split list.
private bool FitsOneOfMultipleMasks(string fileName, string fileMasks)
{
return fileMasks
.Split(new string[] {"\r\n", "\n", ",", "|", " "},
StringSplitOptions.RemoveEmptyEntries)
.Any(fileMask => FitsMask(fileName, fileMask));
}
2009.11.17 Update: Handle fileMask inputs more gracefully
The earlier version of FitsMask (which I have left in for comparison) does a fair job but since we are treating it as a regular expression it will throw an exception if it is not a valid regular expression when it comes in. The solution is that we actually want any regex metacharacters in the input fileMask to be considered literals, not metacharacters. But we still need to treat period, asterisk, and question mark specially. So this improved version of FitsMask safely moves these three characters out of the way, transforms all remaining metacharacters into literals, then puts the three interesting characters back, in their "regex'ed" form.
One other minor improvement is to allow for case-independence, per standard Windows behavior.
private bool FitsMask(string fileName, string fileMask)
{
string pattern =
'^' +
Regex.Escape(fileMask.Replace(".", "__DOT__")
.Replace("*", "__STAR__")
.Replace("?", "__QM__"))
.Replace("__DOT__", "[.]")
.Replace("__STAR__", ".*")
.Replace("__QM__", ".")
+ '$';
return new Regex(pattern, RegexOptions.IgnoreCase).IsMatch(fileName);
}
2010.09.30 Update: Somewhere along the way, passion ensued...
I have been remiss in not updating this earlier but these references will likely be of interest to readers who have made it to this point:
I embedded the FitsMask method as the heart of a WinForms user control aptly called a FileMask--see the API here.
I then wrote an article featuring the FileMask control published on Simple-Talk.com, entitled Using LINQ Lambda Expressions to Design Customizable Generic Components. (While the method itself does not use LINQ, the FileMask user control does, hence the title of the article.)
Try this:
private bool FitsMask(string sFileName, string sFileMask)
{
Regex mask = new Regex(sFileMask.Replace(".", "[.]").Replace("*", ".*").Replace("?", "."));
return mask.IsMatch(sFileName);
}
Many people don't know that, but .NET includes an internal class, called "PatternMatcher" (under the "System.IO" namespace).
This static class contains only 1 method:
public static bool StrictMatchPattern(string expression, string name)
This method is used by .net whenever it needs to compare files with wildcard (FileSystemWatcher, GetFiles(), etc)
Using reflector, I exposed the code here.
Didn't really go through it to understand how it works, but it works great,
So this is the code for anyone who doesn't want to work with the inefficient RegEx way:
public static class PatternMatcher
{
// Fields
private const char ANSI_DOS_QM = '<';
private const char ANSI_DOS_STAR = '>';
private const char DOS_DOT = '"';
private const int MATCHES_ARRAY_SIZE = 16;
// Methods
public static bool StrictMatchPattern(string expression, string name)
{
expression = expression.ToLowerInvariant();
name = name.ToLowerInvariant();
int num9;
char ch = '\0';
char ch2 = '\0';
int[] sourceArray = new int[16];
int[] numArray2 = new int[16];
bool flag = false;
if (((name == null) || (name.Length == 0)) || ((expression == null) || (expression.Length == 0)))
{
return false;
}
if (expression.Equals("*") || expression.Equals("*.*"))
{
return true;
}
if ((expression[0] == '*') && (expression.IndexOf('*', 1) == -1))
{
int length = expression.Length - 1;
if ((name.Length >= length) && (string.Compare(expression, 1, name, name.Length - length, length, StringComparison.OrdinalIgnoreCase) == 0))
{
return true;
}
}
sourceArray[0] = 0;
int num7 = 1;
int num = 0;
int num8 = expression.Length * 2;
while (!flag)
{
int num3;
if (num < name.Length)
{
ch = name[num];
num3 = 1;
num++;
}
else
{
flag = true;
if (sourceArray[num7 - 1] == num8)
{
break;
}
}
int index = 0;
int num5 = 0;
int num6 = 0;
while (index < num7)
{
int num2 = (sourceArray[index++] + 1) / 2;
num3 = 0;
Label_00F2:
if (num2 != expression.Length)
{
num2 += num3;
num9 = num2 * 2;
if (num2 == expression.Length)
{
numArray2[num5++] = num8;
}
else
{
ch2 = expression[num2];
num3 = 1;
if (num5 >= 14)
{
int num11 = numArray2.Length * 2;
int[] destinationArray = new int[num11];
Array.Copy(numArray2, destinationArray, numArray2.Length);
numArray2 = destinationArray;
destinationArray = new int[num11];
Array.Copy(sourceArray, destinationArray, sourceArray.Length);
sourceArray = destinationArray;
}
if (ch2 == '*')
{
numArray2[num5++] = num9;
numArray2[num5++] = num9 + 1;
goto Label_00F2;
}
if (ch2 == '>')
{
bool flag2 = false;
if (!flag && (ch == '.'))
{
int num13 = name.Length;
for (int i = num; i < num13; i++)
{
char ch3 = name[i];
num3 = 1;
if (ch3 == '.')
{
flag2 = true;
break;
}
}
}
if ((flag || (ch != '.')) || flag2)
{
numArray2[num5++] = num9;
numArray2[num5++] = num9 + 1;
}
else
{
numArray2[num5++] = num9 + 1;
}
goto Label_00F2;
}
num9 += num3 * 2;
switch (ch2)
{
case '<':
if (flag || (ch == '.'))
{
goto Label_00F2;
}
numArray2[num5++] = num9;
goto Label_028D;
case '"':
if (flag)
{
goto Label_00F2;
}
if (ch == '.')
{
numArray2[num5++] = num9;
goto Label_028D;
}
break;
}
if (!flag)
{
if (ch2 == '?')
{
numArray2[num5++] = num9;
}
else if (ch2 == ch)
{
numArray2[num5++] = num9;
}
}
}
}
Label_028D:
if ((index < num7) && (num6 < num5))
{
while (num6 < num5)
{
int num14 = sourceArray.Length;
while ((index < num14) && (sourceArray[index] < numArray2[num6]))
{
index++;
}
num6++;
}
}
}
if (num5 == 0)
{
return false;
}
int[] numArray4 = sourceArray;
sourceArray = numArray2;
numArray2 = numArray4;
num7 = num5;
}
num9 = sourceArray[num7 - 1];
return (num9 == num8);
}
}
None of these answers quite seem to do the trick, and msorens's is needlessly complex. This one should work just fine:
public static Boolean MatchesMask(string fileName, string fileMask)
{
String convertedMask = "^" + Regex.Escape(fileMask).Replace("\\*", ".*").Replace("\\?", ".") + "$";
Regex regexMask = new Regex(convertedMask, RegexOptions.IgnoreCase);
return regexMask.IsMatch(fileName);
}
This makes sure possible regex chars in the mask are escaped, replaces the \* and \?, and surrounds it all by ^ and $ to mark the boundaries.
Of course, in most situations, it's far more useful to simply make this into a FileMaskToRegex tool function which returns the Regex object, so you just got it once and can then make a loop in which you check all strings from your files list on it.
public static Regex FileMaskToRegex(string fileMask)
{
String convertedMask = "^" + Regex.Escape(fileMask).Replace("\\*", ".*").Replace("\\?", ".") + "$";
return new Regex(convertedMask, RegexOptions.IgnoreCase);
}
Use WildCardPattern class from System.Management.Automation available as NuGet package or in Windows PowerShell SDK.
WildcardPattern pattern = new WildcardPattern("my*.txt");
bool fits = pattern.IsMatch("myfile.txt");
From Windows 7 using P/Invoke (without 260 char count limit):
// UNICODE_STRING for Rtl... method
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct UNICODE_STRING
{
public ushort Length;
public ushort MaximumLength;
[MarshalAs(UnmanagedType.LPWStr)]
string Buffer;
public UNICODE_STRING(string buffer)
{
if (buffer == null)
Length = MaximumLength = 0;
else
Length = MaximumLength = unchecked((ushort)(buffer.Length * 2));
Buffer = buffer;
}
}
// RtlIsNameInExpression method from NtDll.dll system library
public static class NtDll
{
[DllImport("NtDll.dll", CharSet=CharSet.Unicode, ExactSpelling=true)]
[return: MarshalAs(UnmanagedType.U1)]
public extern static bool RtlIsNameInExpression(
ref UNICODE_STRING Expression,
ref UNICODE_STRING Name,
[MarshalAs(UnmanagedType.U1)]
bool IgnoreCase,
IntPtr Zero
);
}
public bool MatchMask(string mask, string fileName)
{
// Expression must be uppercase for IgnoreCase == true (see MSDN for RtlIsNameInExpression)
UNICODE_STRING expr = new UNICODE_STRING(mask.ToUpper());
UNICODE_STRING name = new UNICODE_STRING(fileName);
if (NtDll.RtlIsNameInExpression(ref expr, ref name, true, IntPtr.Zero))
{
// MATCHES !!!
}
}
Fastest version of the previously proposed function:
public static bool FitsMasks(string filePath, params string[] fileMasks)
// or
public static Regex FileMasksToRegex(params string[] fileMasks)
{
if (!_maskRegexes.ContainsKey(fileMasks))
{
StringBuilder sb = new StringBuilder("^");
bool first = true;
foreach (string fileMask in fileMasks)
{
if(first) first =false; else sb.Append("|");
sb.Append('(');
foreach (char c in fileMask)
{
switch (c)
{
case '*': sb.Append(#".*"); break;
case '?': sb.Append(#"."); break;
default:
sb.Append(Regex.Escape(c.ToString()));
break;
}
}
sb.Append(')');
}
sb.Append("$");
_maskRegexes[fileMasks] = new Regex(sb.ToString(), RegexOptions.IgnoreCase);
}
return _maskRegexes[fileMasks].IsMatch(filePath);
// or
return _maskRegexes[fileMasks];
}
static readonly Dictionary<string[], Regex> _maskRegexes = new Dictionary<string[], Regex>(/*unordered string[] comparer*/);
Notes:
Re-using Regex objects.
Using StringBuilder to optimize Regex creation (multiple .Replace() calls are slow).
Multiple masks, combined with OR.
Another version returning the Regex.
If PowerShell is available, it has direct support for wildcard type matching (as well as Regex).
WildcardPattern pat = new WildcardPattern("a*.b*");
if (pat.IsMatch(filename)) { ... }
I didn't want to copy the source code and like #frankhommers I came up with a reflection based solution.
Notice the code comment about the use of wildcards in the name argument I found in the reference source.
public static class PatternMatcher
{
static MethodInfo strictMatchPatternMethod;
static PatternMatcher()
{
var typeName = "System.IO.PatternMatcher";
var methodName = "StrictMatchPattern";
var assembly = typeof(Uri).Assembly;
var type = assembly.GetType(typeName, true);
strictMatchPatternMethod = type.GetMethod(methodName, BindingFlags.Static | BindingFlags.Public) ?? throw new MissingMethodException($"{typeName}.{methodName} not found");
}
/// <summary>
/// Tells whether a given name matches the expression given with a strict (i.e. UNIX like) semantics.
/// </summary>
/// <param name="expression">Supplies the input expression to check against</param>
/// <param name="name">Supplies the input name to check for.</param>
/// <returns></returns>
public static bool StrictMatchPattern(string expression, string name)
{
// https://referencesource.microsoft.com/#system/services/io/system/io/PatternMatcher.cs
// If this class is ever exposed for generic use,
// we need to make sure that name doesn't contain wildcards. Currently
// the only component that calls this method is FileSystemWatcher and
// it will never pass a name that contains a wildcard.
if (name.Contains('*')) throw new FormatException("Wildcard not allowed");
return (bool)strictMatchPatternMethod.Invoke(null, new object[] { expression, name });
}
}
For .net Core the way microsoft does.
private bool MatchPattern(ReadOnlySpan<char> relativePath)
{
ReadOnlySpan<char> name = IO.Path.GetFileName(relativePath);
if (name.Length == 0)
return false;
if (Filters.Count == 0)
return true;
foreach (string filter in Filters)
{
if (FileSystemName.MatchesSimpleExpression(filter, name, ignoreCase: !PathInternal.IsCaseSensitive))
return true;
}
return false;
}
The way microsoft itself seemed to do for .NET 4.6 is documented in github:
private bool MatchPattern(string relativePath) {
string name = System.IO.Path.GetFileName(relativePath);
if (name != null)
return PatternMatcher.StrictMatchPattern(filter.ToUpper(CultureInfo.InvariantCulture), name.ToUpper(CultureInfo.InvariantCulture));
else
return false;
}
My version, which supports ** wild card:
static Regex FileMask2Regex(string mask)
{
var sb = new StringBuilder(mask);
// hide wildcards
sb.Replace("**", "affefa0d52e84c2db78f5510117471aa-StarStar");
sb.Replace("*", "affefa0d52e84c2db78f5510117471aa-Star");
sb.Replace("?", "affefa0d52e84c2db78f5510117471aa-Question");
sb.Replace("/", "affefa0d52e84c2db78f5510117471aa-Slash");
sb.Replace("\\", "affefa0d52e84c2db78f5510117471aa-Slash");
sb = new StringBuilder(Regex.Escape(sb.ToString()));
// unhide wildcards
sb.Replace("affefa0d52e84c2db78f5510117471aa-StarStar", #".*");
sb.Replace("affefa0d52e84c2db78f5510117471aa-Star", #"[^/\\]*");
sb.Replace("affefa0d52e84c2db78f5510117471aa-Question", #"[^/\\]");
sb.Replace("affefa0d52e84c2db78f5510117471aa-Slash", #"[/\\]");
sb.Append("$");
// allowed to have prefix
sb.Insert(0, #"^(?:.*?[/\\])?");
return new Regex(sb.ToString(), RegexOptions.IgnoreCase);
}
How about using reflection to get access to the function in the .NET framework?
Like this:
public class PatternMatcher
{
public delegate bool StrictMatchPatternDelegate(string expression, string name);
public StrictMatchPatternDelegate StrictMatchPattern;
public PatternMatcher()
{
Type patternMatcherType = typeof(FileSystemWatcher).Assembly.GetType("System.IO.PatternMatcher");
MethodInfo patternMatchMethod = patternMatcherType.GetMethod("StrictMatchPattern", System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.Public);
StrictMatchPattern = (expression, name) => (bool)patternMatchMethod.Invoke(null, new object[] { expression, name });
}
}
void Main()
{
PatternMatcher patternMatcher = new PatternMatcher();
Console.WriteLine(patternMatcher.StrictMatchPattern("*.txt", "test.txt")); //displays true
Console.WriteLine(patternMatcher.StrictMatchPattern("*.doc", "test.txt")); //displays false
}