regex split with exceptions - c#

This is an extension to this SO question. This question considers two different enclosing characters, in contrast to the original question.
I would like to split by (white)spaces of any number but ignore everything between <> AND "". So this string:
string Line = "1 2 <1 2> \"hello world\" 3";
Should result in this:
1, 2, <1 2>, \"hello world\", 3

Instead of Split, I'll use Matches
string Line = "1 2 <1 2> \"hello world\" 3";
var parts = Regex.Matches(Line, #"[<\""]{1}[\w \d]+?[>\""]{1}|[\w\d]+")
.Cast<Match>()
.Select(m=>m.Value)
.ToArray();
PS: This would also match "abc def>. But I ignored it to make the regex shorter

This is what I came up with so far:
public static string[] GetSplitStrings(string input)
{
IList<string> splitStrings = new List<string>();
var counter = 0;
var sb = new StringBuilder();
var inLessGreater = false; // sometimes <> can contain "
foreach (var character in input)
{
if (character.Equals('<'))
{
inLessGreater = true;
counter++;
}
if (character.Equals('>'))
{
inLessGreater = false;
counter++;
}
if (character.Equals('"') && !inLessGreater)
{
counter++;
}
if ((character.Equals(' ') && counter == 0) || (counter == 2))
{
if (sb.ToString().Equals("") == false)
{
if (character.Equals('"') || character.Equals('>'))
{
sb.Append(character);
}
splitStrings.Add(sb.ToString());
}
sb.Clear();
counter = 0;
}
else
{
sb.Append(character);
}
}
return splitStrings.ToArray();
}
Would prefer a neat regex solution.

Related

Editing string in C#

given a string with words separated by spaces how would you go about merging two words if one of them is made by one character only ? An example should clarify:
"a bcd tttt" => "abcd tttt"
"abc d hhhh" => "abcd hhhh"
I would like to merge the single characer word with the one on the left in all cases where it is not the first word in the string, in this case i would like to merge it with the one on the right.
I am trying to loop through the string and create some logic but it turned out to be more complex than i was expecting.
Try the below program's approach:
using System;
using System.Text;
public class Program
{
public static void Main()
{
var delimiter=new char[]{' '};
var stringToMerge="abc d hhhh";
var splitArray=stringToMerge.Split(delimiter);
var stringBuilder=new StringBuilder();
for(int wordIndex=0;wordIndex<splitArray.Length;wordIndex++)
{
var word=splitArray[wordIndex];
if(wordIndex!=0 && word.Length>1)
{
stringBuilder.Append(" ");
}
stringBuilder.Append(word);
}
Console.WriteLine(stringBuilder.ToString());
}
}
Basically, you split the string to words, then using StringBuilder, build a new string, inserting a space before a word only if the word is larger than one character.
One way to approach this is to first use string.Split(' ') to get an array of words, which is easier to deal with.
Then you can loop though the words, handling single character words by concatenating them with the previous word, with special handling for the first word.
One such approach:
public static void Main()
{
string data = "abcd hhhh";
var words = data.Split(' ');
var sb = new StringBuilder();
for (int i = 0; i < words.Length; ++i)
{
var word = words[i];
if (word.Length == 1)
{
sb.Append(word);
if (i == 0 && i < words.Length - 1) // Single character first word is special case: Merge with next word.
sb.Append(words[++i]); // Note the "++i" to increment the loop counter, skipping the next word.
}
else
{
sb.Append(' ' + word);
}
}
var result = sb.ToString();
Console.WriteLine(result);
}
Note that this will concatenate multiple instances of single-letter words, so that "a b c d e" will result in "abcde" and "ab c d e fg" will result in "abcde fg". You don't actually specify what should happen in this case.
if you want to do it with a plain for loop and string walking:
using System;
using System.Text;
public class Program
{
public static void Main()
{
Console.WriteLine(MergeOrphant("bcd a tttt") == "bcda tttt");
Console.WriteLine(MergeOrphant("bcd a tttt a") == "bcda tttta");
Console.WriteLine(MergeOrphant("a bcd tttt") == "abcd tttt");
Console.WriteLine(MergeOrphant("a b") == "ab");
}
private static string MergeOrphant(string source)
{
var stringBuilder = new StringBuilder();
for (var i = 0; i < source.Length; i++)
{
if (i == 1 && char.IsWhiteSpace(source[i]) && char.IsLetter(source[i - 1])) {
i++;
}
if (i > 0 && char.IsWhiteSpace(source[i]) && char.IsLetter(source[i - 1]) && char.IsLetter(source[i + 1]) && (i + 2 == source.Length || char.IsWhiteSpace(source[i + 2])) )
{
i++;
}
stringBuilder.Append(source[i]);
}
return stringBuilder.ToString();
}
}
Quite short with Regex.
string foo = "a bcd b tttt";
foo = Regex.Replace(foo, #"^(\w) (\w{2,})", "$1$2");
foo = Regex.Replace(foo, #"(\w{2,}) (\w)\b", "$1$2");
Be aware \w is [a-zA-Z0-9_] if you need an other definition you have to define you own character class.
My answer would not be the best practice but it works for your second case, but still you should be clear about the letter merging rules.
public static void Main()
{
Console.WriteLine(Edit("abc d hhhh") == "abcd hhhh");
Console.WriteLine(Edit("abc d hhhh a") == "abcd hhhha");
Console.WriteLine(Edit("abc d hhhh a b") == "abcd hhhhab");
Console.WriteLine(Edit("abc d hhhh a def g") == "abcd hhhha defg");
}
public static string Edit(string str)
{
var result = string.Empty;
var split = str.Split(' ', StringSplitOptions.RemoveEmptyEntries);
for (int i = 0; i < split.Length; i++)
{
if(i == 0)
result += split[i];
else
{
if (i > 0 && split[i].Length == 1)
{
result += split[i];
}
else
{
result += $" {split[i]}";
}
}
}
return result;
}
As I have mentioned above, this does not work for your 1st case which is : Edit("a bcd") would not generate "abcd".
Expanding on Matthew's answer,
If you don't want the extra space in the output you can change the last line to;
Console.WriteLine(result.TrimStart(' '));

Need RegEx or some other way for separating quoted tokens containing escaped quotes

Basically, my task is to parse this command line:
-p "This is a string ""with quotes""" d:\1.txt "d:\some folder\1.out"
What I need is to split this string into:
-p
This is a string "with quotes"
d:\1.txt
d:\some folder\1.out
I searched (yes, I really did), but all examples I found either had not included escaped quotes or used \" for escape symbol.
I would use a real csv-parser instead, for example the only one available in .NET:
string str = "-p \"This is a string \"\"with quotes\"\"\" d:\\1.txt \"d:\\some folder\\1.out\"";
var allLineFields = new List<string[]>();
using (var parser = new Microsoft.VisualBasic.FileIO.TextFieldParser(new StringReader(str)))
{
parser.Delimiters = new string[] { " " };
parser.HasFieldsEnclosedInQuotes = true; // <--- !!!
string[] lineFields;
while ((lineFields = parser.ReadFields()) != null)
{
allLineFields.Add(lineFields);
}
}
With your sample string the list contains a single string[] with your four tokens:
-p
This is a string "with quotes"
d:\1.txt
d:\some folder\1.out
Using a regex (if you insist on not using a parser as Tim Schmelter's answer suggested), something like this should work (it matches the given string, but I can't guarantee it's completely bullet-proof):
((?:"(?:[^"]|"")*")|\S+)
Breaking it down, you are grouping either:
A quote " followed by not a quote ^" or two quotes "", followed by a quote "
A bunch (one or more) of non-space characters \S
See here to play around with it.
A handwritten version:
private static string[] ParseArguments(string text)
{
if (string.IsNullOrWhiteSpace(text)) return new string[0];
var entries = new List<string>(8);
var stringBuilder = new StringBuilder(64);
var inString = false;
var l = text.Length;
for (var i = 0; i < l; i++)
{
var c = text[i];
if (inString)
{
if (c == '"')
{
if (i != l - 1 && text[i + 1] == '"')
{
stringBuilder.Append(c);
i++;
}
else inString = false;
}
else stringBuilder.Append(c);
}
else if (c == '"') inString = true;
else if (char.IsWhiteSpace(c))
{
if (stringBuilder.Length == 0) continue;
entries.Add(stringBuilder.ToString());
stringBuilder.Length = 0;
}
else stringBuilder.Append(c);
}
if (stringBuilder.Length != 0) entries.Add(stringBuilder.ToString());
return entries.ToArray();
}

Split string by commas ignoring any punctuation marks (including ',') in quotation marks

How can I split string (from a textbox) by commas excluding those in double quotation marks (without getting rid of the quotation marks), along with other possible punctuation marks (e.g. ' . ' ' ; ' ' - ')?
E.g. If someone entered the following into the textbox:
apple, orange, "baboons, cows", rainbow, "unicorns, gummy bears"
How can I split the above string into the following (say, into a List)?
apple
orange
"baboons, cows"
rainbow
"Unicorns, gummy bears..."
Thank you for your help!
You could try the below regex which uses positive lookahead,
string value = #"apple, orange, ""baboons, cows"", rainbow, ""unicorns, gummy bears""";
string[] lines = Regex.Split(value, #", (?=(?:""[^""]*?(?: [^""]*)*))|, (?=[^"",]+(?:,|$))");
foreach (string line in lines) {
Console.WriteLine(line);
}
Output:
apple
orange
"baboons, cows"
rainbow
"unicorns, gummy bears"
IDEONE
Try this:
Regex str = new Regex("(?:^|,)(\"(?:[^\"]+|\"\")*\"|[^,]*)", RegexOptions.Compiled);
foreach (Match m in str.Matches(input))
{
Console.WriteLine(m.Value.TrimStart(','));
}
You may also try to look at FileHelpers
Much like a CSV parser, instead of Regex, you can loop through each character, like so:
public List<string> ItemStringToList(string inputString)
{
var itemList = new List<string>();
var currentIem = "";
var quotesOpen = false;
for (int i = 0; i < inputString.Length; i++)
{
if (inputString[i] == '"')
{
quotesOpen = !quotesOpen;
continue;
}
if (inputString[i] == ',' && !quotesOpen)
{
itemList.Add(currentIem);
currentIem = "";
continue;
}
if (currentIem == "" && inputString[i] == ' ') continue;
currentIem += inputString[i];
}
if (currentIem != "") itemList.Add(currentIem);
return itemList;
}
Example test usage:
var test1 = ItemStringToList("one, two, three");
var test2 = ItemStringToList("one, \"two\", three");
var test3 = ItemStringToList("one, \"two, three\"");
var test4 = ItemStringToList("one, \"two, three\", four, \"five six\", seven");
var test5 = ItemStringToList("one, \"two, three\", four, \"five six\", seven");
var test6 = ItemStringToList("one, \"two, three\", four, \"five six, seven\"");
var test7 = ItemStringToList("\"one, two, three\", four, \"five six, seven\"");
You could change it to use StringBuilder if you want faster character joining.
Try with this it will work u c an split array string in many waysif you want to split by white space just put a space in (' ') .
namespace LINQExperiment1
{
class Program
{
static void Main(string[] args)
{
string[] sentence = new string[] { "apple", "orange", "baboons cows", " rainbow", "unicorns gummy bears" };
Console.WriteLine("option 1:"); Console.WriteLine("————-");
// option 1: Select returns three string[]’s with
// three strings in each.
IEnumerable<string[]> words1 =
sentence.Select(w => w.Split(' '));
// to get each word, we have to use two foreach loops
foreach (string[] segment in words1)
foreach (string word in segment)
Console.WriteLine(word);
Console.WriteLine();
Console.WriteLine("option 2:"); Console.WriteLine("————-");
// option 2: SelectMany returns nine strings
// (sub-iterates the Select result)
IEnumerable<string> words2 =
sentence.SelectMany(segment => segment.Split(','));
// with SelectMany we have every string individually
foreach (var word in words2)
Console.WriteLine(word);
// option 3: identical to Opt 2 above written using
// the Query Expression syntax (multiple froms)
IEnumerable<string> words3 =from segment in sentence
from word in segment.Split(' ')
select word;
}
}
}
This was trickier than I thought, a good practical problem I think.
Below is the solution I came up with for this. One thing I don't like about my solution is having to add double quotations back and the other one being names of the variables :p:
internal class Program
{
private static void Main(string[] args)
{
string searchString =
#"apple, orange, ""baboons, cows. dogs- hounds"", rainbow, ""unicorns, gummy bears"", abc, defghj";
char delimeter = ',';
char excludeSplittingWithin = '"';
string[] splittedByExcludeSplittingWithin = searchString.Split(excludeSplittingWithin);
List<string> splittedSearchString = new List<string>();
for (int i = 0; i < splittedByExcludeSplittingWithin.Length; i++)
{
if (i == 0 || splittedByExcludeSplittingWithin[i].StartsWith(delimeter.ToString()))
{
string[] splitttedByDelimeter = splittedByExcludeSplittingWithin[i].Split(delimeter);
for (int j = 0; j < splitttedByDelimeter.Length; j++)
{
splittedSearchString.Add(splitttedByDelimeter[j].Trim());
}
}
else
{
splittedSearchString.Add(excludeSplittingWithin + splittedByExcludeSplittingWithin[i] +
excludeSplittingWithin);
}
}
foreach (string s in splittedSearchString)
{
if (s.Trim() != string.Empty)
{
Console.WriteLine(s);
}
}
Console.ReadKey();
}
}
Another Regex solution:
private static IEnumerable<string> Parse(string input)
{
// if used frequently, should be instantiated with Compiled option
Regex regex = new Regex(#"(?<=^|,\s)(\""(?:[^\""]|\""\"")*\""|[^,\s]*)");
return regex.Matches(inputData).Where(m => m.Success);
}

Best way to split string into lines with maximum length, without breaking words

I want to break a string up into lines of a specified maximum length, without splitting any words, if possible (if there is a word that exceeds the maximum line length, then it will have to be split).
As always, I am acutely aware that strings are immutable and that one should preferably use the StringBuilder class. I have seen examples where the string is split into words and the lines are then built up using the StringBuilder class, but the code below seems "neater" to me.
I mentioned "best" in the description and not "most efficient" as I am also interested in the "eloquence" of the code. The strings will never be huge, generally splitting into 2 or three lines, and it won't be happening for thousands of lines.
Is the following code really bad?
private static IEnumerable<string> SplitToLines(string stringToSplit, int maximumLineLength)
{
stringToSplit = stringToSplit.Trim();
var lines = new List<string>();
while (stringToSplit.Length > 0)
{
if (stringToSplit.Length <= maximumLineLength)
{
lines.Add(stringToSplit);
break;
}
var indexOfLastSpaceInLine = stringToSplit.Substring(0, maximumLineLength).LastIndexOf(' ');
lines.Add(stringToSplit.Substring(0, indexOfLastSpaceInLine >= 0 ? indexOfLastSpaceInLine : maximumLineLength).Trim());
stringToSplit = stringToSplit.Substring(indexOfLastSpaceInLine >= 0 ? indexOfLastSpaceInLine + 1 : maximumLineLength);
}
return lines.ToArray();
}
Even when this post is 3 years old I wanted to give a better solution using Regex to accomplish the same:
If you want the string to be splitted and then use the text to be displayed you can use this:
public string SplitToLines(string stringToSplit, int maximumLineLength)
{
return Regex.Replace(stringToSplit, #"(.{1," + maximumLineLength +#"})(?:\s|$)", "$1\n");
}
If on the other hand you need a collection you can use this:
public MatchCollection SplitToLines(string stringToSplit, int maximumLineLength)
{
return Regex.Matches(stringToSplit, #"(.{1," + maximumLineLength +#"})(?:\s|$)");
}
NOTES
Remember to import regex (using System.Text.RegularExpressions;)
You can use string interpolation on the match:
$#"(.{{1,{maximumLineLength}}})(?:\s|$)"
The MatchCollection works almost like an Array
Matching example with explanation here
How about this as a solution:
IEnumerable<string> SplitToLines(string stringToSplit, int maximumLineLength)
{
var words = stringToSplit.Split(' ').Concat(new [] { "" });
return
words
.Skip(1)
.Aggregate(
words.Take(1).ToList(),
(a, w) =>
{
var last = a.Last();
while (last.Length > maximumLineLength)
{
a[a.Count() - 1] = last.Substring(0, maximumLineLength);
last = last.Substring(maximumLineLength);
a.Add(last);
}
var test = last + " " + w;
if (test.Length > maximumLineLength)
{
a.Add(w);
}
else
{
a[a.Count() - 1] = test;
}
return a;
});
}
I reworked this as prefer this:
IEnumerable<string> SplitToLines(string stringToSplit, int maximumLineLength)
{
var words = stringToSplit.Split(' ');
var line = words.First();
foreach (var word in words.Skip(1))
{
var test = $"{line} {word}";
if (test.Length > maximumLineLength)
{
yield return line;
line = word;
}
else
{
line = test;
}
}
yield return line;
}
I don't think your solution is too bad. I do, however, think you should break up your ternary into an if else because you are testing the same condition twice. Your code might also have a bug. Based on your description, it seems you want lines <= maxLineLength, but your code counts the space after the last word and uses it in the <= comparison resulting in effectively < behavior for the trimmed string.
Here is my solution.
private static IEnumerable<string> SplitToLines(string stringToSplit, int maxLineLength)
{
string[] words = stringToSplit.Split(' ');
StringBuilder line = new StringBuilder();
foreach (string word in words)
{
if (word.Length + line.Length <= maxLineLength)
{
line.Append(word + " ");
}
else
{
if (line.Length > 0)
{
yield return line.ToString().Trim();
line.Clear();
}
string overflow = word;
while (overflow.Length > maxLineLength)
{
yield return overflow.Substring(0, maxLineLength);
overflow = overflow.Substring(maxLineLength);
}
line.Append(overflow + " ");
}
}
yield return line.ToString().Trim();
}
It is a bit longer than your solution, but it should be more straightforward. It also uses a StringBuilder so it is much faster for large strings. I performed a benchmarking test for 20,000 words ranging from 1 to 11 characters each split into lines of 10 character width. My method completed in 14ms compared to 1373ms for your method.
Try this (untested)
private static IEnumerable<string> SplitToLines(string value, int maximumLineLength)
{
var words = value.Split(' ');
var line = new StringBuilder();
foreach (var word in words)
{
if ((line.Length + word.Length) >= maximumLineLength)
{
yield return line.ToString();
line = new StringBuilder();
}
line.AppendFormat("{0}{1}", (line.Length>0) ? " " : "", word);
}
yield return line.ToString();
}
~6x faster than the accepted answer
More than 1.5x faster than the Regex version in Release Mode (dependent on line length)
Optionally keep the space at the end of the line or not (the regex version always keeps it)
static IEnumerable<string> SplitToLines(string stringToSplit, int maximumLineLength, bool removeSpace = true)
{
int start = 0;
int end = 0;
for (int i = 0; i < stringToSplit.Length; i++)
{
char c = stringToSplit[i];
if (c == ' ' || c == '\n')
{
if (i - start > maximumLineLength)
{
string substring = stringToSplit.Substring(start, end - start); ;
start = removeSpace ? end + 1 : end; // + 1 to remove the space on the next line
yield return substring;
}
else
end = i;
}
}
yield return stringToSplit.Substring(start); // remember last line
}
Here is the example code used to test speeds (again, run on your own machine and test in Release mode to get accurate timings)
https://dotnetfiddle.net/h5I1GC
Timings on my machine in release mode .Net 4.8
Accepted Answer: 667ms
Regex: 368ms
My Version: 117ms
My requirement was to have a line break at the last space before the 30 char limit.
So here is how i did it. Hope this helps anyone looking.
private string LineBreakLongString(string input)
{
var outputString = string.Empty;
var found = false;
int pos = 0;
int prev = 0;
while (!found)
{
var p = input.IndexOf(' ', pos);
{
if (pos <= 30)
{
pos++;
if (p < 30) { prev = p; }
}
else
{
found = true;
}
}
outputString = input.Substring(0, prev) + System.Environment.NewLine + input.Substring(prev, input.Length - prev).Trim();
}
return outputString;
}
An approach using recursive method and ReadOnlySpan (Tested)
public static void SplitToLines(ReadOnlySpan<char> stringToSplit, int index, ref List<string> values)
{
if (stringToSplit.IsEmpty || index < 1) return;
var nextIndex = stringToSplit.IndexOf(' ');
var slice = stringToSplit.Slice(0, nextIndex < 0 ? stringToSplit.Length : nextIndex);
if (slice.Length <= index)
{
values.Add(slice.ToString());
nextIndex++;
}
else
{
values.Add(slice.Slice(0, index).ToString());
nextIndex = index;
}
if (stringToSplit.Length <= index) return;
SplitToLines(stringToSplit.Slice(nextIndex), index, ref values);
}

C# split comma separated values

How can I split comma separated strings with quoted strings that can also contain commas?
Example input:
John, Doe, "Sid, Nency", Smith
Expected output:
John
Doe
Sid, Nency
Smith
Split by commas was ok, but I've got requirement that strings like "Sid, Nency" are allowed. I tried to use regexes to split such values. Regex ",(?=([^\"]*\"[^\"]*\")*[^\"]*$)" is from Java question and it is not working good for my .NET code. It doubles some strings, finds extra results etc.
So what is the best way to split such strings?
It's because of the capture group. Just turn it into a non-capture group:
",(?=(?:[^""]*""[^""]*"")*[^""]*$)"
^^
The capture group is including the captured part in your results.
ideone demo
var regexObj = new Regex(#",(?=(?:[^""]*""[^""]*"")*[^""]*$)");
regexObj.Split(input).Select(s => s.Trim('\"', ' ')).ForEach(Console.WriteLine);
And just trim the results.
Just go through your string. As you go through your string keep track
if you're in a "block" or not. If you're - don't treat the comma as
a comma (as a separator). Otherwise do treat it as such. It's a simple
algorithm, I would write it myself. When you encounter first " you enter
a block. When you encounter next ", you end that block you were, and so on.
So you can do it with one pass through your string.
import java.util.ArrayList;
public class Test003 {
public static void main(String[] args) {
String s = " John, , , , \" Barry, John \" , , , , , Doe, \"Sid , Nency\", Smith ";
StringBuilder term = new StringBuilder();
boolean inQuote = false;
boolean inTerm = false;
ArrayList<String> terms = new ArrayList<String>();
for (int i=0; i<s.length(); i++){
char ch = s.charAt(i);
if (ch == ' '){
if (inQuote){
if (!inTerm) {
inTerm = true;
}
term.append(ch);
}
else {
if (inTerm){
terms.add(term.toString());
term.setLength(0);
inTerm = false;
}
}
}else if (ch== '"'){
term.append(ch); // comment this out if you don't need it
if (!inTerm){
inTerm = true;
}
inQuote = !inQuote;
}else if (ch == ','){
if (inQuote){
if (!inTerm){
inTerm = true;
}
term.append(ch);
}else{
if (inTerm){
terms.add(term.toString());
term.setLength(0);
inTerm = false;
}
}
}else{
if (!inTerm){
inTerm = true;
}
term.append(ch);
}
}
if (inTerm){
terms.add(term.toString());
}
for (String t : terms){
System.out.println("|" + t + "|");
}
}
}
I use the following code within my Csv Parser class to achieve this:
private string[] ParseLine(string line)
{
List<string> results = new List<string>();
bool inQuotes = false;
int index = 0;
StringBuilder currentValue = new StringBuilder(line.Length);
while (index < line.Length)
{
char c = line[index];
switch (c)
{
case '\"':
{
inQuotes = !inQuotes;
break;
}
default:
{
if (c == ',' && !inQuotes)
{
results.Add(currentValue.ToString());
currentValue.Clear();
}
else
currentValue.Append(c);
break;
}
}
++index;
}
results.Add(currentValue.ToString());
return results.ToArray();
} // eo ParseLine
If you find the regular expression too complex you can do it like this:
string initialString = "John, Doe, \"Sid, Nency\", Smith";
IEnumerable<string> splitted = initialString.Split('"');
splitted = splitted.SelectMany((str, index) => index % 2 == 0 ? str.Split(',') : new[] { str });
splitted = splitted.Where(str => !string.IsNullOrWhiteSpace(str)).Select(str => str.Trim());

Categories

Resources