string emailBody = "sample text for NewFinancial History:\"xyz\" text NewFinancial History:\"abc\" NewEBTDI$:\"abc\" ds \"NewFinancial History:pqr\" test";
private Dictionary<string, List<string>> ExtractFieldValuesForDynamicListObject(string emailBody)
{
Dictionary<string, List<string>> paramValueList = new Dictionary<string, List<string>>();
try
{
emailBody = ReplaceIncompatableQuotes(emailBody);
emailBody = string.Join(" ", Regex.Split(emailBody.Trim(), #"(?:\r\n|\n|\r)"));
var keys = Regex.Matches(emailBody, #"\bNew\B(.+?):", RegexOptions.Singleline).OfType<Match>().Select(m => m.Groups[0].Value.Replace(":", "")).Distinct().ToArray();
foreach (string key in keys)
{
List<string> valueList = new List<string>();
string regex = "" + Regex.Escape(key) + ":" + "\"(?<" + Regex.Escape(GetCleanKey(key)) + ">[^\"]*)\"";
var matches = Regex.Matches(emailBody, regex, RegexOptions.Singleline);
foreach (Match match in matches)
{
if (match.Success)
{
string value = match.Groups[Regex.Escape(GetCleanKey(key))].Value;
if (!valueList.Contains(value.Trim()))
{
valueList.Add(value.Trim());
}
}
}
valueList = valueList.Distinct().ToList();
string listName = key.Replace("New", "");
paramValueList.Add(listName.Trim(), valueList);
}
}
catch (Exception ex)
{
DCULSLogger.LogError(ex);
}
return paramValueList;
}
My goal here is to scan though the email body and identify the string with NewListName:"Value" nomenclature and it is working perfectly fine using the above regex and method. Now my client has changed the nomenclature from NewListName:"Value" to "NewListName:Value". I want to grab the text between the double quotes along with New: keyword. So I need to look for "New keyword and ending quotes. Can anyone help me modify the above regex to scan through the email body and get all list of value between double quotes. So in above example I want to grab \"NewFinancial History:pqr\" in my results. Any help would be appreciated.
You may use a regex that will match quote, New, some chars other than " and :, then :, and then any chars but " up to a ":
var keys = Regex.Matches(emailBody, #"""New[^"":]+:[^""]+""", RegexOptions.Singleline)
.OfType<Match>()
.Select(m => m.Value)
.Distinct()
.ToArray();
See the regex demo
Pattern details:
" - a literal double quote
New - a literal substring
[^":]+ - 1 or more characters other than " and : (the [^...] is a negated character class)
: - a literal colon
[^"]+ - 1 or more characters other than "
" - a literal double quote
Related
Am reading a text file that contains words, numbers and special characters, I want to remove certain special characters like: [](),'
I have this code but it is not working !
using (var reader = new StreamReader ("C://Users//HP//Documents//result2.txt")) {
string line = reader.ReadToEnd ();
Regex rgx = new Regex ("[^[]()',]");
string res = rgx.Replace (line, "");
Message1.text = res;
}
what am I missing, thanks
Some of the characters in your Regex, specifically [ ] ( ) ^, hold special meaning in Regex and in order to use them literally they must be escaped.
Use the following properly escaped Regex:
Regex rgx = new Regex (#"[\^\[\]\(\)',]");
Note that it is necessary to use the # verbatim string, because we don't want to escape these characters from the string, only from the Regex.
Alternatively, double escape the backslashes:
Regex rgx = new Regex ("[\\^\\[\\]\\(\\)',]");
But that's less readable in this case.
You could skip Regex and just maintain a list of characters you want to remove and then replace the old fashioned way:
string[] specialCharsToRemove = new [] { "[", "]", "(", ")", "'", "," };
using (var reader = new StreamReader ("C://Users//HP//Documents//result2.txt"))
{
string line = reader.ReadToEnd();
foreach(string s in specialCharsToRemove)
{
line = line.Replace(s, string.Empty);
}
Message1.text = res;
}
Ideally this would be in its own method, something like this:
private static string RemoveCharacters(string input, string[] specialCharactersToRemove)
{
foreach(string s in specialCharactersToRemove)
{
input = input.Replace(s, string.Empty);
}
return input;
}
I made a fiddle here
Replace them one at a time with String.Replace:
using (var reader = new StreamReader ("C://Users//HP//Documents//result2.txt"))
{
string line = reader.ReadToEnd ();
string res = line.Replace(line, "[", "");
res = res.Replace(line, "]", "");
res = res.Replace(line, "(", "");
res = res.Replace(line, ")", "");
res = res.Replace(line, "'", "");
res = res.Replace(line, ",", "");
Message1.text = res;
}
I agree with avoiding regex for this, but I would not use string.Replace multiple times, either.
Consider implementing a Replace or Remove method that accepts an array of characters to replace, and scan the input string only once. For example:
var builder = new StringBuilder();
foreach (char ch in input)
{
if (!chars.Contains(ch))
{
builder.Append(ch):
}
}
return builder.ToString();
The task:
Write a program, which counts the phrases in a text file. Any sequence of characters could be given as phrase for counting, even sequences containing separators. For instance in the text "I am a student in Sofia" the phrases "s", "stu", "a" and "I am" are found respectively 2, 1, 3 and 1 times.
I know the solution with string.IndexOf or with LINQ or with some type of algorithm like Aho-Corasick. I want to do same thing with Regex.
This is what I've done so far:
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
namespace CountThePhrasesInATextFile
{
class Program
{
static void Main(string[] args)
{
string input = ReadInput("file.txt");
input.ToLower();
List<string> phrases = new List<string>();
using (StreamReader reader = new StreamReader("words.txt"))
{
string line = reader.ReadLine();
while (line != null)
{
phrases.Add(line.Trim());
line = reader.ReadLine();
}
}
foreach (string phrase in phrases)
{
Regex regex = new Regex(String.Format(".*" + phrase.ToLower() + ".*"));
int mathes = regex.Matches(input).Count;
Console.WriteLine(phrase + " ----> " + mathes);
}
}
private static string ReadInput(string fileName)
{
string output;
using (StreamReader reader = new StreamReader(fileName))
{
output = reader.ReadToEnd();
}
return output;
}
}
}
I know my regular expression is incorrect but I don't know what to change.
The output:
Word ----> 2
S ----> 2
MissingWord ----> 0
DS ----> 2
aa ----> 0
The correct output:
Word --> 9
S --> 13
MissingWord --> 0
DS --> 2
aa --> 3
file.txt contains:
Word? We have few words: first word, second word, third word.
Some passwords: PASSWORD123, #PaSsWoRd!456, AAaA, !PASSWORD
words.txt contains:
Word
S
MissingWord
DS
aa
You need to post the file.txt contents first, otherwise it's difficult to verify if the regex is working correctly or not.
That being said, check out the Regex answer here:
Finding ALL positions of a substring in a large string in C#
and see if that helps with your code in the mean time.
edit:
So there's a simple solution, add "(?=(" and "))" to each of your phrases. This is a lookahead assertion in regex. The following code handles what you want.
foreach (string phrase in phrases) {
string MatchPhrase = "(?=(" + phrase.ToLower() + "))";
int mathes = Regex.Matches(input, MatchPhrase).Count;
Console.WriteLine(phrase + " ----> " + mathes);
}
You also had an issue with
input.ToLower();
which should be instead
input = input.ToLower();
as strings in c# are immutable. In total, your code should be:
static void Main(string[] args) {
string input = ReadInput("file.txt");
input = input.ToLower();
List<string> phrases = new List<string>();
using (StreamReader reader = new StreamReader("words.txt")) {
string line = reader.ReadLine();
while (line != null) {
phrases.Add(line.Trim());
line = reader.ReadLine();
}
}
foreach (string phrase in phrases) {
string MatchPhrase = "(?=(" + phrase.ToLower() + "))";
int mathes = Regex.Matches(input, MatchPhrase).Count;
Console.WriteLine(phrase + " ----> " + mathes);
}
Thread.Sleep(50000);
}
private static string ReadInput(string fileName) {
string output;
using (StreamReader reader = new StreamReader(fileName)) {
output = reader.ReadToEnd();
}
return output;
}
here is what happened. I am going to use Word as example.
the regex you built for "word" is ".word.". It is telling regex to match anything starts with anything, contains "word" and ends with anything.
for your input, it matched
Word? We have few words: first word, second word, third word.
which starts with "Word? We have few words: first" and ends with ", second word, third word."
then second line starts with "Some pass" contains "word" and ends with ": PASSWORD123, #PaSsWoRd!456, AAaA, !PASSWORD"
so the count is 2
the regex you want is simple, string "word" is sufficient.
Update:
for ignore case pattern try "(?i)word"
And for the multiple matches within AAaA, try "(?i)(?<=a)a"
?<= is a Zero-width positive lookbehind assertion
Try this code:
string input = File.ReadAllText("file.txt");
foreach (string word in File.ReadLines("words.txt"))
{
var regex = new Regex(word, RegexOptions.IgnoreCase);
int startat = 0;
int count = 0;
Match match = regex.Match(input, startat);
while (match.Success)
{
count++;
startat = match.Index + 1;
match = regex.Match(input, startat);
}
Console.WriteLine(word + "\t" + count);
}
To correctly find all substrings like "aa", had to use the overload Match method with startat parameter.
Note the RegexOptions.IgnoreCase parameter.
A shorter but less clear code:
Match match;
while ((match = regex.Match(input, startat)).Success)
{
count++;
startat = match.Index + 1;
}
I am trying to get the value after New : in double quote.
I can retrieve the value fine when there is no space in ListName. But if I put space between the list name (eg. NewFinancial History:\"xyz\"), it throws the error below:
parsing "NewFinancial History:"(?[^"]*)"" - Invalid group name: Group names must begin with a word character.
it throws error at below line
var matches = Regex.Matches(contents, regex, RegexOptions.Singleline);
Below is my code.
string contents = " testing NewFinancial History:\"xyz\" ";
var keys = Regex.Matches(contents, #"New(.+?):", RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace).OfType<Match>().Select(m => m.Groups[0].Value.Trim().Replace(":", "")).Distinct().ToArray();
foreach (string key in keys)
{
List<string> valueList = new List<string>();
string listNameKey = key;
string regex = "" + listNameKey + ":" + "\"(?<" + listNameKey + ">[^\"]*)\"";
var matches = Regex.Matches(contents, regex, RegexOptions.Singleline);
foreach (Match match in matches)
{
if (match.Success)
{
string value = match.Groups[key].Value;
valueList.Add(value);
}
}
}
I don't see why you also use the "key" as name of the group.
The problem you have is that the group name
could not contain spaces, but you could simply create an anonymous group.
string contents = " testing NewFinancial History:\"xyz\" ";
var keys = Regex.Matches(contents, #"New(.+?):", RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace).OfType<Match>().Select(m => m.Groups[0].Value.Trim().Replace(":", "")).Distinct().ToArray();
foreach (string key in keys)
{
List<string> valueList = new List<string>();
string listNameKey = key;
string regex = "" + listNameKey + ":" + "\"([^\"]*)\""; //create an anonymous capture group
var matches = Regex.Matches(contents, regex, RegexOptions.Singleline);
foreach (Match match in matches)
{
if (match.Success)
{
string value = match.Groups[0].Value; //get the first group
valueList.Add(value);
}
}
}
Change your foreach block to
List<string> valueList = new List<string>();
string listNameKey = key;
string regex = "" + listNameKey + ":" + "\"(?<" +
listNameKey.Replace(" ","") + ">[^\"]*)\""; // Removing spaces in the group name here
var matches = Regex.Matches(contents, regex, RegexOptions.Singleline);
foreach (Match match in matches)
{
if (match.Success)
{
string value = match.Groups[key.Replace(" ", "")].Value; // Removing spaces here
valueList.Add(value);
}
}
The point is that group names cannot have whitespace, so you need to replace them with empty strings in places where you declare the capture group name.
See IDEONE demo
Note that your New(.+?): regex has no whitespace to ignore, I recommend deleting RegexOptions.IgnorePatternWhitespace flag. You can replace it with a more efficient New([^:]+):.
I would appreciate help with non-working regex (does not work for special symbols % or $)
public System.Tuple<string, string> GetParts(string str, string beginMark, string endMark)
{
var pattern =
new Regex(beginMark + #"(?<val>.*?)" + endMark,
RegexOptions.Compiled |
RegexOptions.Singleline);
return (from Match match in pattern.Matches(str)
where match.Success
select new Tuple(
match.Value,
match.Groups["val"].Value))
.ToList();
}
Calling method:
string input = #"%sometext%\another text";
string replacedValue = "AAA";
var occurrences = GetPart(input, #"(%", ")");
foreach (var occurrence in occurrences)
{
Console.WriteLine(occurrence.Item1 + Environment.NewLine);
Console.WriteLine(occurrence.Item2 + Environment.NewLine);
// replace
onsole.WriteLine(input.Replace(occurrence.Item1, replacedValue) + Environment.NewLine);
}
Expected Output:
%sometext%
sometext
AAA\another text
You need to escape your symbols. Try to change
new Regex(beginMark + #"(?<val>.*?)" + endMark,
to
new Regex(Regex.Escape(beginMark) + #"(?<val>.*?)" + Regex.Escape(endMark),
I am currently trying to do a Regex Replace on a JSON string that looks like:
String input = "{\"`####`Answer_Options11\": \"monkey22\",\"`####`Answer_Options\": \"monkey\",\"Answer_Options2\": \"not a monkey\"}";
a
The goal is to find and replace all the value fields who's key field starts with `####`
I currently have this:
static Regex _FieldRegex = new Regex(#"`####`\w+" + ".:.\"(.*)\",");
static public string MatchKey(string input)
{
MatchCollection match = _encryptedFieldRegex.Matches(input.ToLower());
string match2 = "";
foreach (Match k in match )
{
foreach (Capture cap in k.Captures)
{
Console.WriteLine("" + cap.Value);
match2 = Regex.Replace(input.ToLower(), cap.Value.ToString(), #"CAKE");
}
}
return match2.ToString();
}
Now this isn't working. Naturally I guess since it picks up the entire `####`Answer_Options11\": \"monkey22\",\"`####`Answer_Options\": \"monkey\", as a match and replaces it. I want to just replace the match.Group[1] like you would for a single match on the string.
At the end of the day the JSON string needs to look something like this:
String input = "{\"`####`Answer_Options11\": \"CATS AND CAKE\",\"`####`Answer_Options\": \"CAKE WAS A LIE\",\"Answer_Options2\": \"not a monkey\"}";
Any idea how to do this?
you want a positive lookahead and a positive lookbehind :
(?<=####.+?:).*?(?=,)
the lookaheads and lookbehinds will verify that it matches those patterns, but not include them in the match. This site explains the concept pretty well.
Generated code from RegexHero.com :
string strRegex = #"(?<=####.+?:).*?(?=,)";
Regex myRegex = new Regex(strRegex);
string strTargetString = #" ""{\""`####`Answer_Options11\"": \""monkey22\"",\""`####`Answer_Options\"": \""monkey\"",\""Answer_Options2\"": \""not a monkey\""}""";
foreach (Match myMatch in myRegex.Matches(strTargetString))
{
if (myMatch.Success)
{
// Add your code here
}
}
this will match "monkey22" and "monkey" but not "not a monkey"
Working from #Jonesy's answer I got to this which works for what I wanted. It includes the .Replace on the groups that I required. The negative look ahead and behinds were very interesting but I needed to replace some of those values hence groups.
static public string MatchKey(string input)
{
string strRegex = #"(__u__)(.+?:\s*)""(.*)""(,|})*";
Regex myRegex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
IQS_Encryption.Encryption enc = new Encryption();
int count = 1;
string addedJson = "";
int matchCount = 0;
foreach (Match myMatch in myRegex.Matches(input))
{
if (myMatch.Success)
{
//Console.WriteLine("REGEX MYMATCH: " + myMatch.Value);
input = input.Replace(myMatch.Value, "__e__" + myMatch.Groups[2].Value + "\"c" + count + "\"" + myMatch.Groups[4].Value);
addedJson += "c"+count + "{" +enc.EncryptString(myMatch.Groups[3].Value, Encoding.UTF8.GetBytes("12345678912365478912365478965412"))+"},";
}
count++;
matchCount++;
}
Console.WriteLine("MAC" + matchCount);
return input + addedJson;
}`
Thanks again to #Jonesy for the huge help.