Is there a StartWith method for arrays in .NET? Or something similar to it in LINQ?
var arr1 = { "A", "B, "C" }
var arr2 = { "A", "B, "C", "D" }
var arr3 = { "A", "B, "CD" }
var arr4 = { "E", "A, "B", "C" }
arr2.StartWith(arr1) // true
arr1.StartWith(arr2) // false
arr3.StartWith(arr1) // false
arr4.StartWith(arr1) // false
Or I should do it straightforward:
bool StartWith(string[] arr1, string[] arr2)
{
if (arr1.Count() < arr2.Count) return false;
for (var i = 0; i < arr2.Count(), i++)
{
if (arr2[i] != arr1[i]) return false;
}
return true;
}
I'm looking for the most efficient way to do that.
bool answer = arr2.Take(arr1.Length).SequenceEqual(arr1);
Your "striaghtformward" way is the way most LINQ methods would be doing it anyway. There are a few tweaks you could do. For example make it a extension method and use a comparer for the comparison of the two types so custom comparers could be used.
public static class ExtensionMethods
{
static bool StartWith<T>(this T[] arr1, T[] arr2)
{
return StartWith(arr1, arr2, EqualityComparer<T>.Default);
}
static bool StartWith<T>(this T[] arr1, T[] arr2, IEqualityComparer<T> comparer)
{
if (arr1.Length < arr2.Length) return false;
for (var i = 0; i < arr2.Length, i++)
{
if (!comparer.Equals(arr2[i], arr1[i])) return false;
}
return true;
}
}
UPDATE: For fun I decided to take the time and write a little more "advanced" version that would work with any IEnumerable<T> and not just arrays.
public static class ExtensionMethods
{
static bool StartsWith<T>(this IEnumerable<T> #this, IEnumerable<T> #startsWith)
{
return StartsWith(#this, startsWith, EqualityComparer<T>.Default);
}
static bool StartsWith<T>(this IEnumerable<T> #this, IEnumerable<T> startsWith, IEqualityComparer<T> comparer)
{
if (#this == null) throw new ArgumentNullException("this");
if (startsWith == null) throw new ArgumentNullException("startsWith");
if (comparer == null) throw new ArgumentNullException("comparer");
//Check to see if both types implement ICollection<T> to get a free Count check.
var thisCollection = #this as ICollection<T>;
var startsWithCollection = startsWith as ICollection<T>;
if (thisCollection != null && startsWithCollection != null && (thisCollection.Count < startsWithCollection.Count))
return false;
using (var thisEnumerator = #this.GetEnumerator())
using (var startsWithEnumerator = startsWith.GetEnumerator())
{
//Keep looping till the startsWithEnumerator runs out of items.
while (startsWithEnumerator.MoveNext())
{
//Check to see if the thisEnumerator ran out of items.
if (!thisEnumerator.MoveNext())
return false;
if (!comparer.Equals(thisEnumerator.Current, startsWithEnumerator.Current))
return false;
}
}
return true;
}
}
You can do:
var result = arr2.Take(arr1.Length).SequenceEqual(arr1);
To optimize it further you can add the check arr2.Length >= arr1.Length in the start like:
var result = arr2.Length >= arr1.Length && arr2.Take(arr1.Length).SequenceEqual(arr1);
The end result would be same.
Try Enumerable.SequenceEqual(a1, a2) but trim your first array, i.e.,
var arr1 = { "A", "B, "C" }
var arr2 = { "A", "B, "C", "D" }
if (Enumerable.SequenceEqual(arr1, arr2.Take(arr1.Length))
You don't want to require everything to be an array, and you don't want to call Count() on an IEnumerable<T> that may be a large query, when you only really want to sniff at the first four items or whatever.
public static class Extensions
{
public static void Test()
{
var a = new[] { "a", "b" };
var b = new[] { "a", "b", "c" };
var c = new[] { "a", "b", "c", "d" };
var d = new[] { "x", "y" };
Console.WriteLine("b.StartsWith(a): {0}", b.StartsWith(a));
Console.WriteLine("b.StartsWith(c): {0}", b.StartsWith(c));
Console.WriteLine("b.StartsWith(d, x => x.Length): {0}",
b.StartsWith(d, x => x.Length));
}
public static bool StartsWith<T>(
this IEnumerable<T> sequence,
IEnumerable<T> prefixCandidate,
Func<T, T, bool> compare = null)
{
using (var eseq = sequence.GetEnumerator())
using (var eprefix = prefixCandidate.GetEnumerator())
{
if (compare == null)
{
compare = (x, y) => Object.Equals(x, y);
}
eseq.MoveNext();
eprefix.MoveNext();
do
{
if (!compare(eseq.Current, eprefix.Current))
return false;
if (!eprefix.MoveNext())
return true;
}
while (eseq.MoveNext());
return false;
}
}
public static bool StartsWith<T, TProperty>(
this IEnumerable<T> sequence,
IEnumerable<T> prefixCandidate,
Func<T, TProperty> selector)
{
using (var eseq = sequence.GetEnumerator())
using (var eprefix = prefixCandidate.GetEnumerator())
{
eseq.MoveNext();
eprefix.MoveNext();
do
{
if (!Object.Equals(
selector(eseq.Current),
selector(eprefix.Current)))
{
return false;
}
if (!eprefix.MoveNext())
return true;
}
while (eseq.MoveNext());
return false;
}
}
}
Here are some different ways of doing that. I didn't optimize or fully validated everything, there is room for improvement everywhere. But this should give you some idea.
The best performance will always be going low level, if you grab the iterator and go step by step you can get much faster results.
Methods and performance results:
StartsWith1 00:00:01.9014586
StartsWith2 00:00:02.1227468
StartsWith3 00:00:03.2222109
StartsWith4 00:00:05.5544177
Test method:
var watch = new Stopwatch();
watch.Start();
for (int i = 0; i < 10000000; i++)
{
bool test = action(arr2, arr1);
}
watch.Stop();
return watch.Elapsed;
Methods:
public static class IEnumerableExtender
{
public static bool StartsWith1<T>(this IEnumerable<T> source, IEnumerable<T> compare)
{
if (source.Count() < compare.Count())
{
return false;
}
using (var se = source.GetEnumerator())
{
using (var ce = compare.GetEnumerator())
{
while (ce.MoveNext() && se.MoveNext())
{
if (!ce.Current.Equals(se.Current))
{
return false;
}
}
}
}
return true;
}
public static bool StartsWith2<T>(this IEnumerable<T> source, IEnumerable<T> compare) =>
compare.Take(source.Count()).SequenceEqual(source);
public static bool StartsWith3<T>(this IEnumerable<T> source, IEnumerable<T> compare)
{
if (source == null)
{
throw new ArgumentNullException(nameof(source));
}
if (compare == null)
{
throw new ArgumentNullException(nameof(compare));
}
if (source.Count() < compare.Count())
{
return false;
}
return compare.SequenceEqual(source.Take(compare.Count()));
}
public static bool StartsWith4<T>(this IEnumerable<T> arr1, IEnumerable<T> arr2)
{
return StartsWith4(arr1, arr2, EqualityComparer<T>.Default);
}
public static bool StartsWith4<T>(this IEnumerable<T> arr1, IEnumerable<T> arr2, IEqualityComparer<T> comparer)
{
if (arr1.Count() < arr2.Count()) return false;
for (var i = 0; i < arr2.Count(); i++)
{
if (!comparer.Equals(arr2.ElementAt(i), arr1.ElementAt(i))) return false;
}
return true;
}
}
Related
I would like to refactor my method. I also need to get which value was first? So which anyOf? Is it possible to get it from here?
Example:
List<string> anyOf = new List<string>(){"at", "near", "by", "above"};
string source = "South Branch Raritan River near High Bridge at NJ"
public static int IndexOfAny(this string source, IEnumerable<string> anyOf, StringComparison stringComparisonType = StringComparison.CurrentCultureIgnoreCase)
{
var founds = anyOf
.Select(sub => source.IndexOf(sub, stringComparisonType))
.Where(i => i >= 0);
return founds.Any() ? founds.Min() : -1;
}
I would like to get back what is first in string. "near" or "at".
You could use:
public static (int index, string? firstMatch) IndexOfAny(this string source, IEnumerable<string> anyOf, StringComparison stringComparisonType = StringComparison.CurrentCultureIgnoreCase)
{
return anyOf
.Select(s => (Index: source.IndexOf(s, stringComparisonType), String: s))
.Where(x => x.Index >= 0)
.DefaultIfEmpty((-1, null))
.First();
}
I couldn't resist creating a more efficient implementation.
Working here.
Whilst this looks more complicated, its better because,
It allocates only,
an array for the valid search terms,
a array of indices for each search term and,
an array of lengths for each search term.
The source text is enumerated only once and, if a match is found,
that loop will exit early.
Additionally, the code incorporates parameter checking which you'll want as extension methods should be resusable.
public static class Extensions
{
public static int IndexOfAny<T>(
this IEnumerable<T> source,
IEnumerable<IEnumerable<T>> targets,
IEqualityComparer<T> comparer = null)
{
// Parameter Handling
comparer = comparer ?? EqualityComparer<T>.Default;
ArgumentNullException.ThrowIfNull(targets);
var clean = targets
.Where(t => t != null)
.Select(t => t.ToArray())
.Where(t => t.Length > 0)
.ToArray();
if (clean.Length == 0)
{
throw new ArgumentException(
$"'{nameof(targets)}' does not contain a valid search sequence");
}
// Prep
var lengths = clean.Select(t => t.Length).ToArray();
var indices = clean.Select(_ => 0).ToArray();
int i = 0;
// Process
foreach(var t in source)
{
i++;
for(var j = 0; j < clean.Length; j++)
{
var index = indices[j];
if (comparer.Equals(clean[j][index], t))
{
index += 1;
if (index == lengths[j])
{
return i - lengths[j];
}
indices[j] = index;
}
else
{
if (index != 0)
{
indices[j] = 0;
}
}
}
}
return -1;
}
public static int IndexOfAny(
this string source,
IEnumerable<string> targets,
StringComparer comparer = null)
{
comparer = comparer ?? StringComparer.Ordinal;
ArgumentNullException.ThrowIfNull(targets);
return source.ToCharArray().IndexOfAny(
targets.Select(t => t.ToCharArray()),
new CharComparerAdapter(comparer));
}
}
public class CharComparerAdapter : IEqualityComparer<char>
{
private StringComparer Comparer { get; }
public CharComparerAdapter(StringComparer comparer)
{
ArgumentNullException.ThrowIfNull(comparer);
Comparer = comparer;
}
public bool Equals(char left, char right)
{
return Comparer.Equals(left.ToString(), right.ToString());
}
public int GetHashCode(char v)
{
return v;
}
}
I wanted to build a fluent api to iterate on an array where I filter values and continue processing the remaining (not the filtered ones) values. Something like this pseudo-code:
int[] input = { 5, 4, 1, 3, 9, 8, 6, 7, 2, 0 };
from a in Take(3) // a = {5,4,1}
from b in Skip(4) // b = null
from c in TakeWhile(x=> x != 0) // c = {7, 2}
select new Stuff(a, b, c)
I don't know where to start looking, what are the basis for something like this. So I wanted to ask for some help.
The system should not be restricted to int numbers.. another example:
string[] input = { "how", "are", "you", "doing", "?" };
from a in OneOf("how", "what", "where") // a = "how"
from b in Match("are") // b = "are"
from c in TakeWhile(x=> x != "?") // c = { "you", "doing" }
select new Stuff(a, b, c)
The following code will allow you to do input.FirstTake(3).ThenSkip(4).ThenTakeWhile(x => x != 0); to get the sequence 5, 4, 1, 7, 2. The main idea is that you need to keep track of the takes and skips you want to do so they can be applied when you iterate. This is similar to how OrderBy and ThenBy work. Note that you cannot do other Linq operations in between. This build up one enumeration of consecutive skips and takes, then that sequence will be fed through any Linq operations you tack on.
public interface ITakeAndSkip<out T> : IEnumerable<T>
{
ITakeAndSkip<T> ThenSkip(int number);
ITakeAndSkip<T> ThenTake(int number);
ITakeAndSkip<T> ThenTakeWhile(Func<T, bool> predicate);
ITakeAndSkip<T> ThenSkipWhile(Func<T, bool> predicate);
}
public class TakeAndSkip<T> : ITakeAndSkip<T>
{
private readonly IEnumerable<T> _source;
private class TakeOrSkipOperation
{
public bool IsSkip { get; private set; }
public Func<T, bool> Predicate { get; private set; }
public int Number { get; private set; }
private TakeOrSkipOperation()
{
}
public static TakeOrSkipOperation Skip(int number)
{
return new TakeOrSkipOperation
{
IsSkip = true,
Number = number
};
}
public static TakeOrSkipOperation Take(int number)
{
return new TakeOrSkipOperation
{
Number = number
};
}
public static TakeOrSkipOperation SkipWhile(Func<T, bool> predicate)
{
return new TakeOrSkipOperation
{
IsSkip = true,
Predicate = predicate
};
}
public static TakeOrSkipOperation TakeWhile(Func<T, bool> predicate)
{
return new TakeOrSkipOperation
{
Predicate = predicate
};
}
}
private readonly List<TakeOrSkipOperation> _operations = new List<TakeOrSkipOperation>();
public TakeAndSkip(IEnumerable<T> source)
{
_source = source;
}
public IEnumerator<T> GetEnumerator()
{
using (var enumerator = _source.GetEnumerator())
{
// move to the first item and if there are none just return
if (!enumerator.MoveNext()) yield break;
// Then apply all the skip and take operations
foreach (var operation in _operations)
{
int n = operation.Number;
// If we are not dealing with a while then make the predicate count
// down the number to zero.
var predicate = operation.Predicate ?? (x => n-- > 0);
// Iterate the items until there are no more or the predicate is false
bool more = true;
while (more && predicate(enumerator.Current))
{
// If this is a Take then yield the current item.
if (!operation.IsSkip) yield return enumerator.Current;
more = enumerator.MoveNext();
}
// If there are no more items return
if (!more) yield break;
}
// Now we need to decide what to do with the rest of the items.
// If there are no operations or the last one was a skip then
// return the remaining items
if (_operations.Count == 0 || _operations.Last().IsSkip)
{
do
{
yield return enumerator.Current;
} while (enumerator.MoveNext());
}
// Otherwise the last operation was a take and we're done.
}
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
public ITakeAndSkip<T> ThenSkip(int number)
{
_operations.Add(TakeOrSkipOperation.Skip(number));
return this;
}
public ITakeAndSkip<T> ThenSkipWhile(Func<T, bool> predicate)
{
_operations.Add(TakeOrSkipOperation.SkipWhile(predicate));
return this;
}
public ITakeAndSkip<T> ThenTake(int number)
{
_operations.Add(TakeOrSkipOperation.Take(number));
return this;
}
public ITakeAndSkip<T> ThenTakeWhile(Func<T, bool> predicate)
{
_operations.Add(TakeOrSkipOperation.TakeWhile(predicate));
return this;
}
}
public static class TakeAndSkipExtensions
{
public static ITakeAndSkip<T> FirstTake<T>(this IEnumerable<T> source, int number)
{
return new TakeAndSkip<T>(source).ThenTake(number);
}
public static ITakeAndSkip<T> FirstSkip<T>(this IEnumerable<T> source, int number)
{
return new TakeAndSkip<T>(source).ThenSkip(number);
}
public static ITakeAndSkip<T> FirstTakeWhile<T>(this IEnumerable<T> source, Func<T, bool> predicate)
{
return new TakeAndSkip<T>(source).ThenTakeWhile(predicate);
}
public static ITakeAndSkip<T> FirstSkipWhile<T>(this IEnumerable<T> source, Func<T, bool> predicate)
{
return new TakeAndSkip<T>(source).ThenSkipWhile(predicate);
}
}
public static bool CompareLists(List<Product> lstProduct1, List<Product> lstProduct2, List<DuplicateExpression> DuplicateExpression)
{
string[] Fields = DuplicateExpression.Select(x => x.ExpressionName).ToArray();
//var JoinExp = lstProduct1.Join(lstProduct2, new[] { "ProductName", "ProductCode" });
var JoinExp = lstProduct1.Join(lstProduct2, Fields);
bool IsSuccess = CompareTwoLists(lstProduct1, lstProduct2, (listProductx1, listProductx2) => JoinExp.Any());
return IsSuccess;
}
How to convert above function as <T> function?. Actually this is a List comparison function.
SequenceEqual solves your problem.
new[] { "A", "B" }.SequenceEqual(new[] { "A", "B" }).Should().BeTrue();
Here is the source code.
public static bool SequenceEqual<TSource>(this IEnumerable<TSource> first, IEnumerable<TSource> second, IEqualityComparer<TSource> comparer)
{
if (comparer == null) comparer = EqualityComparer<TSource>.Default;
if (first == null) throw Error.ArgumentNull("first");
if (second == null) throw Error.ArgumentNull("second");
using (IEnumerator<TSource> e1 = first.GetEnumerator())
using (IEnumerator<TSource> e2 = second.GetEnumerator())
{
while (e1.MoveNext())
{
if (!(e2.MoveNext() && comparer.Equals(e1.Current, e2.Current)))
return false;
}
if (e2.MoveNext())
return false;
}
return true;
}
In your case you could elect to replace IEnumerable<TSource> with IList<TSource> or even List<TSource> ideally the highest level of abstraction is preferred.
I got a lot of data from a database, which are results from a search function. Now I've a List<string[]> which has duplicated elements of type string[]. The string[] in the list are the search results.
I know that every new created array has a different instance so i can't use MyListOfArrays.Distinct().ToList().
Maybe it's a very basic question...
My question is, are there any functions built in to remove a duplicated string[] form the List<string[]>? Or do I have to write it by my selfe?
Thank you
You can use distinct method with custom equalityComparer
IEnumerable<string[]> distinct = inputStringArrayList.Distinct(new EqualityComparer());
EqualityComparer
class EqualityComparer : IEqualityComparer<string[]>
{
public bool Equals(string[] x, string[] y)
{
if (x.Length != y.Length)
{
return false;
}
if (x.Where((t, i) => t != y[i]).Any())
{
return false;
}
return true;
}
public int GetHashCode(string[] obj)
{
return obj.GetHashCode();
}
}
Alternative Equals Method
public bool Equals(string[] x, string[] y)
{
return x.SequenceEqual(y);
}
Here I am assuming you are having exact same string arrays with same content at same index.
Correction from Matthew Watson
public int GetHashCode(string[] obj)
{
if (obj == null)
return 0;
int hash = 17;
unchecked
{
foreach (string s in obj)
hash = hash*23 + ((s == null) ? 0 : s.GetHashCode());
}
return hash;
}
I have corrected the answer from #Muctadir Dinar.
(He deserves credit for the answer - I am just correcting it and providing a complete test program):
using System;
using System.Collections.Generic;
using System.Linq;
namespace Demo
{
sealed class EqualityComparer: IEqualityComparer<string[]>
{
public bool Equals(string[] x, string[] y)
{
if (ReferenceEquals(x, y))
return true;
if (x == null || y == null)
return false;
return x.SequenceEqual(y);
}
public int GetHashCode(string[] obj)
{
if (obj == null)
return 0;
int hash = 17;
unchecked
{
foreach (string s in obj)
hash = hash*23 + ((s == null) ? 0 : s.GetHashCode());
}
return hash;
}
}
class Program
{
private void run()
{
var list = new List<string[]>
{
strings(1, 10),
strings(2, 10),
strings(3, 10),
strings(2, 10),
strings(4, 10)
};
dump(list);
Console.WriteLine();
var result = list.Distinct(new EqualityComparer());
dump(result);
}
static void dump(IEnumerable<string[]> list)
{
foreach (var array in list)
Console.WriteLine(string.Join(",", array));
}
static string[] strings(int start, int count)
{
return Enumerable.Range(start, count)
.Select(element => element.ToString())
.ToArray();
}
static void Main(string[] args)
{
new Program().run();
}
}
}
A simple and not very efficient approach would be to use string.Join on the string[]:
list = list
.GroupBy(strArr => string.Join("|", strArr))
.Select(g => g.First())
.ToList();
Read the edit below for more information.
I have some code below that I use to split a generic list of Object when the item is of a certain type.
public static IEnumerable<object>[] Split(this IEnumerable<object> tokens, TokenType type) {
List<List<object>> t = new List<List<object>>();
int currentT = 0;
t.Add(new List<object>());
foreach (object list in tokens) {
if ((list is Token) && (list as Token).TokenType == type) {
currentT++;
t.Add(new List<object>());
}
else if ((list is TokenType) && ((TokenType)list )== type) {
currentT++;
t.Add(new List<object>());
}
else {
t[currentT].Add(list);
}
}
return t.ToArray();
}
I dont have a clear question as much as I am curious if anyone knows of any ways I can optimize this code. I call it many times and it seems to be quite the beast as far as clock cycles go. Any ideas? I can also make it a Wiki if anyone is interested, maybe we can keep track of the latest changes.
Update: Im trying to parse out specific tokens. Its a list of some other class and Token classes. Token has a property (enum) of TokenType. I need to find all the Token classes and split on each of them.
{a b c T d e T f g h T i j k l T m}
would split like
{a b c}{d e}{f g h}{i j k l}{m}
EDIT UPDATE:
It seems like all of my speed problems come into the constant creation and addition of Generic Lists. Does anyone know how I can go about this without that?
This is the profile of what is happening if it helps anyone.
Your code looks fine.
My only suggestion would be replacing IEnumerable<object> with the non-generic IEnumerable. (In System.Collections)
EDIT:
On further inspection, you're casting more times than necessary.
Replace the if with the following code:
var token = list as Token;
if (token != null && token.TokenType == type) {
Also, you can get rid your currentT variable by writing t[t.Count - 1] or t.Last(). This will make the code clearer, but might have a tiny negative effect on performance.
Alternatively, you could store a reference to the inner list in a variable and use it directly. (This will slightly improve performance)
Finally, if you can change the return type to List<List<Object>>, you can return t directly; this will avoid an array copy and will be noticeably faster for large lists.
By the way, your variable names are confusing; you should swap the names of t and list.
Type-testing and casts can be a performance killer. If at all possible, your token types should implement a common interface or abstract class. Instead of passing in and object, you should pass in an IToken which wraps your object.
Here's some concept code you can use to get started:
using System;
using System.Collections.Generic;
namespace Juliet
{
interface IToken<T>
{
bool IsDelimeter { get; }
T Data { get; }
}
class DelimeterToken<T> : IToken<T>
{
public bool IsDelimeter { get { return true; } }
public T Data { get { throw new Exception("No data"); } }
}
class DataToken<T> : IToken<T>
{
public DataToken(T data)
{
this.Data = data;
}
public bool IsDelimeter { get { return false; } }
public T Data { get; private set; }
}
class TokenFactory<T>
{
public IToken<T> Make()
{
return new DelimeterToken<T>();
}
public IToken<T> Make(T data)
{
return new DataToken<T>(data);
}
}
class Program
{
static List<List<T>> SplitTokens<T>(IEnumerable<IToken<T>> tokens)
{
List<List<T>> res = new List<List<T>>();
foreach (IToken<T> token in tokens)
{
if (token.IsDelimeter)
{
res.Add(new List<T>());
}
else
{
if (res.Count == 0)
{
res.Add(new List<T>());
}
res[res.Count - 1].Add(token.Data);
}
}
return res;
}
static void Main(string[] args)
{
TokenFactory<string> factory = new TokenFactory<string>();
IToken<string>[] tokens = new IToken<string>[]
{
factory.Make("a"), factory.Make("b"), factory.Make("c"), factory.Make(),
factory.Make("d"), factory.Make("e"), factory.Make(),
factory.Make("f"), factory.Make("g"), factory.Make("h"), factory.Make(),
factory.Make("i"), factory.Make("j"), factory.Make("k"), factory.Make("l"), factory.Make(),
factory.Make("m")
};
List<List<string>> splitTokens = SplitTokens(tokens);
for (int i = 0; i < splitTokens.Count; i++)
{
Console.Write("{");
for (int j = 0; j < splitTokens[i].Count; j++)
{
Console.Write("{0}, ", splitTokens[i][j]);
}
Console.Write("}");
}
Console.ReadKey(true);
}
}
}
In principle, you can create instances of IToken<object> to have it generalized to tokens of multiple classes.
A: An all-lazy implementation will suffice if you just iterate through the results in a nested foreach:
using System;
using System.Collections.Generic;
public static class Splitter
{
public static IEnumerable<IEnumerable<T>> Split<T>(this IEnumerable<T> source, Predicate<T> match)
{
using (IEnumerator<T> enumerator = source.GetEnumerator())
{
while (enumerator.MoveNext())
{
yield return Split(enumerator, match);
}
}
}
static IEnumerable<T> Split<T>(IEnumerator<T> enumerator, Predicate<T> match)
{
do
{
if (match(enumerator.Current))
{
yield break;
}
else
{
yield return enumerator.Current;
}
} while (enumerator.MoveNext());
}
}
Use it like this:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace MyTokenizer
{
class Program
{
enum TokenTypes { SimpleToken, UberToken }
class Token { public TokenTypes TokenType = TokenTypes.SimpleToken; }
class MyUberToken : Token { public MyUberToken() { TokenType = TokenTypes.UberToken; } }
static void Main(string[] args)
{
List<object> objects = new List<object>(new object[] { "A", Guid.NewGuid(), "C", new MyUberToken(), "D", new MyUberToken(), "E", new MyUberToken() });
var splitOn = TokenTypes.UberToken;
foreach (var list in objects.Split(x => x is Token && ((Token)x).TokenType == splitOn))
{
foreach (var item in list)
{
Console.WriteLine(item);
}
Console.WriteLine("==============");
}
Console.ReadKey();
}
}
}
B: If you need to process the results some time later and you wish to do it out-of-order, or you partition on one thread and then possibly dispatch the segments to multiple threads, then this would probably provide a good starting point:
using System;
using System.Collections.Generic;
using System.Linq;
public static class Splitter2
{
public static IEnumerable<IEnumerable<T>> SplitToSegments<T>(this IEnumerable<T> source, Predicate<T> match)
{
T[] items = source.ToArray();
for (int startIndex = 0; startIndex < items.Length; startIndex++)
{
int endIndex = startIndex;
for (; endIndex < items.Length; endIndex++)
{
if (match(items[endIndex])) break;
}
yield return EnumerateArraySegment(items, startIndex, endIndex - 1);
startIndex = endIndex;
}
}
static IEnumerable<T> EnumerateArraySegment<T>(T[] array, int startIndex, int endIndex)
{
for (; startIndex <= endIndex; startIndex++)
{
yield return array[startIndex];
}
}
}
C: If you really must return a collection of List<T> -s - which I doubt, unless you explicitly want to mutate them some time later on -, then try to initialize them to a given capacity before copying:
public static List<List<T>> SplitToLists<T>(this IEnumerable<T> source, Predicate<T> match)
{
List<List<T>> lists = new List<List<T>>();
T[] items = source.ToArray();
for (int startIndex = 0; startIndex < items.Length; startIndex++)
{
int endIndex = startIndex;
for (; endIndex < items.Length; endIndex++)
{
if (match(items[endIndex])) break;
}
List<T> list = new List<T>(endIndex - startIndex);
list.AddRange(EnumerateArraySegment(items, startIndex, endIndex - 1));
lists.Add(list);
startIndex = endIndex;
}
return lists;
}
D: If this is still not enough, I suggest you roll your own lightweight List implementation that can copy a range directly to its internal array from another instance.
My first thought would be instead of looking up t[currentT] all the time, just store a currentList and add directly to that.
This is the best I could do to eliminate as much of the allocation times as possible for the function (should only allocate when it goes over the capacity, which should be no more than what is required to create the largest sub list in the results). I've tested this implementation and it works as you described.
Please note that the results of the prior sub list are destroyed when the next list in the group is accessed.
public static IEnumerable<IEnumerable> Split(this IEnumerable tokens, TokenType type)
{
ArrayList currentT = new ArrayList();
foreach (object list in tokens)
{
Token token = list as Token;
if ((token != null) && token.TokenType == type)
{
yield return currentT;
currentT.Clear();
//currentT = new ArrayList(); <-- Use this instead of 'currentT.Clear();' if you want the returned lists to be a different instance
}
else if ((list is TokenType) && ((TokenType)list) == type)
{
yield return currentT;
currentT.Clear();
//currentT = new ArrayList(); <-- Use this instead of 'currentT.Clear();' if you want the returned lists to be a different instance
}
else
{
currentT.Add(list);
}
}
}
EDIT
Here's another version that doesn't make use of another list at all (shouldn't be doing any allocations). Not sure how well this will compare, but it does work as requested (also I've got no idea how this one will go if you try to cache a sub 'array').
Also, both of these require a "using System.Collections" statement (in addition to the Generic namespace).
private static IEnumerable SplitInnerLoop(IEnumerator iter, TokenType type)
{
do
{
Token token = iter.Current as Token;
if ((token != null) && token.TokenType == type)
{
break;
}
else if ((iter.Current is TokenType) && ((TokenType)iter.Current) == type)
{
break;
}
else
{
yield return iter.Current;
}
} while (iter.MoveNext());
}
public static IEnumerable<IEnumerable> Split(this IEnumerable tokens, TokenType type)
{
IEnumerator iter = tokens.GetEnumerator();
while (iter.MoveNext())
{
yield return SplitInnerLoop(iter, type);
}
}
I think that there are broken cases for these scenarios where assuming that list items are lower case letters, and the item with matching token type is T:
{ T a b c ... };
{ ... x y z T };
{ ... j k l T T m n o ... };
{ T }; and
{ }
Which will result in:
{ { } { a b c ... } };
{ { ... x y z } { } };
{ { ... j k l } { } { } { m n o ... } };
{ { } }; and
{ }
Doing a straight refactoring:
public static IEnumerable<object>[] Split(this IEnumerable<object> tokens,
TokenType type) {
var outer = new List<List<object>>();
var inner = new List<object>();
foreach (var item in tokens) {
Token token = item as token;
if (token != null && token.TokenType == type) {
outer.Add(inner);
inner = new List<object>();
continue;
}
inner.Add(item);
}
outer.Add(inner);
return outer.ToArray();
}
To fix the broken cases (assuming that those are truly broken), I recommend:
public static IEnumerable<object>[] Split(this IEnumerable<object> tokens,
TokenType type) {
var outer = new List<List<object>>();
var inner = new List<object>();
var enumerator = tokens.GetEnumerator();
while (enumerator.MoveNext()) {
Token token = enumerator.Current as token;
if (token == null || token.TokenType != type) {
inner.Add(enumerator.Current);
}
else if (inner.Count > 0) {
outer.Add(inner);
inner = new List<object>();
}
}
return outer.ToArray();
}
Which will result in:
{ { a b c ... } };
{ { ... x y z } };
{ { ... j k l } { m n o ... } };
{ }; and
{ }
Using LINQ you could try this: (I did not test it...)
public static IEnumerable<object>[] Split(this IEnumerable<object> tokens, TokenType type)
{
List<List<object>> l = new List<List<object>>();
l.Add(new List<object>());
return tokens.Aggregate(l, (c, n) =>
{
var t = n as Token;
if (t != null && t.TokenType == type)
{
t.Add(new List<object>());
}
else
{
l.Last().Add(n);
}
return t;
}).ToArray();
}
Second try:
public static IEnumerable<object>[] Split(this IEnumerable<object> tokens, TokenType type)
{
var indexes = tokens.Select((t, index) => new { token = t, index = index }).OfType<Token>().Where(t => t.token.TokenType == type).Select(t => t.index);
int prevIndex = 0;
foreach (int item in indexes)
{
yield return tokens.Where((t, index) => (index > prevIndex && index < item));
prevIndex = item;
}
}
Here is one possibility
The Token class ( could be what ever class )
public class Token
{
public string Name { get; set; }
public TokenType TokenType { get; set; }
}
Now the Type enum ( this could be what ever other grouping factor )
public enum TokenType
{
Type1,
Type2,
Type3,
Type4,
Type5,
}
The Extention Method (Declare this anyway you choose)
public static class TokenExtension
{
public static IEnumerable<Token>[] Split(this IEnumerable<Token> tokens)
{
return tokens.GroupBy(token => ((Token)token).TokenType).ToArray();
}
}
Sample of use ( I used a web project to spin this )
List<Token> tokens = new List<Token>();
tokens.Add(new Token { Name = "a", TokenType = TokenType.Type1 });
tokens.Add(new Token { Name = "b", TokenType = TokenType.Type1 });
tokens.Add(new Token { Name = "c", TokenType = TokenType.Type1 });
tokens.Add(new Token { Name = "d", TokenType = TokenType.Type2 });
tokens.Add(new Token { Name = "e", TokenType = TokenType.Type2 });
tokens.Add(new Token { Name = "f", TokenType = TokenType.Type3 });
tokens.Add(new Token { Name = "g", TokenType = TokenType.Type3 });
tokens.Add(new Token { Name = "h", TokenType = TokenType.Type3 });
tokens.Add(new Token { Name = "i", TokenType = TokenType.Type4 });
tokens.Add(new Token { Name = "j", TokenType = TokenType.Type4 });
tokens.Add(new Token { Name = "k", TokenType = TokenType.Type4 });
tokens.Add(new Token { Name = "l", TokenType = TokenType.Type4 });
tokens.Add(new Token { Name = "m", TokenType = TokenType.Type5 });
StringBuilder stringed = new StringBuilder();
foreach (Token token in tokens)
{
stringed.Append(token.Name);
stringed.Append(", ");
}
Response.Write(stringed.ToString());
Response.Write("</br>");
var q = tokens.Split();
foreach (var list in tokens.Split())
{
stringed = new StringBuilder();
foreach (Token token in list)
{
stringed.Append(token.Name);
stringed.Append(", ");
}
Response.Write(stringed.ToString());
Response.Write("</br>");
}
So all I am soing is using Linq, feel free to add or remove, you can actualy go crazy on this and group on many diferent properties.
Do you need to convert it to an array? You could potentially use LINQ and delayed execution to return the results.
EDIT:
With the clarified question it would be hard to bend LINQ to make it return the results the way you want. If you still want to have the execution of each cycle delayed you could write your own enumerator.
I recommend perf testing this compared to the other options to see if there is a performance gain for your scenario if you attempt this approach. It might cause more overhead managing the iterator which would be bad for cases with little data.
I hope this helps.
// This is the easy way to make your own iterator using the C# syntax
// It will return sets of separated tokens in a lazy fashion
// This sample is based on the version provided by #Ants
public static IEnumerable<IEnumerable<object>> Split(this IEnumerable<object> tokens,
TokenType type) {
var current = new List<object>();
foreach (var item in tokens)
{
Token token = item as Token;
if (token != null && token.TokenType == type)
{
if( current.Count > 0)
{
yield return current;
current = new List<object>();
}
}
else
{
current.Add(item);
}
}
if( current.Count > 0)
yield return current;
}
Warning: This compiles but has still might have hidden bugs. It is getting late here.
// This is doing the same thing but doing it all by hand.
// You could use this method as well to lazily iterate through the 'current' list as well
// This is probably overkill and substantially more complex
public class TokenSplitter : IEnumerable<IEnumerable<object>>, IEnumerator<IEnumerable<object>>
{
IEnumerator<object> _enumerator;
IEnumerable<object> _tokens;
TokenType _target;
List<object> _current;
bool _isDone = false;
public TokenSplitter(IEnumerable<object> tokens, TokenType target)
{
_tokens = tokens;
_target = target;
Reset();
}
// Cruft from the IEnumerable and generic IEnumerator
public IEnumerator<IEnumerable<object>> GetEnumerator() { return this; }
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
public IEnumerable<object> Current { get { return _current; } }
public void Dispose() { }
#region IEnumerator Members
object System.Collections.IEnumerator.Current { get { return Current; } }
// See if there is anything left to get
public bool MoveNext()
{
if (_isDone) return false;
FillCurrent();
return !_isDone;
}
// Reset the enumerators so that you could reuse this structure if you wanted
public void Reset()
{
_isDone = false;
_enumerator = _tokens.GetEnumerator();
_current = new List<object>();
FillCurrent();
}
// Fills the current set of token and then begins the next set
private void FillCurrent()
{
// Try to accumulate as many tokens as possible, this too could be an enumerator to delay the process more
bool hasNext = _enumerator.MoveNext();
for( ; hasNext; hasNext = _enumerator.MoveNext())
{
Token token = _enumerator.Current as Token;
if (token == null || token.TokenType != _target)
{
_current.Add(_enumerator.Current);
}
else
{
_current = new List<object>();
}
}
// Continue removing matching tokens and begin creating the next set
for( ; hasNext; hasNext = _enumerator.MoveNext())
{
Token token = _enumerator.Current as Token;
if (token == null || token.TokenType != _target)
{
_current.Add(_enumerator.Current);
break;
}
}
_isDone = !hasNext;
}
#endregion
}