So I was recently working on a string program where the strings were easily megabytes long, and I ran into problems with there being out of memory exceptions etc. So I said to myself, this would be a perfect problem which lazy execution solves. Here is a split function that essentially makes it so that only one of the string tokens is in memory at any given time, making it much more efficient in terms of memory. It might be a bit slower in terms of CPU cycles, but I wouldn't bet it would be that much slower. My program also had to split strings according to search threads, and one of the functions preforms that as well. I am happy with it, if you notice any problems with the implementation don't hesitate to point them out and suggest alterations. State machines are really cool this way.

Edited 1 Year Ago by overwraith: Changed to a code snippet

Comments
that's really good
nice to share ^^
public static class StringExt{

        public static System.Collections.Generic.IEnumerable<String> NextSplit(
            this String str, 
            char[] tokens) {

            int leftOffHere = 0;
            int[] tokLoc = new int[tokens.Length];//token location

            while (leftOffHere  < str.Length) {
                for (int tokenIndex = 0; tokenIndex < tokens.Length; tokenIndex++)
                    tokLoc[tokenIndex] = str.IndexOf(tokens[tokenIndex], leftOffHere);

                //find the closest token location
                int closest = int.MaxValue;
                foreach (int loc in tokLoc)
                    if (loc < closest && loc != -1)
                        closest = loc;

                if (closest == int.MaxValue)
                    yield break;

                yield return str.Substring(leftOffHere, closest - leftOffHere);

                //left off at closest + 1 so doesn't use same position twice
                leftOffHere = closest + 1;
            }//end loop

        }//end method

        public static System.Collections.Generic.IEnumerable<String> NextSplit(
            this String str, char[] tokens, int start, int length) {

            int leftOffHere = start;
            int[] tokLoc = new int[tokens.Length];//token location 
            
            //convert length to an index, starting index plus the length
            while (leftOffHere < length + start) {
                for (int tokenIndex = 0; tokenIndex < tokens.Length; tokenIndex++)
                    tokLoc[tokenIndex] = str.IndexOf(tokens[tokenIndex], leftOffHere);

                //find the closest token location
                int closest = int.MaxValue;
                foreach (int loc in tokLoc)
                    if (loc < closest && loc != -1)
                        closest = loc;

                if(closest == int.MaxValue)
                    yield break;

                yield return str.Substring(leftOffHere, closest - leftOffHere);

                //left off at closest + 1 so doesn't use same position twice
                leftOffHere = closest + 1;
            }//end loop

        }//end method

    }//end class

Actually this should probably be moved to snippets, that's where I wanted to put it.

There seems to be a bug. It doesn't return the segment after the last token(e.g. "9/7/2011", returns "9" and "7" only)

You are right. I will try to fix it as soon as I am able. If somebody beats me to the punch, please post.

Ok, I think I fixed it, will have to check the second method in more detail.

    public static class StringExt{

        public static System.Collections.Generic.IEnumerable<String> NextSplit(
            this String str, 
            char[] tokens) {

            int leftOffHere = 0;
            int[] tokLoc = new int[tokens.Length];//token location

            while (leftOffHere  < str.Length) {
                for (int tokenIndex = 0; tokenIndex < tokens.Length; tokenIndex++)
                    tokLoc[tokenIndex] = str.IndexOf(tokens[tokenIndex], leftOffHere);

                //find the closest token location
                int closest = int.MaxValue;
                foreach (int loc in tokLoc)
                    if (loc < closest && loc != -1)
                        closest = loc;

                if (closest == int.MaxValue)
                    closest = str.Length;

                yield return str.Substring(leftOffHere, closest - leftOffHere);

                //left off at closest + 1 so doesn't use same position twice
                leftOffHere = closest + 1;
            }//end loop

        }//end method

        public static System.Collections.Generic.IEnumerable<String> NextSplit(
            this String str, char[] tokens, int start, int length) {

            int leftOffHere = start;
            int[] tokLoc = new int[tokens.Length];//token location 

            //convert length to an index, starting index plus the length
            while (leftOffHere < length + start) {
                for (int tokenIndex = 0; tokenIndex < tokens.Length; tokenIndex++)
                    tokLoc[tokenIndex] = str.IndexOf(tokens[tokenIndex], leftOffHere);

                //find the closest token location
                int closest = int.MaxValue;
                foreach (int loc in tokLoc)
                    if (loc < closest && loc != -1)
                        closest = loc;

                if(closest == int.MaxValue)
                    yield break;

                yield return str.Substring(leftOffHere, closest - leftOffHere);

                //left off at closest + 1 so doesn't use same position twice
                leftOffHere = closest + 1;
            }//end loop

        }//end method

    }//end class

A similar change to the second method should work. Replace the yield break with closest = length + start;

Comments
Thanks for taking the time to debug my code.

Something else to consider, using the String.IndexOfAny method will shorten your code considerably. Here's some code to look at. This should work but I haven't fully tested it:

public static IEnumerable<String> NextSplit(
this string str,
char[] delimiters)
{
    int nextIndex = 0;
    int lastIndex = 0;
    while(nextIndex != -1)
    {
        nextIndex = str.IndexOfAny(delimiters,lastIndex);
        if(nextIndex != -1)
        {
            yield return str.Substring(lastIndex, nextIndex - lastIndex);
            lastIndex = nextIndex+1;
        }
        else
        {
            yield return str.Substring(lastIndex);
        }
    }
}
public static IEnumerable<String> NextSplit(
this string str, 
char[] delimiters, 
int start)
{
    int nextIndex = 0;
    int lastIndex = start;
    while (nextIndex != -1)
    {
        nextIndex = str.IndexOfAny(delimiters, lastIndex);
        if (nextIndex != -1)
        {
            yield return str.Substring(lastIndex, nextIndex - lastIndex);
            lastIndex = nextIndex + 1;
        }
        else
        {
            yield return str.Substring(lastIndex);
        }
    }
}
public static IEnumerable<String> NextSplit(
this string str, 
char[] delimiters, 
int start, 
int length)
{
    int nextIndex = 0;
    int lastIndex = start;
    int limit = start + length;
    while (nextIndex != -1)
    {
        nextIndex = str.IndexOfAny(delimiters, lastIndex);
        if (nextIndex != -1)
        {
            if(nextIndex < limit)
            {
                yield return str.Substring(lastIndex, nextIndex - lastIndex);
                lastIndex = nextIndex + 1;
            }
            else
            {
                yield return str.Substring(lastIndex, limit - lastIndex);
                break;
            }
        }
        else
        {
            yield return str.Substring(lastIndex, limit - lastIndex);
        }
    }
}

Edited 1 Year Ago by tinstaafl

Comments
Looks like a good fix thanks.

Something else to consider, using the String.IndexOfAny method will shorten your code considerably

And, perhaps more importantly, improve the runtime from being quadratic in the worst case to being linear (in the length of the string).

Comments
That's a good point.
The article starter has earned a lot of community kudos, and such articles offer a bounty for quality replies.