Skip to main content
3 of 7
added 653 characters in body
Dan W
  • 173
  • 6

Version of C# StringBuilder to allow for strings larger than 2 billion characters

In C#, 64bit Windows, .NET 4.5 (or later), and enabling gcAllowVeryLargeObjects in the App.config file allows for objects larger than two gigabyte. That's cool, but unfortunately, the maximum number of elements that C# allows in an array is still limited to about 2^31 = 2.15 billion. Testing confirmed this.

To overcome this, Microsoft recommends in Option B creating the arrays natively. Problem is we need to use unsafe code, and as far as I know, unicode won't be supported, at least not easily.

So I ended up creating my own BigStringBuilder function in the end. It's a list where each list element (or page) is a char array (type List<char[]>).

Providing you're using 64 bit Windows, you can now easily surpass the 2 billion character element limit. I managed to test creating a giant string around 32 gigabytes large (needed to increase virtual memory in the OS first, otherwise I could only get around 7GB on my 8GB RAM PC). I'm sure it handles more than 32GB easily. In theory, it should be able to handle around 1,000,000,000 * 1,000,000,000 chars or one quintillion characters, which should be enough for anyone!

I also added some common functions to provide some functionality such as fileSave(), length(), substring(), replace(), etc. Like the StringBuilder, in-place character writing (mutability), and instant truncation are possible.

Speed-wise, some quick tests show that it's not significantly slower than a StringBuilder when appending (found it was 33% slower in one test). I got similar performance if I went for a 2D jagged char array (char[][]) instead of List<char[]>, but Lists are simpler to work with, so I stuck with that.

I'm looking for advice to potentially speed up performance, particularly for the append function, and to access or write faster via the indexer (public char this[long n] {...} )

// A simplified version specially for StackOverflow / Codereview
public class BigStringBuilder
{
    List<char[]> c = new List<char[]>();
    private int pagedepth;
    private long pagesize;
    private long mpagesize;         // https://stackoverflow.com/questions/11040646/faster-modulus-in-c-c
    private int currentPage = 0;
    private int currentPosInPage = 0;

    public BigStringBuilder(int pagedepth = 12) {   // pagesize is 2^pagedepth (since must be a power of 2 for a fast indexer)
        this.pagedepth = pagedepth;
        pagesize = (long)Math.Pow(2, pagedepth);
        mpagesize = pagesize - 1;
        c.Add(new char[pagesize]);
    }

    // Indexer for this class, so you can use convenient square bracket indexing to address char elements within the array!!
    public char this[long n]    {
        get { return c[(int)(n >> pagedepth)][n & mpagesize]; }
        set { c[(int)(n >> pagedepth)][n & mpagesize] = value; }
    }

    public string[] returnPagesForTestingPurposes() {
        string[] s = new string[currentPage + 1];
        for (int i = 0; i < currentPage + 1; i++) s[i] = new string(c[i]);
        return s;
    }
    public void clear() {
        c = new List<char[]>();
        c.Add(new char[pagesize]);
        currentPage = 0;
        currentPosInPage = 0;
    }

    // See: https://stackoverflow.com/questions/373365/how-do-i-write-out-a-text-file-in-c-sharp-with-a-code-page-other-than-utf-8/373372
    public void fileSave(string path)   {
        StreamWriter sw = File.CreateText(path);
        for (int i = 0; i < currentPage; i++) sw.Write(new string(c[i]));
        sw.Write(new string(c[currentPage], 0, currentPosInPage));
        sw.Close();
    }

    public void fileOpen(string path)   {
        clear();
        StreamReader sw = new StreamReader(path);
        int len = 0;
        while ((len = sw.ReadBlock(c[currentPage], 0, (int)pagesize)) != 0){
            if (!sw.EndOfStream)    {
                currentPage++;
                if (currentPage == c.Count) c.Add(new char[pagesize]);
            }
            else    {
                currentPosInPage = len;
                break;
            }
        }
        sw.Close();
    }

    public long length()    {
        return (long)currentPage * (long)pagesize + (long)currentPosInPage;
    }

    public string ToString(long max = 2000000000)   {
        if (length() < max) return substring(0, length());
        else return substring(0, max);
    }

    public string substring(long x, long y) {
        StringBuilder sb = new StringBuilder();
        for (long n = x; n < y; n++) sb.Append(c[(int)(n >> pagedepth)][n & mpagesize]);    //8s
        return sb.ToString();
    }

    public bool match(string find, long start = 0)  {
        //if (s.Length > length()) return false;
        for (int i = 0; i < find.Length; i++) if (i + start == find.Length || this[start + i] != find[i]) return false;
        return true;
    }
    public void replace(string s, long pos) {
        for (int i = 0; i < s.Length; i++)  {
            c[(int)(pos >> pagedepth)][pos & mpagesize] = s[i];
            pos++;
        }
    }

    // Simple implementation of an append() function. Testing shows this to be about
    // as fast or faster than the more sophisticated Append2() function further below
    // despite its simplicity:
    public void Append(string s)
    {
        for (int i = 0; i < s.Length; i++)
        {
            c[currentPage][currentPosInPage] = s[i];
            currentPosInPage++;
            if (currentPosInPage == pagesize)
            {
                currentPosInPage = 0;
                currentPage++;
                if (currentPage == c.Count) c.Add(new char[pagesize]);
            }
        }
    }

    // This method is a more sophisticated version of the Append() function above.
    // Surprisingly, in real-world testing, it doesn't seem to be any faster. 
    public void Append2(string s)
    {
        if (currentPosInPage + s.Length <= pagesize)
        {
            // append s entirely to current page
            for (int i = 0; i < s.Length; i++)
            {
                c[currentPage][currentPosInPage] = s[i];
                currentPosInPage++;
            }
        }
        else
        {
            int stringpos;
            int topup = (int)pagesize - currentPosInPage;
            // Finish off current page with substring of s
            for (int i = 0; i < topup; i++)
            {
                c[currentPage][currentPosInPage] = s[i];
                currentPosInPage++;
            }
            currentPage++;
            currentPosInPage = 0;
            stringpos = topup;
            int remainingPagesToFill = (s.Length - topup) >> pagedepth; // We want the floor here
            // fill up complete pages if necessary:
            if (remainingPagesToFill > 0)
            {
                for (int i = 0; i < remainingPagesToFill; i++)
                {
                    if (currentPage == c.Count) c.Add(new char[pagesize]);
                    for (int j = 0; j < pagesize; j++)
                    {
                        c[currentPage][j] = s[stringpos];
                        stringpos++;
                    }
                    currentPage++;
                }
            }
            // finish off remainder of string s on new page:
            if (currentPage == c.Count) c.Add(new char[pagesize]);
            for (int i = stringpos; i < s.Length; i++)
            {
                c[currentPage][currentPosInPage] = s[i];
                currentPosInPage++;
            }
        }
    }
}
Dan W
  • 173
  • 6