random memes }

Continued - building a better string class - assumptions revised

[Summary] [Continued]

Changed the micro-benchmark to use shorter strings, longer inner loops, and compare just:

The following series of measurements seem to largely follow my initial (revised) theory - that a simple one-character while loop tends to win out as the strings copied become shorter and the inner loop longer. (Note that I suspect the "typical" program uses even shorter strings on average than even this benchmark - so we might expect the gain here to be more pronounced).

MSVC 6.0 optimizer set to "minimize size".

    RATE C intrinsic string calls 292.8 MB/second (1000.0 MB in 3.4 seconds)
    RATE C function string calls 293.7 MB/second (1000.0 MB in 3.4 seconds)
    RATE x1 string calls 249.6 MB/second (1000.0 MB in 4.0 seconds)

MSVC 6.0 optimizer set to "maximize speed".

    RATE C intrinsic string calls 227.0 MB/second (1000.0 MB in 4.4 seconds)
    RATE C function string calls 288.6 MB/second (1000.0 MB in 3.5 seconds)
    RATE x1 string calls 290.3 MB/second (1000.0 MB in 3.4 seconds)

Using the (most current?) Microsoft C++ compiler optimizing for Athlon and speed.

    C:\tmp\benchmarks>"c:\program files\Microsoft Visual C++ Toolkit 2003\bin\cl.exe" /G7 /O2 string\string.cpp
    Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 13.10.3077 for 80x86
    Copyright (C) Microsoft Corporation 1984-2002. All rights reserved.

    string.cpp
    Microsoft (R) Incremental Linker Version 7.10.3077
    Copyright (C) Microsoft Corporation.  All rights reserved.

    /out:string.exe
    string.obj

    C:\tmp\benchmarks>string
    RATE C intrinsic string calls 230.1 MB/second (1000.0 MB in 4.3 seconds)
    RATE C function string calls 291.1 MB/second (1000.0 MB in 3.4 seconds)
    RATE x1 string calls 307.2 MB/second (1000.0 MB in 3.3 seconds)

Using the MSVC 6.0 compiler optimizing for Pentium Pro.

    C:\tmp\benchmarks>cl /nologo /G6 /O2 string\string.cpp
    string.cpp

    C:\tmp\benchmarks>string
    RATE C intrinsic string calls 236.1 MB/second (1000.0 MB in 4.2 seconds)
    RATE C function string calls 292.0 MB/second (1000.0 MB in 3.4 seconds)
    RATE x1 string calls 299.0 MB/second (1000.0 MB in 3.3 seconds)

Using the MSVC 6.0 compiler optimizing for Pentium.

    C:\tmp\benchmarks>cl /nologo /G5 /O2 string\string.cpp
    string.cpp

    C:\tmp\benchmarks>string
    RATE C intrinsic string calls 228.0 MB/second (1000.0 MB in 4.4 seconds)
    RATE C function string calls 292.8 MB/second (1000.0 MB in 3.4 seconds)
    RATE x1 string calls 285.3 MB/second (1000.0 MB in 3.5 seconds)

Using the MSVC 6.0 compiler optimizing for i486.

    C:\tmp\benchmarks>cl /nologo /G4 /O2 string\string.cpp
    string.cpp

    C:\tmp\benchmarks>string
    RATE C intrinsic string calls 229.0 MB/second (1000.0 MB in 4.4 seconds)
    RATE C function string calls 292.0 MB/second (1000.0 MB in 3.4 seconds)
    RATE x1 string calls 288.6 MB/second (1000.0 MB in 3.5 seconds)

Using the MSVC 6.0 compiler optimizing for i386.

    C:\tmp\benchmarks>cl /nologo /G3 /O2 string\string.cpp
    string.cpp

    C:\tmp\benchmarks>string
    RATE C intrinsic string calls 230.1 MB/second (1000.0 MB in 4.3 seconds)
    RATE C function string calls 292.0 MB/second (1000.0 MB in 3.4 seconds)
    RATE x1 string calls 287.8 MB/second (1000.0 MB in 3.5 seconds)

Using the GNU C++ compiler.

    C:\tmp\benchmarks>g++ -O6 -o string-gcc.exe string/string.cpp

    C:\tmp\benchmarks>string-gcc
    RATE C intrinsic string calls 238.3 MB/second (1000.0 MB in 4.2 seconds)
    RATE C function string calls 243.5 MB/second (1000.0 MB in 4.1 seconds)
    RATE x1 string calls 320.0 MB/second (1000.0 MB in 3.1 seconds)

While the results vary a bit, it seems reasonable to assume that the simple inlined while loop will equal or win over the alternatives for short string copies - especially for longer inner loops where the optimizer will have more of an opportunity to do interesting things with the inlined code.

Source for the micro-benchmark.

    #include <stdlib.h>
    #include <stdio.h>
    #include <time.h>
    #include <string.h>

    //static const char sOut[] = "ABCDEFGHJKLMNOPQRSTUVWXYZabcdefghjklmnopqrstuvwxyz123456789";
    static const char sOut1[] = "abcdefghjklmnopqrstuvwxyz";
    static const char sOut2[] = "ABCDEFGHJKLMNOPQRSTUVWXYZ";
    int nLength = ::strlen(sOut1);

    void report_times(const char* s,int dt,int cb)
    {
        //printf("TIME %s %d ticks for %d characters written\n",s,dt,cb);
        double ts = (double)dt / CLOCKS_PER_SEC;
        double mb = (double)cb / 1000000;
        double rate = mb / ts;
        printf("RATE %s %0.1f MB/second (%0.1f MB in %0.1f seconds)\n",s,rate,mb,ts);
    }

    //
    //  Perform a function for a fixed number of iterations and return time.
    //

    const int nLoop = 10000000;
    int dtLoop = 0;

    typedef void (*doit)(const char*,const char*);

    int time_function(doit fn)
    {
        clock_t t0 = ::clock();
        for (int i=0; i<nLoop; ++i) {
            const char* s1 = sOut1 + (15 & i);
            const char* s2 = sOut2 + nLength - (15 & i);
            (*fn)(s1,s2);
        }
        return (int)(::clock() - t0) - dtLoop;
    }

    //
    //  Time C string operations.
    //

    char sWork[256];
    int nTotal = 0;

    void do_total(const char* s1,const char* s2)
    {
        nTotal += 4 * nLength;
    }

    void do_c0_string_calls(const char* s1,const char* s2)
    {
        ::strcpy(sWork,s1);
        ::strcat(sWork,s2);
        ::strcpy(sWork,s2);
        ::strcat(sWork,s1);
        ::strcpy(sWork,s1);
        ::strcat(sWork,s1);
        ::strcpy(sWork,s2);
        ::strcat(sWork,s2);
    }

    // Optimization that seems to work best (at least with an Athlon).
    #pragma function(strlen,strcpy,strcat)

    void do_c1_string_calls(const char* s1,const char* s2)
    {
        ::strcpy(sWork,s1);
        ::strcat(sWork,s2);
        ::strcpy(sWork,s2);
        ::strcat(sWork,s1);
        ::strcpy(sWork,s1);
        ::strcat(sWork,s1);
        ::strcpy(sWork,s2);
        ::strcat(sWork,s2);
    }

    inline void x1_strcpy(char* s1,const char* s2)
    {
        while (*s1++ = *s2++);
    }

    inline void x1_strcat(char* s1,const char* s2)
    {
        while (*s1) ++s1;
        x1_strcpy(s1,s2);
    }

    void do_x1_string_calls(const char* s1,const char* s2)
    {
        x1_strcpy(sWork,s1);
        x1_strcat(sWork,s2);
        x1_strcpy(sWork,s2);
        x1_strcat(sWork,s1);
        x1_strcpy(sWork,s1);
        x1_strcat(sWork,s1);
        x1_strcpy(sWork,s2);
        x1_strcat(sWork,s2);
    }


    int main(int ac,char** av)
    {
        dtLoop = time_function(do_total);
        report_times("C intrinsic string calls",time_function(do_c0_string_calls),nTotal);
        report_times("C function string calls",time_function(do_c1_string_calls),nTotal);
        report_times("x1 string calls",time_function(do_x1_string_calls),nTotal);
        return 0;
    }