[Summary] [Continued]

Changed the micro-benchmark to use shorter strings, longer inner loops, and compare just:

  • String intrinsics (inline x86 string instructions)
  • String functions (from the standard C library)
  • A simple inlined while loop.

The following series of measurements seem to largely follow my initial (revised) theory - that a simple one-character while loop tends to win out as the strings copied become shorter and the inner loop longer. (Note that I suspect the “typical” program uses even shorter strings on average than even this benchmark - so we might expect the gain here to be more pronounced).

MSVC 6.0 optimizer set to “minimize size”.

RATE C intrinsic string calls 292.8 MB/second (1000.0 MB in 3.4 seconds)
RATE C function string calls 293.7 MB/second (1000.0 MB in 3.4 seconds)
RATE x1 string calls 249.6 MB/second (1000.0 MB in 4.0 seconds)

MSVC 6.0 optimizer set to “maximize speed”.

RATE C intrinsic string calls 227.0 MB/second (1000.0 MB in 4.4 seconds)
RATE C function string calls 288.6 MB/second (1000.0 MB in 3.5 seconds)
RATE x1 string calls 290.3 MB/second (1000.0 MB in 3.4 seconds)

Using the (most current?) Microsoft C++ compiler optimizing for Athlon and speed.

C:\tmp\benchmarks>"c:\program files\Microsoft Visual C++ Toolkit 2003\bin\cl.exe" /G7 /O2 string\string.cpp
Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 13.10.3077 for 80x86
Copyright (C) Microsoft Corporation 1984-2002. All rights reserved.

string.cpp
Microsoft (R) Incremental Linker Version 7.10.3077
Copyright (C) Microsoft Corporation.  All rights reserved.

/out:string.exe
string.obj

C:\tmp\benchmarks>string
RATE C intrinsic string calls 230.1 MB/second (1000.0 MB in 4.3 seconds)
RATE C function string calls 291.1 MB/second (1000.0 MB in 3.4 seconds)
RATE x1 string calls 307.2 MB/second (1000.0 MB in 3.3 seconds)

Using the MSVC 6.0 compiler optimizing for Pentium Pro.

C:\tmp\benchmarks>cl /nologo /G6 /O2 string\string.cpp
string.cpp

C:\tmp\benchmarks>string
RATE C intrinsic string calls 236.1 MB/second (1000.0 MB in 4.2 seconds)
RATE C function string calls 292.0 MB/second (1000.0 MB in 3.4 seconds)
RATE x1 string calls 299.0 MB/second (1000.0 MB in 3.3 seconds)

Using the MSVC 6.0 compiler optimizing for Pentium.

C:\tmp\benchmarks>cl /nologo /G5 /O2 string\string.cpp
string.cpp

C:\tmp\benchmarks>string
RATE C intrinsic string calls 228.0 MB/second (1000.0 MB in 4.4 seconds)
RATE C function string calls 292.8 MB/second (1000.0 MB in 3.4 seconds)
RATE x1 string calls 285.3 MB/second (1000.0 MB in 3.5 seconds)

Using the MSVC 6.0 compiler optimizing for i486.

C:\tmp\benchmarks>cl /nologo /G4 /O2 string\string.cpp
string.cpp

C:\tmp\benchmarks>string
RATE C intrinsic string calls 229.0 MB/second (1000.0 MB in 4.4 seconds)
RATE C function string calls 292.0 MB/second (1000.0 MB in 3.4 seconds)
RATE x1 string calls 288.6 MB/second (1000.0 MB in 3.5 seconds)

Using the MSVC 6.0 compiler optimizing for i386.

C:\tmp\benchmarks>cl /nologo /G3 /O2 string\string.cpp
string.cpp

C:\tmp\benchmarks>string
RATE C intrinsic string calls 230.1 MB/second (1000.0 MB in 4.3 seconds)
RATE C function string calls 292.0 MB/second (1000.0 MB in 3.4 seconds)
RATE x1 string calls 287.8 MB/second (1000.0 MB in 3.5 seconds)

Using the GNU C++ compiler.

C:\tmp\benchmarks>g++ -O6 -o string-gcc.exe string/string.cpp

C:\tmp\benchmarks>string-gcc
RATE C intrinsic string calls 238.3 MB/second (1000.0 MB in 4.2 seconds)
RATE C function string calls 243.5 MB/second (1000.0 MB in 4.1 seconds)
RATE x1 string calls 320.0 MB/second (1000.0 MB in 3.1 seconds)

While the results vary a bit, it seems reasonable to assume that the simple inlined while loop will equal or win over the alternatives for short string copies - especially for longer inner loops where the optimizer will have more of an opportunity to do interesting things with the inlined code.

Source for the micro-benchmark.

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>

//static const char sOut[] = "ABCDEFGHJKLMNOPQRSTUVWXYZabcdefghjklmnopqrstuvwxyz123456789";
static const char sOut1[] = "abcdefghjklmnopqrstuvwxyz";
static const char sOut2[] = "ABCDEFGHJKLMNOPQRSTUVWXYZ";
int nLength = ::strlen(sOut1);

void report_times(const char* s,int dt,int cb)
{
    //printf("TIME %s %d ticks for %d characters written\n",s,dt,cb);
    double ts = (double)dt / CLOCKS_PER_SEC;
    double mb = (double)cb / 1000000;
    double rate = mb / ts;
    printf("RATE %s %0.1f MB/second (%0.1f MB in %0.1f seconds)\n",s,rate,mb,ts);
}

//
//  Perform a function for a fixed number of iterations and return time.
//

const int nLoop = 10000000;
int dtLoop = 0;

typedef void (*doit)(const char*,const char*);

int time_function(doit fn)
{
    clock_t t0 = ::clock();
    for (int i=0; i<nLoop; ++i) {
        const char* s1 = sOut1 + (15 & i);
        const char* s2 = sOut2 + nLength - (15 & i);
        (*fn)(s1,s2);
    }
    return (int)(::clock() - t0) - dtLoop;
}

//
//  Time C string operations.
//

char sWork[256];
int nTotal = 0;

void do_total(const char* s1,const char* s2)
{
    nTotal += 4 * nLength;
}

void do_c0_string_calls(const char* s1,const char* s2)
{
    ::strcpy(sWork,s1);
    ::strcat(sWork,s2);
    ::strcpy(sWork,s2);
    ::strcat(sWork,s1);
    ::strcpy(sWork,s1);
    ::strcat(sWork,s1);
    ::strcpy(sWork,s2);
    ::strcat(sWork,s2);
}

// Optimization that seems to work best (at least with an Athlon).
#pragma function(strlen,strcpy,strcat)

void do_c1_string_calls(const char* s1,const char* s2)
{
    ::strcpy(sWork,s1);
    ::strcat(sWork,s2);
    ::strcpy(sWork,s2);
    ::strcat(sWork,s1);
    ::strcpy(sWork,s1);
    ::strcat(sWork,s1);
    ::strcpy(sWork,s2);
    ::strcat(sWork,s2);
}

inline void x1_strcpy(char* s1,const char* s2)
{
    while (*s1++ = *s2++);
}

inline void x1_strcat(char* s1,const char* s2)
{
    while (*s1) ++s1;
    x1_strcpy(s1,s2);
}

void do_x1_string_calls(const char* s1,const char* s2)
{
    x1_strcpy(sWork,s1);
    x1_strcat(sWork,s2);
    x1_strcpy(sWork,s2);
    x1_strcat(sWork,s1);
    x1_strcpy(sWork,s1);
    x1_strcat(sWork,s1);
    x1_strcpy(sWork,s2);
    x1_strcat(sWork,s2);
}


int main(int ac,char** av)
{
    dtLoop = time_function(do_total);
    report_times("C intrinsic string calls",time_function(do_c0_string_calls),nTotal);
    report_times("C function string calls",time_function(do_c1_string_calls),nTotal);
    report_times("x1 string calls",time_function(do_x1_string_calls),nTotal);
    return 0;
}