One Billion Row challenge in Hs

I’ve already looked at Danny van Kooten’s version and I would say it isn’t optimised much at all (except for the parallel reading and a trick with forking and un-mapping the data file). I could not get it to run successfully even without this forking and piping, I always get a segfault in one of the worker threads.

I quickly made a single threaded version out of it to get some baseline comparison (and fixed the segfault).

C code
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>

#define MAX_DISTINCT_GROUPS    10000
#define MAX_GROUPBY_KEY_LENGTH 100

// Capacity of our hashmap
// Needs to be a power of 2
// so we can bit-and instead of modulo
#define HASHMAP_CAPACITY 16384
#define HASHMAP_INDEX(h) (h & (HASHMAP_CAPACITY - 1))

struct Group
{
    unsigned int count;
    int min;
    int max;
    long sum;
    char key[MAX_GROUPBY_KEY_LENGTH];
};

struct Result
{
    unsigned int n;
    unsigned int map[HASHMAP_CAPACITY];
    struct Group groups[MAX_DISTINCT_GROUPS];
};

// parses a floating point number as an integer
// this is only possible because we know our data file has only a single decimal
static inline const char *parse_number(int *dest, const char *s)
{
    // parse sign
    int mod = 1;
    if (*s == '-')
    {
        mod = -1;
        s++;
    }

    if (s[1] == '.')
    {
        *dest = ((s[0] * 10) + s[2] - ('0' * 11)) * mod;
        return s + 4;
    }

    *dest = (s[0] * 100 + s[1] * 10 + s[3] - ('0' * 111)) * mod;
    return s + 5;
}

static inline int min(int a, int b)
{
    return a < b ? a : b;
}
static inline int max(int a, int b)
{
    return a > b ? a : b;
}

// qsort callback
static inline int cmp(const void *ptr_a, const void *ptr_b)
{
    return strcmp(((struct Group *)ptr_a)->key, ((struct Group *)ptr_b)->key);
}

static struct Result *process_chunk(char *data, size_t data_size)
{
    // initialize result
    struct Result *result = malloc(sizeof(*result));
    if (!result)
    {
        perror("malloc error");
        exit(EXIT_FAILURE);
    }
    result->n = 0;

    // we could do this in a single call to memset
    // since the two are contiguous in memory
    // but this code is only called NTHREADS times
    // so not really worth it
    memset(result->map, 0, HASHMAP_CAPACITY * sizeof(*result->map));
    memset(result->groups, 0, MAX_DISTINCT_GROUPS * sizeof(*result->groups));

    const char *s            = data;
    const char *end          = &data[data_size - 1];
    const char *last_but_one = end - 1;

    // flaming hot loop
    while (s < last_but_one)
    {
        const char *linestart = s;

        // find position of ;
        // while simulatenuously hashing everything up to that point
        unsigned int len = 0;
        unsigned int h   = 0;
        while (s[len] != ';')
        {
            h = (h * 31) + (unsigned char)s[len];
            len += 1;
        }

        // parse decimal number as int
        int temperature;
        s = parse_number(&temperature, linestart + len + 1);

        // probe map until free spot or match
        unsigned int *c = &result->map[HASHMAP_INDEX(h)];
        while (*c > 0 && memcmp(result->groups[*c].key, linestart, len) != 0)
        {
            h += 1;
            c = &result->map[HASHMAP_INDEX(h)];
        }

        // new hashmap entry
        if (*c == 0)
        {
            *c = result->n;
            memcpy(result->groups[*c].key, linestart, len);
            result->n++;
        }

        // existing entry
        result->groups[*c].count += 1;
        result->groups[*c].min = min(result->groups[*c].min, temperature);
        result->groups[*c].max = max(result->groups[*c].max, temperature);
        result->groups[*c].sum += temperature;
    }

    return result;
}

static void result_to_str(char *dest, const struct Result *result)
{
    char buf[128];
    *dest++ = '{';
    for (unsigned int i = 0; i < result->n; i++)
    {
        size_t n = (size_t)snprintf(
            buf, 128, "%s=%.1f/%.1f/%.1f%s", result->groups[i].key,
            (float)result->groups[i].min / 10.0,
            ((float)result->groups[i].sum / (float)result->groups[i].count)
                / 10.0,
            (float)result->groups[i].max / 10.0,
            i < (result->n - 1) ? ", " : "");

        memcpy(dest, buf, n);
        dest += n;
    }
    *dest++ = '}';
    *dest   = 0x0;
}

int main(int argc, char **argv)
{
    char *file = "../measurements_big.txt";
    if (argc > 1)
    {
        file = argv[1];
    }

    int fd = open(file, O_RDONLY);
    if (!fd)
    {
        perror("error opening file");
        exit(EXIT_FAILURE);
    }

    struct stat sb;
    if (fstat(fd, &sb) == -1)
    {
        perror("error getting file size");
        exit(EXIT_FAILURE);
    }

    // mmap entire file into memory
    size_t sz  = (size_t)sb.st_size;
    char *data = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
    if (data == MAP_FAILED)
    {
        perror("error mmapping file");
        exit(EXIT_FAILURE);
    }

    struct Result *result = process_chunk(data, sz);

    // sort results alphabetically
    qsort(result->groups, (size_t)result->n, sizeof(*result->groups), cmp);

    char *buf = malloc(240551);
    result_to_str(buf, result);
    puts(buf);

    // clean-up
    munmap((void *)data, sz);
    close(fd);

    return EXIT_SUCCESS;
}

I compiled it using his flags:

clang -std=c11 -O3 -march=native -mtune=native -flto -Wall -Wextra -Wpedantic -Wformat=2 -Wconversion -Wundef -Winline -Wimplicit-fallthrough test.c -o testc 

The result contains negative zeroes -0.0 and a , at the end, but is correct.
The benchmark: runtime is 27.5s, needs 8MB of RAM.

% hyperfine -r 5 -w 1 './testc > solution.txt'
Benchmark 1: ./testc > solution.txt
  Time (mean ± σ):     27.489 s ±  0.027 s    [User: 25.152 s, System: 2.271 s]
  Range (min … max):   27.452 s … 27.521 s    5 runs

My data

Apple M1 Max Pro
10Cores - 8 performance 2 efficiency
Max. Clock 3206 MHz (it did run at that frequency, I checked that)
L3 Cache 48MB
SSD has a measured speed of 4629.7MB/s write 5180.3 MB/s read

My data file is 14.9GB.

Btw. looking for a Fortran version I’ve found GitHub - sshestov/1brc-fortran, which (because of too many hash collisions) only returns 8350 instead of 8939 stations as a result and runs in 3.5s using 16 threads. That’s why I always check the results :wink:

If somebody has a AVX2 CPU on Linux: this should be the fastest C solution (if the title is true)

2 Likes