I’ve already looked at Danny van Kooten’s version and I would say it isn’t optimised much at all (except for the parallel reading and a trick with forking and un-mapping the data file). I could not get it to run successfully even without this forking and piping, I always get a segfault in one of the worker threads.
I quickly made a single threaded version out of it to get some baseline comparison (and fixed the segfault).
C code
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#define MAX_DISTINCT_GROUPS 10000
#define MAX_GROUPBY_KEY_LENGTH 100
// Capacity of our hashmap
// Needs to be a power of 2
// so we can bit-and instead of modulo
#define HASHMAP_CAPACITY 16384
#define HASHMAP_INDEX(h) (h & (HASHMAP_CAPACITY - 1))
struct Group
{
unsigned int count;
int min;
int max;
long sum;
char key[MAX_GROUPBY_KEY_LENGTH];
};
struct Result
{
unsigned int n;
unsigned int map[HASHMAP_CAPACITY];
struct Group groups[MAX_DISTINCT_GROUPS];
};
// parses a floating point number as an integer
// this is only possible because we know our data file has only a single decimal
static inline const char *parse_number(int *dest, const char *s)
{
// parse sign
int mod = 1;
if (*s == '-')
{
mod = -1;
s++;
}
if (s[1] == '.')
{
*dest = ((s[0] * 10) + s[2] - ('0' * 11)) * mod;
return s + 4;
}
*dest = (s[0] * 100 + s[1] * 10 + s[3] - ('0' * 111)) * mod;
return s + 5;
}
static inline int min(int a, int b)
{
return a < b ? a : b;
}
static inline int max(int a, int b)
{
return a > b ? a : b;
}
// qsort callback
static inline int cmp(const void *ptr_a, const void *ptr_b)
{
return strcmp(((struct Group *)ptr_a)->key, ((struct Group *)ptr_b)->key);
}
static struct Result *process_chunk(char *data, size_t data_size)
{
// initialize result
struct Result *result = malloc(sizeof(*result));
if (!result)
{
perror("malloc error");
exit(EXIT_FAILURE);
}
result->n = 0;
// we could do this in a single call to memset
// since the two are contiguous in memory
// but this code is only called NTHREADS times
// so not really worth it
memset(result->map, 0, HASHMAP_CAPACITY * sizeof(*result->map));
memset(result->groups, 0, MAX_DISTINCT_GROUPS * sizeof(*result->groups));
const char *s = data;
const char *end = &data[data_size - 1];
const char *last_but_one = end - 1;
// flaming hot loop
while (s < last_but_one)
{
const char *linestart = s;
// find position of ;
// while simulatenuously hashing everything up to that point
unsigned int len = 0;
unsigned int h = 0;
while (s[len] != ';')
{
h = (h * 31) + (unsigned char)s[len];
len += 1;
}
// parse decimal number as int
int temperature;
s = parse_number(&temperature, linestart + len + 1);
// probe map until free spot or match
unsigned int *c = &result->map[HASHMAP_INDEX(h)];
while (*c > 0 && memcmp(result->groups[*c].key, linestart, len) != 0)
{
h += 1;
c = &result->map[HASHMAP_INDEX(h)];
}
// new hashmap entry
if (*c == 0)
{
*c = result->n;
memcpy(result->groups[*c].key, linestart, len);
result->n++;
}
// existing entry
result->groups[*c].count += 1;
result->groups[*c].min = min(result->groups[*c].min, temperature);
result->groups[*c].max = max(result->groups[*c].max, temperature);
result->groups[*c].sum += temperature;
}
return result;
}
static void result_to_str(char *dest, const struct Result *result)
{
char buf[128];
*dest++ = '{';
for (unsigned int i = 0; i < result->n; i++)
{
size_t n = (size_t)snprintf(
buf, 128, "%s=%.1f/%.1f/%.1f%s", result->groups[i].key,
(float)result->groups[i].min / 10.0,
((float)result->groups[i].sum / (float)result->groups[i].count)
/ 10.0,
(float)result->groups[i].max / 10.0,
i < (result->n - 1) ? ", " : "");
memcpy(dest, buf, n);
dest += n;
}
*dest++ = '}';
*dest = 0x0;
}
int main(int argc, char **argv)
{
char *file = "../measurements_big.txt";
if (argc > 1)
{
file = argv[1];
}
int fd = open(file, O_RDONLY);
if (!fd)
{
perror("error opening file");
exit(EXIT_FAILURE);
}
struct stat sb;
if (fstat(fd, &sb) == -1)
{
perror("error getting file size");
exit(EXIT_FAILURE);
}
// mmap entire file into memory
size_t sz = (size_t)sb.st_size;
char *data = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
if (data == MAP_FAILED)
{
perror("error mmapping file");
exit(EXIT_FAILURE);
}
struct Result *result = process_chunk(data, sz);
// sort results alphabetically
qsort(result->groups, (size_t)result->n, sizeof(*result->groups), cmp);
char *buf = malloc(240551);
result_to_str(buf, result);
puts(buf);
// clean-up
munmap((void *)data, sz);
close(fd);
return EXIT_SUCCESS;
}
I compiled it using his flags:
clang -std=c11 -O3 -march=native -mtune=native -flto -Wall -Wextra -Wpedantic -Wformat=2 -Wconversion -Wundef -Winline -Wimplicit-fallthrough test.c -o testc
The result contains negative zeroes -0.0
and a ,
at the end, but is correct.
The benchmark: runtime is 27.5s, needs 8MB of RAM.
% hyperfine -r 5 -w 1 './testc > solution.txt'
Benchmark 1: ./testc > solution.txt
Time (mean ± σ): 27.489 s ± 0.027 s [User: 25.152 s, System: 2.271 s]
Range (min … max): 27.452 s … 27.521 s 5 runs
My data
Apple M1 Max Pro
10Cores - 8 performance 2 efficiency
Max. Clock 3206 MHz (it did run at that frequency, I checked that)
L3 Cache 48MB
SSD has a measured speed of 4629.7MB/s write 5180.3 MB/s read
My data file is 14.9GB.
Btw. looking for a Fortran version I’ve found GitHub - sshestov/1brc-fortran, which (because of too many hash collisions) only returns 8350 instead of 8939 stations as a result and runs in 3.5s using 16 threads. That’s why I always check the results
If somebody has a AVX2 CPU on Linux: this should be the fastest C solution (if the title is true)