Update: after adding a timerMy solution is in C, and takes about 2.3 milliseconds to process the code
million-number file on my machine (see the Github link below),
I was able to excludeincluding startup from the benchmark,time and now itparsing
consistently takes about 1.45 millisecondsall the numbers).
My solution isUsing a timer built in Cto the program, I was also able to exclude
startup time from the benchmarking, and found that the actual
parsing-and-counting portion of the program takes about 21.3 milliseconds to process the5
million-number file on my machine (including startup time and parsingmilliseconds for the numbers)test file.
Full-program performance
Timing scriptParse/tally performance
Most of my performance analysis looked at the full runtime of the
program, but I eventually added an internal timer, which I was able to
use with a shell script (included below) to generate additional
benchmarks. Here, I've replicated the first table above using the new
benchmark.
| cc | A | F | msec. | % |
|-------+---+---+-------+-------|
| clang | y | | 1.501 | 1 |
| gcc | y | y | 1.589 | 1.058 |
| gcc | y | | 1.614 | 1.075 |
| clang | y | y | 1.625 | 1.083 |
| gcc | | y | 1.775 | 1.182 |
| clang | | | 1.910 | 1.272 |
| gcc | | | 1.926 | 1.283 |
| clang | | y | 1.957 | 1.303 |
Timing scripts
import glob, subprocess, timeit
def run(fname, call=subprocess.run, out=subprocess.DEVNULL):
call([fname, '1M_random_numbers.txt'], stdout=out)
for fname in glob.glob('./bin/*'):
results = timeit.repeat(f'run({fname!r})', number=1000, repeat=10,
globals={'run': run})
# Use the minimum result, as suggested by Python's documentation:
# the slower results were probably just interrupted more.
print(f'{fname}\t{min(results)}')
This is the script I used to benchmark with the program's internal
timer:
#!/usr/bin/env zsh
autoload -Uz zmathfunc && zmathfunc
# Convert the program's time output to an integer
reformat() awk '{print $1 "000000000+" $3}'
export show_time=1 multi_max=0
for run in ./bin/*; do
totals=()
repeat 10; do
echo -n '.'
results=()
repeat 1000; do
results+=($($run 1M_random_numbers.txt | reformat) '+')
done
totals+=($(dc <<<"0 ${results} p"))
done
printf '\r%s %s\n' "${run#./bin/}" "$((min(${(j:,:)totals})))"
done
cc -O3 -DASSUME_VALID -Wall -Wextra -Wpedantic \
integer-count.c -o integer-count
nthreads, if a number, will change the number of threads used from
the default of 6.
For one million numbers, I found 6 to be optimal. For one billion
numbers, I ran out of CPUs before adding more threads stopped
helping.
multi_max, unless it is empty or 0, will cause the program to print
print all numbers that occur
the the most, instead of just one.
If 0, it will cause the program to not output a maximum number at
all (which is useful in combination with show_time).
show_time, if non-empty, causes the program to output how many
seconds and nanoseconds it took, from just before the threads are
launched until just after the final tallies are produced.
I have released this code under the GPL on GitHub,
herehere.
/* Import lists list only key imports. */
#include <errno.h>
#include <fcntl.h> /* open */
#include <limits.h> /* INT_MAX */
#include <pthread.h> /* pthread_{create,exit,join} */
#include <stdbool.h>
#include <stdint.h> /* uintmax_t */
#include <stdlib.h> /* calloc, exit, getenv, malloc, strtol */
#include <stdio.h> /* printf, fprintf, perror */
#include <string.h> /* memset */
#include <sys/mman.h> /* mmap */
#include <sys/stat.h> /* stat */
#include <time.h> /* clock_gettime, CLOCK_REALTIME */
#include <unistd.h> /* close */
#define eprintf(...) fprintf(stderr, __VA_ARGS__)
#ifndef TALLY_LEN
#define TALLY_LEN (1000)
#endif
#ifndef DEFAULT_THREADS
#define DEFAULT_THREADS (6)
#endif
#ifndef MAX_THREADS
#define MAX_THREADS (100)
#endif
typedef struct {
size_t tally[TALLY_LEN];
char const *start;
char const *limit;
/* With sizeof(size_t) == sizeof(char*) == 8, align to
8192 bytes. */
char *_align[176];
} thread_params;
typedef enum { MM_ONE, MM_ALL, MM_NONE } multi_max_opt;
int as_positive(char const *const string);
void subtract_timespecs(struct timespec const *minuend,
struct timespec const *subtrahend,
struct timespec *difference);
int split_data_for_threads(int const nthreads, thread_params *const tparams,
char const *const start, size_t size);
void *run_thread(void *params);
void count_numbers(thread_params *params);
static size_t count_numbers_internal(thread_params *params);
int main(int argc, char **argv)
{
/* **************** Argument handling and file setup **************** */
if (argc != 2) {
eprintf("usage: %s file_name (env: [nthreads=%d] [multi_max=]%s)\n",
argv[0], DEFAULT_THREADS, "[multi_max={,0,1}] [show_time=]");
return 1;
}
char const *const fname = argv[1];
char const *const s_nthreads = getenv("nthreads");
int const nthreads = s_nthreads ? as_positive(s_nthreads) : DEFAULT_THREADS;
if (!nthreads) {
eprintf("number of threads ($nthreads) must be a positive integer\n");
return 1;
}
if (nthreads > MAX_THREADS) {
eprintf("nthreads must be no greater than %d\n", MAX_THREADS);
return 1;
}
/* If multi_max is nonempty, enable the option. */
char const *const s_multi_max = getenv("multi_max");
boolmulti_max_opt const multi_max = (!s_multi_max || s_multi_max[0] == '\0')
? MM_ONE
: (s_multi_max[0] !=== '0' && s_multi_max[1] == '\0')
? MM_NONE
: false;MM_ALL;
char const *const s_show_time = getenv("show_time");
bool const show_time = s_show_time ? (s_show_time[0] != '\0') : false;
struct stat stats;
if (stat(fname, &stats)) {
perror("error getting file size");
return 1;
}
if (stats.st_size < 1) {
/* The only way to have st_size < 1 is an empty file, so all numbers
are equally common. With multi_max, print them all. */
if (multi_max != MM_NONE) {
int n = (multi_max == MM_ALL) ? 0 : TALLY_LEN;
do {
printf("%d\n", (n + 4) % TALLY_LEN);
} while (++n < TALLY_LEN);
}
return 0;
}
/* This is basically the only 100% safe way to convert off_t to size_t. */
if ((uintmax_t) stats.st_size > SIZE_MAX) {
eprintf("file too large to map\n");
return 1;
}
size_t const size = (uintmax_t) stats.st_size;
int const fd = open(fname, O_RDONLY);
if (fd < 0) {
perror("error opening file");
return 1;
}
/* This mem-mapping is only unmapped by exiting. */
char const *const mem = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (mem == MAP_FAILED) {
perror("error mapping file");
return 1;
}
if (close(fd)) {
perror("error closing file");
/* We don't actually NEED to close the file, so we continue regardless. */
}
/* **************** Thread setup **************** */
/* This memory is only freed by exiting. */
thread_params *const tparams = calloc((unsigned) nthreads,
sizeof(thread_params));
if (!tparams) {
perror("failed to allocate tally arrays");
return 1;
}
if ((errno = split_data_for_threads(nthreads, tparams, mem, size))) {
return errno;
}
/* **************** Counting **************** */
/* This array has 1 extra entry, because [0] is this thread.
This memory is only freed by exiting. */
pthread_t *const threads = malloc((unsigned)nthreads * sizeof(pthread_t));
if (!threads) {
perror("failed to allocate thread array");
return 1;
}
/* **************** Counting **************** */
struct timespec start, end;
if (show_time) {
if (clock_gettime(CLOCK_REALTIME, &start)) {
perror("error recording start time");
return 1;
}
}
/* Start at 1 because this is thread 0. */
for (int t = 1; t < nthreads; ++t) {
errno = pthread_create(&threads[t], NULL, run_thread, &tparams[t]);
if (errno) {
perror("error starting thread");
return 1;
}
}
count_numbers(&tparams[0]);
for (int t = 1; t < nthreads; ++t) {
if ((errno = pthread_join(threads[t], NULL))) {
perror("error joining thread");
return 1;
}
}
/* **************** Find a most-frequent number **************** */
int max_at = -1;
size_t max_val = 0;
size_t *tally = tparams[0].tally;
for (int n = 0; n < TALLY_LEN; ++n) {
for (int t = 1; t < nthreads; ++t) {
tally[n] += tparams[t].tally[n];
}
if (tally[n] > max_val) {
max_val = tally[n];
max_at = n;
}
}
if (show_time) {
if (clock_gettime(CLOCK_REALTIME, &end)) {
perror("error recording end time");
return 1;
}
struct timespec time;
subtract_timespecs(&end, &start, &time);
/* Hopefully, time_t is signed or converts cleanly. */
printf("%jd s %ld ns\n", (intmax_t) time.tv_sec, time.tv_nsec);
}
if (multi_max != MM_NONE) {
printf("%d\n", max_at);
if (multi_max) {
for (int n = max_at + 1; n < TALLY_LEN; ++n) {
if (tally[n] == max_val) {
printf("%d\n", n);
}
}
}
}
return 0;
}
/**
* Convert a whole string to a positive integer, returning zero on any failure.
*/
int as_positive(char const *const string)
{
char *end;
long result = strtol(string, &end, 10);
/* Fail if the string wasn't all digits. */
if (*string < '0' || *string > '9' || *end != '\0') {
return 0;
}
if (result > INT_MAX || result < 1) {
return 0;
}
return (int)result;
}
/**
* Subtract the subtrahend from the minuend to set the result.
*
* Error and exits if the result is negative.
*
* @param[in] minuend The timespec to subtract from.
* @param[in] subtrahend The timespec to subtract.
* @param[out] difference The timespec which will contain the result. This can
* safely be either of the other arguments.
*/
void subtract_timespecs(struct timespec const *minuend,
struct timespec const *subtrahend,
struct timespec *difference)
{
/* time_t should be signed, but fail in case it isn't. */
if (minuend->tv_sec < subtrahend->tv_sec) {
eprintf("error: negative time difference %ju - %ju",
(uintmax_t) (intmax_t) minuend->tv_sec,
(uintmax_t) (intmax_t) subtrahend->tv_sec);
exit(1);
}
difference->tv_sec = minuend->tv_sec - subtrahend->tv_sec;
difference->tv_nsec = minuend->tv_nsec - subtrahend->tv_nsec;
if (difference->tv_nsec < 0) {
difference->tv_nsec += 1000000000L;
difference->tv_sec -= 1;
}
}
/**
* Divide data among threads, initializing the thread_param structs.
*
* @param[in] nthreads The number of elements in tparams.
* @param[out] tparams An array of thread parameters to initialize.
* @param[in] start The data to divide among the threads.
* @param[in] size The length of the memory.
*
* @return Zero if successful, nonzero if an error occurs.
*/
int split_data_for_threads(int const nthreads, thread_params *const tparams,
char const *const start, size_t size)
{
char const *const limit = start + size;
size_t step = size / (unsigned int) nthreads;
int extra = (int) (size % (size_t) (unsigned int) nthreads);
char const *cursor = start;
/* Set up initial thread boundaries (ends only). */
for (int t = 0; t < nthreads; ++t) {
cursor += step;
/* Account for uneven division of memory among threads. */
if (t < extra) {
cursor += 1;
}
tparams[t].limit = cursor;
}
/* The math should guarantee this, but just to be safe... */
if (tparams[nthreads - 1].limit != limit) {
eprintf("internal error preparing threads\n");
return 2;
}
/* Adjust boundaries and set start positions. */
cursor = start;
for (int t = 0; t < nthreads; ++t) {
tparams[t].start = cursor;
/* Adjust chunk end to point just after a newline */
cursor = tparams[t].limit;
while (*(cursor - 1) != '\n' && cursor < limit) {
++cursor;
}
tparams[t].limit = cursor;
}
return 0;
}
void *run_thread(void *params)
{
count_numbers(params);
pthread_exit(NULL);
return NULL;
}
void count_numbers(thread_params *params)
{
#ifndef ASSUME_VALID
/* Skip any leading newlines so we don't count them as zeros. */
char const *const start = params->start;
char const *const limit = params->limit;
char const *cursor = start;
while (cursor < limit && *cursor == '\n') {
++cursor;
}
params->start = cursor;
size_t final_state = count_numbers_internal(params);
/* If the data ended without a newline, tally the final value read. */
if (start < limit && *(limit - 1) != '\n') {
params->tally[final_state]++;
}
#else
count_numbers_internal(params);
#endif
}
/* If this function is inlined, it sometimes takes a dramatic
performance hit when compiled with gcc -O3. I haven't observed
a dramatic effect with clang, but I have observed a small one. */
__attribute__((noinline))
static size_t count_numbers_internal(thread_params *params)
{
/* The cursor could be declared in the loop header. */
char const *cursor = params->start;
char const *const limit = params->limit;
size_t *tally = params->tally;
size_t value = 0;
for (; cursor < limit; ++cursor) {
char c = *cursor;
if (c == '\n') {
tally[value]++;
value = 0;
#ifndef FALL_LINES
continue;
#else
++cursor;
if (cursor >= limit) {
break;
}
c = *cursor;
#endif
}
#ifndef ASSUME_VALID
if (c < '0' || c > '9') {
fprintfeprintf(stderr, "not a digit: 0x%02x\n", c);
exit(1);
}
#endif
value = value * 10 + (size_t) (c - '0');
#ifndef ASSUME_VALID
/* Have to check here instead of when tallying because of overflow. */
if (value > TALLY_LEN) {
fprintfeprintf(stderr, "value out of range: %zu\n", value);
exit(1);
}
#endif
}
return value;
}