From df62bd71d6d569c01aa47e9cb2a6fba8a256b255 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 31 Mar 2021 10:27:32 -0400 Subject: [PATCH] tests: benchmark: improve benchmark scalability accuracy Testing with a fixed number of loops per-thread only works if the workload is distributed perfectly across CPUs. For instance, if a lock is held in the workload (e.g. internally by open() and close()), those may cause starvation of some threads, and therefore cause the benchmark to be wrong because it will wait for the slowest thread to complete its loops. It is also not good for testing overcommit of threads vs cpus. Change the test to report the number of loops performed in a given wall time, and use this to report the average and std.dev. of tracing overhead per event on each active CPU. Change the benchmark workload to be only CPU-bound and not generate system calls to minimize the inherent non-scalability of the workload (e.g. locks held within the kernel). Signed-off-by: Mathieu Desnoyers Change-Id: I5245f36831875bd9f87854618a4ed0cb31e56a4d --- tests/benchmark/README | 7 ++- tests/benchmark/bench.c | 83 ++++++++++++++++++++++++++++------ tests/benchmark/ptime | 2 +- tests/benchmark/test_benchmark | 33 +++++++++----- 4 files changed, 97 insertions(+), 28 deletions(-) diff --git a/tests/benchmark/README b/tests/benchmark/README index c8e33052..3294f10a 100644 --- a/tests/benchmark/README +++ b/tests/benchmark/README @@ -3,6 +3,9 @@ To run the benchmark: ./test_benchmark You can specify the number of iterations, events and threads by setting -environment variables ITERS, NR_EVENTS, NR_CPUS respectively: +environment variables ITERS, DURATION, NR_THREADS respectively: - ITERS=10 NR_EVENTS=10000 NR_CPUS=4 ./test_benchmark + ITERS=10 DURATION=20 NR_THREADS=4 ./test_benchmark + +NR_CPUS can also be configured, but by default is based on the contents of +/proc/cpuinfo. diff --git a/tests/benchmark/bench.c b/tests/benchmark/bench.c index 77f53a15..d15896cc 100644 --- a/tests/benchmark/bench.c +++ b/tests/benchmark/bench.c @@ -4,6 +4,7 @@ * LTTng Userspace Tracer (UST) - benchmark tool * * Copyright 2010 - Douglas Santos + * Copyright 2021 - Mathieu Desnoyers * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -34,8 +35,22 @@ #include "ust_tests_benchmark.h" #endif -static int nr_cpus; -static unsigned long nr_events; +#define printf_verbose(fmt, args...) \ + do { \ + if (verbose_mode) \ + printf(fmt, ## args); \ + } while (0) + +static int verbose_mode; + +struct thread_counter { + unsigned long long nr_loops; +}; + +static int nr_threads; +static unsigned long duration; + +static volatile int test_go, test_stop; void do_stuff(void) { @@ -53,21 +68,33 @@ void do_stuff(void) void *function(void *arg) { - unsigned long i; + unsigned long long nr_loops = 0; + struct thread_counter *thread_counter = arg; - for (i = 0; i < nr_events; i++) { + while (!test_go) + cmm_barrier(); + + for (;;) { do_stuff(); + nr_loops++; + if (test_stop) + break; } + thread_counter->nr_loops = nr_loops; return NULL; } void usage(char **argv) { - printf("Usage: %s nr_cpus nr_events\n", argv[0]); + printf("Usage: %s nr_threads duration(s) \n", argv[0]); + printf("OPTIONS:\n"); + printf(" [-v] (verbose output)\n"); + printf("\n"); } - int main(int argc, char **argv) { + unsigned long long total_loops = 0; + unsigned long i_thr; void *retval; int i; @@ -76,25 +103,53 @@ int main(int argc, char **argv) exit(1); } - nr_cpus = atoi(argv[1]); - printf("using %d processor(s)\n", nr_cpus); + nr_threads = atoi(argv[1]); + duration = atol(argv[2]); + + for (i = 3; i < argc; i++) { + if (argv[i][0] != '-') + continue; + switch (argv[i][1]) { + case 'v': + verbose_mode = 1; + break; + } + } + + printf_verbose("using %d thread(s)\n", nr_threads); + printf_verbose("for a duration of %lds\n", duration); - nr_events = atol(argv[2]); - printf("using %ld events per cpu\n", nr_events); + pthread_t thread[nr_threads]; + struct thread_counter thread_counter[nr_threads]; - pthread_t thread[nr_cpus]; - for (i = 0; i < nr_cpus; i++) { - if (pthread_create(&thread[i], NULL, function, NULL)) { + for (i = 0; i < nr_threads; i++) { + thread_counter[i].nr_loops = 0; + if (pthread_create(&thread[i], NULL, function, &thread_counter[i])) { fprintf(stderr, "thread create %d failed\n", i); exit(1); } } - for (i = 0; i < nr_cpus; i++) { + test_go = 1; + + for (i_thr = 0; i_thr < duration; i_thr++) { + sleep(1); + if (verbose_mode) { + fwrite(".", sizeof(char), 1, stdout); + fflush(stdout); + } + } + printf_verbose("\n"); + + test_stop = 1; + + for (i = 0; i < nr_threads; i++) { if (pthread_join(thread[i], &retval)) { fprintf(stderr, "thread join %d failed\n", i); exit(1); } + total_loops += thread_counter[i].nr_loops; } + printf("Number of loops: %llu\n", total_loops); return 0; } diff --git a/tests/benchmark/ptime b/tests/benchmark/ptime index 419f3684..92ee2980 100755 --- a/tests/benchmark/ptime +++ b/tests/benchmark/ptime @@ -16,7 +16,7 @@ def main(): os.system(cmd) t2 = time.time() - print(t2-t1) + print("Wall time: " + str(t2-t1)) if __name__ == "__main__": main() diff --git a/tests/benchmark/test_benchmark b/tests/benchmark/test_benchmark index 6f0dd7b4..fa1b1e9e 100755 --- a/tests/benchmark/test_benchmark +++ b/tests/benchmark/test_benchmark @@ -7,36 +7,43 @@ source $TESTDIR/utils/tap.sh plan_tests 1 : ${ITERS:=10} -: ${NR_EVENTS:=7000000} -: ${NR_CPUS:=1} +: ${DURATION:=2} +: ${NR_THREADS:=1} +: ${NR_CPUS:=$(lscpu | grep "^CPU(s)" | sed 's/^.*:[ \t]*//g')} : ${TIME:="./$CURDIR/ptime"} -: ${PROG_NOTRACING:="./$CURDIR/bench1 $NR_CPUS $NR_EVENTS"} -: ${PROG_TRACING:="./$CURDIR/bench2 $NR_CPUS $NR_EVENTS"} +: ${PROG_NOTRACING:="./$CURDIR/bench1 $NR_THREADS $DURATION"} +: ${PROG_TRACING:="./$CURDIR/bench2 $NR_THREADS $DURATION"} function signal_cleanup () { killall lttng-sessiond + exit } trap signal_cleanup SIGTERM SIGINT -CMD_NOTRACING="$TIME '$PROG_NOTRACING >/dev/null 2>&1'" -CMD_TRACING="$TIME '$PROG_TRACING >/dev/null 2>&1'" +CMD_NOTRACING="$TIME '$PROG_NOTRACING'" +CMD_TRACING="$TIME '$PROG_TRACING'" + +NR_ACTIVE_CPUS=$(( $NR_CPUS > $NR_THREADS ? $NR_THREADS : $NR_CPUS )) for i in $(seq $ITERS); do - time_notrace[i]=$(sh -c "$CMD_NOTRACING") + res=$(sh -c "$CMD_NOTRACING") + loops_notrace[$i]=$(echo "${res}" | grep "^Number of loops:" | sed 's/^.*: //g') + time_notrace[$i]=$(echo "${res}" | grep "^Wall time:" | sed 's/^.*: //g') done - lttng-sessiond -d --no-kernel lttng -q create --snapshot lttng -q enable-event -u -a lttng -q start for i in $(seq $ITERS); do - time_trace[i]=$(sh -c "$CMD_TRACING") + res=$(sh -c "$CMD_TRACING") + loops_trace[$i]=$(echo "${res}" | grep "^Number of loops:" | sed 's/^.*: //g') + time_trace[$i]=$(echo "${res}" | grep "^Wall time:" | sed 's/^.*: //g') done lttng -q stop @@ -45,9 +52,12 @@ killall lttng-sessiond pass "Trace benchmark" +# Multiply the wall time by the number of active CPUs to get the +# overhead of events on each active cpu. + avg_delta=0 for i in $(seq $ITERS); do - delta[$i]=$(echo "( ((${time_trace[$i]}) - (${time_notrace[$i]})) / $NR_EVENTS)" | bc -l) + delta[$i]=$(echo "((${time_trace[$i]} * ${NR_ACTIVE_CPUS} / ${loops_trace[$i]}) - (${time_notrace[$i]} * ${NR_ACTIVE_CPUS} / ${loops_notrace[$i]}))" | bc -l) avg_delta=$(echo "(${avg_delta} + ${delta[$i]})" | bc -l) done avg_delta=$(echo "(${avg_delta} / $ITERS)" | bc -l) @@ -65,6 +75,7 @@ NS_PER_EVENT=$(echo "($avg_delta * 1000000000)" | bc -l) NS_PER_EVENT=${NS_PER_EVENT%%.*} STD_DEV_NS_PER_EVENT=$(echo "($std_dev * 1000000000)" | bc -l) +# Remove fractions STD_DEV_NS_PER_EVENT=${STD_DEV_NS_PER_EVENT%%.*} -diag "Average tracing overhead per event is ${NS_PER_EVENT}ns, std.dev.: ${STD_DEV_NS_PER_EVENT}ns" +diag "Average tracing overhead per event is ${NS_PER_EVENT}ns, std.dev.: ${STD_DEV_NS_PER_EVENT}ns { NR_THREADS=${NR_THREADS}, NR_ACTIVE_CPUS=${NR_ACTIVE_CPUS} }" -- 2.34.1