From df62bd71d6d569c01aa47e9cb2a6fba8a256b255 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 31 Mar 2021 10:27:32 -0400
Subject: [PATCH] tests: benchmark: improve benchmark scalability accuracy

Testing with a fixed number of loops per-thread only works if the
workload is distributed perfectly across CPUs. For instance, if a lock
is held in the workload (e.g. internally by open() and close()), those
may cause starvation of some threads, and therefore cause the benchmark
to be wrong because it will wait for the slowest thread to complete its
loops.

It is also not good for testing overcommit of threads vs cpus.

Change the test to report the number of loops performed in a given wall
time, and use this to report the average and std.dev. of tracing
overhead per event on each active CPU.

Change the benchmark workload to be only CPU-bound and not generate
system calls to minimize the inherent non-scalability of the workload
(e.g. locks held within the kernel).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Change-Id: I5245f36831875bd9f87854618a4ed0cb31e56a4d
---
 tests/benchmark/README         |  7 ++-
 tests/benchmark/bench.c        | 83 ++++++++++++++++++++++++++++------
 tests/benchmark/ptime          |  2 +-
 tests/benchmark/test_benchmark | 33 +++++++++-----
 4 files changed, 97 insertions(+), 28 deletions(-)
diff --git a/tests/benchmark/README b/tests/benchmark/README
index c8e33052..3294f10a 100644
--- a/tests/benchmark/README
+++ b/tests/benchmark/README
@@ -3,6 +3,9 @@ To run the benchmark:
     ./test_benchmark
 
 You can specify the number of iterations, events and threads by setting
-environment variables ITERS, NR_EVENTS, NR_CPUS respectively:
+environment variables ITERS, DURATION, NR_THREADS respectively:
 
-    ITERS=10 NR_EVENTS=10000 NR_CPUS=4 ./test_benchmark
+    ITERS=10 DURATION=20 NR_THREADS=4 ./test_benchmark
+
+NR_CPUS can also be configured, but by default is based on the contents of
+/proc/cpuinfo.
diff --git a/tests/benchmark/bench.c b/tests/benchmark/bench.c
index 77f53a15..d15896cc 100644
--- a/tests/benchmark/bench.c
+++ b/tests/benchmark/bench.c
@@ -4,6 +4,7 @@
  * LTTng Userspace Tracer (UST) - benchmark tool
  *
  * Copyright 2010 - Douglas Santos <douglas.santos@polymtl.ca>
+ * Copyright 2021 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,8 +35,22 @@
 #include "ust_tests_benchmark.h"
 #endif
 
-static int nr_cpus;
-static unsigned long nr_events;
+#define printf_verbose(fmt, args...)		\
+	do {					\
+		if (verbose_mode)		\
+			printf(fmt, ## args);	\
+	} while (0)
+
+static int verbose_mode;
+
+struct thread_counter {
+	unsigned long long nr_loops;
+};
+
+static int nr_threads;
+static unsigned long duration;
+
+static volatile int test_go, test_stop;
 
 void do_stuff(void)
 {
@@ -53,21 +68,33 @@ void do_stuff(void)
 
 void *function(void *arg)
 {
-	unsigned long i;
+	unsigned long long nr_loops = 0;
+	struct thread_counter *thread_counter = arg;
 
-	for (i = 0; i < nr_events; i++) {
+	while (!test_go)
+		cmm_barrier();
+
+	for (;;) {
 		do_stuff();
+		nr_loops++;
+		if (test_stop)
+			break;
 	}
+	thread_counter->nr_loops = nr_loops;
 	return NULL;
 }
 
 void usage(char **argv) {
-	printf("Usage: %s nr_cpus nr_events\n", argv[0]);
+	printf("Usage: %s nr_threads duration(s) <OPTIONS>\n", argv[0]);
+	printf("OPTIONS:\n");
+	printf("        [-v] (verbose output)\n");
+	printf("\n");
 }
 
-
 int main(int argc, char **argv)
 {
+	unsigned long long total_loops = 0;
+	unsigned long i_thr;
 	void *retval;
 	int i;
 
@@ -76,25 +103,53 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 
-	nr_cpus = atoi(argv[1]);
-	printf("using %d processor(s)\n", nr_cpus);
+	nr_threads = atoi(argv[1]);
+	duration = atol(argv[2]);
+
+	for (i = 3; i < argc; i++) {
+		if (argv[i][0] != '-')
+			continue;
+		switch (argv[i][1]) {
+		case 'v':
+			verbose_mode = 1;
+			break;
+		}
+	}
+
+	printf_verbose("using %d thread(s)\n", nr_threads);
+	printf_verbose("for a duration of %lds\n", duration);
 
-	nr_events = atol(argv[2]);
-	printf("using %ld events per cpu\n", nr_events);
+	pthread_t thread[nr_threads];
+	struct thread_counter thread_counter[nr_threads];
 
-	pthread_t thread[nr_cpus];
-	for (i = 0; i < nr_cpus; i++) {
-		if (pthread_create(&thread[i], NULL, function, NULL)) {
+	for (i = 0; i < nr_threads; i++) {
+		thread_counter[i].nr_loops = 0;
+		if (pthread_create(&thread[i], NULL, function, &thread_counter[i])) {
 			fprintf(stderr, "thread create %d failed\n", i);
 			exit(1);
 		}
 	}
 
-	for (i = 0; i < nr_cpus; i++) {
+	test_go = 1;
+
+	for (i_thr = 0; i_thr < duration; i_thr++) {
+		sleep(1);
+		if (verbose_mode) {
+			fwrite(".", sizeof(char), 1, stdout);
+			fflush(stdout);
+		}
+	}
+	printf_verbose("\n");
+
+	test_stop = 1;
+
+	for (i = 0; i < nr_threads; i++) {
 		if (pthread_join(thread[i], &retval)) {
 			fprintf(stderr, "thread join %d failed\n", i);
 			exit(1);
 		}
+		total_loops += thread_counter[i].nr_loops;
 	}
+	printf("Number of loops: %llu\n", total_loops);
 	return 0;
 }
diff --git a/tests/benchmark/ptime b/tests/benchmark/ptime
index 419f3684..92ee2980 100755
--- a/tests/benchmark/ptime
+++ b/tests/benchmark/ptime
@@ -16,7 +16,7 @@ def main():
 	os.system(cmd)
 	t2 = time.time()
 
-	print(t2-t1)
+	print("Wall time: " + str(t2-t1))
 
 if __name__ == "__main__":
 	main()
diff --git a/tests/benchmark/test_benchmark b/tests/benchmark/test_benchmark
index 6f0dd7b4..fa1b1e9e 100755
--- a/tests/benchmark/test_benchmark
+++ b/tests/benchmark/test_benchmark
@@ -7,36 +7,43 @@ source $TESTDIR/utils/tap.sh
 plan_tests 1
 
 : ${ITERS:=10}
-: ${NR_EVENTS:=7000000}
-: ${NR_CPUS:=1}
+: ${DURATION:=2}
+: ${NR_THREADS:=1}
+: ${NR_CPUS:=$(lscpu | grep "^CPU(s)" | sed 's/^.*:[ \t]*//g')}
 
 : ${TIME:="./$CURDIR/ptime"}
 
-: ${PROG_NOTRACING:="./$CURDIR/bench1 $NR_CPUS $NR_EVENTS"}
-: ${PROG_TRACING:="./$CURDIR/bench2 $NR_CPUS $NR_EVENTS"}
+: ${PROG_NOTRACING:="./$CURDIR/bench1 $NR_THREADS $DURATION"}
+: ${PROG_TRACING:="./$CURDIR/bench2 $NR_THREADS $DURATION"}
 
 function signal_cleanup ()
 {
 	killall lttng-sessiond
+	exit
 }
 
 trap signal_cleanup SIGTERM SIGINT
 
-CMD_NOTRACING="$TIME '$PROG_NOTRACING >/dev/null 2>&1'"
-CMD_TRACING="$TIME '$PROG_TRACING >/dev/null 2>&1'"
+CMD_NOTRACING="$TIME '$PROG_NOTRACING'"
+CMD_TRACING="$TIME '$PROG_TRACING'"
+
+NR_ACTIVE_CPUS=$(( $NR_CPUS > $NR_THREADS ? $NR_THREADS : $NR_CPUS ))
 
 for i in $(seq $ITERS); do
-	time_notrace[i]=$(sh -c "$CMD_NOTRACING")
+	res=$(sh -c "$CMD_NOTRACING")
+	loops_notrace[$i]=$(echo "${res}" | grep "^Number of loops:" | sed 's/^.*: //g')
+	time_notrace[$i]=$(echo "${res}" | grep "^Wall time:" | sed 's/^.*: //g')
 done
 
-
 lttng-sessiond -d --no-kernel
 lttng -q create --snapshot
 lttng -q enable-event -u -a
 lttng -q start
 
 for i in $(seq $ITERS); do
-	time_trace[i]=$(sh -c "$CMD_TRACING")
+	res=$(sh -c "$CMD_TRACING")
+	loops_trace[$i]=$(echo "${res}" | grep "^Number of loops:" | sed 's/^.*: //g')
+	time_trace[$i]=$(echo "${res}" | grep "^Wall time:" | sed 's/^.*: //g')
 done
 
 lttng -q stop
@@ -45,9 +52,12 @@ killall lttng-sessiond
 
 pass "Trace benchmark"
 
+# Multiply the wall time by the number of active CPUs to get the
+# overhead of events on each active cpu.
+
 avg_delta=0
 for i in $(seq $ITERS); do
-	delta[$i]=$(echo "( ((${time_trace[$i]}) - (${time_notrace[$i]})) / $NR_EVENTS)" | bc -l)
+	delta[$i]=$(echo "((${time_trace[$i]} * ${NR_ACTIVE_CPUS} / ${loops_trace[$i]}) - (${time_notrace[$i]} * ${NR_ACTIVE_CPUS} / ${loops_notrace[$i]}))" | bc -l)
 	avg_delta=$(echo "(${avg_delta} + ${delta[$i]})" | bc -l)
 done
 avg_delta=$(echo "(${avg_delta} / $ITERS)" | bc -l)
@@ -65,6 +75,7 @@ NS_PER_EVENT=$(echo "($avg_delta * 1000000000)" | bc -l)
 NS_PER_EVENT=${NS_PER_EVENT%%.*}
 
 STD_DEV_NS_PER_EVENT=$(echo "($std_dev * 1000000000)" | bc -l)
+# Remove fractions
 STD_DEV_NS_PER_EVENT=${STD_DEV_NS_PER_EVENT%%.*}
 
-diag "Average tracing overhead per event is ${NS_PER_EVENT}ns, std.dev.: ${STD_DEV_NS_PER_EVENT}ns"
+diag "Average tracing overhead per event is ${NS_PER_EVENT}ns, std.dev.: ${STD_DEV_NS_PER_EVENT}ns { NR_THREADS=${NR_THREADS}, NR_ACTIVE_CPUS=${NR_ACTIVE_CPUS} }"
-- 
2.34.1