Commit | Line | Data |
---|---|---|
1c13f3c9 IM |
1 | /* |
2 | * numa.c | |
3 | * | |
4 | * numa: Simulate NUMA-sensitive workload and measure their NUMA performance | |
5 | */ | |
6 | ||
7 | #include "../perf.h" | |
8 | #include "../builtin.h" | |
9 | #include "../util/util.h" | |
10 | #include "../util/parse-options.h" | |
11 | ||
12 | #include "bench.h" | |
13 | ||
14 | #include <errno.h> | |
15 | #include <sched.h> | |
16 | #include <stdio.h> | |
17 | #include <assert.h> | |
18 | #include <malloc.h> | |
19 | #include <signal.h> | |
20 | #include <stdlib.h> | |
21 | #include <string.h> | |
22 | #include <unistd.h> | |
23 | #include <pthread.h> | |
24 | #include <sys/mman.h> | |
25 | #include <sys/time.h> | |
26 | #include <sys/wait.h> | |
27 | #include <sys/prctl.h> | |
28 | #include <sys/types.h> | |
29 | ||
30 | #include <numa.h> | |
31 | #include <numaif.h> | |
32 | ||
33 | /* | |
34 | * Regular printout to the terminal, supressed if -q is specified: | |
35 | */ | |
36 | #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) | |
37 | ||
38 | /* | |
39 | * Debug printf: | |
40 | */ | |
41 | #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0) | |
42 | ||
43 | struct thread_data { | |
44 | int curr_cpu; | |
45 | cpu_set_t bind_cpumask; | |
46 | int bind_node; | |
47 | u8 *process_data; | |
48 | int process_nr; | |
49 | int thread_nr; | |
50 | int task_nr; | |
51 | unsigned int loops_done; | |
52 | u64 val; | |
53 | u64 runtime_ns; | |
54 | pthread_mutex_t *process_lock; | |
55 | }; | |
56 | ||
57 | /* Parameters set by options: */ | |
58 | ||
59 | struct params { | |
60 | /* Startup synchronization: */ | |
61 | bool serialize_startup; | |
62 | ||
63 | /* Task hierarchy: */ | |
64 | int nr_proc; | |
65 | int nr_threads; | |
66 | ||
67 | /* Working set sizes: */ | |
68 | const char *mb_global_str; | |
69 | const char *mb_proc_str; | |
70 | const char *mb_proc_locked_str; | |
71 | const char *mb_thread_str; | |
72 | ||
73 | double mb_global; | |
74 | double mb_proc; | |
75 | double mb_proc_locked; | |
76 | double mb_thread; | |
77 | ||
78 | /* Access patterns to the working set: */ | |
79 | bool data_reads; | |
80 | bool data_writes; | |
81 | bool data_backwards; | |
82 | bool data_zero_memset; | |
83 | bool data_rand_walk; | |
84 | u32 nr_loops; | |
85 | u32 nr_secs; | |
86 | u32 sleep_usecs; | |
87 | ||
88 | /* Working set initialization: */ | |
89 | bool init_zero; | |
90 | bool init_random; | |
91 | bool init_cpu0; | |
92 | ||
93 | /* Misc options: */ | |
94 | int show_details; | |
95 | int run_all; | |
96 | int thp; | |
97 | ||
98 | long bytes_global; | |
99 | long bytes_process; | |
100 | long bytes_process_locked; | |
101 | long bytes_thread; | |
102 | ||
103 | int nr_tasks; | |
104 | bool show_quiet; | |
105 | ||
106 | bool show_convergence; | |
107 | bool measure_convergence; | |
108 | ||
109 | int perturb_secs; | |
110 | int nr_cpus; | |
111 | int nr_nodes; | |
112 | ||
113 | /* Affinity options -C and -N: */ | |
114 | char *cpu_list_str; | |
115 | char *node_list_str; | |
116 | }; | |
117 | ||
118 | ||
119 | /* Global, read-writable area, accessible to all processes and threads: */ | |
120 | ||
121 | struct global_info { | |
122 | u8 *data; | |
123 | ||
124 | pthread_mutex_t startup_mutex; | |
125 | int nr_tasks_started; | |
126 | ||
127 | pthread_mutex_t startup_done_mutex; | |
128 | ||
129 | pthread_mutex_t start_work_mutex; | |
130 | int nr_tasks_working; | |
131 | ||
132 | pthread_mutex_t stop_work_mutex; | |
133 | u64 bytes_done; | |
134 | ||
135 | struct thread_data *threads; | |
136 | ||
137 | /* Convergence latency measurement: */ | |
138 | bool all_converged; | |
139 | bool stop_work; | |
140 | ||
141 | int print_once; | |
142 | ||
143 | struct params p; | |
144 | }; | |
145 | ||
146 | static struct global_info *g = NULL; | |
147 | ||
148 | static int parse_cpus_opt(const struct option *opt, const char *arg, int unset); | |
149 | static int parse_nodes_opt(const struct option *opt, const char *arg, int unset); | |
150 | ||
151 | struct params p0; | |
152 | ||
153 | static const struct option options[] = { | |
154 | OPT_INTEGER('p', "nr_proc" , &p0.nr_proc, "number of processes"), | |
155 | OPT_INTEGER('t', "nr_threads" , &p0.nr_threads, "number of threads per process"), | |
156 | ||
157 | OPT_STRING('G', "mb_global" , &p0.mb_global_str, "MB", "global memory (MBs)"), | |
158 | OPT_STRING('P', "mb_proc" , &p0.mb_proc_str, "MB", "process memory (MBs)"), | |
159 | OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"), | |
160 | OPT_STRING('T', "mb_thread" , &p0.mb_thread_str, "MB", "thread memory (MBs)"), | |
161 | ||
162 | OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run"), | |
163 | OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run"), | |
164 | OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"), | |
165 | ||
166 | OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"), | |
167 | OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"), | |
168 | OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"), | |
169 | OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"), | |
170 | OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"), | |
171 | ||
172 | ||
173 | OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"), | |
174 | OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"), | |
175 | OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"), | |
176 | OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"), | |
177 | ||
178 | OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"), | |
179 | OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"), | |
180 | OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"), | |
181 | OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"), | |
182 | OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"), | |
183 | OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "bzero the initial allocations"), | |
184 | OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"), | |
185 | ||
186 | /* Special option string parsing callbacks: */ | |
187 | OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]", | |
188 | "bind the first N tasks to these specific cpus (the rest is unbound)", | |
189 | parse_cpus_opt), | |
190 | OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]", | |
191 | "bind the first N tasks to these specific memory nodes (the rest is unbound)", | |
192 | parse_nodes_opt), | |
193 | OPT_END() | |
194 | }; | |
195 | ||
196 | static const char * const bench_numa_usage[] = { | |
197 | "perf bench numa <options>", | |
198 | NULL | |
199 | }; | |
200 | ||
201 | static const char * const numa_usage[] = { | |
202 | "perf bench numa mem [<options>]", | |
203 | NULL | |
204 | }; | |
205 | ||
206 | static cpu_set_t bind_to_cpu(int target_cpu) | |
207 | { | |
208 | cpu_set_t orig_mask, mask; | |
209 | int ret; | |
210 | ||
211 | ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); | |
212 | BUG_ON(ret); | |
213 | ||
214 | CPU_ZERO(&mask); | |
215 | ||
216 | if (target_cpu == -1) { | |
217 | int cpu; | |
218 | ||
219 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) | |
220 | CPU_SET(cpu, &mask); | |
221 | } else { | |
222 | BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus); | |
223 | CPU_SET(target_cpu, &mask); | |
224 | } | |
225 | ||
226 | ret = sched_setaffinity(0, sizeof(mask), &mask); | |
227 | BUG_ON(ret); | |
228 | ||
229 | return orig_mask; | |
230 | } | |
231 | ||
232 | static cpu_set_t bind_to_node(int target_node) | |
233 | { | |
234 | int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes; | |
235 | cpu_set_t orig_mask, mask; | |
236 | int cpu; | |
237 | int ret; | |
238 | ||
239 | BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus); | |
240 | BUG_ON(!cpus_per_node); | |
241 | ||
242 | ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); | |
243 | BUG_ON(ret); | |
244 | ||
245 | CPU_ZERO(&mask); | |
246 | ||
247 | if (target_node == -1) { | |
248 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) | |
249 | CPU_SET(cpu, &mask); | |
250 | } else { | |
251 | int cpu_start = (target_node + 0) * cpus_per_node; | |
252 | int cpu_stop = (target_node + 1) * cpus_per_node; | |
253 | ||
254 | BUG_ON(cpu_stop > g->p.nr_cpus); | |
255 | ||
256 | for (cpu = cpu_start; cpu < cpu_stop; cpu++) | |
257 | CPU_SET(cpu, &mask); | |
258 | } | |
259 | ||
260 | ret = sched_setaffinity(0, sizeof(mask), &mask); | |
261 | BUG_ON(ret); | |
262 | ||
263 | return orig_mask; | |
264 | } | |
265 | ||
266 | static void bind_to_cpumask(cpu_set_t mask) | |
267 | { | |
268 | int ret; | |
269 | ||
270 | ret = sched_setaffinity(0, sizeof(mask), &mask); | |
271 | BUG_ON(ret); | |
272 | } | |
273 | ||
274 | static void mempol_restore(void) | |
275 | { | |
276 | int ret; | |
277 | ||
278 | ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1); | |
279 | ||
280 | BUG_ON(ret); | |
281 | } | |
282 | ||
283 | static void bind_to_memnode(int node) | |
284 | { | |
285 | unsigned long nodemask; | |
286 | int ret; | |
287 | ||
288 | if (node == -1) | |
289 | return; | |
290 | ||
291 | BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)); | |
292 | nodemask = 1L << node; | |
293 | ||
294 | ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); | |
295 | dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); | |
296 | ||
297 | BUG_ON(ret); | |
298 | } | |
299 | ||
300 | #define HPSIZE (2*1024*1024) | |
301 | ||
302 | #define set_taskname(fmt...) \ | |
303 | do { \ | |
304 | char name[20]; \ | |
305 | \ | |
306 | snprintf(name, 20, fmt); \ | |
307 | prctl(PR_SET_NAME, name); \ | |
308 | } while (0) | |
309 | ||
310 | static u8 *alloc_data(ssize_t bytes0, int map_flags, | |
311 | int init_zero, int init_cpu0, int thp, int init_random) | |
312 | { | |
313 | cpu_set_t orig_mask; | |
314 | ssize_t bytes; | |
315 | u8 *buf; | |
316 | int ret; | |
317 | ||
318 | if (!bytes0) | |
319 | return NULL; | |
320 | ||
321 | /* Allocate and initialize all memory on CPU#0: */ | |
322 | if (init_cpu0) { | |
323 | orig_mask = bind_to_node(0); | |
324 | bind_to_memnode(0); | |
325 | } | |
326 | ||
327 | bytes = bytes0 + HPSIZE; | |
328 | ||
329 | buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0); | |
330 | BUG_ON(buf == (void *)-1); | |
331 | ||
332 | if (map_flags == MAP_PRIVATE) { | |
333 | if (thp > 0) { | |
334 | ret = madvise(buf, bytes, MADV_HUGEPAGE); | |
335 | if (ret && !g->print_once) { | |
336 | g->print_once = 1; | |
337 | printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n"); | |
338 | } | |
339 | } | |
340 | if (thp < 0) { | |
341 | ret = madvise(buf, bytes, MADV_NOHUGEPAGE); | |
342 | if (ret && !g->print_once) { | |
343 | g->print_once = 1; | |
344 | printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n"); | |
345 | } | |
346 | } | |
347 | } | |
348 | ||
349 | if (init_zero) { | |
350 | bzero(buf, bytes); | |
351 | } else { | |
352 | /* Initialize random contents, different in each word: */ | |
353 | if (init_random) { | |
354 | u64 *wbuf = (void *)buf; | |
355 | long off = rand(); | |
356 | long i; | |
357 | ||
358 | for (i = 0; i < bytes/8; i++) | |
359 | wbuf[i] = i + off; | |
360 | } | |
361 | } | |
362 | ||
363 | /* Align to 2MB boundary: */ | |
364 | buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1)); | |
365 | ||
366 | /* Restore affinity: */ | |
367 | if (init_cpu0) { | |
368 | bind_to_cpumask(orig_mask); | |
369 | mempol_restore(); | |
370 | } | |
371 | ||
372 | return buf; | |
373 | } | |
374 | ||
375 | static void free_data(void *data, ssize_t bytes) | |
376 | { | |
377 | int ret; | |
378 | ||
379 | if (!data) | |
380 | return; | |
381 | ||
382 | ret = munmap(data, bytes); | |
383 | BUG_ON(ret); | |
384 | } | |
385 | ||
386 | /* | |
387 | * Create a shared memory buffer that can be shared between processes, zeroed: | |
388 | */ | |
389 | static void * zalloc_shared_data(ssize_t bytes) | |
390 | { | |
391 | return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0, g->p.thp, g->p.init_random); | |
392 | } | |
393 | ||
394 | /* | |
395 | * Create a shared memory buffer that can be shared between processes: | |
396 | */ | |
397 | static void * setup_shared_data(ssize_t bytes) | |
398 | { | |
399 | return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); | |
400 | } | |
401 | ||
402 | /* | |
403 | * Allocate process-local memory - this will either be shared between | |
404 | * threads of this process, or only be accessed by this thread: | |
405 | */ | |
406 | static void * setup_private_data(ssize_t bytes) | |
407 | { | |
408 | return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); | |
409 | } | |
410 | ||
411 | /* | |
412 | * Return a process-shared (global) mutex: | |
413 | */ | |
414 | static void init_global_mutex(pthread_mutex_t *mutex) | |
415 | { | |
416 | pthread_mutexattr_t attr; | |
417 | ||
418 | pthread_mutexattr_init(&attr); | |
419 | pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); | |
420 | pthread_mutex_init(mutex, &attr); | |
421 | } | |
422 | ||
423 | static int parse_cpu_list(const char *arg) | |
424 | { | |
425 | p0.cpu_list_str = strdup(arg); | |
426 | ||
427 | dprintf("got CPU list: {%s}\n", p0.cpu_list_str); | |
428 | ||
429 | return 0; | |
430 | } | |
431 | ||
b81a48ea | 432 | static int parse_setup_cpu_list(void) |
1c13f3c9 IM |
433 | { |
434 | struct thread_data *td; | |
435 | char *str0, *str; | |
436 | int t; | |
437 | ||
438 | if (!g->p.cpu_list_str) | |
b81a48ea | 439 | return 0; |
1c13f3c9 IM |
440 | |
441 | dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); | |
442 | ||
443 | str0 = str = strdup(g->p.cpu_list_str); | |
444 | t = 0; | |
445 | ||
446 | BUG_ON(!str); | |
447 | ||
448 | tprintf("# binding tasks to CPUs:\n"); | |
449 | tprintf("# "); | |
450 | ||
451 | while (true) { | |
452 | int bind_cpu, bind_cpu_0, bind_cpu_1; | |
453 | char *tok, *tok_end, *tok_step, *tok_len, *tok_mul; | |
454 | int bind_len; | |
455 | int step; | |
456 | int mul; | |
457 | ||
458 | tok = strsep(&str, ","); | |
459 | if (!tok) | |
460 | break; | |
461 | ||
462 | tok_end = strstr(tok, "-"); | |
463 | ||
464 | dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); | |
465 | if (!tok_end) { | |
466 | /* Single CPU specified: */ | |
467 | bind_cpu_0 = bind_cpu_1 = atol(tok); | |
468 | } else { | |
469 | /* CPU range specified (for example: "5-11"): */ | |
470 | bind_cpu_0 = atol(tok); | |
471 | bind_cpu_1 = atol(tok_end + 1); | |
472 | } | |
473 | ||
474 | step = 1; | |
475 | tok_step = strstr(tok, "#"); | |
476 | if (tok_step) { | |
477 | step = atol(tok_step + 1); | |
478 | BUG_ON(step <= 0 || step >= g->p.nr_cpus); | |
479 | } | |
480 | ||
481 | /* | |
482 | * Mask length. | |
483 | * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4', | |
484 | * where the _4 means the next 4 CPUs are allowed. | |
485 | */ | |
486 | bind_len = 1; | |
487 | tok_len = strstr(tok, "_"); | |
488 | if (tok_len) { | |
489 | bind_len = atol(tok_len + 1); | |
490 | BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus); | |
491 | } | |
492 | ||
493 | /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ | |
494 | mul = 1; | |
495 | tok_mul = strstr(tok, "x"); | |
496 | if (tok_mul) { | |
497 | mul = atol(tok_mul + 1); | |
498 | BUG_ON(mul <= 0); | |
499 | } | |
500 | ||
501 | dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul); | |
502 | ||
b81a48ea PH |
503 | if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) { |
504 | printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus); | |
505 | return -1; | |
506 | } | |
507 | ||
508 | BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0); | |
1c13f3c9 IM |
509 | BUG_ON(bind_cpu_0 > bind_cpu_1); |
510 | ||
511 | for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) { | |
512 | int i; | |
513 | ||
514 | for (i = 0; i < mul; i++) { | |
515 | int cpu; | |
516 | ||
517 | if (t >= g->p.nr_tasks) { | |
518 | printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu); | |
519 | goto out; | |
520 | } | |
521 | td = g->threads + t; | |
522 | ||
523 | if (t) | |
524 | tprintf(","); | |
525 | if (bind_len > 1) { | |
526 | tprintf("%2d/%d", bind_cpu, bind_len); | |
527 | } else { | |
528 | tprintf("%2d", bind_cpu); | |
529 | } | |
530 | ||
531 | CPU_ZERO(&td->bind_cpumask); | |
532 | for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) { | |
533 | BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus); | |
534 | CPU_SET(cpu, &td->bind_cpumask); | |
535 | } | |
536 | t++; | |
537 | } | |
538 | } | |
539 | } | |
540 | out: | |
541 | ||
542 | tprintf("\n"); | |
543 | ||
544 | if (t < g->p.nr_tasks) | |
545 | printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t); | |
546 | ||
547 | free(str0); | |
b81a48ea | 548 | return 0; |
1c13f3c9 IM |
549 | } |
550 | ||
551 | static int parse_cpus_opt(const struct option *opt __maybe_unused, | |
552 | const char *arg, int unset __maybe_unused) | |
553 | { | |
554 | if (!arg) | |
555 | return -1; | |
556 | ||
557 | return parse_cpu_list(arg); | |
558 | } | |
559 | ||
560 | static int parse_node_list(const char *arg) | |
561 | { | |
562 | p0.node_list_str = strdup(arg); | |
563 | ||
564 | dprintf("got NODE list: {%s}\n", p0.node_list_str); | |
565 | ||
566 | return 0; | |
567 | } | |
568 | ||
b81a48ea | 569 | static int parse_setup_node_list(void) |
1c13f3c9 IM |
570 | { |
571 | struct thread_data *td; | |
572 | char *str0, *str; | |
573 | int t; | |
574 | ||
575 | if (!g->p.node_list_str) | |
b81a48ea | 576 | return 0; |
1c13f3c9 IM |
577 | |
578 | dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); | |
579 | ||
580 | str0 = str = strdup(g->p.node_list_str); | |
581 | t = 0; | |
582 | ||
583 | BUG_ON(!str); | |
584 | ||
585 | tprintf("# binding tasks to NODEs:\n"); | |
586 | tprintf("# "); | |
587 | ||
588 | while (true) { | |
589 | int bind_node, bind_node_0, bind_node_1; | |
590 | char *tok, *tok_end, *tok_step, *tok_mul; | |
591 | int step; | |
592 | int mul; | |
593 | ||
594 | tok = strsep(&str, ","); | |
595 | if (!tok) | |
596 | break; | |
597 | ||
598 | tok_end = strstr(tok, "-"); | |
599 | ||
600 | dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); | |
601 | if (!tok_end) { | |
602 | /* Single NODE specified: */ | |
603 | bind_node_0 = bind_node_1 = atol(tok); | |
604 | } else { | |
605 | /* NODE range specified (for example: "5-11"): */ | |
606 | bind_node_0 = atol(tok); | |
607 | bind_node_1 = atol(tok_end + 1); | |
608 | } | |
609 | ||
610 | step = 1; | |
611 | tok_step = strstr(tok, "#"); | |
612 | if (tok_step) { | |
613 | step = atol(tok_step + 1); | |
614 | BUG_ON(step <= 0 || step >= g->p.nr_nodes); | |
615 | } | |
616 | ||
617 | /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ | |
618 | mul = 1; | |
619 | tok_mul = strstr(tok, "x"); | |
620 | if (tok_mul) { | |
621 | mul = atol(tok_mul + 1); | |
622 | BUG_ON(mul <= 0); | |
623 | } | |
624 | ||
625 | dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step); | |
626 | ||
b81a48ea PH |
627 | if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) { |
628 | printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes); | |
629 | return -1; | |
630 | } | |
631 | ||
632 | BUG_ON(bind_node_0 < 0 || bind_node_1 < 0); | |
1c13f3c9 IM |
633 | BUG_ON(bind_node_0 > bind_node_1); |
634 | ||
635 | for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) { | |
636 | int i; | |
637 | ||
638 | for (i = 0; i < mul; i++) { | |
639 | if (t >= g->p.nr_tasks) { | |
640 | printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node); | |
641 | goto out; | |
642 | } | |
643 | td = g->threads + t; | |
644 | ||
645 | if (!t) | |
646 | tprintf(" %2d", bind_node); | |
647 | else | |
648 | tprintf(",%2d", bind_node); | |
649 | ||
650 | td->bind_node = bind_node; | |
651 | t++; | |
652 | } | |
653 | } | |
654 | } | |
655 | out: | |
656 | ||
657 | tprintf("\n"); | |
658 | ||
659 | if (t < g->p.nr_tasks) | |
660 | printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t); | |
661 | ||
662 | free(str0); | |
b81a48ea | 663 | return 0; |
1c13f3c9 IM |
664 | } |
665 | ||
666 | static int parse_nodes_opt(const struct option *opt __maybe_unused, | |
667 | const char *arg, int unset __maybe_unused) | |
668 | { | |
669 | if (!arg) | |
670 | return -1; | |
671 | ||
672 | return parse_node_list(arg); | |
673 | ||
674 | return 0; | |
675 | } | |
676 | ||
677 | #define BIT(x) (1ul << x) | |
678 | ||
679 | static inline uint32_t lfsr_32(uint32_t lfsr) | |
680 | { | |
681 | const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31); | |
682 | return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps); | |
683 | } | |
684 | ||
685 | /* | |
686 | * Make sure there's real data dependency to RAM (when read | |
687 | * accesses are enabled), so the compiler, the CPU and the | |
688 | * kernel (KSM, zero page, etc.) cannot optimize away RAM | |
689 | * accesses: | |
690 | */ | |
691 | static inline u64 access_data(u64 *data __attribute__((unused)), u64 val) | |
692 | { | |
693 | if (g->p.data_reads) | |
694 | val += *data; | |
695 | if (g->p.data_writes) | |
696 | *data = val + 1; | |
697 | return val; | |
698 | } | |
699 | ||
700 | /* | |
701 | * The worker process does two types of work, a forwards going | |
702 | * loop and a backwards going loop. | |
703 | * | |
704 | * We do this so that on multiprocessor systems we do not create | |
705 | * a 'train' of processing, with highly synchronized processes, | |
706 | * skewing the whole benchmark. | |
707 | */ | |
708 | static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val) | |
709 | { | |
710 | long words = bytes/sizeof(u64); | |
711 | u64 *data = (void *)__data; | |
712 | long chunk_0, chunk_1; | |
713 | u64 *d0, *d, *d1; | |
714 | long off; | |
715 | long i; | |
716 | ||
717 | BUG_ON(!data && words); | |
718 | BUG_ON(data && !words); | |
719 | ||
720 | if (!data) | |
721 | return val; | |
722 | ||
723 | /* Very simple memset() work variant: */ | |
724 | if (g->p.data_zero_memset && !g->p.data_rand_walk) { | |
725 | bzero(data, bytes); | |
726 | return val; | |
727 | } | |
728 | ||
729 | /* Spread out by PID/TID nr and by loop nr: */ | |
730 | chunk_0 = words/nr_max; | |
731 | chunk_1 = words/g->p.nr_loops; | |
732 | off = nr*chunk_0 + loop*chunk_1; | |
733 | ||
734 | while (off >= words) | |
735 | off -= words; | |
736 | ||
737 | if (g->p.data_rand_walk) { | |
738 | u32 lfsr = nr + loop + val; | |
739 | int j; | |
740 | ||
741 | for (i = 0; i < words/1024; i++) { | |
742 | long start, end; | |
743 | ||
744 | lfsr = lfsr_32(lfsr); | |
745 | ||
746 | start = lfsr % words; | |
747 | end = min(start + 1024, words-1); | |
748 | ||
749 | if (g->p.data_zero_memset) { | |
750 | bzero(data + start, (end-start) * sizeof(u64)); | |
751 | } else { | |
752 | for (j = start; j < end; j++) | |
753 | val = access_data(data + j, val); | |
754 | } | |
755 | } | |
756 | } else if (!g->p.data_backwards || (nr + loop) & 1) { | |
757 | ||
758 | d0 = data + off; | |
759 | d = data + off + 1; | |
760 | d1 = data + words; | |
761 | ||
762 | /* Process data forwards: */ | |
763 | for (;;) { | |
764 | if (unlikely(d >= d1)) | |
765 | d = data; | |
766 | if (unlikely(d == d0)) | |
767 | break; | |
768 | ||
769 | val = access_data(d, val); | |
770 | ||
771 | d++; | |
772 | } | |
773 | } else { | |
774 | /* Process data backwards: */ | |
775 | ||
776 | d0 = data + off; | |
777 | d = data + off - 1; | |
778 | d1 = data + words; | |
779 | ||
780 | /* Process data forwards: */ | |
781 | for (;;) { | |
782 | if (unlikely(d < data)) | |
783 | d = data + words-1; | |
784 | if (unlikely(d == d0)) | |
785 | break; | |
786 | ||
787 | val = access_data(d, val); | |
788 | ||
789 | d--; | |
790 | } | |
791 | } | |
792 | ||
793 | return val; | |
794 | } | |
795 | ||
796 | static void update_curr_cpu(int task_nr, unsigned long bytes_worked) | |
797 | { | |
798 | unsigned int cpu; | |
799 | ||
800 | cpu = sched_getcpu(); | |
801 | ||
802 | g->threads[task_nr].curr_cpu = cpu; | |
803 | prctl(0, bytes_worked); | |
804 | } | |
805 | ||
806 | #define MAX_NR_NODES 64 | |
807 | ||
808 | /* | |
809 | * Count the number of nodes a process's threads | |
810 | * are spread out on. | |
811 | * | |
812 | * A count of 1 means that the process is compressed | |
813 | * to a single node. A count of g->p.nr_nodes means it's | |
814 | * spread out on the whole system. | |
815 | */ | |
816 | static int count_process_nodes(int process_nr) | |
817 | { | |
818 | char node_present[MAX_NR_NODES] = { 0, }; | |
819 | int nodes; | |
820 | int n, t; | |
821 | ||
822 | for (t = 0; t < g->p.nr_threads; t++) { | |
823 | struct thread_data *td; | |
824 | int task_nr; | |
825 | int node; | |
826 | ||
827 | task_nr = process_nr*g->p.nr_threads + t; | |
828 | td = g->threads + task_nr; | |
829 | ||
830 | node = numa_node_of_cpu(td->curr_cpu); | |
831 | node_present[node] = 1; | |
832 | } | |
833 | ||
834 | nodes = 0; | |
835 | ||
836 | for (n = 0; n < MAX_NR_NODES; n++) | |
837 | nodes += node_present[n]; | |
838 | ||
839 | return nodes; | |
840 | } | |
841 | ||
842 | /* | |
843 | * Count the number of distinct process-threads a node contains. | |
844 | * | |
845 | * A count of 1 means that the node contains only a single | |
846 | * process. If all nodes on the system contain at most one | |
847 | * process then we are well-converged. | |
848 | */ | |
849 | static int count_node_processes(int node) | |
850 | { | |
851 | int processes = 0; | |
852 | int t, p; | |
853 | ||
854 | for (p = 0; p < g->p.nr_proc; p++) { | |
855 | for (t = 0; t < g->p.nr_threads; t++) { | |
856 | struct thread_data *td; | |
857 | int task_nr; | |
858 | int n; | |
859 | ||
860 | task_nr = p*g->p.nr_threads + t; | |
861 | td = g->threads + task_nr; | |
862 | ||
863 | n = numa_node_of_cpu(td->curr_cpu); | |
864 | if (n == node) { | |
865 | processes++; | |
866 | break; | |
867 | } | |
868 | } | |
869 | } | |
870 | ||
871 | return processes; | |
872 | } | |
873 | ||
874 | static void calc_convergence_compression(int *strong) | |
875 | { | |
876 | unsigned int nodes_min, nodes_max; | |
877 | int p; | |
878 | ||
879 | nodes_min = -1; | |
880 | nodes_max = 0; | |
881 | ||
882 | for (p = 0; p < g->p.nr_proc; p++) { | |
883 | unsigned int nodes = count_process_nodes(p); | |
884 | ||
885 | nodes_min = min(nodes, nodes_min); | |
886 | nodes_max = max(nodes, nodes_max); | |
887 | } | |
888 | ||
889 | /* Strong convergence: all threads compress on a single node: */ | |
890 | if (nodes_min == 1 && nodes_max == 1) { | |
891 | *strong = 1; | |
892 | } else { | |
893 | *strong = 0; | |
894 | tprintf(" {%d-%d}", nodes_min, nodes_max); | |
895 | } | |
896 | } | |
897 | ||
898 | static void calc_convergence(double runtime_ns_max, double *convergence) | |
899 | { | |
900 | unsigned int loops_done_min, loops_done_max; | |
901 | int process_groups; | |
902 | int nodes[MAX_NR_NODES]; | |
903 | int distance; | |
904 | int nr_min; | |
905 | int nr_max; | |
906 | int strong; | |
907 | int sum; | |
908 | int nr; | |
909 | int node; | |
910 | int cpu; | |
911 | int t; | |
912 | ||
913 | if (!g->p.show_convergence && !g->p.measure_convergence) | |
914 | return; | |
915 | ||
916 | for (node = 0; node < g->p.nr_nodes; node++) | |
917 | nodes[node] = 0; | |
918 | ||
919 | loops_done_min = -1; | |
920 | loops_done_max = 0; | |
921 | ||
922 | for (t = 0; t < g->p.nr_tasks; t++) { | |
923 | struct thread_data *td = g->threads + t; | |
924 | unsigned int loops_done; | |
925 | ||
926 | cpu = td->curr_cpu; | |
927 | ||
928 | /* Not all threads have written it yet: */ | |
929 | if (cpu < 0) | |
930 | continue; | |
931 | ||
932 | node = numa_node_of_cpu(cpu); | |
933 | ||
934 | nodes[node]++; | |
935 | ||
936 | loops_done = td->loops_done; | |
937 | loops_done_min = min(loops_done, loops_done_min); | |
938 | loops_done_max = max(loops_done, loops_done_max); | |
939 | } | |
940 | ||
941 | nr_max = 0; | |
942 | nr_min = g->p.nr_tasks; | |
943 | sum = 0; | |
944 | ||
945 | for (node = 0; node < g->p.nr_nodes; node++) { | |
946 | nr = nodes[node]; | |
947 | nr_min = min(nr, nr_min); | |
948 | nr_max = max(nr, nr_max); | |
949 | sum += nr; | |
950 | } | |
951 | BUG_ON(nr_min > nr_max); | |
952 | ||
953 | BUG_ON(sum > g->p.nr_tasks); | |
954 | ||
955 | if (0 && (sum < g->p.nr_tasks)) | |
956 | return; | |
957 | ||
958 | /* | |
959 | * Count the number of distinct process groups present | |
960 | * on nodes - when we are converged this will decrease | |
961 | * to g->p.nr_proc: | |
962 | */ | |
963 | process_groups = 0; | |
964 | ||
965 | for (node = 0; node < g->p.nr_nodes; node++) { | |
966 | int processes = count_node_processes(node); | |
967 | ||
968 | nr = nodes[node]; | |
969 | tprintf(" %2d/%-2d", nr, processes); | |
970 | ||
971 | process_groups += processes; | |
972 | } | |
973 | ||
974 | distance = nr_max - nr_min; | |
975 | ||
976 | tprintf(" [%2d/%-2d]", distance, process_groups); | |
977 | ||
978 | tprintf(" l:%3d-%-3d (%3d)", | |
979 | loops_done_min, loops_done_max, loops_done_max-loops_done_min); | |
980 | ||
981 | if (loops_done_min && loops_done_max) { | |
982 | double skew = 1.0 - (double)loops_done_min/loops_done_max; | |
983 | ||
984 | tprintf(" [%4.1f%%]", skew * 100.0); | |
985 | } | |
986 | ||
987 | calc_convergence_compression(&strong); | |
988 | ||
989 | if (strong && process_groups == g->p.nr_proc) { | |
990 | if (!*convergence) { | |
991 | *convergence = runtime_ns_max; | |
992 | tprintf(" (%6.1fs converged)\n", *convergence/1e9); | |
993 | if (g->p.measure_convergence) { | |
994 | g->all_converged = true; | |
995 | g->stop_work = true; | |
996 | } | |
997 | } | |
998 | } else { | |
999 | if (*convergence) { | |
1000 | tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9); | |
1001 | *convergence = 0; | |
1002 | } | |
1003 | tprintf("\n"); | |
1004 | } | |
1005 | } | |
1006 | ||
1007 | static void show_summary(double runtime_ns_max, int l, double *convergence) | |
1008 | { | |
1009 | tprintf("\r # %5.1f%% [%.1f mins]", | |
1010 | (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0); | |
1011 | ||
1012 | calc_convergence(runtime_ns_max, convergence); | |
1013 | ||
1014 | if (g->p.show_details >= 0) | |
1015 | fflush(stdout); | |
1016 | } | |
1017 | ||
1018 | static void *worker_thread(void *__tdata) | |
1019 | { | |
1020 | struct thread_data *td = __tdata; | |
1021 | struct timeval start0, start, stop, diff; | |
1022 | int process_nr = td->process_nr; | |
1023 | int thread_nr = td->thread_nr; | |
1024 | unsigned long last_perturbance; | |
1025 | int task_nr = td->task_nr; | |
1026 | int details = g->p.show_details; | |
1027 | int first_task, last_task; | |
1028 | double convergence = 0; | |
1029 | u64 val = td->val; | |
1030 | double runtime_ns_max; | |
1031 | u8 *global_data; | |
1032 | u8 *process_data; | |
1033 | u8 *thread_data; | |
1034 | u64 bytes_done; | |
1035 | long work_done; | |
1036 | u32 l; | |
1037 | ||
1038 | bind_to_cpumask(td->bind_cpumask); | |
1039 | bind_to_memnode(td->bind_node); | |
1040 | ||
1041 | set_taskname("thread %d/%d", process_nr, thread_nr); | |
1042 | ||
1043 | global_data = g->data; | |
1044 | process_data = td->process_data; | |
1045 | thread_data = setup_private_data(g->p.bytes_thread); | |
1046 | ||
1047 | bytes_done = 0; | |
1048 | ||
1049 | last_task = 0; | |
1050 | if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1) | |
1051 | last_task = 1; | |
1052 | ||
1053 | first_task = 0; | |
1054 | if (process_nr == 0 && thread_nr == 0) | |
1055 | first_task = 1; | |
1056 | ||
1057 | if (details >= 2) { | |
1058 | printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n", | |
1059 | process_nr, thread_nr, global_data, process_data, thread_data); | |
1060 | } | |
1061 | ||
1062 | if (g->p.serialize_startup) { | |
1063 | pthread_mutex_lock(&g->startup_mutex); | |
1064 | g->nr_tasks_started++; | |
1065 | pthread_mutex_unlock(&g->startup_mutex); | |
1066 | ||
1067 | /* Here we will wait for the main process to start us all at once: */ | |
1068 | pthread_mutex_lock(&g->start_work_mutex); | |
1069 | g->nr_tasks_working++; | |
1070 | ||
1071 | /* Last one wake the main process: */ | |
1072 | if (g->nr_tasks_working == g->p.nr_tasks) | |
1073 | pthread_mutex_unlock(&g->startup_done_mutex); | |
1074 | ||
1075 | pthread_mutex_unlock(&g->start_work_mutex); | |
1076 | } | |
1077 | ||
1078 | gettimeofday(&start0, NULL); | |
1079 | ||
1080 | start = stop = start0; | |
1081 | last_perturbance = start.tv_sec; | |
1082 | ||
1083 | for (l = 0; l < g->p.nr_loops; l++) { | |
1084 | start = stop; | |
1085 | ||
1086 | if (g->stop_work) | |
1087 | break; | |
1088 | ||
1089 | val += do_work(global_data, g->p.bytes_global, process_nr, g->p.nr_proc, l, val); | |
1090 | val += do_work(process_data, g->p.bytes_process, thread_nr, g->p.nr_threads, l, val); | |
1091 | val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val); | |
1092 | ||
1093 | if (g->p.sleep_usecs) { | |
1094 | pthread_mutex_lock(td->process_lock); | |
1095 | usleep(g->p.sleep_usecs); | |
1096 | pthread_mutex_unlock(td->process_lock); | |
1097 | } | |
1098 | /* | |
1099 | * Amount of work to be done under a process-global lock: | |
1100 | */ | |
1101 | if (g->p.bytes_process_locked) { | |
1102 | pthread_mutex_lock(td->process_lock); | |
1103 | val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val); | |
1104 | pthread_mutex_unlock(td->process_lock); | |
1105 | } | |
1106 | ||
1107 | work_done = g->p.bytes_global + g->p.bytes_process + | |
1108 | g->p.bytes_process_locked + g->p.bytes_thread; | |
1109 | ||
1110 | update_curr_cpu(task_nr, work_done); | |
1111 | bytes_done += work_done; | |
1112 | ||
1113 | if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs) | |
1114 | continue; | |
1115 | ||
1116 | td->loops_done = l; | |
1117 | ||
1118 | gettimeofday(&stop, NULL); | |
1119 | ||
1120 | /* Check whether our max runtime timed out: */ | |
1121 | if (g->p.nr_secs) { | |
1122 | timersub(&stop, &start0, &diff); | |
2100f778 | 1123 | if ((u32)diff.tv_sec >= g->p.nr_secs) { |
1c13f3c9 IM |
1124 | g->stop_work = true; |
1125 | break; | |
1126 | } | |
1127 | } | |
1128 | ||
1129 | /* Update the summary at most once per second: */ | |
1130 | if (start.tv_sec == stop.tv_sec) | |
1131 | continue; | |
1132 | ||
1133 | /* | |
1134 | * Perturb the first task's equilibrium every g->p.perturb_secs seconds, | |
1135 | * by migrating to CPU#0: | |
1136 | */ | |
1137 | if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { | |
1138 | cpu_set_t orig_mask; | |
1139 | int target_cpu; | |
1140 | int this_cpu; | |
1141 | ||
1142 | last_perturbance = stop.tv_sec; | |
1143 | ||
1144 | /* | |
1145 | * Depending on where we are running, move into | |
1146 | * the other half of the system, to create some | |
1147 | * real disturbance: | |
1148 | */ | |
1149 | this_cpu = g->threads[task_nr].curr_cpu; | |
1150 | if (this_cpu < g->p.nr_cpus/2) | |
1151 | target_cpu = g->p.nr_cpus-1; | |
1152 | else | |
1153 | target_cpu = 0; | |
1154 | ||
1155 | orig_mask = bind_to_cpu(target_cpu); | |
1156 | ||
1157 | /* Here we are running on the target CPU already */ | |
1158 | if (details >= 1) | |
1159 | printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu); | |
1160 | ||
1161 | bind_to_cpumask(orig_mask); | |
1162 | } | |
1163 | ||
1164 | if (details >= 3) { | |
1165 | timersub(&stop, &start, &diff); | |
1166 | runtime_ns_max = diff.tv_sec * 1000000000; | |
1167 | runtime_ns_max += diff.tv_usec * 1000; | |
1168 | ||
1169 | if (details >= 0) { | |
2100f778 | 1170 | printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n", |
1c13f3c9 IM |
1171 | process_nr, thread_nr, runtime_ns_max / bytes_done, val); |
1172 | } | |
1173 | fflush(stdout); | |
1174 | } | |
1175 | if (!last_task) | |
1176 | continue; | |
1177 | ||
1178 | timersub(&stop, &start0, &diff); | |
1179 | runtime_ns_max = diff.tv_sec * 1000000000ULL; | |
1180 | runtime_ns_max += diff.tv_usec * 1000ULL; | |
1181 | ||
1182 | show_summary(runtime_ns_max, l, &convergence); | |
1183 | } | |
1184 | ||
1185 | gettimeofday(&stop, NULL); | |
1186 | timersub(&stop, &start0, &diff); | |
1187 | td->runtime_ns = diff.tv_sec * 1000000000ULL; | |
1188 | td->runtime_ns += diff.tv_usec * 1000ULL; | |
1189 | ||
1190 | free_data(thread_data, g->p.bytes_thread); | |
1191 | ||
1192 | pthread_mutex_lock(&g->stop_work_mutex); | |
1193 | g->bytes_done += bytes_done; | |
1194 | pthread_mutex_unlock(&g->stop_work_mutex); | |
1195 | ||
1196 | return NULL; | |
1197 | } | |
1198 | ||
1199 | /* | |
1200 | * A worker process starts a couple of threads: | |
1201 | */ | |
1202 | static void worker_process(int process_nr) | |
1203 | { | |
1204 | pthread_mutex_t process_lock; | |
1205 | struct thread_data *td; | |
1206 | pthread_t *pthreads; | |
1207 | u8 *process_data; | |
1208 | int task_nr; | |
1209 | int ret; | |
1210 | int t; | |
1211 | ||
1212 | pthread_mutex_init(&process_lock, NULL); | |
1213 | set_taskname("process %d", process_nr); | |
1214 | ||
1215 | /* | |
1216 | * Pick up the memory policy and the CPU binding of our first thread, | |
1217 | * so that we initialize memory accordingly: | |
1218 | */ | |
1219 | task_nr = process_nr*g->p.nr_threads; | |
1220 | td = g->threads + task_nr; | |
1221 | ||
1222 | bind_to_memnode(td->bind_node); | |
1223 | bind_to_cpumask(td->bind_cpumask); | |
1224 | ||
1225 | pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t)); | |
1226 | process_data = setup_private_data(g->p.bytes_process); | |
1227 | ||
1228 | if (g->p.show_details >= 3) { | |
1229 | printf(" # process %2d global mem: %p, process mem: %p\n", | |
1230 | process_nr, g->data, process_data); | |
1231 | } | |
1232 | ||
1233 | for (t = 0; t < g->p.nr_threads; t++) { | |
1234 | task_nr = process_nr*g->p.nr_threads + t; | |
1235 | td = g->threads + task_nr; | |
1236 | ||
1237 | td->process_data = process_data; | |
1238 | td->process_nr = process_nr; | |
1239 | td->thread_nr = t; | |
1240 | td->task_nr = task_nr; | |
1241 | td->val = rand(); | |
1242 | td->curr_cpu = -1; | |
1243 | td->process_lock = &process_lock; | |
1244 | ||
1245 | ret = pthread_create(pthreads + t, NULL, worker_thread, td); | |
1246 | BUG_ON(ret); | |
1247 | } | |
1248 | ||
1249 | for (t = 0; t < g->p.nr_threads; t++) { | |
1250 | ret = pthread_join(pthreads[t], NULL); | |
1251 | BUG_ON(ret); | |
1252 | } | |
1253 | ||
1254 | free_data(process_data, g->p.bytes_process); | |
1255 | free(pthreads); | |
1256 | } | |
1257 | ||
1258 | static void print_summary(void) | |
1259 | { | |
1260 | if (g->p.show_details < 0) | |
1261 | return; | |
1262 | ||
1263 | printf("\n ###\n"); | |
1264 | printf(" # %d %s will execute (on %d nodes, %d CPUs):\n", | |
1265 | g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus); | |
1266 | printf(" # %5dx %5ldMB global shared mem operations\n", | |
1267 | g->p.nr_loops, g->p.bytes_global/1024/1024); | |
1268 | printf(" # %5dx %5ldMB process shared mem operations\n", | |
1269 | g->p.nr_loops, g->p.bytes_process/1024/1024); | |
1270 | printf(" # %5dx %5ldMB thread local mem operations\n", | |
1271 | g->p.nr_loops, g->p.bytes_thread/1024/1024); | |
1272 | ||
1273 | printf(" ###\n"); | |
1274 | ||
1275 | printf("\n ###\n"); fflush(stdout); | |
1276 | } | |
1277 | ||
1278 | static void init_thread_data(void) | |
1279 | { | |
1280 | ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; | |
1281 | int t; | |
1282 | ||
1283 | g->threads = zalloc_shared_data(size); | |
1284 | ||
1285 | for (t = 0; t < g->p.nr_tasks; t++) { | |
1286 | struct thread_data *td = g->threads + t; | |
1287 | int cpu; | |
1288 | ||
1289 | /* Allow all nodes by default: */ | |
1290 | td->bind_node = -1; | |
1291 | ||
1292 | /* Allow all CPUs by default: */ | |
1293 | CPU_ZERO(&td->bind_cpumask); | |
1294 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) | |
1295 | CPU_SET(cpu, &td->bind_cpumask); | |
1296 | } | |
1297 | } | |
1298 | ||
1299 | static void deinit_thread_data(void) | |
1300 | { | |
1301 | ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; | |
1302 | ||
1303 | free_data(g->threads, size); | |
1304 | } | |
1305 | ||
1306 | static int init(void) | |
1307 | { | |
1308 | g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0); | |
1309 | ||
1310 | /* Copy over options: */ | |
1311 | g->p = p0; | |
1312 | ||
1313 | g->p.nr_cpus = numa_num_configured_cpus(); | |
1314 | ||
1315 | g->p.nr_nodes = numa_max_node() + 1; | |
1316 | ||
1317 | /* char array in count_process_nodes(): */ | |
1318 | BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); | |
1319 | ||
1320 | if (g->p.show_quiet && !g->p.show_details) | |
1321 | g->p.show_details = -1; | |
1322 | ||
1323 | /* Some memory should be specified: */ | |
1324 | if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str) | |
1325 | return -1; | |
1326 | ||
1327 | if (g->p.mb_global_str) { | |
1328 | g->p.mb_global = atof(g->p.mb_global_str); | |
1329 | BUG_ON(g->p.mb_global < 0); | |
1330 | } | |
1331 | ||
1332 | if (g->p.mb_proc_str) { | |
1333 | g->p.mb_proc = atof(g->p.mb_proc_str); | |
1334 | BUG_ON(g->p.mb_proc < 0); | |
1335 | } | |
1336 | ||
1337 | if (g->p.mb_proc_locked_str) { | |
1338 | g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str); | |
1339 | BUG_ON(g->p.mb_proc_locked < 0); | |
1340 | BUG_ON(g->p.mb_proc_locked > g->p.mb_proc); | |
1341 | } | |
1342 | ||
1343 | if (g->p.mb_thread_str) { | |
1344 | g->p.mb_thread = atof(g->p.mb_thread_str); | |
1345 | BUG_ON(g->p.mb_thread < 0); | |
1346 | } | |
1347 | ||
1348 | BUG_ON(g->p.nr_threads <= 0); | |
1349 | BUG_ON(g->p.nr_proc <= 0); | |
1350 | ||
1351 | g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads; | |
1352 | ||
1353 | g->p.bytes_global = g->p.mb_global *1024L*1024L; | |
1354 | g->p.bytes_process = g->p.mb_proc *1024L*1024L; | |
1355 | g->p.bytes_process_locked = g->p.mb_proc_locked *1024L*1024L; | |
1356 | g->p.bytes_thread = g->p.mb_thread *1024L*1024L; | |
1357 | ||
1358 | g->data = setup_shared_data(g->p.bytes_global); | |
1359 | ||
1360 | /* Startup serialization: */ | |
1361 | init_global_mutex(&g->start_work_mutex); | |
1362 | init_global_mutex(&g->startup_mutex); | |
1363 | init_global_mutex(&g->startup_done_mutex); | |
1364 | init_global_mutex(&g->stop_work_mutex); | |
1365 | ||
1366 | init_thread_data(); | |
1367 | ||
1368 | tprintf("#\n"); | |
b81a48ea PH |
1369 | if (parse_setup_cpu_list() || parse_setup_node_list()) |
1370 | return -1; | |
1c13f3c9 IM |
1371 | tprintf("#\n"); |
1372 | ||
1373 | print_summary(); | |
1374 | ||
1375 | return 0; | |
1376 | } | |
1377 | ||
1378 | static void deinit(void) | |
1379 | { | |
1380 | free_data(g->data, g->p.bytes_global); | |
1381 | g->data = NULL; | |
1382 | ||
1383 | deinit_thread_data(); | |
1384 | ||
1385 | free_data(g, sizeof(*g)); | |
1386 | g = NULL; | |
1387 | } | |
1388 | ||
1389 | /* | |
1390 | * Print a short or long result, depending on the verbosity setting: | |
1391 | */ | |
1392 | static void print_res(const char *name, double val, | |
1393 | const char *txt_unit, const char *txt_short, const char *txt_long) | |
1394 | { | |
1395 | if (!name) | |
1396 | name = "main,"; | |
1397 | ||
1398 | if (g->p.show_quiet) | |
1399 | printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short); | |
1400 | else | |
1401 | printf(" %14.3f %s\n", val, txt_long); | |
1402 | } | |
1403 | ||
1404 | static int __bench_numa(const char *name) | |
1405 | { | |
1406 | struct timeval start, stop, diff; | |
1407 | u64 runtime_ns_min, runtime_ns_sum; | |
1408 | pid_t *pids, pid, wpid; | |
1409 | double delta_runtime; | |
1410 | double runtime_avg; | |
1411 | double runtime_sec_max; | |
1412 | double runtime_sec_min; | |
1413 | int wait_stat; | |
1414 | double bytes; | |
1415 | int i, t; | |
1416 | ||
1417 | if (init()) | |
1418 | return -1; | |
1419 | ||
1420 | pids = zalloc(g->p.nr_proc * sizeof(*pids)); | |
1421 | pid = -1; | |
1422 | ||
1423 | /* All threads try to acquire it, this way we can wait for them to start up: */ | |
1424 | pthread_mutex_lock(&g->start_work_mutex); | |
1425 | ||
1426 | if (g->p.serialize_startup) { | |
1427 | tprintf(" #\n"); | |
1428 | tprintf(" # Startup synchronization: ..."); fflush(stdout); | |
1429 | } | |
1430 | ||
1431 | gettimeofday(&start, NULL); | |
1432 | ||
1433 | for (i = 0; i < g->p.nr_proc; i++) { | |
1434 | pid = fork(); | |
1435 | dprintf(" # process %2d: PID %d\n", i, pid); | |
1436 | ||
1437 | BUG_ON(pid < 0); | |
1438 | if (!pid) { | |
1439 | /* Child process: */ | |
1440 | worker_process(i); | |
1441 | ||
1442 | exit(0); | |
1443 | } | |
1444 | pids[i] = pid; | |
1445 | ||
1446 | } | |
1447 | /* Wait for all the threads to start up: */ | |
1448 | while (g->nr_tasks_started != g->p.nr_tasks) | |
1449 | usleep(1000); | |
1450 | ||
1451 | BUG_ON(g->nr_tasks_started != g->p.nr_tasks); | |
1452 | ||
1453 | if (g->p.serialize_startup) { | |
1454 | double startup_sec; | |
1455 | ||
1456 | pthread_mutex_lock(&g->startup_done_mutex); | |
1457 | ||
1458 | /* This will start all threads: */ | |
1459 | pthread_mutex_unlock(&g->start_work_mutex); | |
1460 | ||
1461 | /* This mutex is locked - the last started thread will wake us: */ | |
1462 | pthread_mutex_lock(&g->startup_done_mutex); | |
1463 | ||
1464 | gettimeofday(&stop, NULL); | |
1465 | ||
1466 | timersub(&stop, &start, &diff); | |
1467 | ||
1468 | startup_sec = diff.tv_sec * 1000000000.0; | |
1469 | startup_sec += diff.tv_usec * 1000.0; | |
1470 | startup_sec /= 1e9; | |
1471 | ||
1472 | tprintf(" threads initialized in %.6f seconds.\n", startup_sec); | |
1473 | tprintf(" #\n"); | |
1474 | ||
1475 | start = stop; | |
1476 | pthread_mutex_unlock(&g->startup_done_mutex); | |
1477 | } else { | |
1478 | gettimeofday(&start, NULL); | |
1479 | } | |
1480 | ||
1481 | /* Parent process: */ | |
1482 | ||
1483 | ||
1484 | for (i = 0; i < g->p.nr_proc; i++) { | |
1485 | wpid = waitpid(pids[i], &wait_stat, 0); | |
1486 | BUG_ON(wpid < 0); | |
1487 | BUG_ON(!WIFEXITED(wait_stat)); | |
1488 | ||
1489 | } | |
1490 | ||
1491 | runtime_ns_sum = 0; | |
1492 | runtime_ns_min = -1LL; | |
1493 | ||
1494 | for (t = 0; t < g->p.nr_tasks; t++) { | |
1495 | u64 thread_runtime_ns = g->threads[t].runtime_ns; | |
1496 | ||
1497 | runtime_ns_sum += thread_runtime_ns; | |
1498 | runtime_ns_min = min(thread_runtime_ns, runtime_ns_min); | |
1499 | } | |
1500 | ||
1501 | gettimeofday(&stop, NULL); | |
1502 | timersub(&stop, &start, &diff); | |
1503 | ||
1504 | BUG_ON(bench_format != BENCH_FORMAT_DEFAULT); | |
1505 | ||
1506 | tprintf("\n ###\n"); | |
1507 | tprintf("\n"); | |
1508 | ||
1509 | runtime_sec_max = diff.tv_sec * 1000000000.0; | |
1510 | runtime_sec_max += diff.tv_usec * 1000.0; | |
1511 | runtime_sec_max /= 1e9; | |
1512 | ||
1513 | runtime_sec_min = runtime_ns_min/1e9; | |
1514 | ||
1515 | bytes = g->bytes_done; | |
1516 | runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9; | |
1517 | ||
1518 | if (g->p.measure_convergence) { | |
1519 | print_res(name, runtime_sec_max, | |
1520 | "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge"); | |
1521 | } | |
1522 | ||
1523 | print_res(name, runtime_sec_max, | |
1524 | "secs,", "runtime-max/thread", "secs slowest (max) thread-runtime"); | |
1525 | ||
1526 | print_res(name, runtime_sec_min, | |
1527 | "secs,", "runtime-min/thread", "secs fastest (min) thread-runtime"); | |
1528 | ||
1529 | print_res(name, runtime_avg, | |
1530 | "secs,", "runtime-avg/thread", "secs average thread-runtime"); | |
1531 | ||
1532 | delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0; | |
1533 | print_res(name, delta_runtime / runtime_sec_max * 100.0, | |
1534 | "%,", "spread-runtime/thread", "% difference between max/avg runtime"); | |
1535 | ||
1536 | print_res(name, bytes / g->p.nr_tasks / 1e9, | |
1537 | "GB,", "data/thread", "GB data processed, per thread"); | |
1538 | ||
1539 | print_res(name, bytes / 1e9, | |
1540 | "GB,", "data-total", "GB data processed, total"); | |
1541 | ||
1542 | print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks), | |
1543 | "nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime"); | |
1544 | ||
1545 | print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max, | |
1546 | "GB/sec,", "thread-speed", "GB/sec/thread speed"); | |
1547 | ||
1548 | print_res(name, bytes / runtime_sec_max / 1e9, | |
1549 | "GB/sec,", "total-speed", "GB/sec total speed"); | |
1550 | ||
1551 | free(pids); | |
1552 | ||
1553 | deinit(); | |
1554 | ||
1555 | return 0; | |
1556 | } | |
1557 | ||
1558 | #define MAX_ARGS 50 | |
1559 | ||
1560 | static int command_size(const char **argv) | |
1561 | { | |
1562 | int size = 0; | |
1563 | ||
1564 | while (*argv) { | |
1565 | size++; | |
1566 | argv++; | |
1567 | } | |
1568 | ||
1569 | BUG_ON(size >= MAX_ARGS); | |
1570 | ||
1571 | return size; | |
1572 | } | |
1573 | ||
1574 | static void init_params(struct params *p, const char *name, int argc, const char **argv) | |
1575 | { | |
1576 | int i; | |
1577 | ||
1578 | printf("\n # Running %s \"perf bench numa", name); | |
1579 | ||
1580 | for (i = 0; i < argc; i++) | |
1581 | printf(" %s", argv[i]); | |
1582 | ||
1583 | printf("\"\n"); | |
1584 | ||
1585 | memset(p, 0, sizeof(*p)); | |
1586 | ||
1587 | /* Initialize nonzero defaults: */ | |
1588 | ||
1589 | p->serialize_startup = 1; | |
1590 | p->data_reads = true; | |
1591 | p->data_writes = true; | |
1592 | p->data_backwards = true; | |
1593 | p->data_rand_walk = true; | |
1594 | p->nr_loops = -1; | |
1595 | p->init_random = true; | |
40ba93e3 RR |
1596 | p->mb_global_str = "1"; |
1597 | p->nr_proc = 1; | |
1598 | p->nr_threads = 1; | |
1599 | p->nr_secs = 5; | |
0fae799e | 1600 | p->run_all = argc == 1; |
1c13f3c9 IM |
1601 | } |
1602 | ||
1603 | static int run_bench_numa(const char *name, const char **argv) | |
1604 | { | |
1605 | int argc = command_size(argv); | |
1606 | ||
1607 | init_params(&p0, name, argc, argv); | |
1608 | argc = parse_options(argc, argv, options, bench_numa_usage, 0); | |
1609 | if (argc) | |
1610 | goto err; | |
1611 | ||
1612 | if (__bench_numa(name)) | |
1613 | goto err; | |
1614 | ||
1615 | return 0; | |
1616 | ||
1617 | err: | |
1c13f3c9 IM |
1618 | return -1; |
1619 | } | |
1620 | ||
1621 | #define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk" | |
1622 | #define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1" | |
1623 | ||
1624 | #define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1" | |
1625 | #define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1" | |
1626 | ||
1627 | #define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1" | |
1628 | #define OPT_BW_NOTHP OPT_BW, "--thp", "-1" | |
1629 | ||
1630 | /* | |
1631 | * The built-in test-suite executed by "perf bench numa -a". | |
1632 | * | |
1633 | * (A minimum of 4 nodes and 16 GB of RAM is recommended.) | |
1634 | */ | |
1635 | static const char *tests[][MAX_ARGS] = { | |
1636 | /* Basic single-stream NUMA bandwidth measurements: */ | |
1637 | { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", | |
1638 | "-C" , "0", "-M", "0", OPT_BW_RAM }, | |
1639 | { "RAM-bw-local-NOTHP,", | |
1640 | "mem", "-p", "1", "-t", "1", "-P", "1024", | |
1641 | "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP }, | |
1642 | { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", | |
1643 | "-C" , "0", "-M", "1", OPT_BW_RAM }, | |
1644 | ||
1645 | /* 2-stream NUMA bandwidth measurements: */ | |
1646 | { "RAM-bw-local-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024", | |
1647 | "-C", "0,2", "-M", "0x2", OPT_BW_RAM }, | |
1648 | { "RAM-bw-remote-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024", | |
1649 | "-C", "0,2", "-M", "1x2", OPT_BW_RAM }, | |
1650 | ||
1651 | /* Cross-stream NUMA bandwidth measurement: */ | |
1652 | { "RAM-bw-cross,", "mem", "-p", "2", "-t", "1", "-P", "1024", | |
1653 | "-C", "0,8", "-M", "1,0", OPT_BW_RAM }, | |
1654 | ||
1655 | /* Convergence latency measurements: */ | |
1656 | { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV }, | |
1657 | { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV }, | |
1658 | { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV }, | |
1659 | { " 2x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, | |
1660 | { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, | |
1661 | { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV }, | |
1662 | { " 4x4-convergence-NOTHP,", | |
1663 | "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV_NOTHP }, | |
1664 | { " 4x6-convergence,", "mem", "-p", "4", "-t", "6", "-P", "1020", OPT_CONV }, | |
1665 | { " 4x8-convergence,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_CONV }, | |
1666 | { " 8x4-convergence,", "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV }, | |
1667 | { " 8x4-convergence-NOTHP,", | |
1668 | "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV_NOTHP }, | |
1669 | { " 3x1-convergence,", "mem", "-p", "3", "-t", "1", "-P", "512", OPT_CONV }, | |
1670 | { " 4x1-convergence,", "mem", "-p", "4", "-t", "1", "-P", "512", OPT_CONV }, | |
1671 | { " 8x1-convergence,", "mem", "-p", "8", "-t", "1", "-P", "512", OPT_CONV }, | |
1672 | { "16x1-convergence,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_CONV }, | |
1673 | { "32x1-convergence,", "mem", "-p", "32", "-t", "1", "-P", "128", OPT_CONV }, | |
1674 | ||
1675 | /* Various NUMA process/thread layout bandwidth measurements: */ | |
1676 | { " 2x1-bw-process,", "mem", "-p", "2", "-t", "1", "-P", "1024", OPT_BW }, | |
1677 | { " 3x1-bw-process,", "mem", "-p", "3", "-t", "1", "-P", "1024", OPT_BW }, | |
1678 | { " 4x1-bw-process,", "mem", "-p", "4", "-t", "1", "-P", "1024", OPT_BW }, | |
1679 | { " 8x1-bw-process,", "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW }, | |
1680 | { " 8x1-bw-process-NOTHP,", | |
1681 | "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP }, | |
1682 | { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW }, | |
1683 | ||
1684 | { " 4x1-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, | |
1685 | { " 8x1-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, | |
1686 | { "16x1-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, | |
1687 | { "32x1-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, | |
1688 | ||
1689 | { " 2x3-bw-thread,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, | |
1690 | { " 4x4-bw-thread,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, | |
1691 | { " 4x6-bw-thread,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, | |
1692 | { " 4x8-bw-thread,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, | |
1693 | { " 4x8-bw-thread-NOTHP,", | |
1694 | "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP }, | |
1695 | { " 3x3-bw-thread,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, | |
1696 | { " 5x5-bw-thread,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, | |
1697 | ||
1698 | { "2x16-bw-thread,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, | |
1699 | { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, | |
1700 | ||
1701 | { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, | |
1702 | { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP }, | |
1703 | { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW }, | |
1704 | { "numa01-bw-thread-NOTHP,", | |
1705 | "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW_NOTHP }, | |
1706 | }; | |
1707 | ||
1708 | static int bench_all(void) | |
1709 | { | |
1710 | int nr = ARRAY_SIZE(tests); | |
1711 | int ret; | |
1712 | int i; | |
1713 | ||
1714 | ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'"); | |
1715 | BUG_ON(ret < 0); | |
1716 | ||
1717 | for (i = 0; i < nr; i++) { | |
b81a48ea | 1718 | run_bench_numa(tests[i][0], tests[i] + 1); |
1c13f3c9 IM |
1719 | } |
1720 | ||
1721 | printf("\n"); | |
1722 | ||
1723 | return 0; | |
1724 | } | |
1725 | ||
1726 | int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused) | |
1727 | { | |
1728 | init_params(&p0, "main,", argc, argv); | |
1729 | argc = parse_options(argc, argv, options, bench_numa_usage, 0); | |
1730 | if (argc) | |
1731 | goto err; | |
1732 | ||
1733 | if (p0.run_all) | |
1734 | return bench_all(); | |
1735 | ||
1736 | if (__bench_numa(NULL)) | |
1737 | goto err; | |
1738 | ||
1739 | return 0; | |
1740 | ||
1741 | err: | |
1742 | usage_with_options(numa_usage, options); | |
1743 | return -1; | |
1744 | } |