Fix typos in fprintf
[librseq.git] / src / rseq-percpu-alloc.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4 #include <rseq/percpu-alloc.h>
5 #include <sys/mman.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <pthread.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <rseq/compiler.h>
12 #include <errno.h>
13 #include <stdint.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16
17 #ifdef HAVE_LIBNUMA
18 # include <numa.h>
19 # include <numaif.h>
20 #endif
21
22 #include "rseq-alloc-utils.h"
23
24 /*
25 * rseq-percpu-alloc.c: rseq CPU-Local Storage (CLS) memory allocator.
26 *
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
29 * given size (rounded to next power of 2), reserving a given virtual
30 * address size per CPU, for a given maximum number of CPUs.
31 *
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
35 */
36
37 /*
38 * Use high bits of per-CPU addresses to index the pool.
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
41 */
42 #if RSEQ_BITS_PER_LONG == 64
43 # define POOL_INDEX_BITS 16
44 #else
45 # define POOL_INDEX_BITS 8
46 #endif
47 #define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48 #define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49 #define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50 #define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
51
52 #define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
53
54 /*
55 * Smallest allocation should hold enough space for a free list pointer.
56 */
57 #if RSEQ_BITS_PER_LONG == 64
58 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
59 #else
60 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
61 #endif
62
63 /*
64 * Skip pool index 0 to ensure allocated entries at index 0 do not match
65 * a NULL pointer.
66 */
67 #define FIRST_POOL 1
68
69 #define RSEQ_POOL_FLAGS (RSEQ_POOL_ROBUST)
70
71 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
72
73 struct free_list_node;
74
75 struct free_list_node {
76 struct free_list_node *next;
77 };
78
79 /* This lock protects pool create/destroy. */
80 static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
81
82 struct rseq_mmap_attr {
83 void *(*mmap_func)(void *priv, size_t len);
84 int (*munmap_func)(void *priv, void *ptr, size_t len);
85 void *mmap_priv;
86 };
87
88 struct rseq_percpu_pool {
89 void *base;
90 unsigned int index;
91 size_t item_len;
92 size_t percpu_len;
93 int item_order;
94 int max_nr_cpus;
95
96 /*
97 * The free list chains freed items on the CPU 0 address range.
98 * We should rethink this decision if false sharing between
99 * malloc/free from other CPUs and data accesses from CPU 0
100 * becomes an issue. This is a NULL-terminated singly-linked
101 * list.
102 */
103 struct free_list_node *free_list_head;
104 size_t next_unused;
105 /* This lock protects allocation/free within the pool. */
106 pthread_mutex_t lock;
107
108 struct rseq_mmap_attr mmap_attr;
109
110 /* Track alloc/free. */
111 unsigned long *alloc_bitmap;
112 };
113
114 //TODO: the array of pools should grow dynamically on create.
115 static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
116
117 /*
118 * Pool set entries are indexed by item_len rounded to the next power of
119 * 2. A pool set can contain NULL pool entries, in which case the next
120 * large enough entry will be used for allocation.
121 */
122 struct rseq_percpu_pool_set {
123 /* This lock protects add vs malloc/zmalloc within the pool set. */
124 pthread_mutex_t lock;
125 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
126 };
127
128 static
129 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
130 {
131 return pool->base + (pool->percpu_len * cpu) + item_offset;
132 }
133
134 void *__rseq_percpu_ptr(void __rseq_percpu *_ptr, int cpu)
135 {
136 uintptr_t ptr = (uintptr_t) _ptr;
137 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
138 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
139 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
140
141 assert(cpu >= 0);
142 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
143 }
144
145 static
146 void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
147 {
148 int i;
149
150 for (i = 0; i < pool->max_nr_cpus; i++) {
151 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
152 memset(p, 0, pool->item_len);
153 }
154 }
155
156 #ifdef HAVE_LIBNUMA
157 int rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, int numa_flags)
158 {
159 unsigned long nr_pages, page;
160 long ret, page_len;
161 int cpu;
162
163 if (!numa_flags)
164 return 0;
165 page_len = rseq_get_page_len();
166 nr_pages = pool->percpu_len >> rseq_get_count_order_ulong(page_len);
167 for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
168 int node = numa_node_of_cpu(cpu);
169
170 /* TODO: batch move_pages() call with an array of pages. */
171 for (page = 0; page < nr_pages; page++) {
172 void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
173 int status = -EPERM;
174
175 ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
176 if (ret)
177 return ret;
178 }
179 }
180 return 0;
181 }
182 #else
183 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
184 int numa_flags __attribute__((unused)))
185 {
186 return 0;
187 }
188 #endif
189
190 static
191 void *default_mmap_func(void *priv __attribute__((unused)), size_t len)
192 {
193 void *base;
194
195 base = mmap(NULL, len, PROT_READ | PROT_WRITE,
196 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
197 if (base == MAP_FAILED)
198 return NULL;
199 return base;
200 }
201
202 static
203 int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len)
204 {
205 return munmap(ptr, len);
206 }
207
208 static
209 int create_alloc_bitmap(struct rseq_percpu_pool *pool)
210 {
211 size_t count;
212
213 count = ((pool->percpu_len >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
214
215 /*
216 * Not being able to create the validation bitmap is an error
217 * that needs to be reported.
218 */
219 pool->alloc_bitmap = calloc(count, sizeof(unsigned long));
220 if (!pool->alloc_bitmap)
221 return -1;
222 return 0;
223 }
224
225 /* Always inline for __builtin_return_address(0). */
226 static inline __attribute__((always_inline))
227 void check_free_list(const struct rseq_percpu_pool *pool)
228 {
229 size_t total_item = pool->percpu_len >> pool->item_order;
230 size_t total_never_allocated = (pool->percpu_len - pool->next_unused) >> pool->item_order;
231 size_t total_freed = 0;
232 size_t max_list_traversal = total_item - total_never_allocated;
233 size_t traversal_iteration = 0;
234
235 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
236 node;
237 prev = node,
238 node = node->next) {
239
240 void *node_addr = node;
241
242 if (traversal_iteration >= max_list_traversal) {
243 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool %p, caller %p.\n",
244 __func__, pool, __builtin_return_address(0));
245 abort();
246 }
247
248 /* Node is out of range. */
249 if ((node_addr < pool->base) ||
250 (node_addr >= pool->base + pool->next_unused)) {
251 if (prev)
252 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool %p, caller %p.\n",
253 __func__, prev, node, pool, __builtin_return_address(0));
254 else
255 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool %p, caller %p.\n",
256 __func__, node, pool, __builtin_return_address(0));
257 abort();
258 }
259
260 traversal_iteration += 1;
261 total_freed += 1;
262 }
263
264 if (total_never_allocated + total_freed != total_item) {
265 fprintf(stderr, "%s: Corrupted free-list in pool %p; total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
266 __func__, pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
267 abort();
268 }
269
270 }
271
272 /* Always inline for __builtin_return_address(0). */
273 static inline __attribute__((always_inline))
274 void destroy_alloc_bitmap(struct rseq_percpu_pool *pool)
275 {
276 unsigned long *bitmap = pool->alloc_bitmap;
277 size_t count, total_leaks = 0;
278
279 if (!bitmap)
280 return;
281
282 count = ((pool->percpu_len >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
283
284 /* Assert that all items in the pool were freed. */
285 for (size_t k = 0; k < count; ++k)
286 total_leaks += rseq_hweight_ulong(bitmap[k]);
287 if (total_leaks) {
288 fprintf(stderr, "%s: Pool has %zu leaked items on destroy, caller: %p.\n",
289 __func__, total_leaks, (void *) __builtin_return_address(0));
290 abort();
291 }
292
293 check_free_list(pool);
294
295 free(bitmap);
296 }
297
298 /* Always inline for __builtin_return_address(0). */
299 static inline __attribute__((always_inline))
300 int __rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
301 {
302 int ret;
303
304 if (!pool->base) {
305 errno = ENOENT;
306 ret = -1;
307 goto end;
308 }
309 /*
310 * This must be done before releasing pool->base for checking the
311 * free-list.
312 */
313 destroy_alloc_bitmap(pool);
314 ret = pool->mmap_attr.munmap_func(pool->mmap_attr.mmap_priv, pool->base,
315 pool->percpu_len * pool->max_nr_cpus);
316 if (ret)
317 goto end;
318 pthread_mutex_destroy(&pool->lock);
319 memset(pool, 0, sizeof(*pool));
320 end:
321 return 0;
322 }
323
324 int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
325 {
326 int ret;
327
328 pthread_mutex_lock(&pool_lock);
329 ret = __rseq_percpu_pool_destroy(pool);
330 pthread_mutex_unlock(&pool_lock);
331 return ret;
332 }
333
334 struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
335 size_t percpu_len, int max_nr_cpus,
336 const struct rseq_mmap_attr *mmap_attr,
337 int flags)
338 {
339 void *(*mmap_func)(void *priv, size_t len);
340 int (*munmap_func)(void *priv, void *ptr, size_t len);
341 void *mmap_priv;
342 struct rseq_percpu_pool *pool;
343 void *base;
344 unsigned int i;
345 int order;
346
347 if (flags & ~RSEQ_POOL_FLAGS) {
348 errno = EINVAL;
349 return NULL;
350 }
351
352 /* Make sure each item is large enough to contain free list pointers. */
353 if (item_len < sizeof(void *))
354 item_len = sizeof(void *);
355
356 /* Align item_len on next power of two. */
357 order = rseq_get_count_order_ulong(item_len);
358 if (order < 0) {
359 errno = EINVAL;
360 return NULL;
361 }
362 item_len = 1UL << order;
363
364 /* Align percpu_len on page size. */
365 percpu_len = rseq_align(percpu_len, rseq_get_page_len());
366
367 if (max_nr_cpus < 0 || item_len > percpu_len ||
368 percpu_len > (UINTPTR_MAX >> POOL_INDEX_BITS)) {
369 errno = EINVAL;
370 return NULL;
371 }
372
373 if (mmap_attr) {
374 mmap_func = mmap_attr->mmap_func;
375 munmap_func = mmap_attr->munmap_func;
376 mmap_priv = mmap_attr->mmap_priv;
377 } else {
378 mmap_func = default_mmap_func;
379 munmap_func = default_munmap_func;
380 mmap_priv = NULL;
381 }
382 pthread_mutex_lock(&pool_lock);
383 /* Linear scan in array of pools to find empty spot. */
384 for (i = FIRST_POOL; i < MAX_NR_POOLS; i++) {
385 pool = &rseq_percpu_pool[i];
386 if (!pool->base)
387 goto found_empty;
388 }
389 errno = ENOMEM;
390 pool = NULL;
391 goto end;
392
393 found_empty:
394 base = mmap_func(mmap_priv, percpu_len * max_nr_cpus);
395 if (!base)
396 goto error_alloc;
397 pthread_mutex_init(&pool->lock, NULL);
398 pool->base = base;
399 pool->percpu_len = percpu_len;
400 pool->max_nr_cpus = max_nr_cpus;
401 pool->index = i;
402 pool->item_len = item_len;
403 pool->item_order = order;
404 pool->mmap_attr.mmap_func = mmap_func;
405 pool->mmap_attr.munmap_func = munmap_func;
406 pool->mmap_attr.mmap_priv = mmap_priv;
407
408 if (RSEQ_POOL_ROBUST & flags) {
409 if (create_alloc_bitmap(pool))
410 goto error_alloc;
411 }
412 end:
413 pthread_mutex_unlock(&pool_lock);
414 return pool;
415
416 error_alloc:
417 __rseq_percpu_pool_destroy(pool);
418 pthread_mutex_unlock(&pool_lock);
419 errno = ENOMEM;
420 return NULL;
421 }
422
423 /* Always inline for __builtin_return_address(0). */
424 static inline __attribute__((always_inline))
425 void set_alloc_slot(struct rseq_percpu_pool *pool, size_t item_offset)
426 {
427 unsigned long *bitmap = pool->alloc_bitmap;
428 size_t item_index = item_offset >> pool->item_order;
429 unsigned long mask;
430 size_t k;
431
432 if (!bitmap)
433 return;
434
435 k = item_index / BIT_PER_ULONG;
436 mask = 1ULL << (item_index % BIT_PER_ULONG);
437
438 /* Print error if bit is already set. */
439 if (bitmap[k] & mask) {
440 fprintf(stderr, "%s: Allocator corruption detected for pool: %p, item offset: %zu, caller: %p.\n",
441 __func__, pool, item_offset, (void *) __builtin_return_address(0));
442 abort();
443 }
444 bitmap[k] |= mask;
445 }
446
447 static
448 void __rseq_percpu *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
449 {
450 struct free_list_node *node;
451 uintptr_t item_offset;
452 void __rseq_percpu *addr;
453
454 pthread_mutex_lock(&pool->lock);
455 /* Get first entry from free list. */
456 node = pool->free_list_head;
457 if (node != NULL) {
458 /* Remove node from free list (update head). */
459 pool->free_list_head = node->next;
460 item_offset = (uintptr_t) ((void *) node - pool->base);
461 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
462 goto end;
463 }
464 if (pool->next_unused + pool->item_len > pool->percpu_len) {
465 errno = ENOMEM;
466 addr = NULL;
467 goto end;
468 }
469 item_offset = pool->next_unused;
470 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
471 pool->next_unused += pool->item_len;
472 set_alloc_slot(pool, item_offset);
473 end:
474 pthread_mutex_unlock(&pool->lock);
475 if (zeroed && addr)
476 rseq_percpu_zero_item(pool, item_offset);
477 return addr;
478 }
479
480 void __rseq_percpu *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
481 {
482 return __rseq_percpu_malloc(pool, false);
483 }
484
485 void __rseq_percpu *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
486 {
487 return __rseq_percpu_malloc(pool, true);
488 }
489
490 /* Always inline for __builtin_return_address(0). */
491 static inline __attribute__((always_inline))
492 void clear_alloc_slot(struct rseq_percpu_pool *pool, size_t item_offset)
493 {
494 unsigned long *bitmap = pool->alloc_bitmap;
495 size_t item_index = item_offset >> pool->item_order;
496 unsigned long mask;
497 size_t k;
498
499 if (!bitmap)
500 return;
501
502 k = item_index / BIT_PER_ULONG;
503 mask = 1ULL << (item_index % BIT_PER_ULONG);
504
505 /* Print error if bit is not set. */
506 if (!(bitmap[k] & mask)) {
507 fprintf(stderr, "%s: Double-free detected for pool: %p, item offset: %zu, caller: %p.\n",
508 __func__, pool, item_offset, (void *) __builtin_return_address(0));
509 abort();
510 }
511 bitmap[k] &= ~mask;
512 }
513
514 void rseq_percpu_free(void __rseq_percpu *_ptr)
515 {
516 uintptr_t ptr = (uintptr_t) _ptr;
517 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
518 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
519 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
520 struct free_list_node *head, *item;
521
522 pthread_mutex_lock(&pool->lock);
523 clear_alloc_slot(pool, item_offset);
524 /* Add ptr to head of free list */
525 head = pool->free_list_head;
526 /* Free-list is in CPU 0 range. */
527 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
528 item->next = head;
529 pool->free_list_head = item;
530 pthread_mutex_unlock(&pool->lock);
531 }
532
533 struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
534 {
535 struct rseq_percpu_pool_set *pool_set;
536
537 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
538 if (!pool_set)
539 return NULL;
540 pthread_mutex_init(&pool_set->lock, NULL);
541 return pool_set;
542 }
543
544 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
545 {
546 int order, ret;
547
548 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
549 struct rseq_percpu_pool *pool = pool_set->entries[order];
550
551 if (!pool)
552 continue;
553 ret = rseq_percpu_pool_destroy(pool);
554 if (ret)
555 return ret;
556 pool_set->entries[order] = NULL;
557 }
558 pthread_mutex_destroy(&pool_set->lock);
559 free(pool_set);
560 return 0;
561 }
562
563 /* Ownership of pool is handed over to pool set on success. */
564 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
565 {
566 size_t item_order = pool->item_order;
567 int ret = 0;
568
569 pthread_mutex_lock(&pool_set->lock);
570 if (pool_set->entries[item_order]) {
571 errno = EBUSY;
572 ret = -1;
573 goto end;
574 }
575 pool_set->entries[pool->item_order] = pool;
576 end:
577 pthread_mutex_unlock(&pool_set->lock);
578 return ret;
579 }
580
581 static
582 void __rseq_percpu *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
583 {
584 int order, min_order = POOL_SET_MIN_ENTRY;
585 struct rseq_percpu_pool *pool;
586 void __rseq_percpu *addr;
587
588 order = rseq_get_count_order_ulong(len);
589 if (order > POOL_SET_MIN_ENTRY)
590 min_order = order;
591 again:
592 pthread_mutex_lock(&pool_set->lock);
593 /* First smallest present pool where @len fits. */
594 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
595 pool = pool_set->entries[order];
596
597 if (!pool)
598 continue;
599 if (pool->item_len >= len)
600 goto found;
601 }
602 pool = NULL;
603 found:
604 pthread_mutex_unlock(&pool_set->lock);
605 if (pool) {
606 addr = __rseq_percpu_malloc(pool, zeroed);
607 if (addr == NULL && errno == ENOMEM) {
608 /*
609 * If the allocation failed, try again with a
610 * larger pool.
611 */
612 min_order = order + 1;
613 goto again;
614 }
615 } else {
616 /* Not found. */
617 errno = ENOMEM;
618 addr = NULL;
619 }
620 return addr;
621 }
622
623 void __rseq_percpu *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
624 {
625 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
626 }
627
628 void __rseq_percpu *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
629 {
630 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
631 }
632
633 struct rseq_mmap_attr *rseq_mmap_attr_create(void *(*mmap_func)(void *priv, size_t len),
634 int (*munmap_func)(void *priv, void *ptr, size_t len),
635 void *mmap_priv)
636 {
637 struct rseq_mmap_attr *attr = calloc(1, sizeof(struct rseq_mmap_attr));
638
639 if (!attr)
640 return NULL;
641 attr->mmap_func = mmap_func;
642 attr->munmap_func = munmap_func;
643 attr->mmap_priv = mmap_priv;
644 return attr;
645 }
646
647 void rseq_mmap_attr_destroy(struct rseq_mmap_attr *attr)
648 {
649 free(attr);
650 }
This page took 0.042895 seconds and 4 git commands to generate.