percpu pool: Introduce generic attributes
[librseq.git] / src / rseq-percpu-alloc.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4 #include <rseq/percpu-alloc.h>
5 #include <sys/mman.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <pthread.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <rseq/compiler.h>
12 #include <errno.h>
13 #include <stdint.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16
17 #ifdef HAVE_LIBNUMA
18 # include <numa.h>
19 # include <numaif.h>
20 #endif
21
22 #include "rseq-alloc-utils.h"
23
24 /*
25 * rseq-percpu-alloc.c: rseq CPU-Local Storage (CLS) memory allocator.
26 *
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
29 * given size (rounded to next power of 2), reserving a given virtual
30 * address size per CPU, for a given maximum number of CPUs.
31 *
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
35 */
36
37 /*
38 * Use high bits of per-CPU addresses to index the pool.
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
41 */
42 #if RSEQ_BITS_PER_LONG == 64
43 # define POOL_INDEX_BITS 16
44 #else
45 # define POOL_INDEX_BITS 8
46 #endif
47 #define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48 #define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49 #define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50 #define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
51
52 #define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
53
54 /*
55 * Smallest allocation should hold enough space for a free list pointer.
56 */
57 #if RSEQ_BITS_PER_LONG == 64
58 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
59 #else
60 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
61 #endif
62
63 /*
64 * Skip pool index 0 to ensure allocated entries at index 0 do not match
65 * a NULL pointer.
66 */
67 #define FIRST_POOL 1
68
69 #define RSEQ_POOL_FLAGS (RSEQ_POOL_ROBUST)
70
71 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
72
73 struct free_list_node;
74
75 struct free_list_node {
76 struct free_list_node *next;
77 };
78
79 /* This lock protects pool create/destroy. */
80 static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
81
82 struct rseq_pool_attr {
83 bool mmap_set;
84 void *(*mmap_func)(void *priv, size_t len);
85 int (*munmap_func)(void *priv, void *ptr, size_t len);
86 void *mmap_priv;
87 };
88
89 struct rseq_percpu_pool {
90 void *base;
91 unsigned int index;
92 size_t item_len;
93 size_t percpu_len;
94 int item_order;
95 int max_nr_cpus;
96
97 /*
98 * The free list chains freed items on the CPU 0 address range.
99 * We should rethink this decision if false sharing between
100 * malloc/free from other CPUs and data accesses from CPU 0
101 * becomes an issue. This is a NULL-terminated singly-linked
102 * list.
103 */
104 struct free_list_node *free_list_head;
105 size_t next_unused;
106 /* This lock protects allocation/free within the pool. */
107 pthread_mutex_t lock;
108
109 struct rseq_pool_attr attr;
110
111 char *name;
112 /* Track alloc/free. */
113 unsigned long *alloc_bitmap;
114 };
115
116 //TODO: the array of pools should grow dynamically on create.
117 static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
118
119 /*
120 * Pool set entries are indexed by item_len rounded to the next power of
121 * 2. A pool set can contain NULL pool entries, in which case the next
122 * large enough entry will be used for allocation.
123 */
124 struct rseq_percpu_pool_set {
125 /* This lock protects add vs malloc/zmalloc within the pool set. */
126 pthread_mutex_t lock;
127 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
128 };
129
130 static
131 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
132 {
133 return pool->base + (pool->percpu_len * cpu) + item_offset;
134 }
135
136 void *__rseq_percpu_ptr(void __rseq_percpu *_ptr, int cpu)
137 {
138 uintptr_t ptr = (uintptr_t) _ptr;
139 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
140 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
141 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
142
143 assert(cpu >= 0);
144 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
145 }
146
147 static
148 void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
149 {
150 int i;
151
152 for (i = 0; i < pool->max_nr_cpus; i++) {
153 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
154 memset(p, 0, pool->item_len);
155 }
156 }
157
158 #ifdef HAVE_LIBNUMA
159 int rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, int numa_flags)
160 {
161 unsigned long nr_pages, page;
162 long ret, page_len;
163 int cpu;
164
165 if (!numa_flags)
166 return 0;
167 page_len = rseq_get_page_len();
168 nr_pages = pool->percpu_len >> rseq_get_count_order_ulong(page_len);
169 for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
170 int node = numa_node_of_cpu(cpu);
171
172 /* TODO: batch move_pages() call with an array of pages. */
173 for (page = 0; page < nr_pages; page++) {
174 void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
175 int status = -EPERM;
176
177 ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
178 if (ret)
179 return ret;
180 }
181 }
182 return 0;
183 }
184 #else
185 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
186 int numa_flags __attribute__((unused)))
187 {
188 return 0;
189 }
190 #endif
191
192 static
193 void *default_mmap_func(void *priv __attribute__((unused)), size_t len)
194 {
195 void *base;
196
197 base = mmap(NULL, len, PROT_READ | PROT_WRITE,
198 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
199 if (base == MAP_FAILED)
200 return NULL;
201 return base;
202 }
203
204 static
205 int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len)
206 {
207 return munmap(ptr, len);
208 }
209
210 static
211 int create_alloc_bitmap(struct rseq_percpu_pool *pool)
212 {
213 size_t count;
214
215 count = ((pool->percpu_len >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
216
217 /*
218 * Not being able to create the validation bitmap is an error
219 * that needs to be reported.
220 */
221 pool->alloc_bitmap = calloc(count, sizeof(unsigned long));
222 if (!pool->alloc_bitmap)
223 return -1;
224 return 0;
225 }
226
227 static
228 const char *get_pool_name(const struct rseq_percpu_pool *pool)
229 {
230 return pool->name ? : "<anonymous>";
231 }
232
233 /* Always inline for __builtin_return_address(0). */
234 static inline __attribute__((always_inline))
235 void check_free_list(const struct rseq_percpu_pool *pool)
236 {
237 size_t total_item = pool->percpu_len >> pool->item_order;
238 size_t total_never_allocated = (pool->percpu_len - pool->next_unused) >> pool->item_order;
239 size_t total_freed = 0;
240 size_t max_list_traversal = total_item - total_never_allocated;
241 size_t traversal_iteration = 0;
242
243 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
244 node;
245 prev = node,
246 node = node->next) {
247
248 void *node_addr = node;
249
250 if (traversal_iteration >= max_list_traversal) {
251 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
252 __func__, get_pool_name(pool), pool, __builtin_return_address(0));
253 abort();
254 }
255
256 /* Node is out of range. */
257 if ((node_addr < pool->base) ||
258 (node_addr >= pool->base + pool->next_unused)) {
259 if (prev)
260 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
261 __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0));
262 else
263 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
264 __func__, node, get_pool_name(pool), pool, __builtin_return_address(0));
265 abort();
266 }
267
268 traversal_iteration += 1;
269 total_freed += 1;
270 }
271
272 if (total_never_allocated + total_freed != total_item) {
273 fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
274 __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
275 abort();
276 }
277
278 }
279
280 /* Always inline for __builtin_return_address(0). */
281 static inline __attribute__((always_inline))
282 void destroy_alloc_bitmap(struct rseq_percpu_pool *pool)
283 {
284 unsigned long *bitmap = pool->alloc_bitmap;
285 size_t count, total_leaks = 0;
286
287 if (!bitmap)
288 return;
289
290 count = ((pool->percpu_len >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
291
292 /* Assert that all items in the pool were freed. */
293 for (size_t k = 0; k < count; ++k)
294 total_leaks += rseq_hweight_ulong(bitmap[k]);
295 if (total_leaks) {
296 fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
297 __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0));
298 abort();
299 }
300
301 check_free_list(pool);
302
303 free(bitmap);
304 }
305
306 /* Always inline for __builtin_return_address(0). */
307 static inline __attribute__((always_inline))
308 int __rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
309 {
310 int ret;
311
312 if (!pool->base) {
313 errno = ENOENT;
314 ret = -1;
315 goto end;
316 }
317 /*
318 * This must be done before releasing pool->base for checking the
319 * free-list.
320 */
321 destroy_alloc_bitmap(pool);
322 ret = pool->attr.munmap_func(pool->attr.mmap_priv, pool->base,
323 pool->percpu_len * pool->max_nr_cpus);
324 if (ret)
325 goto end;
326 pthread_mutex_destroy(&pool->lock);
327 free(pool->name);
328 memset(pool, 0, sizeof(*pool));
329 end:
330 return 0;
331 }
332
333 int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
334 {
335 int ret;
336
337 pthread_mutex_lock(&pool_lock);
338 ret = __rseq_percpu_pool_destroy(pool);
339 pthread_mutex_unlock(&pool_lock);
340 return ret;
341 }
342
343 struct rseq_percpu_pool *rseq_percpu_pool_create(const char *pool_name,
344 size_t item_len, size_t percpu_len, int max_nr_cpus,
345 const struct rseq_pool_attr *_attr,
346 int flags)
347 {
348 struct rseq_percpu_pool *pool;
349 struct rseq_pool_attr attr = {};
350 void *base;
351 unsigned int i;
352 int order;
353
354 if (flags & ~RSEQ_POOL_FLAGS) {
355 errno = EINVAL;
356 return NULL;
357 }
358
359 /* Make sure each item is large enough to contain free list pointers. */
360 if (item_len < sizeof(void *))
361 item_len = sizeof(void *);
362
363 /* Align item_len on next power of two. */
364 order = rseq_get_count_order_ulong(item_len);
365 if (order < 0) {
366 errno = EINVAL;
367 return NULL;
368 }
369 item_len = 1UL << order;
370
371 /* Align percpu_len on page size. */
372 percpu_len = rseq_align(percpu_len, rseq_get_page_len());
373
374 if (max_nr_cpus < 0 || item_len > percpu_len ||
375 percpu_len > (UINTPTR_MAX >> POOL_INDEX_BITS)) {
376 errno = EINVAL;
377 return NULL;
378 }
379
380 if (_attr)
381 memcpy(&attr, _attr, sizeof(attr));
382 if (!attr.mmap_set) {
383 attr.mmap_func = default_mmap_func;
384 attr.munmap_func = default_munmap_func;
385 attr.mmap_priv = NULL;
386 }
387
388 pthread_mutex_lock(&pool_lock);
389 /* Linear scan in array of pools to find empty spot. */
390 for (i = FIRST_POOL; i < MAX_NR_POOLS; i++) {
391 pool = &rseq_percpu_pool[i];
392 if (!pool->base)
393 goto found_empty;
394 }
395 errno = ENOMEM;
396 pool = NULL;
397 goto end;
398
399 found_empty:
400 base = attr.mmap_func(attr.mmap_priv, percpu_len * max_nr_cpus);
401 if (!base)
402 goto error_alloc;
403 pthread_mutex_init(&pool->lock, NULL);
404 pool->base = base;
405 pool->percpu_len = percpu_len;
406 pool->max_nr_cpus = max_nr_cpus;
407 pool->index = i;
408 pool->item_len = item_len;
409 pool->item_order = order;
410 memcpy(&pool->attr, &attr, sizeof(attr));
411
412 if (pool_name) {
413 pool->name = strdup(pool_name);
414 if (!pool->name)
415 goto error_alloc;
416 }
417
418 if (RSEQ_POOL_ROBUST & flags) {
419 if (create_alloc_bitmap(pool))
420 goto error_alloc;
421 }
422 end:
423 pthread_mutex_unlock(&pool_lock);
424 return pool;
425
426 error_alloc:
427 __rseq_percpu_pool_destroy(pool);
428 pthread_mutex_unlock(&pool_lock);
429 errno = ENOMEM;
430 return NULL;
431 }
432
433 /* Always inline for __builtin_return_address(0). */
434 static inline __attribute__((always_inline))
435 void set_alloc_slot(struct rseq_percpu_pool *pool, size_t item_offset)
436 {
437 unsigned long *bitmap = pool->alloc_bitmap;
438 size_t item_index = item_offset >> pool->item_order;
439 unsigned long mask;
440 size_t k;
441
442 if (!bitmap)
443 return;
444
445 k = item_index / BIT_PER_ULONG;
446 mask = 1ULL << (item_index % BIT_PER_ULONG);
447
448 /* Print error if bit is already set. */
449 if (bitmap[k] & mask) {
450 fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
451 __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
452 abort();
453 }
454 bitmap[k] |= mask;
455 }
456
457 static
458 void __rseq_percpu *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
459 {
460 struct free_list_node *node;
461 uintptr_t item_offset;
462 void __rseq_percpu *addr;
463
464 pthread_mutex_lock(&pool->lock);
465 /* Get first entry from free list. */
466 node = pool->free_list_head;
467 if (node != NULL) {
468 /* Remove node from free list (update head). */
469 pool->free_list_head = node->next;
470 item_offset = (uintptr_t) ((void *) node - pool->base);
471 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
472 goto end;
473 }
474 if (pool->next_unused + pool->item_len > pool->percpu_len) {
475 errno = ENOMEM;
476 addr = NULL;
477 goto end;
478 }
479 item_offset = pool->next_unused;
480 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
481 pool->next_unused += pool->item_len;
482 set_alloc_slot(pool, item_offset);
483 end:
484 pthread_mutex_unlock(&pool->lock);
485 if (zeroed && addr)
486 rseq_percpu_zero_item(pool, item_offset);
487 return addr;
488 }
489
490 void __rseq_percpu *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
491 {
492 return __rseq_percpu_malloc(pool, false);
493 }
494
495 void __rseq_percpu *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
496 {
497 return __rseq_percpu_malloc(pool, true);
498 }
499
500 /* Always inline for __builtin_return_address(0). */
501 static inline __attribute__((always_inline))
502 void clear_alloc_slot(struct rseq_percpu_pool *pool, size_t item_offset)
503 {
504 unsigned long *bitmap = pool->alloc_bitmap;
505 size_t item_index = item_offset >> pool->item_order;
506 unsigned long mask;
507 size_t k;
508
509 if (!bitmap)
510 return;
511
512 k = item_index / BIT_PER_ULONG;
513 mask = 1ULL << (item_index % BIT_PER_ULONG);
514
515 /* Print error if bit is not set. */
516 if (!(bitmap[k] & mask)) {
517 fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
518 __func__, get_pool_name(pool), pool, item_offset,
519 (void *) __builtin_return_address(0));
520 abort();
521 }
522 bitmap[k] &= ~mask;
523 }
524
525 void rseq_percpu_free(void __rseq_percpu *_ptr)
526 {
527 uintptr_t ptr = (uintptr_t) _ptr;
528 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
529 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
530 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
531 struct free_list_node *head, *item;
532
533 pthread_mutex_lock(&pool->lock);
534 clear_alloc_slot(pool, item_offset);
535 /* Add ptr to head of free list */
536 head = pool->free_list_head;
537 /* Free-list is in CPU 0 range. */
538 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
539 item->next = head;
540 pool->free_list_head = item;
541 pthread_mutex_unlock(&pool->lock);
542 }
543
544 struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
545 {
546 struct rseq_percpu_pool_set *pool_set;
547
548 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
549 if (!pool_set)
550 return NULL;
551 pthread_mutex_init(&pool_set->lock, NULL);
552 return pool_set;
553 }
554
555 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
556 {
557 int order, ret;
558
559 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
560 struct rseq_percpu_pool *pool = pool_set->entries[order];
561
562 if (!pool)
563 continue;
564 ret = rseq_percpu_pool_destroy(pool);
565 if (ret)
566 return ret;
567 pool_set->entries[order] = NULL;
568 }
569 pthread_mutex_destroy(&pool_set->lock);
570 free(pool_set);
571 return 0;
572 }
573
574 /* Ownership of pool is handed over to pool set on success. */
575 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
576 {
577 size_t item_order = pool->item_order;
578 int ret = 0;
579
580 pthread_mutex_lock(&pool_set->lock);
581 if (pool_set->entries[item_order]) {
582 errno = EBUSY;
583 ret = -1;
584 goto end;
585 }
586 pool_set->entries[pool->item_order] = pool;
587 end:
588 pthread_mutex_unlock(&pool_set->lock);
589 return ret;
590 }
591
592 static
593 void __rseq_percpu *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
594 {
595 int order, min_order = POOL_SET_MIN_ENTRY;
596 struct rseq_percpu_pool *pool;
597 void __rseq_percpu *addr;
598
599 order = rseq_get_count_order_ulong(len);
600 if (order > POOL_SET_MIN_ENTRY)
601 min_order = order;
602 again:
603 pthread_mutex_lock(&pool_set->lock);
604 /* First smallest present pool where @len fits. */
605 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
606 pool = pool_set->entries[order];
607
608 if (!pool)
609 continue;
610 if (pool->item_len >= len)
611 goto found;
612 }
613 pool = NULL;
614 found:
615 pthread_mutex_unlock(&pool_set->lock);
616 if (pool) {
617 addr = __rseq_percpu_malloc(pool, zeroed);
618 if (addr == NULL && errno == ENOMEM) {
619 /*
620 * If the allocation failed, try again with a
621 * larger pool.
622 */
623 min_order = order + 1;
624 goto again;
625 }
626 } else {
627 /* Not found. */
628 errno = ENOMEM;
629 addr = NULL;
630 }
631 return addr;
632 }
633
634 void __rseq_percpu *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
635 {
636 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
637 }
638
639 void __rseq_percpu *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
640 {
641 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
642 }
643
644 struct rseq_pool_attr *rseq_pool_attr_create(void)
645 {
646 return calloc(1, sizeof(struct rseq_pool_attr));
647 }
648
649 void rseq_pool_attr_destroy(struct rseq_pool_attr *attr)
650 {
651 free(attr);
652 }
653
654 void rseq_pool_attr_set_mmap(struct rseq_pool_attr *attr,
655 void *(*mmap_func)(void *priv, size_t len),
656 int (*munmap_func)(void *priv, void *ptr, size_t len),
657 void *mmap_priv)
658 {
659 attr->mmap_set = true;
660 attr->mmap_func = mmap_func;
661 attr->munmap_func = munmap_func;
662 attr->mmap_priv = mmap_priv;
663 }
This page took 0.051485 seconds and 4 git commands to generate.