Fix destroy_alloc_bitmap size
[librseq.git] / src / rseq-percpu-alloc.c
CommitLineData
ef6695f1
MD
1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4#include <rseq/percpu-alloc.h>
5#include <sys/mman.h>
6#include <assert.h>
7#include <string.h>
8#include <pthread.h>
9#include <unistd.h>
10#include <stdlib.h>
11#include <rseq/compiler.h>
12#include <errno.h>
13#include <stdint.h>
14#include <stdbool.h>
367e559c
MD
15#include <stdio.h>
16
17#ifdef HAVE_LIBNUMA
18# include <numa.h>
19# include <numaif.h>
20#endif
ef6695f1 21
19be9217
MD
22#include "rseq-alloc-utils.h"
23
ef6695f1 24/*
8ab16a24 25 * rseq-percpu-alloc.c: rseq CPU-Local Storage (CLS) memory allocator.
ef6695f1 26 *
8ab16a24
MD
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
8aa1462d
MD
29 * given size (rounded to next power of 2), reserving a given virtual
30 * address size per CPU, for a given maximum number of CPUs.
8ab16a24
MD
31 *
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
ef6695f1
MD
35 */
36
72b100a1 37/*
8ab16a24 38 * Use high bits of per-CPU addresses to index the pool.
72b100a1
MD
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
41 */
ef6695f1 42#if RSEQ_BITS_PER_LONG == 64
72b100a1 43# define POOL_INDEX_BITS 16
ef6695f1 44#else
72b100a1 45# define POOL_INDEX_BITS 8
ef6695f1 46#endif
72b100a1
MD
47#define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48#define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49#define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50#define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
ef6695f1 51
72b100a1 52#define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
ef6695f1 53
72b100a1
MD
54/*
55 * Smallest allocation should hold enough space for a free list pointer.
56 */
ef6695f1
MD
57#if RSEQ_BITS_PER_LONG == 64
58# define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
59#else
60# define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
61#endif
62
bb1552e2
MD
63/*
64 * Skip pool index 0 to ensure allocated entries at index 0 do not match
65 * a NULL pointer.
66 */
67#define FIRST_POOL 1
68
0fdf7a4c
OD
69#define RSEQ_POOL_FLAGS (RSEQ_POOL_ROBUST)
70
71#define BIT_PER_ULONG (8 * sizeof(unsigned long))
72
ef6695f1
MD
73struct free_list_node;
74
75struct free_list_node {
76 struct free_list_node *next;
77};
78
79/* This lock protects pool create/destroy. */
80static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
81
9bd07c29
MD
82struct rseq_mmap_attr {
83 void *(*mmap_func)(void *priv, size_t len);
84 int (*munmap_func)(void *priv, void *ptr, size_t len);
85 void *mmap_priv;
86};
87
ef6695f1
MD
88struct rseq_percpu_pool {
89 void *base;
90 unsigned int index;
91 size_t item_len;
92 size_t percpu_len;
93 int item_order;
94 int max_nr_cpus;
95
96 /*
8ab16a24 97 * The free list chains freed items on the CPU 0 address range.
ef6695f1 98 * We should rethink this decision if false sharing between
8ab16a24 99 * malloc/free from other CPUs and data accesses from CPU 0
ef6695f1
MD
100 * becomes an issue. This is a NULL-terminated singly-linked
101 * list.
102 */
103 struct free_list_node *free_list_head;
104 size_t next_unused;
105 /* This lock protects allocation/free within the pool. */
106 pthread_mutex_t lock;
9bd07c29
MD
107
108 struct rseq_mmap_attr mmap_attr;
0fdf7a4c 109
9649c7ee
MD
110 /* Track alloc/free. */
111 unsigned long *alloc_bitmap;
ef6695f1
MD
112};
113
114//TODO: the array of pools should grow dynamically on create.
115static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
116
117/*
118 * Pool set entries are indexed by item_len rounded to the next power of
119 * 2. A pool set can contain NULL pool entries, in which case the next
120 * large enough entry will be used for allocation.
121 */
122struct rseq_percpu_pool_set {
123 /* This lock protects add vs malloc/zmalloc within the pool set. */
124 pthread_mutex_t lock;
125 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
126};
127
367e559c
MD
128static
129void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
130{
131 return pool->base + (pool->percpu_len * cpu) + item_offset;
132}
133
d24ee051 134void *__rseq_percpu_ptr(void __rseq_percpu *_ptr, int cpu)
367e559c
MD
135{
136 uintptr_t ptr = (uintptr_t) _ptr;
72b100a1
MD
137 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
138 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
367e559c
MD
139 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
140
141 assert(cpu >= 0);
142 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
143}
144
145static
146void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
147{
148 int i;
149
150 for (i = 0; i < pool->max_nr_cpus; i++) {
151 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
152 memset(p, 0, pool->item_len);
153 }
154}
155
156#ifdef HAVE_LIBNUMA
9bd07c29 157int rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, int numa_flags)
367e559c
MD
158{
159 unsigned long nr_pages, page;
160 long ret, page_len;
161 int cpu;
162
163 if (!numa_flags)
9bd07c29 164 return 0;
367e559c 165 page_len = rseq_get_page_len();
19be9217 166 nr_pages = pool->percpu_len >> rseq_get_count_order_ulong(page_len);
367e559c
MD
167 for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
168 int node = numa_node_of_cpu(cpu);
169
170 /* TODO: batch move_pages() call with an array of pages. */
171 for (page = 0; page < nr_pages; page++) {
172 void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
173 int status = -EPERM;
174
175 ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
9bd07c29
MD
176 if (ret)
177 return ret;
367e559c
MD
178 }
179 }
9bd07c29 180 return 0;
367e559c
MD
181}
182#else
367e559c
MD
183void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
184 int numa_flags __attribute__((unused)))
185{
9bd07c29 186 return 0;
367e559c
MD
187}
188#endif
189
9bd07c29
MD
190static
191void *default_mmap_func(void *priv __attribute__((unused)), size_t len)
192{
193 void *base;
194
195 base = mmap(NULL, len, PROT_READ | PROT_WRITE,
196 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
197 if (base == MAP_FAILED)
198 return NULL;
199 return base;
200}
201
202static
203int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len)
204{
205 return munmap(ptr, len);
206}
207
0fdf7a4c 208static
9649c7ee 209int create_alloc_bitmap(struct rseq_percpu_pool *pool)
0fdf7a4c
OD
210{
211 size_t count;
212
9649c7ee 213 count = ((pool->percpu_len >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
0fdf7a4c
OD
214
215 /*
9649c7ee
MD
216 * Not being able to create the validation bitmap is an error
217 * that needs to be reported.
0fdf7a4c 218 */
9649c7ee
MD
219 pool->alloc_bitmap = calloc(count, sizeof(unsigned long));
220 if (!pool->alloc_bitmap)
221 return -1;
222 return 0;
0fdf7a4c
OD
223}
224
225static
9649c7ee 226void destroy_alloc_bitmap(struct rseq_percpu_pool *pool)
0fdf7a4c 227{
9649c7ee 228 unsigned long *bitmap = pool->alloc_bitmap;
9649c7ee 229 size_t count, total_leaks = 0;
0fdf7a4c 230
9649c7ee 231 if (!bitmap)
0fdf7a4c 232 return;
0fdf7a4c 233
ed56a022 234 count = ((pool->percpu_len >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
0fdf7a4c
OD
235
236 /* Assert that all items in the pool were freed. */
9649c7ee
MD
237 for (size_t k = 0; k < count; ++k)
238 total_leaks += rseq_hweight_ulong(bitmap[k]);
239 if (total_leaks) {
240 fprintf(stderr, "%s: Pool has %zu leaked items on destroy.\n", __func__,
241 total_leaks);
242 abort();
0fdf7a4c
OD
243 }
244
245 free(bitmap);
246}
247
9649c7ee
MD
248static
249int __rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
250{
251 int ret;
252
253 if (!pool->base) {
254 errno = ENOENT;
255 ret = -1;
256 goto end;
257 }
258 ret = pool->mmap_attr.munmap_func(pool->mmap_attr.mmap_priv, pool->base,
259 pool->percpu_len * pool->max_nr_cpus);
260 if (ret)
261 goto end;
262 pthread_mutex_destroy(&pool->lock);
263 destroy_alloc_bitmap(pool);
264 memset(pool, 0, sizeof(*pool));
265end:
266 return 0;
267}
268
269int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
270{
271 int ret;
272
273 pthread_mutex_lock(&pool_lock);
274 ret = __rseq_percpu_pool_destroy(pool);
275 pthread_mutex_unlock(&pool_lock);
276 return ret;
277}
278
ef6695f1
MD
279struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
280 size_t percpu_len, int max_nr_cpus,
6b30db4e
MD
281 const struct rseq_mmap_attr *mmap_attr,
282 int flags)
ef6695f1 283{
9bd07c29
MD
284 void *(*mmap_func)(void *priv, size_t len);
285 int (*munmap_func)(void *priv, void *ptr, size_t len);
286 void *mmap_priv;
ef6695f1
MD
287 struct rseq_percpu_pool *pool;
288 void *base;
289 unsigned int i;
290 int order;
ef6695f1 291
0fdf7a4c 292 if (flags & ~RSEQ_POOL_FLAGS) {
6b30db4e
MD
293 errno = EINVAL;
294 return NULL;
295 }
296
ef6695f1
MD
297 /* Make sure each item is large enough to contain free list pointers. */
298 if (item_len < sizeof(void *))
299 item_len = sizeof(void *);
300
301 /* Align item_len on next power of two. */
19be9217 302 order = rseq_get_count_order_ulong(item_len);
ef6695f1
MD
303 if (order < 0) {
304 errno = EINVAL;
305 return NULL;
306 }
307 item_len = 1UL << order;
308
309 /* Align percpu_len on page size. */
367e559c 310 percpu_len = rseq_align(percpu_len, rseq_get_page_len());
ef6695f1
MD
311
312 if (max_nr_cpus < 0 || item_len > percpu_len ||
72b100a1 313 percpu_len > (UINTPTR_MAX >> POOL_INDEX_BITS)) {
ef6695f1
MD
314 errno = EINVAL;
315 return NULL;
316 }
317
9bd07c29
MD
318 if (mmap_attr) {
319 mmap_func = mmap_attr->mmap_func;
320 munmap_func = mmap_attr->munmap_func;
321 mmap_priv = mmap_attr->mmap_priv;
322 } else {
323 mmap_func = default_mmap_func;
324 munmap_func = default_munmap_func;
325 mmap_priv = NULL;
326 }
ef6695f1
MD
327 pthread_mutex_lock(&pool_lock);
328 /* Linear scan in array of pools to find empty spot. */
bb1552e2 329 for (i = FIRST_POOL; i < MAX_NR_POOLS; i++) {
ef6695f1
MD
330 pool = &rseq_percpu_pool[i];
331 if (!pool->base)
332 goto found_empty;
333 }
334 errno = ENOMEM;
335 pool = NULL;
336 goto end;
337
338found_empty:
9bd07c29 339 base = mmap_func(mmap_priv, percpu_len * max_nr_cpus);
9649c7ee
MD
340 if (!base)
341 goto error_alloc;
ef6695f1
MD
342 pthread_mutex_init(&pool->lock, NULL);
343 pool->base = base;
344 pool->percpu_len = percpu_len;
345 pool->max_nr_cpus = max_nr_cpus;
346 pool->index = i;
347 pool->item_len = item_len;
348 pool->item_order = order;
9bd07c29
MD
349 pool->mmap_attr.mmap_func = mmap_func;
350 pool->mmap_attr.munmap_func = munmap_func;
351 pool->mmap_attr.mmap_priv = mmap_priv;
0fdf7a4c
OD
352
353 if (RSEQ_POOL_ROBUST & flags) {
9649c7ee
MD
354 if (create_alloc_bitmap(pool))
355 goto error_alloc;
0fdf7a4c 356 }
ef6695f1
MD
357end:
358 pthread_mutex_unlock(&pool_lock);
359 return pool;
ef6695f1 360
9649c7ee
MD
361error_alloc:
362 __rseq_percpu_pool_destroy(pool);
ef6695f1 363 pthread_mutex_unlock(&pool_lock);
9649c7ee
MD
364 errno = ENOMEM;
365 return NULL;
ef6695f1
MD
366}
367
0fdf7a4c 368static
9649c7ee 369void set_alloc_slot(struct rseq_percpu_pool *pool, size_t item_offset)
0fdf7a4c 370{
9649c7ee
MD
371 unsigned long *bitmap = pool->alloc_bitmap;
372 size_t item_index = item_offset >> pool->item_order;
0fdf7a4c
OD
373 unsigned long mask;
374 size_t k;
375
9649c7ee 376 if (!bitmap)
0fdf7a4c 377 return;
0fdf7a4c 378
9649c7ee 379 k = item_index / BIT_PER_ULONG;
0fdf7a4c
OD
380 mask = 1ULL << (item_index % BIT_PER_ULONG);
381
9649c7ee
MD
382 /* Print error if bit is already set. */
383 if (bitmap[k] & mask) {
384 fprintf(stderr, "%s: Allocator corruption detected for pool %p, item offset %zu.",
385 __func__, pool, item_offset);
386 abort();
387 }
0fdf7a4c
OD
388 bitmap[k] |= mask;
389}
390
ef6695f1 391static
d24ee051 392void __rseq_percpu *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
ef6695f1
MD
393{
394 struct free_list_node *node;
395 uintptr_t item_offset;
d24ee051 396 void __rseq_percpu *addr;
ef6695f1
MD
397
398 pthread_mutex_lock(&pool->lock);
399 /* Get first entry from free list. */
400 node = pool->free_list_head;
401 if (node != NULL) {
402 /* Remove node from free list (update head). */
403 pool->free_list_head = node->next;
404 item_offset = (uintptr_t) ((void *) node - pool->base);
72b100a1 405 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
ef6695f1
MD
406 goto end;
407 }
408 if (pool->next_unused + pool->item_len > pool->percpu_len) {
ea1a3ada 409 errno = ENOMEM;
ef6695f1
MD
410 addr = NULL;
411 goto end;
412 }
413 item_offset = pool->next_unused;
72b100a1 414 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
ef6695f1 415 pool->next_unused += pool->item_len;
9649c7ee 416 set_alloc_slot(pool, item_offset);
ef6695f1
MD
417end:
418 pthread_mutex_unlock(&pool->lock);
419 if (zeroed && addr)
420 rseq_percpu_zero_item(pool, item_offset);
421 return addr;
422}
423
d24ee051 424void __rseq_percpu *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
ef6695f1
MD
425{
426 return __rseq_percpu_malloc(pool, false);
427}
428
d24ee051 429void __rseq_percpu *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
ef6695f1
MD
430{
431 return __rseq_percpu_malloc(pool, true);
432}
433
0fdf7a4c 434static
9649c7ee 435void clear_alloc_slot(struct rseq_percpu_pool *pool, size_t item_offset)
0fdf7a4c 436{
9649c7ee
MD
437 unsigned long *bitmap = pool->alloc_bitmap;
438 size_t item_index = item_offset >> pool->item_order;
0fdf7a4c
OD
439 unsigned long mask;
440 size_t k;
441
9649c7ee 442 if (!bitmap)
0fdf7a4c 443 return;
0fdf7a4c 444
9649c7ee
MD
445 k = item_index / BIT_PER_ULONG;
446 mask = 1ULL << (item_index % BIT_PER_ULONG);
0fdf7a4c 447
9649c7ee
MD
448 /* Print error if bit is not set. */
449 if (!(bitmap[k] & mask)) {
450 fprintf(stderr, "%s: Double-free detected for pool %p, item offset %zu.",
451 __func__, pool, item_offset);
452 abort();
453 }
0fdf7a4c
OD
454 bitmap[k] &= ~mask;
455}
456
d24ee051 457void rseq_percpu_free(void __rseq_percpu *_ptr)
ef6695f1
MD
458{
459 uintptr_t ptr = (uintptr_t) _ptr;
72b100a1
MD
460 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
461 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
ef6695f1
MD
462 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
463 struct free_list_node *head, *item;
464
465 pthread_mutex_lock(&pool->lock);
9649c7ee 466 clear_alloc_slot(pool, item_offset);
ef6695f1
MD
467 /* Add ptr to head of free list */
468 head = pool->free_list_head;
8ab16a24 469 /* Free-list is in CPU 0 range. */
ef6695f1
MD
470 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
471 item->next = head;
472 pool->free_list_head = item;
473 pthread_mutex_unlock(&pool->lock);
474}
475
476struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
477{
478 struct rseq_percpu_pool_set *pool_set;
479
480 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
481 if (!pool_set)
482 return NULL;
483 pthread_mutex_init(&pool_set->lock, NULL);
484 return pool_set;
485}
486
487int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
488{
489 int order, ret;
490
491 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
492 struct rseq_percpu_pool *pool = pool_set->entries[order];
493
494 if (!pool)
495 continue;
496 ret = rseq_percpu_pool_destroy(pool);
497 if (ret)
498 return ret;
499 pool_set->entries[order] = NULL;
500 }
501 pthread_mutex_destroy(&pool_set->lock);
502 free(pool_set);
503 return 0;
504}
505
506/* Ownership of pool is handed over to pool set on success. */
507int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
508{
509 size_t item_order = pool->item_order;
510 int ret = 0;
511
512 pthread_mutex_lock(&pool_set->lock);
513 if (pool_set->entries[item_order]) {
514 errno = EBUSY;
515 ret = -1;
516 goto end;
517 }
518 pool_set->entries[pool->item_order] = pool;
519end:
520 pthread_mutex_unlock(&pool_set->lock);
521 return ret;
522}
523
524static
d24ee051 525void __rseq_percpu *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
ef6695f1
MD
526{
527 int order, min_order = POOL_SET_MIN_ENTRY;
528 struct rseq_percpu_pool *pool;
d24ee051 529 void __rseq_percpu *addr;
ef6695f1 530
d06f5cf5
MD
531 order = rseq_get_count_order_ulong(len);
532 if (order > POOL_SET_MIN_ENTRY)
533 min_order = order;
ef6695f1
MD
534again:
535 pthread_mutex_lock(&pool_set->lock);
536 /* First smallest present pool where @len fits. */
537 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
538 pool = pool_set->entries[order];
539
540 if (!pool)
541 continue;
542 if (pool->item_len >= len)
543 goto found;
544 }
545 pool = NULL;
546found:
547 pthread_mutex_unlock(&pool_set->lock);
548 if (pool) {
549 addr = __rseq_percpu_malloc(pool, zeroed);
550 if (addr == NULL && errno == ENOMEM) {
551 /*
552 * If the allocation failed, try again with a
553 * larger pool.
554 */
555 min_order = order + 1;
556 goto again;
557 }
558 } else {
559 /* Not found. */
560 errno = ENOMEM;
561 addr = NULL;
562 }
563 return addr;
564}
565
d24ee051 566void __rseq_percpu *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
ef6695f1
MD
567{
568 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
569}
570
d24ee051 571void __rseq_percpu *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
ef6695f1
MD
572{
573 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
574}
9bd07c29
MD
575
576struct rseq_mmap_attr *rseq_mmap_attr_create(void *(*mmap_func)(void *priv, size_t len),
577 int (*munmap_func)(void *priv, void *ptr, size_t len),
578 void *mmap_priv)
579{
580 struct rseq_mmap_attr *attr = calloc(1, sizeof(struct rseq_mmap_attr));
581
582 if (!attr)
583 return NULL;
584 attr->mmap_func = mmap_func;
585 attr->munmap_func = munmap_func;
586 attr->mmap_priv = mmap_priv;
587 return attr;
588}
589
590void rseq_mmap_attr_destroy(struct rseq_mmap_attr *attr)
591{
592 free(attr);
593}
This page took 0.04806 seconds and 4 git commands to generate.