1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/percpu-alloc.h>
11 #include <rseq/compiler.h>
22 #include "rseq-alloc-utils.h"
25 * rseq-percpu-alloc.c: rseq CPU-Local Storage (CLS) memory allocator.
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
29 * given size (rounded to next power of 2), reserving a given virtual
30 * address size per CPU, for a given maximum number of CPUs.
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
38 * Use high bits of per-CPU addresses to index the pool.
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
42 #if RSEQ_BITS_PER_LONG == 64
43 # define POOL_INDEX_BITS 16
45 # define POOL_INDEX_BITS 8
47 #define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48 #define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49 #define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50 #define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
52 #define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
55 * Smallest allocation should hold enough space for a free list pointer.
57 #if RSEQ_BITS_PER_LONG == 64
58 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
60 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
64 * Skip pool index 0 to ensure allocated entries at index 0 do not match
69 #define RSEQ_POOL_FLAGS (RSEQ_POOL_ROBUST)
71 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
73 struct free_list_node
;
75 struct free_list_node
{
76 struct free_list_node
*next
;
79 /* This lock protects pool create/destroy. */
80 static pthread_mutex_t pool_lock
= PTHREAD_MUTEX_INITIALIZER
;
82 struct rseq_mmap_attr
{
83 void *(*mmap_func
)(void *priv
, size_t len
);
84 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
);
88 struct rseq_percpu_pool
{
97 * The free list chains freed items on the CPU 0 address range.
98 * We should rethink this decision if false sharing between
99 * malloc/free from other CPUs and data accesses from CPU 0
100 * becomes an issue. This is a NULL-terminated singly-linked
103 struct free_list_node
*free_list_head
;
105 /* This lock protects allocation/free within the pool. */
106 pthread_mutex_t lock
;
108 struct rseq_mmap_attr mmap_attr
;
110 /* Track alloc/free. */
111 unsigned long *alloc_bitmap
;
114 //TODO: the array of pools should grow dynamically on create.
115 static struct rseq_percpu_pool rseq_percpu_pool
[MAX_NR_POOLS
];
118 * Pool set entries are indexed by item_len rounded to the next power of
119 * 2. A pool set can contain NULL pool entries, in which case the next
120 * large enough entry will be used for allocation.
122 struct rseq_percpu_pool_set
{
123 /* This lock protects add vs malloc/zmalloc within the pool set. */
124 pthread_mutex_t lock
;
125 struct rseq_percpu_pool
*entries
[POOL_SET_NR_ENTRIES
];
129 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool
*pool
, int cpu
, uintptr_t item_offset
)
131 return pool
->base
+ (pool
->percpu_len
* cpu
) + item_offset
;
134 void *__rseq_percpu_ptr(void __rseq_percpu
*_ptr
, int cpu
)
136 uintptr_t ptr
= (uintptr_t) _ptr
;
137 uintptr_t item_offset
= ptr
& MAX_POOL_LEN_MASK
;
138 uintptr_t pool_index
= ptr
>> POOL_INDEX_SHIFT
;
139 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
142 return __rseq_pool_percpu_ptr(pool
, cpu
, item_offset
);
146 void rseq_percpu_zero_item(struct rseq_percpu_pool
*pool
, uintptr_t item_offset
)
150 for (i
= 0; i
< pool
->max_nr_cpus
; i
++) {
151 char *p
= __rseq_pool_percpu_ptr(pool
, i
, item_offset
);
152 memset(p
, 0, pool
->item_len
);
157 int rseq_percpu_pool_init_numa(struct rseq_percpu_pool
*pool
, int numa_flags
)
159 unsigned long nr_pages
, page
;
165 page_len
= rseq_get_page_len();
166 nr_pages
= pool
->percpu_len
>> rseq_get_count_order_ulong(page_len
);
167 for (cpu
= 0; cpu
< pool
->max_nr_cpus
; cpu
++) {
168 int node
= numa_node_of_cpu(cpu
);
170 /* TODO: batch move_pages() call with an array of pages. */
171 for (page
= 0; page
< nr_pages
; page
++) {
172 void *pageptr
= __rseq_pool_percpu_ptr(pool
, cpu
, page
* page_len
);
175 ret
= move_pages(0, 1, &pageptr
, &node
, &status
, numa_flags
);
183 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool
*pool
__attribute__((unused
)),
184 int numa_flags
__attribute__((unused
)))
191 void *default_mmap_func(void *priv
__attribute__((unused
)), size_t len
)
195 base
= mmap(NULL
, len
, PROT_READ
| PROT_WRITE
,
196 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
197 if (base
== MAP_FAILED
)
203 int default_munmap_func(void *priv
__attribute__((unused
)), void *ptr
, size_t len
)
205 return munmap(ptr
, len
);
209 int create_alloc_bitmap(struct rseq_percpu_pool
*pool
)
213 count
= ((pool
->percpu_len
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
216 * Not being able to create the validation bitmap is an error
217 * that needs to be reported.
219 pool
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
220 if (!pool
->alloc_bitmap
)
225 /* Always inline for __builtin_return_address(0). */
226 static inline __attribute__((always_inline
))
227 void check_free_list(const struct rseq_percpu_pool
*pool
)
229 size_t total_item
= pool
->percpu_len
>> pool
->item_order
;
230 size_t total_never_allocated
= (pool
->percpu_len
- pool
->next_unused
) >> pool
->item_order
;
231 size_t total_freed
= 0;
232 size_t max_list_traversal
= total_item
- total_never_allocated
;
233 size_t traversal_iteration
= 0;
235 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
240 void *node_addr
= node
;
242 if (traversal_iteration
>= max_list_traversal
) {
243 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool %p, caller %p.\n",
244 __func__
, pool
, __builtin_return_address(0));
248 /* Node is out of range. */
249 if ((node_addr
< pool
->base
) ||
250 (node_addr
>= pool
->base
+ pool
->next_unused
)) {
252 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool %p, caller %p.\n",
253 __func__
, prev
, node
, pool
, __builtin_return_address(0));
255 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool %p, caller %p.\n",
256 __func__
, node
, pool
, __builtin_return_address(0));
260 traversal_iteration
+= 1;
264 if (total_never_allocated
+ total_freed
!= total_item
) {
265 fprintf(stderr
, "%s: Corrupted free-list in pool %p; total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
266 __func__
, pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
272 /* Always inline for __builtin_return_address(0). */
273 static inline __attribute__((always_inline
))
274 void destroy_alloc_bitmap(struct rseq_percpu_pool
*pool
)
276 unsigned long *bitmap
= pool
->alloc_bitmap
;
277 size_t count
, total_leaks
= 0;
282 count
= ((pool
->percpu_len
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
284 /* Assert that all items in the pool were freed. */
285 for (size_t k
= 0; k
< count
; ++k
)
286 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
288 fprintf(stderr
, "%s: Pool has %zu leaked items on destroy, caller: %p.\n",
289 __func__
, total_leaks
, (void *) __builtin_return_address(0));
293 check_free_list(pool
);
298 /* Always inline for __builtin_return_address(0). */
299 static inline __attribute__((always_inline
))
300 int __rseq_percpu_pool_destroy(struct rseq_percpu_pool
*pool
)
310 * This must be done before releasing pool->base for checking the
313 destroy_alloc_bitmap(pool
);
314 ret
= pool
->mmap_attr
.munmap_func(pool
->mmap_attr
.mmap_priv
, pool
->base
,
315 pool
->percpu_len
* pool
->max_nr_cpus
);
318 pthread_mutex_destroy(&pool
->lock
);
319 memset(pool
, 0, sizeof(*pool
));
324 int rseq_percpu_pool_destroy(struct rseq_percpu_pool
*pool
)
328 pthread_mutex_lock(&pool_lock
);
329 ret
= __rseq_percpu_pool_destroy(pool
);
330 pthread_mutex_unlock(&pool_lock
);
334 struct rseq_percpu_pool
*rseq_percpu_pool_create(size_t item_len
,
335 size_t percpu_len
, int max_nr_cpus
,
336 const struct rseq_mmap_attr
*mmap_attr
,
339 void *(*mmap_func
)(void *priv
, size_t len
);
340 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
);
342 struct rseq_percpu_pool
*pool
;
347 if (flags
& ~RSEQ_POOL_FLAGS
) {
352 /* Make sure each item is large enough to contain free list pointers. */
353 if (item_len
< sizeof(void *))
354 item_len
= sizeof(void *);
356 /* Align item_len on next power of two. */
357 order
= rseq_get_count_order_ulong(item_len
);
362 item_len
= 1UL << order
;
364 /* Align percpu_len on page size. */
365 percpu_len
= rseq_align(percpu_len
, rseq_get_page_len());
367 if (max_nr_cpus
< 0 || item_len
> percpu_len
||
368 percpu_len
> (UINTPTR_MAX
>> POOL_INDEX_BITS
)) {
374 mmap_func
= mmap_attr
->mmap_func
;
375 munmap_func
= mmap_attr
->munmap_func
;
376 mmap_priv
= mmap_attr
->mmap_priv
;
378 mmap_func
= default_mmap_func
;
379 munmap_func
= default_munmap_func
;
382 pthread_mutex_lock(&pool_lock
);
383 /* Linear scan in array of pools to find empty spot. */
384 for (i
= FIRST_POOL
; i
< MAX_NR_POOLS
; i
++) {
385 pool
= &rseq_percpu_pool
[i
];
394 base
= mmap_func(mmap_priv
, percpu_len
* max_nr_cpus
);
397 pthread_mutex_init(&pool
->lock
, NULL
);
399 pool
->percpu_len
= percpu_len
;
400 pool
->max_nr_cpus
= max_nr_cpus
;
402 pool
->item_len
= item_len
;
403 pool
->item_order
= order
;
404 pool
->mmap_attr
.mmap_func
= mmap_func
;
405 pool
->mmap_attr
.munmap_func
= munmap_func
;
406 pool
->mmap_attr
.mmap_priv
= mmap_priv
;
408 if (RSEQ_POOL_ROBUST
& flags
) {
409 if (create_alloc_bitmap(pool
))
413 pthread_mutex_unlock(&pool_lock
);
417 __rseq_percpu_pool_destroy(pool
);
418 pthread_mutex_unlock(&pool_lock
);
423 /* Always inline for __builtin_return_address(0). */
424 static inline __attribute__((always_inline
))
425 void set_alloc_slot(struct rseq_percpu_pool
*pool
, size_t item_offset
)
427 unsigned long *bitmap
= pool
->alloc_bitmap
;
428 size_t item_index
= item_offset
>> pool
->item_order
;
435 k
= item_index
/ BIT_PER_ULONG
;
436 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
438 /* Print error if bit is already set. */
439 if (bitmap
[k
] & mask
) {
440 fprintf(stderr
, "%s: Allocator corruption detected for pool: %p, item offset: %zu, caller: %p.\n",
441 __func__
, pool
, item_offset
, (void *) __builtin_return_address(0));
448 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_percpu_pool
*pool
, bool zeroed
)
450 struct free_list_node
*node
;
451 uintptr_t item_offset
;
452 void __rseq_percpu
*addr
;
454 pthread_mutex_lock(&pool
->lock
);
455 /* Get first entry from free list. */
456 node
= pool
->free_list_head
;
458 /* Remove node from free list (update head). */
459 pool
->free_list_head
= node
->next
;
460 item_offset
= (uintptr_t) ((void *) node
- pool
->base
);
461 addr
= (void *) (((uintptr_t) pool
->index
<< POOL_INDEX_SHIFT
) | item_offset
);
464 if (pool
->next_unused
+ pool
->item_len
> pool
->percpu_len
) {
469 item_offset
= pool
->next_unused
;
470 addr
= (void *) (((uintptr_t) pool
->index
<< POOL_INDEX_SHIFT
) | item_offset
);
471 pool
->next_unused
+= pool
->item_len
;
472 set_alloc_slot(pool
, item_offset
);
474 pthread_mutex_unlock(&pool
->lock
);
476 rseq_percpu_zero_item(pool
, item_offset
);
480 void __rseq_percpu
*rseq_percpu_malloc(struct rseq_percpu_pool
*pool
)
482 return __rseq_percpu_malloc(pool
, false);
485 void __rseq_percpu
*rseq_percpu_zmalloc(struct rseq_percpu_pool
*pool
)
487 return __rseq_percpu_malloc(pool
, true);
490 /* Always inline for __builtin_return_address(0). */
491 static inline __attribute__((always_inline
))
492 void clear_alloc_slot(struct rseq_percpu_pool
*pool
, size_t item_offset
)
494 unsigned long *bitmap
= pool
->alloc_bitmap
;
495 size_t item_index
= item_offset
>> pool
->item_order
;
502 k
= item_index
/ BIT_PER_ULONG
;
503 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
505 /* Print error if bit is not set. */
506 if (!(bitmap
[k
] & mask
)) {
507 fprintf(stderr
, "%s: Double-free detected for pool: %p, item offset: %zu, caller: %p.\n",
508 __func__
, pool
, item_offset
, (void *) __builtin_return_address(0));
514 void rseq_percpu_free(void __rseq_percpu
*_ptr
)
516 uintptr_t ptr
= (uintptr_t) _ptr
;
517 uintptr_t item_offset
= ptr
& MAX_POOL_LEN_MASK
;
518 uintptr_t pool_index
= ptr
>> POOL_INDEX_SHIFT
;
519 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
520 struct free_list_node
*head
, *item
;
522 pthread_mutex_lock(&pool
->lock
);
523 clear_alloc_slot(pool
, item_offset
);
524 /* Add ptr to head of free list */
525 head
= pool
->free_list_head
;
526 /* Free-list is in CPU 0 range. */
527 item
= (struct free_list_node
*)__rseq_pool_percpu_ptr(pool
, 0, item_offset
);
529 pool
->free_list_head
= item
;
530 pthread_mutex_unlock(&pool
->lock
);
533 struct rseq_percpu_pool_set
*rseq_percpu_pool_set_create(void)
535 struct rseq_percpu_pool_set
*pool_set
;
537 pool_set
= calloc(1, sizeof(struct rseq_percpu_pool_set
));
540 pthread_mutex_init(&pool_set
->lock
, NULL
);
544 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set
*pool_set
)
548 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
549 struct rseq_percpu_pool
*pool
= pool_set
->entries
[order
];
553 ret
= rseq_percpu_pool_destroy(pool
);
556 pool_set
->entries
[order
] = NULL
;
558 pthread_mutex_destroy(&pool_set
->lock
);
563 /* Ownership of pool is handed over to pool set on success. */
564 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set
*pool_set
, struct rseq_percpu_pool
*pool
)
566 size_t item_order
= pool
->item_order
;
569 pthread_mutex_lock(&pool_set
->lock
);
570 if (pool_set
->entries
[item_order
]) {
575 pool_set
->entries
[pool
->item_order
] = pool
;
577 pthread_mutex_unlock(&pool_set
->lock
);
582 void __rseq_percpu
*__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
, bool zeroed
)
584 int order
, min_order
= POOL_SET_MIN_ENTRY
;
585 struct rseq_percpu_pool
*pool
;
586 void __rseq_percpu
*addr
;
588 order
= rseq_get_count_order_ulong(len
);
589 if (order
> POOL_SET_MIN_ENTRY
)
592 pthread_mutex_lock(&pool_set
->lock
);
593 /* First smallest present pool where @len fits. */
594 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
595 pool
= pool_set
->entries
[order
];
599 if (pool
->item_len
>= len
)
604 pthread_mutex_unlock(&pool_set
->lock
);
606 addr
= __rseq_percpu_malloc(pool
, zeroed
);
607 if (addr
== NULL
&& errno
== ENOMEM
) {
609 * If the allocation failed, try again with a
612 min_order
= order
+ 1;
623 void __rseq_percpu
*rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
625 return __rseq_percpu_pool_set_malloc(pool_set
, len
, false);
628 void __rseq_percpu
*rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
630 return __rseq_percpu_pool_set_malloc(pool_set
, len
, true);
633 struct rseq_mmap_attr
*rseq_mmap_attr_create(void *(*mmap_func
)(void *priv
, size_t len
),
634 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
),
637 struct rseq_mmap_attr
*attr
= calloc(1, sizeof(struct rseq_mmap_attr
));
641 attr
->mmap_func
= mmap_func
;
642 attr
->munmap_func
= munmap_func
;
643 attr
->mmap_priv
= mmap_priv
;
647 void rseq_mmap_attr_destroy(struct rseq_mmap_attr
*attr
)