1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/mempool.h>
11 #include <rseq/compiler.h>
22 #include "rseq-utils.h"
26 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
28 * The rseq per-CPU memory allocator allows the application the request
29 * memory pools of CPU-Local memory each of containing objects of a
30 * given size (rounded to next power of 2), reserving a given virtual
31 * address size per CPU, for a given maximum number of CPUs.
33 * The per-CPU memory allocator is analogous to TLS (Thread-Local
34 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
35 * memory allocator provides CPU-Local Storage.
38 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
41 * Smallest allocation should hold enough space for a free list pointer.
43 #if RSEQ_BITS_PER_LONG == 64
44 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
46 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
50 * Skip pool index 0 to ensure allocated entries at index 0 do not match
55 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
57 #define MOVE_PAGES_BATCH_SIZE 4096
59 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
61 struct free_list_node
;
63 struct free_list_node
{
64 struct free_list_node
*next
;
68 MEMPOOL_TYPE_PERCPU
= 0, /* Default */
69 MEMPOOL_TYPE_GLOBAL
= 1,
72 struct rseq_mempool_attr
{
74 void *(*mmap_func
)(void *priv
, size_t len
);
75 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
);
80 enum mempool_type type
;
85 struct rseq_mempool_range
;
87 struct rseq_mempool_range
{
88 struct rseq_mempool_range
*next
;
89 struct rseq_mempool
*pool
; /* Backward ref. to container pool. */
93 /* Track alloc/free. */
94 unsigned long *alloc_bitmap
;
98 /* Linked-list of ranges. */
99 struct rseq_mempool_range
*ranges
;
105 * The free list chains freed items on the CPU 0 address range.
106 * We should rethink this decision if false sharing between
107 * malloc/free from other CPUs and data accesses from CPU 0
108 * becomes an issue. This is a NULL-terminated singly-linked
111 struct free_list_node
*free_list_head
;
113 /* This lock protects allocation/free within the pool. */
114 pthread_mutex_t lock
;
116 struct rseq_mempool_attr attr
;
121 * Pool set entries are indexed by item_len rounded to the next power of
122 * 2. A pool set can contain NULL pool entries, in which case the next
123 * large enough entry will be used for allocation.
125 struct rseq_mempool_set
{
126 /* This lock protects add vs malloc/zmalloc within the pool set. */
127 pthread_mutex_t lock
;
128 struct rseq_mempool
*entries
[POOL_SET_NR_ENTRIES
];
132 void *__rseq_pool_percpu_ptr(struct rseq_mempool
*pool
, int cpu
,
133 uintptr_t item_offset
, size_t stride
)
135 /* TODO: Implement multi-ranges support. */
136 return pool
->ranges
->base
+ (stride
* cpu
) + item_offset
;
140 void rseq_percpu_zero_item(struct rseq_mempool
*pool
, uintptr_t item_offset
)
144 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
145 char *p
= __rseq_pool_percpu_ptr(pool
, i
,
146 item_offset
, pool
->attr
.stride
);
147 memset(p
, 0, pool
->item_len
);
151 //TODO: this will need to be reimplemented for ranges,
152 //which cannot use __rseq_pool_percpu_ptr.
153 #if 0 //#ifdef HAVE_LIBNUMA
155 int rseq_mempool_range_init_numa(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, int numa_flags
)
157 unsigned long nr_pages
, page_len
;
163 page_len
= rseq_get_page_len();
164 nr_pages
= pool
->attr
.stride
>> rseq_get_count_order_ulong(page_len
);
165 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
167 int status
[MOVE_PAGES_BATCH_SIZE
];
168 int nodes
[MOVE_PAGES_BATCH_SIZE
];
169 void *pages
[MOVE_PAGES_BATCH_SIZE
];
171 nodes
[0] = numa_node_of_cpu(cpu
);
172 for (size_t k
= 1; k
< RSEQ_ARRAY_SIZE(nodes
); ++k
) {
176 for (unsigned long page
= 0; page
< nr_pages
;) {
178 size_t max_k
= RSEQ_ARRAY_SIZE(pages
);
179 size_t left
= nr_pages
- page
;
185 for (size_t k
= 0; k
< max_k
; ++k
, ++page
) {
186 pages
[k
] = __rseq_pool_percpu_ptr(pool
, cpu
, page
* page_len
);
190 ret
= move_pages(0, max_k
, pages
, nodes
, status
, numa_flags
);
196 fprintf(stderr
, "%lu pages were not migrated\n", ret
);
197 for (size_t k
= 0; k
< max_k
; ++k
) {
200 "Error while moving page %p to numa node %d: %u\n",
201 pages
[k
], nodes
[k
], -status
[k
]);
209 int rseq_mempool_init_numa(struct rseq_mempool
*pool
, int numa_flags
)
211 struct rseq_mempool_range
*range
;
216 for (range
= pool
->ranges
; range
; range
= range
->next
) {
217 ret
= rseq_mempool_range_init_numa(pool
, range
, numa_flags
);
224 int rseq_mempool_init_numa(struct rseq_mempool
*pool
__attribute__((unused
)),
225 int numa_flags
__attribute__((unused
)))
232 void *default_mmap_func(void *priv
__attribute__((unused
)), size_t len
)
236 base
= mmap(NULL
, len
, PROT_READ
| PROT_WRITE
,
237 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
238 if (base
== MAP_FAILED
)
244 int default_munmap_func(void *priv
__attribute__((unused
)), void *ptr
, size_t len
)
246 return munmap(ptr
, len
);
250 int create_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
254 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
257 * Not being able to create the validation bitmap is an error
258 * that needs to be reported.
260 range
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
261 if (!range
->alloc_bitmap
)
267 const char *get_pool_name(const struct rseq_mempool
*pool
)
269 return pool
->name
? : "<anonymous>";
273 bool addr_in_pool(const struct rseq_mempool
*pool
, void *addr
)
275 struct rseq_mempool_range
*range
;
277 for (range
= pool
->ranges
; range
; range
= range
->next
) {
278 if (addr
>= range
->base
&& addr
< range
->base
+ range
->next_unused
)
284 /* Always inline for __builtin_return_address(0). */
285 static inline __attribute__((always_inline
))
286 void check_free_list(const struct rseq_mempool
*pool
)
288 size_t total_item
= 0, total_never_allocated
= 0, total_freed
= 0,
289 max_list_traversal
= 0, traversal_iteration
= 0;
290 struct rseq_mempool_range
*range
;
292 if (!pool
->attr
.robust_set
)
295 for (range
= pool
->ranges
; range
; range
= range
->next
) {
296 total_item
+= pool
->attr
.stride
>> pool
->item_order
;
297 total_never_allocated
+= (pool
->attr
.stride
- range
->next_unused
) >> pool
->item_order
;
299 max_list_traversal
= total_item
- total_never_allocated
;
301 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
306 void *node_addr
= node
;
308 if (traversal_iteration
>= max_list_traversal
) {
309 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
310 __func__
, get_pool_name(pool
), pool
, __builtin_return_address(0));
314 /* Node is out of range. */
315 if (!addr_in_pool(pool
, node_addr
)) {
317 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
318 __func__
, prev
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
320 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
321 __func__
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
325 traversal_iteration
++;
329 if (total_never_allocated
+ total_freed
!= total_item
) {
330 fprintf(stderr
, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
331 __func__
, get_pool_name(pool
), pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
336 /* Always inline for __builtin_return_address(0). */
337 static inline __attribute__((always_inline
))
338 void destroy_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
340 unsigned long *bitmap
= range
->alloc_bitmap
;
341 size_t count
, total_leaks
= 0;
346 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
348 /* Assert that all items in the pool were freed. */
349 for (size_t k
= 0; k
< count
; ++k
)
350 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
352 fprintf(stderr
, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
353 __func__
, get_pool_name(pool
), pool
, total_leaks
, (void *) __builtin_return_address(0));
360 /* Always inline for __builtin_return_address(0). */
361 static inline __attribute__((always_inline
))
362 int rseq_mempool_range_destroy(struct rseq_mempool
*pool
,
363 struct rseq_mempool_range
*range
)
365 destroy_alloc_bitmap(pool
, range
);
366 /* range is a header located one page before the aligned mapping. */
367 return pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, range
->header
,
368 (pool
->attr
.stride
* pool
->attr
.max_nr_cpus
) + rseq_get_page_len());
372 * Allocate a memory mapping aligned on @alignment, with an optional
373 * @pre_header before the mapping.
376 void *aligned_mmap_anonymous(struct rseq_mempool
*pool
,
377 size_t page_size
, size_t len
, size_t alignment
,
378 void **pre_header
, size_t pre_header_len
)
380 size_t minimum_page_count
, page_count
, extra
, total_allocate
= 0;
384 if (len
< page_size
|| alignment
< page_size
||
385 !is_pow2(len
) || !is_pow2(alignment
)) {
389 page_order
= rseq_get_count_order_ulong(page_size
);
390 if (page_order
< 0) {
394 if (pre_header_len
&& (pre_header_len
& (page_size
- 1))) {
399 minimum_page_count
= (pre_header_len
+ len
) >> page_order
;
400 page_count
= (pre_header_len
+ len
+ alignment
- page_size
) >> page_order
;
402 assert(page_count
>= minimum_page_count
);
404 ptr
= pool
->attr
.mmap_func(pool
->attr
.mmap_priv
, page_count
<< page_order
);
408 total_allocate
= page_count
<< page_order
;
410 if (!(((uintptr_t) ptr
+ pre_header_len
) & (alignment
- 1))) {
411 /* Pointer is already aligned. ptr points to pre_header. */
415 /* Unmap extra before. */
416 extra
= offset_align((uintptr_t) ptr
+ pre_header_len
, alignment
);
417 assert(!(extra
& (page_size
- 1)));
418 if (pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, ptr
, extra
)) {
422 total_allocate
-= extra
;
423 ptr
+= extra
; /* ptr points to pre_header */
424 page_count
-= extra
>> page_order
;
426 assert(page_count
>= minimum_page_count
);
428 if (page_count
> minimum_page_count
) {
431 /* Unmap extra after. */
432 extra_ptr
= ptr
+ (minimum_page_count
<< page_order
);
433 extra
= (page_count
- minimum_page_count
) << page_order
;
434 if (pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, extra_ptr
, extra
)) {
438 total_allocate
-= extra
;
441 assert(!(((uintptr_t)ptr
+ pre_header_len
) & (alignment
- 1)));
442 assert(total_allocate
== len
+ pre_header_len
);
448 ptr
+= pre_header_len
;
454 struct rseq_mempool_range
*rseq_mempool_range_create(struct rseq_mempool
*pool
)
456 struct rseq_mempool_range
*range
;
457 unsigned long page_size
;
461 page_size
= rseq_get_page_len();
463 base
= aligned_mmap_anonymous(pool
, page_size
,
464 pool
->attr
.stride
* pool
->attr
.max_nr_cpus
,
469 range
= (struct rseq_mempool_range
*) (base
- RANGE_HEADER_OFFSET
);
472 range
->header
= header
;
473 if (pool
->attr
.robust_set
) {
474 if (create_alloc_bitmap(pool
, range
))
480 (void) rseq_mempool_range_destroy(pool
, range
);
484 int rseq_mempool_destroy(struct rseq_mempool
*pool
)
486 struct rseq_mempool_range
*range
, *next_range
;
491 check_free_list(pool
);
492 /* Iteration safe against removal. */
493 for (range
= pool
->ranges
; range
&& (next_range
= range
->next
, 1); range
= next_range
) {
494 if (rseq_mempool_range_destroy(pool
, range
))
496 /* Update list head to keep list coherent in case of partial failure. */
497 pool
->ranges
= next_range
;
499 pthread_mutex_destroy(&pool
->lock
);
501 memset(pool
, 0, sizeof(*pool
));
506 struct rseq_mempool
*rseq_mempool_create(const char *pool_name
,
507 size_t item_len
, const struct rseq_mempool_attr
*_attr
)
509 struct rseq_mempool
*pool
;
510 struct rseq_mempool_attr attr
= {};
513 /* Make sure each item is large enough to contain free list pointers. */
514 if (item_len
< sizeof(void *))
515 item_len
= sizeof(void *);
517 /* Align item_len on next power of two. */
518 order
= rseq_get_count_order_ulong(item_len
);
523 item_len
= 1UL << order
;
526 memcpy(&attr
, _attr
, sizeof(attr
));
527 if (!attr
.mmap_set
) {
528 attr
.mmap_func
= default_mmap_func
;
529 attr
.munmap_func
= default_munmap_func
;
530 attr
.mmap_priv
= NULL
;
534 case MEMPOOL_TYPE_PERCPU
:
535 if (attr
.max_nr_cpus
< 0) {
539 if (attr
.max_nr_cpus
== 0) {
541 attr
.max_nr_cpus
= get_possible_cpus_array_len();
542 if (attr
.max_nr_cpus
== 0) {
548 case MEMPOOL_TYPE_GLOBAL
:
552 attr
.stride
= RSEQ_MEMPOOL_STRIDE
; /* Use default */
553 if (item_len
> attr
.stride
|| attr
.stride
< (size_t) rseq_get_page_len() ||
554 !is_pow2(attr
.stride
)) {
559 pool
= calloc(1, sizeof(struct rseq_mempool
));
563 memcpy(&pool
->attr
, &attr
, sizeof(attr
));
564 pthread_mutex_init(&pool
->lock
, NULL
);
565 pool
->item_len
= item_len
;
566 pool
->item_order
= order
;
568 //TODO: implement multi-range support.
569 pool
->ranges
= rseq_mempool_range_create(pool
);
574 pool
->name
= strdup(pool_name
);
581 rseq_mempool_destroy(pool
);
586 /* Always inline for __builtin_return_address(0). */
587 static inline __attribute__((always_inline
))
588 void set_alloc_slot(struct rseq_mempool
*pool
, size_t item_offset
)
590 unsigned long *bitmap
= pool
->ranges
->alloc_bitmap
;
591 size_t item_index
= item_offset
>> pool
->item_order
;
598 k
= item_index
/ BIT_PER_ULONG
;
599 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
601 /* Print error if bit is already set. */
602 if (bitmap
[k
] & mask
) {
603 fprintf(stderr
, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
604 __func__
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
611 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_mempool
*pool
, bool zeroed
)
613 struct free_list_node
*node
;
614 uintptr_t item_offset
;
615 void __rseq_percpu
*addr
;
617 pthread_mutex_lock(&pool
->lock
);
618 /* Get first entry from free list. */
619 node
= pool
->free_list_head
;
621 /* Remove node from free list (update head). */
622 pool
->free_list_head
= node
->next
;
623 item_offset
= (uintptr_t) ((void *) node
- pool
->ranges
->base
);
624 addr
= (void __rseq_percpu
*) (pool
->ranges
->base
+ item_offset
);
627 if (pool
->ranges
->next_unused
+ pool
->item_len
> pool
->attr
.stride
) {
632 item_offset
= pool
->ranges
->next_unused
;
633 addr
= (void __rseq_percpu
*) (pool
->ranges
->base
+ item_offset
);
634 pool
->ranges
->next_unused
+= pool
->item_len
;
637 set_alloc_slot(pool
, item_offset
);
638 pthread_mutex_unlock(&pool
->lock
);
640 rseq_percpu_zero_item(pool
, item_offset
);
644 void __rseq_percpu
*rseq_mempool_percpu_malloc(struct rseq_mempool
*pool
)
646 return __rseq_percpu_malloc(pool
, false);
649 void __rseq_percpu
*rseq_mempool_percpu_zmalloc(struct rseq_mempool
*pool
)
651 return __rseq_percpu_malloc(pool
, true);
654 /* Always inline for __builtin_return_address(0). */
655 static inline __attribute__((always_inline
))
656 void clear_alloc_slot(struct rseq_mempool
*pool
, size_t item_offset
)
658 unsigned long *bitmap
= pool
->ranges
->alloc_bitmap
;
659 size_t item_index
= item_offset
>> pool
->item_order
;
666 k
= item_index
/ BIT_PER_ULONG
;
667 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
669 /* Print error if bit is not set. */
670 if (!(bitmap
[k
] & mask
)) {
671 fprintf(stderr
, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
672 __func__
, get_pool_name(pool
), pool
, item_offset
,
673 (void *) __builtin_return_address(0));
679 void librseq_mempool_percpu_free(void __rseq_percpu
*_ptr
, size_t stride
)
681 uintptr_t ptr
= (uintptr_t) _ptr
;
682 void *range_base
= (void *) (ptr
& (~(stride
- 1)));
683 struct rseq_mempool_range
*range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
684 struct rseq_mempool
*pool
= range
->pool
;
685 uintptr_t item_offset
= ptr
& (stride
- 1);
686 struct free_list_node
*head
, *item
;
688 pthread_mutex_lock(&pool
->lock
);
689 clear_alloc_slot(pool
, item_offset
);
690 /* Add ptr to head of free list */
691 head
= pool
->free_list_head
;
692 /* Free-list is in CPU 0 range. */
693 item
= (struct free_list_node
*) ptr
;
695 pool
->free_list_head
= item
;
696 pthread_mutex_unlock(&pool
->lock
);
699 struct rseq_mempool_set
*rseq_mempool_set_create(void)
701 struct rseq_mempool_set
*pool_set
;
703 pool_set
= calloc(1, sizeof(struct rseq_mempool_set
));
706 pthread_mutex_init(&pool_set
->lock
, NULL
);
710 int rseq_mempool_set_destroy(struct rseq_mempool_set
*pool_set
)
714 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
715 struct rseq_mempool
*pool
= pool_set
->entries
[order
];
719 ret
= rseq_mempool_destroy(pool
);
722 pool_set
->entries
[order
] = NULL
;
724 pthread_mutex_destroy(&pool_set
->lock
);
729 /* Ownership of pool is handed over to pool set on success. */
730 int rseq_mempool_set_add_pool(struct rseq_mempool_set
*pool_set
, struct rseq_mempool
*pool
)
732 size_t item_order
= pool
->item_order
;
735 pthread_mutex_lock(&pool_set
->lock
);
736 if (pool_set
->entries
[item_order
]) {
741 pool_set
->entries
[pool
->item_order
] = pool
;
743 pthread_mutex_unlock(&pool_set
->lock
);
748 void __rseq_percpu
*__rseq_mempool_set_malloc(struct rseq_mempool_set
*pool_set
, size_t len
, bool zeroed
)
750 int order
, min_order
= POOL_SET_MIN_ENTRY
;
751 struct rseq_mempool
*pool
;
752 void __rseq_percpu
*addr
;
754 order
= rseq_get_count_order_ulong(len
);
755 if (order
> POOL_SET_MIN_ENTRY
)
758 pthread_mutex_lock(&pool_set
->lock
);
759 /* First smallest present pool where @len fits. */
760 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
761 pool
= pool_set
->entries
[order
];
765 if (pool
->item_len
>= len
)
770 pthread_mutex_unlock(&pool_set
->lock
);
772 addr
= __rseq_percpu_malloc(pool
, zeroed
);
773 if (addr
== NULL
&& errno
== ENOMEM
) {
775 * If the allocation failed, try again with a
778 min_order
= order
+ 1;
789 void __rseq_percpu
*rseq_mempool_set_percpu_malloc(struct rseq_mempool_set
*pool_set
, size_t len
)
791 return __rseq_mempool_set_malloc(pool_set
, len
, false);
794 void __rseq_percpu
*rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set
*pool_set
, size_t len
)
796 return __rseq_mempool_set_malloc(pool_set
, len
, true);
799 struct rseq_mempool_attr
*rseq_mempool_attr_create(void)
801 return calloc(1, sizeof(struct rseq_mempool_attr
));
804 void rseq_mempool_attr_destroy(struct rseq_mempool_attr
*attr
)
809 int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr
*attr
,
810 void *(*mmap_func
)(void *priv
, size_t len
),
811 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
),
818 attr
->mmap_set
= true;
819 attr
->mmap_func
= mmap_func
;
820 attr
->munmap_func
= munmap_func
;
821 attr
->mmap_priv
= mmap_priv
;
825 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr
*attr
)
831 attr
->robust_set
= true;
835 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr
*attr
,
836 size_t stride
, int max_nr_cpus
)
842 attr
->type
= MEMPOOL_TYPE_PERCPU
;
843 attr
->stride
= stride
;
844 attr
->max_nr_cpus
= max_nr_cpus
;
848 int rseq_mempool_attr_set_global(struct rseq_mempool_attr
*attr
,
855 attr
->type
= MEMPOOL_TYPE_GLOBAL
;
856 attr
->stride
= stride
;
857 attr
->max_nr_cpus
= 1;
This page took 0.072292 seconds and 4 git commands to generate.