1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/mempool.h>
11 #include <rseq/compiler.h>
23 #include "rseq-utils.h"
24 #include <rseq/rseq.h>
27 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
29 * The rseq per-CPU memory allocator allows the application the request
30 * memory pools of CPU-Local memory each of containing objects of a
31 * given size (rounded to next power of 2), reserving a given virtual
32 * address size per CPU, for a given maximum number of CPUs.
34 * The per-CPU memory allocator is analogous to TLS (Thread-Local
35 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
36 * memory allocator provides CPU-Local Storage.
39 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
42 * Smallest allocation should hold enough space for a free list pointer.
44 #if RSEQ_BITS_PER_LONG == 64
45 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
47 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
50 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
52 #define MOVE_PAGES_BATCH_SIZE 4096
54 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
56 #if RSEQ_BITS_PER_LONG == 64
57 # define DEFAULT_POISON_VALUE 0x5555555555555555ULL
59 # define DEFAULT_POISON_VALUE 0x55555555UL
62 struct free_list_node
;
64 struct free_list_node
{
65 struct free_list_node
*next
;
69 MEMPOOL_TYPE_GLOBAL
= 0, /* Default */
70 MEMPOOL_TYPE_PERCPU
= 1,
73 struct rseq_mempool_attr
{
75 void *(*mmap_func
)(void *priv
, size_t len
);
76 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
);
80 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
);
85 enum mempool_type type
;
89 unsigned long max_nr_ranges
;
94 enum rseq_mempool_populate_policy populate_policy
;
97 struct rseq_mempool_range
;
99 struct rseq_mempool_range
{
100 struct rseq_mempool_range
*next
; /* Linked list of ranges. */
101 struct rseq_mempool
*pool
; /* Backward reference to container pool. */
104 * Memory layout of a mempool range:
105 * - Header page (contains struct rseq_mempool_range at the very end),
106 * - Base of the per-cpu data, starting with CPU 0,
109 * - CPU max_nr_cpus - 1
110 * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_ALL).
115 * The init values contains malloc_init/zmalloc values.
116 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_ALL.
121 /* Pool range mmap/munmap */
125 /* Track alloc/free. */
126 unsigned long *alloc_bitmap
;
129 struct rseq_mempool
{
130 /* Head of ranges linked-list. */
131 struct rseq_mempool_range
*range_list
;
132 unsigned long nr_ranges
;
138 * The free list chains freed items on the CPU 0 address range.
139 * We should rethink this decision if false sharing between
140 * malloc/free from other CPUs and data accesses from CPU 0
141 * becomes an issue. This is a NULL-terminated singly-linked
144 struct free_list_node
*free_list_head
;
146 /* This lock protects allocation/free within the pool. */
147 pthread_mutex_t lock
;
149 struct rseq_mempool_attr attr
;
154 * Pool set entries are indexed by item_len rounded to the next power of
155 * 2. A pool set can contain NULL pool entries, in which case the next
156 * large enough entry will be used for allocation.
158 struct rseq_mempool_set
{
159 /* This lock protects add vs malloc/zmalloc within the pool set. */
160 pthread_mutex_t lock
;
161 struct rseq_mempool
*entries
[POOL_SET_NR_ENTRIES
];
165 * This memfd is used to implement the user COW behavior for the page
166 * protection scheme. memfd is a sparse virtual file. Its layout (in
167 * offset from beginning of file) matches the process address space
168 * (pointers directly converted to file offsets).
171 pthread_mutex_t lock
;
172 size_t reserved_size
;
173 unsigned int refcount
;
177 static struct rseq_memfd memfd
= {
178 .lock
= PTHREAD_MUTEX_INITIALIZER
,
185 const char *get_pool_name(const struct rseq_mempool
*pool
)
187 return pool
->name
? : "<anonymous>";
191 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range
*range
, int cpu
,
192 uintptr_t item_offset
, size_t stride
)
194 return range
->base
+ (stride
* cpu
) + item_offset
;
198 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range
*range
,
199 uintptr_t item_offset
)
203 return range
->init
+ item_offset
;
207 void __rseq_percpu
*__rseq_free_list_to_percpu_ptr(const struct rseq_mempool
*pool
,
208 struct free_list_node
*node
)
210 void __rseq_percpu
*p
= (void __rseq_percpu
*) node
;
212 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
)
213 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
218 struct free_list_node
*__rseq_percpu_to_free_list_ptr(const struct rseq_mempool
*pool
,
219 void __rseq_percpu
*p
)
221 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
)
222 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
223 return (struct free_list_node
*) p
;
227 off_t
ptr_to_off_t(void *p
)
229 return (off_t
) (uintptr_t) p
;
233 int memcmpbyte(const char *s
, int c
, size_t n
)
238 if ((res
= *(s
++) - c
) != 0)
244 void rseq_percpu_zero_item(struct rseq_mempool
*pool
,
245 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
250 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
252 memset(init_p
, 0, pool
->item_len
);
253 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
254 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
255 item_offset
, pool
->attr
.stride
);
257 /* Update propagated */
258 if (init_p
&& !memcmpbyte(p
, 0, pool
->item_len
))
260 memset(p
, 0, pool
->item_len
);
265 void rseq_percpu_init_item(struct rseq_mempool
*pool
,
266 struct rseq_mempool_range
*range
, uintptr_t item_offset
,
267 void *init_ptr
, size_t init_len
)
272 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
274 memcpy(init_p
, init_ptr
, init_len
);
275 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
276 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
277 item_offset
, pool
->attr
.stride
);
279 /* Update propagated */
280 if (init_p
&& !memcmp(init_p
, p
, init_len
))
282 memcpy(p
, init_ptr
, init_len
);
287 void rseq_poison_item(void *p
, size_t item_len
, uintptr_t poison
)
291 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t))
292 *((uintptr_t *) (p
+ offset
)) = poison
;
296 void rseq_percpu_poison_item(struct rseq_mempool
*pool
,
297 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
299 uintptr_t poison
= pool
->attr
.poison
;
303 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
305 rseq_poison_item(init_p
, pool
->item_len
, poison
);
306 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
307 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
308 item_offset
, pool
->attr
.stride
);
310 /* Update propagated */
311 if (init_p
&& !memcmp(init_p
, p
, pool
->item_len
))
313 rseq_poison_item(p
, pool
->item_len
, poison
);
317 /* Always inline for __builtin_return_address(0). */
318 static inline __attribute__((always_inline
))
319 void rseq_check_poison_item(const struct rseq_mempool
*pool
, uintptr_t item_offset
,
320 void *p
, size_t item_len
, uintptr_t poison
, bool skip_freelist_ptr
)
324 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t)) {
327 /* Skip poison check for free-list pointer. */
328 if (skip_freelist_ptr
&& offset
== 0)
330 v
= *((uintptr_t *) (p
+ offset
));
332 fprintf(stderr
, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
333 __func__
, (unsigned long) v
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
339 /* Always inline for __builtin_return_address(0). */
340 static inline __attribute__((always_inline
))
341 void rseq_percpu_check_poison_item(const struct rseq_mempool
*pool
,
342 const struct rseq_mempool_range
*range
, uintptr_t item_offset
)
344 uintptr_t poison
= pool
->attr
.poison
;
348 if (!pool
->attr
.robust_set
)
350 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
352 rseq_check_poison_item(pool
, item_offset
, init_p
, pool
->item_len
, poison
, true);
353 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
354 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
355 item_offset
, pool
->attr
.stride
);
357 * When the free list is embedded in the init values
358 * memory (populate none), it is visible from the init
359 * values memory mapping as well as per-cpu private
360 * mappings before they COW.
362 * When the free list is embedded in CPU 0 mapping
363 * (populate all), only this CPU must skip the free list
364 * nodes when checking poison.
366 rseq_check_poison_item(pool
, item_offset
, p
, pool
->item_len
, poison
,
367 init_p
== NULL
? (i
== 0) : true);
372 int rseq_mempool_range_init_numa(void *addr
, size_t len
, int cpu
, int numa_flags
)
374 unsigned long nr_pages
, page_len
;
375 int status
[MOVE_PAGES_BATCH_SIZE
];
376 int nodes
[MOVE_PAGES_BATCH_SIZE
];
377 void *pages
[MOVE_PAGES_BATCH_SIZE
];
384 page_len
= rseq_get_page_len();
385 nr_pages
= len
>> rseq_get_count_order_ulong(page_len
);
387 nodes
[0] = numa_node_of_cpu(cpu
);
391 for (size_t k
= 1; k
< RSEQ_ARRAY_SIZE(nodes
); ++k
) {
395 for (unsigned long page
= 0; page
< nr_pages
;) {
397 size_t max_k
= RSEQ_ARRAY_SIZE(pages
);
398 size_t left
= nr_pages
- page
;
404 for (size_t k
= 0; k
< max_k
; ++k
, ++page
) {
405 pages
[k
] = addr
+ (page
* page_len
);
409 ret
= move_pages(0, max_k
, pages
, nodes
, status
, numa_flags
);
415 fprintf(stderr
, "%lu pages were not migrated\n", ret
);
416 for (size_t k
= 0; k
< max_k
; ++k
) {
419 "Error while moving page %p to numa node %d: %u\n",
420 pages
[k
], nodes
[k
], -status
[k
]);
427 int rseq_mempool_range_init_numa(void *addr
__attribute__((unused
)),
428 size_t len
__attribute__((unused
)),
429 int cpu
__attribute__((unused
)),
430 int numa_flags
__attribute__((unused
)))
438 void *default_mmap_func(void *priv
__attribute__((unused
)), size_t len
)
442 base
= mmap(NULL
, len
, PROT_READ
| PROT_WRITE
,
443 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
444 if (base
== MAP_FAILED
)
450 int default_munmap_func(void *priv
__attribute__((unused
)), void *ptr
, size_t len
)
452 return munmap(ptr
, len
);
456 int create_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
460 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
463 * Not being able to create the validation bitmap is an error
464 * that needs to be reported.
466 range
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
467 if (!range
->alloc_bitmap
)
473 bool percpu_addr_in_pool(const struct rseq_mempool
*pool
, void __rseq_percpu
*_addr
)
475 struct rseq_mempool_range
*range
;
476 void *addr
= (void *) _addr
;
478 for (range
= pool
->range_list
; range
; range
= range
->next
) {
479 if (addr
>= range
->base
&& addr
< range
->base
+ range
->next_unused
)
485 /* Always inline for __builtin_return_address(0). */
486 static inline __attribute__((always_inline
))
487 void check_free_list(const struct rseq_mempool
*pool
)
489 size_t total_item
= 0, total_never_allocated
= 0, total_freed
= 0,
490 max_list_traversal
= 0, traversal_iteration
= 0;
491 struct rseq_mempool_range
*range
;
493 if (!pool
->attr
.robust_set
)
496 for (range
= pool
->range_list
; range
; range
= range
->next
) {
497 total_item
+= pool
->attr
.stride
>> pool
->item_order
;
498 total_never_allocated
+= (pool
->attr
.stride
- range
->next_unused
) >> pool
->item_order
;
500 max_list_traversal
= total_item
- total_never_allocated
;
502 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
507 if (traversal_iteration
>= max_list_traversal
) {
508 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
509 __func__
, get_pool_name(pool
), pool
, __builtin_return_address(0));
513 /* Node is out of range. */
514 if (!percpu_addr_in_pool(pool
, __rseq_free_list_to_percpu_ptr(pool
, node
))) {
516 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
517 __func__
, prev
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
519 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
520 __func__
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
524 traversal_iteration
++;
528 if (total_never_allocated
+ total_freed
!= total_item
) {
529 fprintf(stderr
, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
530 __func__
, get_pool_name(pool
), pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
535 /* Always inline for __builtin_return_address(0). */
536 static inline __attribute__((always_inline
))
537 void check_range_poison(const struct rseq_mempool
*pool
,
538 const struct rseq_mempool_range
*range
)
542 for (item_offset
= 0; item_offset
< range
->next_unused
;
543 item_offset
+= pool
->item_len
)
544 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
547 /* Always inline for __builtin_return_address(0). */
548 static inline __attribute__((always_inline
))
549 void check_pool_poison(const struct rseq_mempool
*pool
)
551 struct rseq_mempool_range
*range
;
553 if (!pool
->attr
.robust_set
)
555 for (range
= pool
->range_list
; range
; range
= range
->next
)
556 check_range_poison(pool
, range
);
559 /* Always inline for __builtin_return_address(0). */
560 static inline __attribute__((always_inline
))
561 void destroy_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
563 unsigned long *bitmap
= range
->alloc_bitmap
;
564 size_t count
, total_leaks
= 0;
569 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
571 /* Assert that all items in the pool were freed. */
572 for (size_t k
= 0; k
< count
; ++k
)
573 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
575 fprintf(stderr
, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
576 __func__
, get_pool_name(pool
), pool
, total_leaks
, (void *) __builtin_return_address(0));
581 range
->alloc_bitmap
= NULL
;
584 /* Always inline for __builtin_return_address(0). */
585 static inline __attribute__((always_inline
))
586 int rseq_mempool_range_destroy(struct rseq_mempool
*pool
,
587 struct rseq_mempool_range
*range
)
591 destroy_alloc_bitmap(pool
, range
);
594 * Punch a hole into memfd where the init values used to be.
597 ret
= fallocate(memfd
.fd
, FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
,
598 ptr_to_off_t(range
->init
), pool
->attr
.stride
);
604 /* range is a header located one page before the aligned mapping. */
605 return pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, range
->mmap_addr
, range
->mmap_len
);
609 * Allocate a memory mapping aligned on @alignment, with an optional
610 * @pre_header before the mapping.
613 void *aligned_mmap_anonymous(struct rseq_mempool
*pool
,
614 size_t page_size
, size_t len
, size_t alignment
,
615 void **pre_header
, size_t pre_header_len
)
617 size_t minimum_page_count
, page_count
, extra
, total_allocate
= 0;
621 if (len
< page_size
|| alignment
< page_size
||
622 !is_pow2(alignment
) || (len
& (alignment
- 1))) {
626 page_order
= rseq_get_count_order_ulong(page_size
);
627 if (page_order
< 0) {
631 if (pre_header_len
&& (pre_header_len
& (page_size
- 1))) {
636 minimum_page_count
= (pre_header_len
+ len
) >> page_order
;
637 page_count
= (pre_header_len
+ len
+ alignment
- page_size
) >> page_order
;
639 assert(page_count
>= minimum_page_count
);
641 ptr
= pool
->attr
.mmap_func(pool
->attr
.mmap_priv
, page_count
<< page_order
);
645 total_allocate
= page_count
<< page_order
;
647 if (!(((uintptr_t) ptr
+ pre_header_len
) & (alignment
- 1))) {
648 /* Pointer is already aligned. ptr points to pre_header. */
652 /* Unmap extra before. */
653 extra
= offset_align((uintptr_t) ptr
+ pre_header_len
, alignment
);
654 assert(!(extra
& (page_size
- 1)));
655 if (pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, ptr
, extra
)) {
659 total_allocate
-= extra
;
660 ptr
+= extra
; /* ptr points to pre_header */
661 page_count
-= extra
>> page_order
;
663 assert(page_count
>= minimum_page_count
);
665 if (page_count
> minimum_page_count
) {
668 /* Unmap extra after. */
669 extra_ptr
= ptr
+ (minimum_page_count
<< page_order
);
670 extra
= (page_count
- minimum_page_count
) << page_order
;
671 if (pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, extra_ptr
, extra
)) {
675 total_allocate
-= extra
;
678 assert(!(((uintptr_t)ptr
+ pre_header_len
) & (alignment
- 1)));
679 assert(total_allocate
== len
+ pre_header_len
);
685 ptr
+= pre_header_len
;
691 int rseq_memfd_reserve_init(void *init
, size_t init_len
)
696 pthread_mutex_lock(&memfd
.lock
);
697 reserve_len
= (size_t) ptr_to_off_t(init
) + init_len
;
698 if (reserve_len
> memfd
.reserved_size
) {
699 if (ftruncate(memfd
.fd
, (off_t
) reserve_len
)) {
703 memfd
.reserved_size
= reserve_len
;
706 pthread_mutex_unlock(&memfd
.lock
);
711 struct rseq_mempool_range
*rseq_mempool_range_create(struct rseq_mempool
*pool
)
713 struct rseq_mempool_range
*range
;
714 unsigned long page_size
;
717 size_t range_len
; /* Range len excludes header. */
719 if (pool
->attr
.max_nr_ranges
&&
720 pool
->nr_ranges
>= pool
->attr
.max_nr_ranges
) {
724 page_size
= rseq_get_page_len();
726 range_len
= pool
->attr
.stride
* pool
->attr
.max_nr_cpus
;
727 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
)
728 range_len
+= pool
->attr
.stride
; /* init values */
729 base
= aligned_mmap_anonymous(pool
, page_size
,
735 range
= (struct rseq_mempool_range
*) (base
- RANGE_HEADER_OFFSET
);
737 range
->header
= header
;
739 range
->mmap_addr
= header
;
740 range
->mmap_len
= page_size
+ range_len
;
742 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
) {
743 range
->init
= base
+ (pool
->attr
.stride
* pool
->attr
.max_nr_cpus
);
744 /* Populate init values pages from memfd */
745 if (rseq_memfd_reserve_init(range
->init
, pool
->attr
.stride
))
747 if (mmap(range
->init
, pool
->attr
.stride
, PROT_READ
| PROT_WRITE
,
748 MAP_SHARED
| MAP_FIXED
, memfd
.fd
,
749 ptr_to_off_t(range
->init
)) != (void *) range
->init
) {
752 assert(pool
->attr
.type
== MEMPOOL_TYPE_PERCPU
);
754 * Map per-cpu memory as private COW mappings of init values.
759 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
760 void *p
= base
+ (pool
->attr
.stride
* cpu
);
761 size_t len
= pool
->attr
.stride
;
763 if (mmap(p
, len
, PROT_READ
| PROT_WRITE
, MAP_PRIVATE
| MAP_FIXED
,
764 memfd
.fd
, ptr_to_off_t(range
->init
)) != (void *) p
) {
771 if (pool
->attr
.robust_set
) {
772 if (create_alloc_bitmap(pool
, range
))
775 if (pool
->attr
.init_set
) {
776 switch (pool
->attr
.type
) {
777 case MEMPOOL_TYPE_GLOBAL
:
778 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
779 base
, pool
->attr
.stride
, -1)) {
783 case MEMPOOL_TYPE_PERCPU
:
786 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
787 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
788 base
+ (pool
->attr
.stride
* cpu
),
789 pool
->attr
.stride
, cpu
)) {
803 (void) rseq_mempool_range_destroy(pool
, range
);
808 int rseq_mempool_memfd_ref(struct rseq_mempool
*pool
)
812 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_ALL
)
815 pthread_mutex_lock(&memfd
.lock
);
816 if (memfd
.refcount
== 0) {
817 memfd
.fd
= memfd_create("mempool", MFD_CLOEXEC
);
819 perror("memfd_create");
826 pthread_mutex_unlock(&memfd
.lock
);
831 void rseq_mempool_memfd_unref(struct rseq_mempool
*pool
)
833 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_ALL
)
836 pthread_mutex_lock(&memfd
.lock
);
837 if (memfd
.refcount
== 1) {
838 if (close(memfd
.fd
)) {
843 memfd
.reserved_size
= 0;
846 pthread_mutex_unlock(&memfd
.lock
);
849 int rseq_mempool_destroy(struct rseq_mempool
*pool
)
851 struct rseq_mempool_range
*range
, *next_range
;
856 check_free_list(pool
);
857 check_pool_poison(pool
);
858 /* Iteration safe against removal. */
859 for (range
= pool
->range_list
; range
&& (next_range
= range
->next
, 1); range
= next_range
) {
860 if (rseq_mempool_range_destroy(pool
, range
))
862 /* Update list head to keep list coherent in case of partial failure. */
863 pool
->range_list
= next_range
;
865 rseq_mempool_memfd_unref(pool
);
866 pthread_mutex_destroy(&pool
->lock
);
873 struct rseq_mempool
*rseq_mempool_create(const char *pool_name
,
874 size_t item_len
, const struct rseq_mempool_attr
*_attr
)
876 struct rseq_mempool
*pool
;
877 struct rseq_mempool_attr attr
= {};
880 /* Make sure each item is large enough to contain free list pointers. */
881 if (item_len
< sizeof(void *))
882 item_len
= sizeof(void *);
884 /* Align item_len on next power of two. */
885 order
= rseq_get_count_order_ulong(item_len
);
890 item_len
= 1UL << order
;
893 memcpy(&attr
, _attr
, sizeof(attr
));
894 if (!attr
.mmap_set
) {
895 attr
.mmap_func
= default_mmap_func
;
896 attr
.munmap_func
= default_munmap_func
;
897 attr
.mmap_priv
= NULL
;
901 case MEMPOOL_TYPE_PERCPU
:
902 if (attr
.max_nr_cpus
< 0) {
906 if (attr
.max_nr_cpus
== 0) {
908 attr
.max_nr_cpus
= rseq_get_max_nr_cpus();
909 if (attr
.max_nr_cpus
== 0) {
915 case MEMPOOL_TYPE_GLOBAL
:
916 /* Override populate policy for global type. */
917 attr
.populate_policy
= RSEQ_MEMPOOL_POPULATE_ALL
;
918 /* Use a 1-cpu pool for global mempool type. */
919 attr
.max_nr_cpus
= 1;
923 attr
.stride
= RSEQ_MEMPOOL_STRIDE
; /* Use default */
924 if (attr
.robust_set
&& !attr
.poison_set
) {
925 attr
.poison_set
= true;
926 attr
.poison
= DEFAULT_POISON_VALUE
;
928 if (item_len
> attr
.stride
|| attr
.stride
< (size_t) rseq_get_page_len() ||
929 !is_pow2(attr
.stride
)) {
934 pool
= calloc(1, sizeof(struct rseq_mempool
));
938 memcpy(&pool
->attr
, &attr
, sizeof(attr
));
939 pthread_mutex_init(&pool
->lock
, NULL
);
940 pool
->item_len
= item_len
;
941 pool
->item_order
= order
;
943 if (rseq_mempool_memfd_ref(pool
))
946 pool
->range_list
= rseq_mempool_range_create(pool
);
947 if (!pool
->range_list
)
951 pool
->name
= strdup(pool_name
);
958 rseq_mempool_destroy(pool
);
963 /* Always inline for __builtin_return_address(0). */
964 static inline __attribute__((always_inline
))
965 void set_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
967 unsigned long *bitmap
= range
->alloc_bitmap
;
968 size_t item_index
= item_offset
>> pool
->item_order
;
975 k
= item_index
/ BIT_PER_ULONG
;
976 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
978 /* Print error if bit is already set. */
979 if (bitmap
[k
] & mask
) {
980 fprintf(stderr
, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
981 __func__
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
988 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_mempool
*pool
,
989 bool zeroed
, void *init_ptr
, size_t init_len
)
991 struct rseq_mempool_range
*range
;
992 struct free_list_node
*node
;
993 uintptr_t item_offset
;
994 void __rseq_percpu
*addr
;
996 if (init_len
> pool
->item_len
) {
1000 pthread_mutex_lock(&pool
->lock
);
1001 /* Get first entry from free list. */
1002 node
= pool
->free_list_head
;
1004 void *range_base
, *ptr
;
1006 ptr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1007 range_base
= (void *) ((uintptr_t) ptr
& (~(pool
->attr
.stride
- 1)));
1008 range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1009 /* Remove node from free list (update head). */
1010 pool
->free_list_head
= node
->next
;
1011 item_offset
= (uintptr_t) (ptr
- range_base
);
1012 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
1013 addr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1017 * If the most recent range (first in list) does not have any
1018 * room left, create a new range and prepend it to the list
1021 range
= pool
->range_list
;
1022 if (range
->next_unused
+ pool
->item_len
> pool
->attr
.stride
) {
1023 range
= rseq_mempool_range_create(pool
);
1029 /* Add range to head of list. */
1030 range
->next
= pool
->range_list
;
1031 pool
->range_list
= range
;
1033 /* First range in list has room left. */
1034 item_offset
= range
->next_unused
;
1035 addr
= (void __rseq_percpu
*) (range
->base
+ item_offset
);
1036 range
->next_unused
+= pool
->item_len
;
1039 set_alloc_slot(pool
, range
, item_offset
);
1040 pthread_mutex_unlock(&pool
->lock
);
1043 rseq_percpu_zero_item(pool
, range
, item_offset
);
1044 else if (init_ptr
) {
1045 rseq_percpu_init_item(pool
, range
, item_offset
,
1046 init_ptr
, init_len
);
1052 void __rseq_percpu
*rseq_mempool_percpu_malloc(struct rseq_mempool
*pool
)
1054 return __rseq_percpu_malloc(pool
, false, NULL
, 0);
1057 void __rseq_percpu
*rseq_mempool_percpu_zmalloc(struct rseq_mempool
*pool
)
1059 return __rseq_percpu_malloc(pool
, true, NULL
, 0);
1062 void __rseq_percpu
*rseq_mempool_percpu_malloc_init(struct rseq_mempool
*pool
,
1063 void *init_ptr
, size_t len
)
1065 return __rseq_percpu_malloc(pool
, false, init_ptr
, len
);
1068 /* Always inline for __builtin_return_address(0). */
1069 static inline __attribute__((always_inline
))
1070 void clear_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1072 unsigned long *bitmap
= range
->alloc_bitmap
;
1073 size_t item_index
= item_offset
>> pool
->item_order
;
1080 k
= item_index
/ BIT_PER_ULONG
;
1081 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1083 /* Print error if bit is not set. */
1084 if (!(bitmap
[k
] & mask
)) {
1085 fprintf(stderr
, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1086 __func__
, get_pool_name(pool
), pool
, item_offset
,
1087 (void *) __builtin_return_address(0));
1093 void librseq_mempool_percpu_free(void __rseq_percpu
*_ptr
, size_t stride
)
1095 uintptr_t ptr
= (uintptr_t) _ptr
;
1096 void *range_base
= (void *) (ptr
& (~(stride
- 1)));
1097 struct rseq_mempool_range
*range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1098 struct rseq_mempool
*pool
= range
->pool
;
1099 uintptr_t item_offset
= ptr
& (stride
- 1);
1100 struct free_list_node
*head
, *item
;
1102 pthread_mutex_lock(&pool
->lock
);
1103 clear_alloc_slot(pool
, range
, item_offset
);
1104 /* Add ptr to head of free list */
1105 head
= pool
->free_list_head
;
1106 if (pool
->attr
.poison_set
)
1107 rseq_percpu_poison_item(pool
, range
, item_offset
);
1108 item
= __rseq_percpu_to_free_list_ptr(pool
, _ptr
);
1110 * Setting the next pointer will overwrite the first uintptr_t
1111 * poison for either CPU 0 (populate all) or init data (populate
1115 pool
->free_list_head
= item
;
1116 pthread_mutex_unlock(&pool
->lock
);
1119 struct rseq_mempool_set
*rseq_mempool_set_create(void)
1121 struct rseq_mempool_set
*pool_set
;
1123 pool_set
= calloc(1, sizeof(struct rseq_mempool_set
));
1126 pthread_mutex_init(&pool_set
->lock
, NULL
);
1130 int rseq_mempool_set_destroy(struct rseq_mempool_set
*pool_set
)
1134 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1135 struct rseq_mempool
*pool
= pool_set
->entries
[order
];
1139 ret
= rseq_mempool_destroy(pool
);
1142 pool_set
->entries
[order
] = NULL
;
1144 pthread_mutex_destroy(&pool_set
->lock
);
1149 /* Ownership of pool is handed over to pool set on success. */
1150 int rseq_mempool_set_add_pool(struct rseq_mempool_set
*pool_set
, struct rseq_mempool
*pool
)
1152 size_t item_order
= pool
->item_order
;
1155 pthread_mutex_lock(&pool_set
->lock
);
1156 if (pool_set
->entries
[item_order
]) {
1161 pool_set
->entries
[pool
->item_order
] = pool
;
1163 pthread_mutex_unlock(&pool_set
->lock
);
1168 void __rseq_percpu
*__rseq_mempool_set_malloc(struct rseq_mempool_set
*pool_set
,
1169 void *init_ptr
, size_t len
, bool zeroed
)
1171 int order
, min_order
= POOL_SET_MIN_ENTRY
;
1172 struct rseq_mempool
*pool
;
1173 void __rseq_percpu
*addr
;
1175 order
= rseq_get_count_order_ulong(len
);
1176 if (order
> POOL_SET_MIN_ENTRY
)
1179 pthread_mutex_lock(&pool_set
->lock
);
1180 /* First smallest present pool where @len fits. */
1181 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1182 pool
= pool_set
->entries
[order
];
1186 if (pool
->item_len
>= len
)
1191 pthread_mutex_unlock(&pool_set
->lock
);
1193 addr
= __rseq_percpu_malloc(pool
, zeroed
, init_ptr
, len
);
1194 if (addr
== NULL
&& errno
== ENOMEM
) {
1196 * If the allocation failed, try again with a
1199 min_order
= order
+ 1;
1210 void __rseq_percpu
*rseq_mempool_set_percpu_malloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1212 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, false);
1215 void __rseq_percpu
*rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1217 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, true);
1220 void __rseq_percpu
*rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set
*pool_set
,
1221 void *init_ptr
, size_t len
)
1223 return __rseq_mempool_set_malloc(pool_set
, init_ptr
, len
, true);
1226 struct rseq_mempool_attr
*rseq_mempool_attr_create(void)
1228 return calloc(1, sizeof(struct rseq_mempool_attr
));
1231 void rseq_mempool_attr_destroy(struct rseq_mempool_attr
*attr
)
1236 int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr
*attr
,
1237 void *(*mmap_func
)(void *priv
, size_t len
),
1238 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
),
1245 attr
->mmap_set
= true;
1246 attr
->mmap_func
= mmap_func
;
1247 attr
->munmap_func
= munmap_func
;
1248 attr
->mmap_priv
= mmap_priv
;
1252 int rseq_mempool_attr_set_init(struct rseq_mempool_attr
*attr
,
1253 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
),
1260 attr
->init_set
= true;
1261 attr
->init_func
= init_func
;
1262 attr
->init_priv
= init_priv
;
1266 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr
*attr
)
1272 attr
->robust_set
= true;
1276 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr
*attr
,
1277 size_t stride
, int max_nr_cpus
)
1283 attr
->type
= MEMPOOL_TYPE_PERCPU
;
1284 attr
->stride
= stride
;
1285 attr
->max_nr_cpus
= max_nr_cpus
;
1289 int rseq_mempool_attr_set_global(struct rseq_mempool_attr
*attr
,
1296 attr
->type
= MEMPOOL_TYPE_GLOBAL
;
1297 attr
->stride
= stride
;
1298 attr
->max_nr_cpus
= 0;
1302 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr
*attr
,
1303 unsigned long max_nr_ranges
)
1309 attr
->max_nr_ranges
= max_nr_ranges
;
1313 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr
*attr
,
1320 attr
->poison_set
= true;
1321 attr
->poison
= poison
;
1325 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr
*attr
,
1326 enum rseq_mempool_populate_policy policy
)
1332 attr
->populate_policy
= policy
;
1336 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool
*mempool
)
1338 if (!mempool
|| mempool
->attr
.type
!= MEMPOOL_TYPE_PERCPU
) {
1342 return mempool
->attr
.max_nr_cpus
;