1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/mempool.h>
11 #include <rseq/compiler.h>
23 #include "rseq-utils.h"
24 #include <rseq/rseq.h>
27 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
29 * The rseq per-CPU memory allocator allows the application the request
30 * memory pools of CPU-Local memory each of containing objects of a
31 * given size (rounded to next power of 2), reserving a given virtual
32 * address size per CPU, for a given maximum number of CPUs.
34 * The per-CPU memory allocator is analogous to TLS (Thread-Local
35 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
36 * memory allocator provides CPU-Local Storage.
39 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
42 * Smallest allocation should hold enough space for a free list pointer.
44 #if RSEQ_BITS_PER_LONG == 64
45 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
47 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
50 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
52 #define MOVE_PAGES_BATCH_SIZE 4096
54 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
56 #if RSEQ_BITS_PER_LONG == 64
57 # define DEFAULT_PRIVATE_POISON_VALUE 0x5555555555555555ULL
59 # define DEFAULT_PRIVATE_POISON_VALUE 0x55555555UL
62 struct free_list_node
;
64 struct free_list_node
{
65 struct free_list_node
*next
;
69 MEMPOOL_TYPE_GLOBAL
= 0, /* Default */
70 MEMPOOL_TYPE_PERCPU
= 1,
73 struct rseq_mempool_attr
{
75 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
);
80 enum mempool_type type
;
84 unsigned long max_nr_ranges
;
89 enum rseq_mempool_populate_policy populate_policy
;
92 struct rseq_mempool_range
;
94 struct rseq_mempool_range
{
95 struct rseq_mempool_range
*next
; /* Linked list of ranges. */
96 struct rseq_mempool
*pool
; /* Backward reference to container pool. */
99 * Memory layout of a mempool range:
100 * - Header page (contains struct rseq_mempool_range at the very end),
101 * - Base of the per-cpu data, starting with CPU 0.
102 * Aliases with free-list for non-robust populate all pool.
105 * - CPU max_nr_cpus - 1
106 * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL).
107 * Aliases with free-list for non-robust populate none pool.
108 * - free list (for robust pool).
110 * The free list aliases the CPU 0 memory area for non-robust
111 * populate all pools. It aliases with init values for
112 * non-robust populate none pools. It is located immediately
113 * after the init values for robust pools.
118 * The init values contains malloc_init/zmalloc values.
119 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL.
124 /* Pool range mmap/munmap */
128 /* Track alloc/free. */
129 unsigned long *alloc_bitmap
;
132 struct rseq_mempool
{
133 /* Head of ranges linked-list. */
134 struct rseq_mempool_range
*range_list
;
135 unsigned long nr_ranges
;
141 * The free list chains freed items on the CPU 0 address range.
142 * We should rethink this decision if false sharing between
143 * malloc/free from other CPUs and data accesses from CPU 0
144 * becomes an issue. This is a NULL-terminated singly-linked
147 struct free_list_node
*free_list_head
;
149 /* This lock protects allocation/free within the pool. */
150 pthread_mutex_t lock
;
152 struct rseq_mempool_attr attr
;
157 * Pool set entries are indexed by item_len rounded to the next power of
158 * 2. A pool set can contain NULL pool entries, in which case the next
159 * large enough entry will be used for allocation.
161 struct rseq_mempool_set
{
162 /* This lock protects add vs malloc/zmalloc within the pool set. */
163 pthread_mutex_t lock
;
164 struct rseq_mempool
*entries
[POOL_SET_NR_ENTRIES
];
168 const char *get_pool_name(const struct rseq_mempool
*pool
)
170 return pool
->name
? : "<anonymous>";
174 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range
*range
, int cpu
,
175 uintptr_t item_offset
, size_t stride
)
177 return range
->base
+ (stride
* cpu
) + item_offset
;
181 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range
*range
,
182 uintptr_t item_offset
)
186 return range
->init
+ item_offset
;
190 void __rseq_percpu
*__rseq_free_list_to_percpu_ptr(const struct rseq_mempool
*pool
,
191 struct free_list_node
*node
)
193 void __rseq_percpu
*p
= (void __rseq_percpu
*) node
;
195 if (pool
->attr
.robust_set
) {
197 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
198 /* Skip init values */
199 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL
)
200 p
-= pool
->attr
.stride
;
203 /* Populate none free list is in init values */
204 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL
)
205 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
211 struct free_list_node
*__rseq_percpu_to_free_list_ptr(const struct rseq_mempool
*pool
,
212 void __rseq_percpu
*p
)
214 if (pool
->attr
.robust_set
) {
216 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
217 /* Skip init values */
218 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL
)
219 p
+= pool
->attr
.stride
;
222 /* Populate none free list is in init values */
223 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL
)
224 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
226 return (struct free_list_node
*) p
;
230 intptr_t rseq_cmp_item(void *p
, size_t item_len
, intptr_t cmp_value
, intptr_t *unexpected_value
)
235 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t)) {
236 intptr_t v
= *((intptr_t *) (p
+ offset
));
238 if ((res
= v
- cmp_value
) != 0) {
239 if (unexpected_value
)
240 *unexpected_value
= v
;
248 void rseq_percpu_zero_item(struct rseq_mempool
*pool
,
249 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
254 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
256 bzero(init_p
, pool
->item_len
);
257 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
258 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
259 item_offset
, pool
->attr
.stride
);
262 * If item is already zeroed, either because the
263 * init range update has propagated or because the
264 * content is already zeroed (e.g. zero page), don't
265 * write to the page. This eliminates useless COW over
266 * the zero page just for overwriting it with zeroes.
268 * This means zmalloc() in populate all policy pool do
269 * not trigger COW for CPUs which are not actively
270 * writing to the pool. This is however not the case for
271 * malloc_init() in populate-all pools if it populates
274 if (!rseq_cmp_item(p
, pool
->item_len
, 0, NULL
))
276 bzero(p
, pool
->item_len
);
281 void rseq_percpu_init_item(struct rseq_mempool
*pool
,
282 struct rseq_mempool_range
*range
, uintptr_t item_offset
,
283 void *init_ptr
, size_t init_len
)
288 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
290 memcpy(init_p
, init_ptr
, init_len
);
291 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
292 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
293 item_offset
, pool
->attr
.stride
);
296 * If the update propagated through a shared mapping,
297 * or the item already has the correct content, skip
298 * writing it into the cpu item to eliminate useless
301 if (!memcmp(init_ptr
, p
, init_len
))
303 memcpy(p
, init_ptr
, init_len
);
308 void rseq_poison_item(void *p
, size_t item_len
, uintptr_t poison
)
312 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t))
313 *((uintptr_t *) (p
+ offset
)) = poison
;
317 void rseq_percpu_poison_item(struct rseq_mempool
*pool
,
318 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
320 uintptr_t poison
= pool
->attr
.poison
;
324 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
326 rseq_poison_item(init_p
, pool
->item_len
, poison
);
327 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
328 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
329 item_offset
, pool
->attr
.stride
);
332 * If the update propagated through a shared mapping,
333 * or the item already has the correct content, skip
334 * writing it into the cpu item to eliminate useless
337 * It is recommended to use zero as poison value for
338 * populate-all pools to eliminate COW due to writing
339 * poison to unused CPU memory.
341 if (rseq_cmp_item(p
, pool
->item_len
, poison
, NULL
) == 0)
343 rseq_poison_item(p
, pool
->item_len
, poison
);
347 /* Always inline for __builtin_return_address(0). */
348 static inline __attribute__((always_inline
))
349 void rseq_check_poison_item(const struct rseq_mempool
*pool
, uintptr_t item_offset
,
350 void *p
, size_t item_len
, uintptr_t poison
)
352 intptr_t unexpected_value
;
354 if (rseq_cmp_item(p
, item_len
, poison
, &unexpected_value
) == 0)
357 fprintf(stderr
, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
358 __func__
, (unsigned long) unexpected_value
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
362 /* Always inline for __builtin_return_address(0). */
363 static inline __attribute__((always_inline
))
364 void rseq_percpu_check_poison_item(const struct rseq_mempool
*pool
,
365 const struct rseq_mempool_range
*range
, uintptr_t item_offset
)
367 uintptr_t poison
= pool
->attr
.poison
;
371 if (!pool
->attr
.robust_set
)
373 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
375 rseq_check_poison_item(pool
, item_offset
, init_p
, pool
->item_len
, poison
);
376 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
377 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
378 item_offset
, pool
->attr
.stride
);
379 rseq_check_poison_item(pool
, item_offset
, p
, pool
->item_len
, poison
);
384 int rseq_mempool_range_init_numa(void *addr
, size_t len
, int cpu
, int numa_flags
)
386 unsigned long nr_pages
, page_len
;
387 int status
[MOVE_PAGES_BATCH_SIZE
];
388 int nodes
[MOVE_PAGES_BATCH_SIZE
];
389 void *pages
[MOVE_PAGES_BATCH_SIZE
];
396 page_len
= rseq_get_page_len();
397 nr_pages
= len
>> rseq_get_count_order_ulong(page_len
);
399 nodes
[0] = numa_node_of_cpu(cpu
);
403 for (size_t k
= 1; k
< RSEQ_ARRAY_SIZE(nodes
); ++k
) {
407 for (unsigned long page
= 0; page
< nr_pages
;) {
409 size_t max_k
= RSEQ_ARRAY_SIZE(pages
);
410 size_t left
= nr_pages
- page
;
416 for (size_t k
= 0; k
< max_k
; ++k
, ++page
) {
417 pages
[k
] = addr
+ (page
* page_len
);
421 ret
= move_pages(0, max_k
, pages
, nodes
, status
, numa_flags
);
427 fprintf(stderr
, "%lu pages were not migrated\n", ret
);
428 for (size_t k
= 0; k
< max_k
; ++k
) {
431 "Error while moving page %p to numa node %d: %u\n",
432 pages
[k
], nodes
[k
], -status
[k
]);
439 int rseq_mempool_range_init_numa(void *addr
__attribute__((unused
)),
440 size_t len
__attribute__((unused
)),
441 int cpu
__attribute__((unused
)),
442 int numa_flags
__attribute__((unused
)))
450 int create_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
454 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
457 * Not being able to create the validation bitmap is an error
458 * that needs to be reported.
460 range
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
461 if (!range
->alloc_bitmap
)
467 bool percpu_addr_in_pool(const struct rseq_mempool
*pool
, void __rseq_percpu
*_addr
)
469 struct rseq_mempool_range
*range
;
470 void *addr
= (void *) _addr
;
472 for (range
= pool
->range_list
; range
; range
= range
->next
) {
473 if (addr
>= range
->base
&& addr
< range
->base
+ range
->next_unused
)
479 /* Always inline for __builtin_return_address(0). */
480 static inline __attribute__((always_inline
))
481 void check_free_list(const struct rseq_mempool
*pool
)
483 size_t total_item
= 0, total_never_allocated
= 0, total_freed
= 0,
484 max_list_traversal
= 0, traversal_iteration
= 0;
485 struct rseq_mempool_range
*range
;
487 if (!pool
->attr
.robust_set
)
490 for (range
= pool
->range_list
; range
; range
= range
->next
) {
491 total_item
+= pool
->attr
.stride
>> pool
->item_order
;
492 total_never_allocated
+= (pool
->attr
.stride
- range
->next_unused
) >> pool
->item_order
;
494 max_list_traversal
= total_item
- total_never_allocated
;
496 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
501 if (traversal_iteration
>= max_list_traversal
) {
502 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
503 __func__
, get_pool_name(pool
), pool
, __builtin_return_address(0));
507 /* Node is out of range. */
508 if (!percpu_addr_in_pool(pool
, __rseq_free_list_to_percpu_ptr(pool
, node
))) {
510 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
511 __func__
, prev
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
513 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
514 __func__
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
518 traversal_iteration
++;
522 if (total_never_allocated
+ total_freed
!= total_item
) {
523 fprintf(stderr
, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
524 __func__
, get_pool_name(pool
), pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
529 /* Always inline for __builtin_return_address(0). */
530 static inline __attribute__((always_inline
))
531 void check_range_poison(const struct rseq_mempool
*pool
,
532 const struct rseq_mempool_range
*range
)
536 for (item_offset
= 0; item_offset
< range
->next_unused
;
537 item_offset
+= pool
->item_len
)
538 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
541 /* Always inline for __builtin_return_address(0). */
542 static inline __attribute__((always_inline
))
543 void check_pool_poison(const struct rseq_mempool
*pool
)
545 struct rseq_mempool_range
*range
;
547 if (!pool
->attr
.robust_set
)
549 for (range
= pool
->range_list
; range
; range
= range
->next
)
550 check_range_poison(pool
, range
);
553 /* Always inline for __builtin_return_address(0). */
554 static inline __attribute__((always_inline
))
555 void destroy_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
557 unsigned long *bitmap
= range
->alloc_bitmap
;
558 size_t count
, total_leaks
= 0;
563 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
565 /* Assert that all items in the pool were freed. */
566 for (size_t k
= 0; k
< count
; ++k
)
567 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
569 fprintf(stderr
, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
570 __func__
, get_pool_name(pool
), pool
, total_leaks
, (void *) __builtin_return_address(0));
575 range
->alloc_bitmap
= NULL
;
578 /* Always inline for __builtin_return_address(0). */
579 static inline __attribute__((always_inline
))
580 int rseq_mempool_range_destroy(struct rseq_mempool
*pool
,
581 struct rseq_mempool_range
*range
)
583 destroy_alloc_bitmap(pool
, range
);
585 /* range is a header located one page before the aligned mapping. */
586 return munmap(range
->mmap_addr
, range
->mmap_len
);
590 * Allocate a memory mapping aligned on @alignment, with an optional
591 * @pre_header before the mapping.
594 void *aligned_mmap_anonymous(size_t page_size
, size_t len
, size_t alignment
,
595 void **pre_header
, size_t pre_header_len
)
597 size_t minimum_page_count
, page_count
, extra
, total_allocate
= 0;
601 if (len
< page_size
|| alignment
< page_size
||
602 !is_pow2(alignment
) || (len
& (alignment
- 1))) {
606 page_order
= rseq_get_count_order_ulong(page_size
);
607 if (page_order
< 0) {
611 if (pre_header_len
&& (pre_header_len
& (page_size
- 1))) {
616 minimum_page_count
= (pre_header_len
+ len
) >> page_order
;
617 page_count
= (pre_header_len
+ len
+ alignment
- page_size
) >> page_order
;
619 assert(page_count
>= minimum_page_count
);
621 ptr
= mmap(NULL
, page_count
<< page_order
, PROT_READ
| PROT_WRITE
,
622 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
623 if (ptr
== MAP_FAILED
) {
628 total_allocate
= page_count
<< page_order
;
630 if (!(((uintptr_t) ptr
+ pre_header_len
) & (alignment
- 1))) {
631 /* Pointer is already aligned. ptr points to pre_header. */
635 /* Unmap extra before. */
636 extra
= offset_align((uintptr_t) ptr
+ pre_header_len
, alignment
);
637 assert(!(extra
& (page_size
- 1)));
638 if (munmap(ptr
, extra
)) {
642 total_allocate
-= extra
;
643 ptr
+= extra
; /* ptr points to pre_header */
644 page_count
-= extra
>> page_order
;
646 assert(page_count
>= minimum_page_count
);
648 if (page_count
> minimum_page_count
) {
651 /* Unmap extra after. */
652 extra_ptr
= ptr
+ (minimum_page_count
<< page_order
);
653 extra
= (page_count
- minimum_page_count
) << page_order
;
654 if (munmap(extra_ptr
, extra
)) {
658 total_allocate
-= extra
;
661 assert(!(((uintptr_t)ptr
+ pre_header_len
) & (alignment
- 1)));
662 assert(total_allocate
== len
+ pre_header_len
);
668 ptr
+= pre_header_len
;
674 int rseq_memfd_create_init(const char *poolname
, size_t init_len
)
677 char buf
[249]; /* Limit is 249 bytes. */
681 snprintf(buf
, sizeof(buf
), "%s:rseq-mempool", poolname
);
684 name
= "<anonymous>:rseq-mempool";
687 fd
= memfd_create(name
, MFD_CLOEXEC
);
689 perror("memfd_create");
692 if (ftruncate(fd
, (off_t
) init_len
)) {
703 void rseq_memfd_close(int fd
)
712 struct rseq_mempool_range
*rseq_mempool_range_create(struct rseq_mempool
*pool
)
714 struct rseq_mempool_range
*range
;
715 unsigned long page_size
;
718 size_t range_len
; /* Range len excludes header. */
721 if (pool
->attr
.max_nr_ranges
&&
722 pool
->nr_ranges
>= pool
->attr
.max_nr_ranges
) {
726 page_size
= rseq_get_page_len();
728 range_len
= pool
->attr
.stride
* pool
->attr
.max_nr_cpus
;
729 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL
)
730 range_len
+= pool
->attr
.stride
; /* init values */
731 if (pool
->attr
.robust_set
)
732 range_len
+= pool
->attr
.stride
; /* free list */
733 base
= aligned_mmap_anonymous(page_size
, range_len
,
734 pool
->attr
.stride
, &header
, page_size
);
737 range
= (struct rseq_mempool_range
*) (base
- RANGE_HEADER_OFFSET
);
739 range
->header
= header
;
741 range
->mmap_addr
= header
;
742 range
->mmap_len
= page_size
+ range_len
;
744 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL
) {
745 range
->init
= base
+ (pool
->attr
.stride
* pool
->attr
.max_nr_cpus
);
746 /* Populate init values pages from memfd */
747 memfd
= rseq_memfd_create_init(pool
->name
, pool
->attr
.stride
);
750 if (mmap(range
->init
, pool
->attr
.stride
, PROT_READ
| PROT_WRITE
,
751 MAP_SHARED
| MAP_FIXED
, memfd
, 0) != (void *) range
->init
) {
755 * Make sure the init values shared mapping is not
756 * shared with the children processes across fork.
758 if (madvise(range
->init
, pool
->attr
.stride
, MADV_DONTFORK
))
760 assert(pool
->attr
.type
== MEMPOOL_TYPE_PERCPU
);
762 * Map per-cpu memory as private COW mappings of init values.
767 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
768 void *p
= base
+ (pool
->attr
.stride
* cpu
);
769 size_t len
= pool
->attr
.stride
;
771 if (mmap(p
, len
, PROT_READ
| PROT_WRITE
, MAP_PRIVATE
| MAP_FIXED
,
772 memfd
, 0) != (void *) p
) {
777 rseq_memfd_close(memfd
);
781 if (pool
->attr
.robust_set
) {
782 if (create_alloc_bitmap(pool
, range
))
785 if (pool
->attr
.init_set
) {
786 switch (pool
->attr
.type
) {
787 case MEMPOOL_TYPE_GLOBAL
:
788 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
789 base
, pool
->attr
.stride
, -1)) {
793 case MEMPOOL_TYPE_PERCPU
:
796 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
797 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
798 base
+ (pool
->attr
.stride
* cpu
),
799 pool
->attr
.stride
, cpu
)) {
813 rseq_memfd_close(memfd
);
814 (void) rseq_mempool_range_destroy(pool
, range
);
818 int rseq_mempool_destroy(struct rseq_mempool
*pool
)
820 struct rseq_mempool_range
*range
, *next_range
;
825 check_free_list(pool
);
826 check_pool_poison(pool
);
827 /* Iteration safe against removal. */
828 for (range
= pool
->range_list
; range
&& (next_range
= range
->next
, 1); range
= next_range
) {
829 if (rseq_mempool_range_destroy(pool
, range
))
831 /* Update list head to keep list coherent in case of partial failure. */
832 pool
->range_list
= next_range
;
834 pthread_mutex_destroy(&pool
->lock
);
841 struct rseq_mempool
*rseq_mempool_create(const char *pool_name
,
842 size_t item_len
, const struct rseq_mempool_attr
*_attr
)
844 struct rseq_mempool
*pool
;
845 struct rseq_mempool_attr attr
= {};
848 /* Make sure each item is large enough to contain free list pointers. */
849 if (item_len
< sizeof(void *))
850 item_len
= sizeof(void *);
852 /* Align item_len on next power of two. */
853 order
= rseq_get_count_order_ulong(item_len
);
858 item_len
= 1UL << order
;
861 memcpy(&attr
, _attr
, sizeof(attr
));
864 case MEMPOOL_TYPE_PERCPU
:
865 if (attr
.max_nr_cpus
< 0) {
869 if (attr
.max_nr_cpus
== 0) {
871 attr
.max_nr_cpus
= rseq_get_max_nr_cpus();
872 if (attr
.max_nr_cpus
== 0) {
878 case MEMPOOL_TYPE_GLOBAL
:
879 /* Override populate policy for global type. */
880 if (attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_PRIVATE_NONE
)
881 attr
.populate_policy
= RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL
;
882 /* Use a 1-cpu pool for global mempool type. */
883 attr
.max_nr_cpus
= 1;
887 attr
.stride
= RSEQ_MEMPOOL_STRIDE
; /* Use default */
888 if (attr
.robust_set
&& !attr
.poison_set
) {
889 attr
.poison_set
= true;
890 attr
.poison
= DEFAULT_PRIVATE_POISON_VALUE
;
892 if (item_len
> attr
.stride
|| attr
.stride
< (size_t) rseq_get_page_len() ||
893 !is_pow2(attr
.stride
)) {
898 pool
= calloc(1, sizeof(struct rseq_mempool
));
902 memcpy(&pool
->attr
, &attr
, sizeof(attr
));
903 pthread_mutex_init(&pool
->lock
, NULL
);
904 pool
->item_len
= item_len
;
905 pool
->item_order
= order
;
907 pool
->range_list
= rseq_mempool_range_create(pool
);
908 if (!pool
->range_list
)
912 pool
->name
= strdup(pool_name
);
919 rseq_mempool_destroy(pool
);
924 /* Always inline for __builtin_return_address(0). */
925 static inline __attribute__((always_inline
))
926 void set_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
928 unsigned long *bitmap
= range
->alloc_bitmap
;
929 size_t item_index
= item_offset
>> pool
->item_order
;
936 k
= item_index
/ BIT_PER_ULONG
;
937 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
939 /* Print error if bit is already set. */
940 if (bitmap
[k
] & mask
) {
941 fprintf(stderr
, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
942 __func__
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
949 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_mempool
*pool
,
950 bool zeroed
, void *init_ptr
, size_t init_len
)
952 struct rseq_mempool_range
*range
;
953 struct free_list_node
*node
;
954 uintptr_t item_offset
;
955 void __rseq_percpu
*addr
;
957 if (init_len
> pool
->item_len
) {
961 pthread_mutex_lock(&pool
->lock
);
962 /* Get first entry from free list. */
963 node
= pool
->free_list_head
;
965 void *range_base
, *ptr
;
967 ptr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
968 range_base
= (void *) ((uintptr_t) ptr
& (~(pool
->attr
.stride
- 1)));
969 range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
970 /* Remove node from free list (update head). */
971 pool
->free_list_head
= node
->next
;
972 item_offset
= (uintptr_t) (ptr
- range_base
);
973 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
974 addr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
978 * If the most recent range (first in list) does not have any
979 * room left, create a new range and prepend it to the list
982 range
= pool
->range_list
;
983 if (range
->next_unused
+ pool
->item_len
> pool
->attr
.stride
) {
984 range
= rseq_mempool_range_create(pool
);
990 /* Add range to head of list. */
991 range
->next
= pool
->range_list
;
992 pool
->range_list
= range
;
994 /* First range in list has room left. */
995 item_offset
= range
->next_unused
;
996 addr
= (void __rseq_percpu
*) (range
->base
+ item_offset
);
997 range
->next_unused
+= pool
->item_len
;
1000 set_alloc_slot(pool
, range
, item_offset
);
1001 pthread_mutex_unlock(&pool
->lock
);
1004 rseq_percpu_zero_item(pool
, range
, item_offset
);
1005 else if (init_ptr
) {
1006 rseq_percpu_init_item(pool
, range
, item_offset
,
1007 init_ptr
, init_len
);
1013 void __rseq_percpu
*rseq_mempool_percpu_malloc(struct rseq_mempool
*pool
)
1015 return __rseq_percpu_malloc(pool
, false, NULL
, 0);
1018 void __rseq_percpu
*rseq_mempool_percpu_zmalloc(struct rseq_mempool
*pool
)
1020 return __rseq_percpu_malloc(pool
, true, NULL
, 0);
1023 void __rseq_percpu
*rseq_mempool_percpu_malloc_init(struct rseq_mempool
*pool
,
1024 void *init_ptr
, size_t len
)
1026 return __rseq_percpu_malloc(pool
, false, init_ptr
, len
);
1029 /* Always inline for __builtin_return_address(0). */
1030 static inline __attribute__((always_inline
))
1031 void clear_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1033 unsigned long *bitmap
= range
->alloc_bitmap
;
1034 size_t item_index
= item_offset
>> pool
->item_order
;
1041 k
= item_index
/ BIT_PER_ULONG
;
1042 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1044 /* Print error if bit is not set. */
1045 if (!(bitmap
[k
] & mask
)) {
1046 fprintf(stderr
, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1047 __func__
, get_pool_name(pool
), pool
, item_offset
,
1048 (void *) __builtin_return_address(0));
1054 void librseq_mempool_percpu_free(void __rseq_percpu
*_ptr
, size_t stride
)
1056 uintptr_t ptr
= (uintptr_t) _ptr
;
1057 void *range_base
= (void *) (ptr
& (~(stride
- 1)));
1058 struct rseq_mempool_range
*range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1059 struct rseq_mempool
*pool
= range
->pool
;
1060 uintptr_t item_offset
= ptr
& (stride
- 1);
1061 struct free_list_node
*head
, *item
;
1063 pthread_mutex_lock(&pool
->lock
);
1064 clear_alloc_slot(pool
, range
, item_offset
);
1065 /* Add ptr to head of free list */
1066 head
= pool
->free_list_head
;
1067 if (pool
->attr
.poison_set
)
1068 rseq_percpu_poison_item(pool
, range
, item_offset
);
1069 item
= __rseq_percpu_to_free_list_ptr(pool
, _ptr
);
1071 * Setting the next pointer will overwrite the first uintptr_t
1072 * poison for either CPU 0 (populate all) or init data (populate
1076 pool
->free_list_head
= item
;
1077 pthread_mutex_unlock(&pool
->lock
);
1080 struct rseq_mempool_set
*rseq_mempool_set_create(void)
1082 struct rseq_mempool_set
*pool_set
;
1084 pool_set
= calloc(1, sizeof(struct rseq_mempool_set
));
1087 pthread_mutex_init(&pool_set
->lock
, NULL
);
1091 int rseq_mempool_set_destroy(struct rseq_mempool_set
*pool_set
)
1095 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1096 struct rseq_mempool
*pool
= pool_set
->entries
[order
];
1100 ret
= rseq_mempool_destroy(pool
);
1103 pool_set
->entries
[order
] = NULL
;
1105 pthread_mutex_destroy(&pool_set
->lock
);
1110 /* Ownership of pool is handed over to pool set on success. */
1111 int rseq_mempool_set_add_pool(struct rseq_mempool_set
*pool_set
, struct rseq_mempool
*pool
)
1113 size_t item_order
= pool
->item_order
;
1116 pthread_mutex_lock(&pool_set
->lock
);
1117 if (pool_set
->entries
[item_order
]) {
1122 pool_set
->entries
[pool
->item_order
] = pool
;
1124 pthread_mutex_unlock(&pool_set
->lock
);
1129 void __rseq_percpu
*__rseq_mempool_set_malloc(struct rseq_mempool_set
*pool_set
,
1130 void *init_ptr
, size_t len
, bool zeroed
)
1132 int order
, min_order
= POOL_SET_MIN_ENTRY
;
1133 struct rseq_mempool
*pool
;
1134 void __rseq_percpu
*addr
;
1136 order
= rseq_get_count_order_ulong(len
);
1137 if (order
> POOL_SET_MIN_ENTRY
)
1140 pthread_mutex_lock(&pool_set
->lock
);
1141 /* First smallest present pool where @len fits. */
1142 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1143 pool
= pool_set
->entries
[order
];
1147 if (pool
->item_len
>= len
)
1152 pthread_mutex_unlock(&pool_set
->lock
);
1154 addr
= __rseq_percpu_malloc(pool
, zeroed
, init_ptr
, len
);
1155 if (addr
== NULL
&& errno
== ENOMEM
) {
1157 * If the allocation failed, try again with a
1160 min_order
= order
+ 1;
1171 void __rseq_percpu
*rseq_mempool_set_percpu_malloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1173 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, false);
1176 void __rseq_percpu
*rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1178 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, true);
1181 void __rseq_percpu
*rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set
*pool_set
,
1182 void *init_ptr
, size_t len
)
1184 return __rseq_mempool_set_malloc(pool_set
, init_ptr
, len
, true);
1187 struct rseq_mempool_attr
*rseq_mempool_attr_create(void)
1189 return calloc(1, sizeof(struct rseq_mempool_attr
));
1192 void rseq_mempool_attr_destroy(struct rseq_mempool_attr
*attr
)
1197 int rseq_mempool_attr_set_init(struct rseq_mempool_attr
*attr
,
1198 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
),
1205 attr
->init_set
= true;
1206 attr
->init_func
= init_func
;
1207 attr
->init_priv
= init_priv
;
1211 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr
*attr
)
1217 attr
->robust_set
= true;
1221 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr
*attr
,
1222 size_t stride
, int max_nr_cpus
)
1228 attr
->type
= MEMPOOL_TYPE_PERCPU
;
1229 attr
->stride
= stride
;
1230 attr
->max_nr_cpus
= max_nr_cpus
;
1234 int rseq_mempool_attr_set_global(struct rseq_mempool_attr
*attr
,
1241 attr
->type
= MEMPOOL_TYPE_GLOBAL
;
1242 attr
->stride
= stride
;
1243 attr
->max_nr_cpus
= 0;
1247 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr
*attr
,
1248 unsigned long max_nr_ranges
)
1254 attr
->max_nr_ranges
= max_nr_ranges
;
1258 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr
*attr
,
1265 attr
->poison_set
= true;
1266 attr
->poison
= poison
;
1270 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr
*attr
,
1271 enum rseq_mempool_populate_policy policy
)
1277 attr
->populate_policy
= policy
;
1281 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool
*mempool
)
1283 if (!mempool
|| mempool
->attr
.type
!= MEMPOOL_TYPE_PERCPU
) {
1287 return mempool
->attr
.max_nr_cpus
;