1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3 // SPDX-FileCopyrightText: 2024 Olivier Dion <odion@efficios.com>
5 #include <rseq/mempool.h>
12 #include <rseq/compiler.h>
24 #include "rseq-utils.h"
25 #include <rseq/rseq.h>
28 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
30 * The rseq per-CPU memory allocator allows the application the request
31 * memory pools of CPU-Local memory each of containing objects of a
32 * given size (rounded to next power of 2), reserving a given virtual
33 * address size per CPU, for a given maximum number of CPUs.
35 * The per-CPU memory allocator is analogous to TLS (Thread-Local
36 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
37 * memory allocator provides CPU-Local Storage.
40 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
42 #define POOL_HEADER_NR_PAGES 2
45 * Smallest allocation should hold enough space for a free list pointer.
47 #if RSEQ_BITS_PER_LONG == 64
48 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
50 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
53 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
55 #define MOVE_PAGES_BATCH_SIZE 4096
57 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
59 #if RSEQ_BITS_PER_LONG == 64
60 # define DEFAULT_COW_INIT_POISON_VALUE 0x5555555555555555ULL
62 # define DEFAULT_COW_INIT_POISON_VALUE 0x55555555UL
66 * Define the default COW_ZERO poison value as zero to prevent useless
67 * COW page allocation when writing poison values when freeing items.
69 #define DEFAULT_COW_ZERO_POISON_VALUE 0x0
71 struct free_list_node
;
73 struct free_list_node
{
74 struct free_list_node
*next
;
78 MEMPOOL_TYPE_PERCPU
= 0, /* Default */
79 MEMPOOL_TYPE_GLOBAL
= 1,
82 struct rseq_mempool_attr
{
84 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
);
89 enum mempool_type type
;
93 unsigned long max_nr_ranges
;
98 enum rseq_mempool_populate_policy populate_policy
;
101 struct rseq_mempool_range
;
103 struct rseq_mempool_range
{
104 struct rseq_mempool_range
*next
; /* Linked list of ranges. */
105 struct rseq_mempool
*pool
; /* Backward reference to container pool. */
108 * Memory layout of a mempool range:
109 * - Canary header page (for detection of destroy-after-fork of
111 * - Header page (contains struct rseq_mempool_range at the
113 * - Base of the per-cpu data, starting with CPU 0.
114 * Aliases with free-list for non-robust COW_ZERO pool.
117 * - CPU max_nr_cpus - 1
118 * - init values (only allocated for COW_INIT pool).
119 * Aliases with free-list for non-robust COW_INIT pool.
120 * - free list (for robust pool).
122 * The free list aliases the CPU 0 memory area for non-robust
123 * COW_ZERO pools. It aliases with init values for non-robust
124 * COW_INIT pools. It is located immediately after the init
125 * values for robust pools.
130 * The init values contains malloc_init/zmalloc values.
131 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_COW_ZERO.
136 /* Pool range mmap/munmap */
140 size_t allocated_items
;
142 /* Track alloc/free. */
143 unsigned long *alloc_bitmap
;
146 struct rseq_mempool
{
147 /* Head of ranges linked-list. */
148 struct rseq_mempool_range
*range_list
;
149 unsigned long nr_ranges
;
155 * COW_INIT non-robust pools:
156 * The free list chains freed items on the init
157 * values address range.
159 * COW_ZERO non-robust pools:
160 * The free list chains freed items on the CPU 0
161 * address range. We should rethink this
162 * decision if false sharing between malloc/free
163 * from other CPUs and data accesses from CPU 0
166 * Robust pools: The free list chains freed items in the
167 * address range dedicated for the free list.
169 * This is a NULL-terminated singly-linked list.
171 struct free_list_node
*free_list_head
;
173 /* This lock protects allocation/free within the pool. */
174 pthread_mutex_t lock
;
176 struct rseq_mempool_attr attr
;
181 * Pool set entries are indexed by item_len rounded to the next power of
182 * 2. A pool set can contain NULL pool entries, in which case the next
183 * large enough entry will be used for allocation.
185 struct rseq_mempool_set
{
186 /* This lock protects add vs malloc/zmalloc within the pool set. */
187 pthread_mutex_t lock
;
188 struct rseq_mempool
*entries
[POOL_SET_NR_ENTRIES
];
192 const char *get_pool_name(const struct rseq_mempool
*pool
)
194 return pool
->name
? : "<anonymous>";
198 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range
*range
, int cpu
,
199 uintptr_t item_offset
, size_t stride
)
201 return range
->base
+ (stride
* cpu
) + item_offset
;
205 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range
*range
,
206 uintptr_t item_offset
)
210 return range
->init
+ item_offset
;
214 void __rseq_percpu
*__rseq_free_list_to_percpu_ptr(const struct rseq_mempool
*pool
,
215 struct free_list_node
*node
)
217 void __rseq_percpu
*p
= (void __rseq_percpu
*) node
;
219 if (pool
->attr
.robust_set
) {
221 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
222 /* Skip init values */
223 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
224 p
-= pool
->attr
.stride
;
227 /* COW_INIT free list is in init values */
228 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
229 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
235 struct free_list_node
*__rseq_percpu_to_free_list_ptr(const struct rseq_mempool
*pool
,
236 void __rseq_percpu
*p
)
238 if (pool
->attr
.robust_set
) {
240 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
241 /* Skip init values */
242 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
243 p
+= pool
->attr
.stride
;
246 /* COW_INIT free list is in init values */
247 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
248 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
250 return (struct free_list_node
*) p
;
254 intptr_t rseq_cmp_item(void *p
, size_t item_len
, intptr_t cmp_value
, intptr_t *unexpected_value
)
259 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t)) {
260 intptr_t v
= *((intptr_t *) (p
+ offset
));
262 if ((res
= v
- cmp_value
) != 0) {
263 if (unexpected_value
)
264 *unexpected_value
= v
;
272 void rseq_percpu_zero_item(struct rseq_mempool
*pool
,
273 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
278 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
280 bzero(init_p
, pool
->item_len
);
281 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
282 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
283 item_offset
, pool
->attr
.stride
);
286 * If item is already zeroed, either because the
287 * init range update has propagated or because the
288 * content is already zeroed (e.g. zero page), don't
289 * write to the page. This eliminates useless COW over
290 * the zero page just for overwriting it with zeroes.
292 * This means zmalloc() in COW_ZERO policy pool do
293 * not trigger COW for CPUs which are not actively
294 * writing to the pool. This is however not the case for
295 * malloc_init() in populate-all pools if it populates
298 if (!rseq_cmp_item(p
, pool
->item_len
, 0, NULL
))
300 bzero(p
, pool
->item_len
);
305 void rseq_percpu_init_item(struct rseq_mempool
*pool
,
306 struct rseq_mempool_range
*range
, uintptr_t item_offset
,
307 void *init_ptr
, size_t init_len
)
312 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
314 memcpy(init_p
, init_ptr
, init_len
);
315 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
316 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
317 item_offset
, pool
->attr
.stride
);
320 * If the update propagated through a shared mapping,
321 * or the item already has the correct content, skip
322 * writing it into the cpu item to eliminate useless
325 if (!memcmp(init_ptr
, p
, init_len
))
327 memcpy(p
, init_ptr
, init_len
);
332 void rseq_poison_item(void *p
, size_t item_len
, uintptr_t poison
)
336 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t))
337 *((uintptr_t *) (p
+ offset
)) = poison
;
341 void rseq_percpu_poison_item(struct rseq_mempool
*pool
,
342 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
344 uintptr_t poison
= pool
->attr
.poison
;
348 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
350 rseq_poison_item(init_p
, pool
->item_len
, poison
);
351 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
352 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
353 item_offset
, pool
->attr
.stride
);
356 * If the update propagated through a shared mapping,
357 * or the item already has the correct content, skip
358 * writing it into the cpu item to eliminate useless
361 * It is recommended to use zero as poison value for
362 * COW_ZERO pools to eliminate COW due to writing
363 * poison to CPU memory still backed by the zero page.
365 if (rseq_cmp_item(p
, pool
->item_len
, poison
, NULL
) == 0)
367 rseq_poison_item(p
, pool
->item_len
, poison
);
371 /* Always inline for __builtin_return_address(0). */
372 static inline __attribute__((always_inline
))
373 void rseq_check_poison_item(const struct rseq_mempool
*pool
, uintptr_t item_offset
,
374 void *p
, size_t item_len
, uintptr_t poison
)
376 intptr_t unexpected_value
;
378 if (rseq_cmp_item(p
, item_len
, poison
, &unexpected_value
) == 0)
381 fprintf(stderr
, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
382 __func__
, (unsigned long) unexpected_value
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
386 /* Always inline for __builtin_return_address(0). */
387 static inline __attribute__((always_inline
))
388 void rseq_percpu_check_poison_item(const struct rseq_mempool
*pool
,
389 const struct rseq_mempool_range
*range
, uintptr_t item_offset
)
391 uintptr_t poison
= pool
->attr
.poison
;
395 if (!pool
->attr
.robust_set
)
397 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
399 rseq_check_poison_item(pool
, item_offset
, init_p
, pool
->item_len
, poison
);
400 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
401 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
402 item_offset
, pool
->attr
.stride
);
403 rseq_check_poison_item(pool
, item_offset
, p
, pool
->item_len
, poison
);
408 int rseq_mempool_range_init_numa(void *addr
, size_t len
, int cpu
, int numa_flags
)
410 unsigned long nr_pages
, page_len
;
411 int status
[MOVE_PAGES_BATCH_SIZE
];
412 int nodes
[MOVE_PAGES_BATCH_SIZE
];
413 void *pages
[MOVE_PAGES_BATCH_SIZE
];
420 page_len
= rseq_get_page_len();
421 nr_pages
= len
>> rseq_get_count_order_ulong(page_len
);
423 nodes
[0] = numa_node_of_cpu(cpu
);
427 for (size_t k
= 1; k
< RSEQ_ARRAY_SIZE(nodes
); ++k
) {
431 for (unsigned long page
= 0; page
< nr_pages
;) {
433 size_t max_k
= RSEQ_ARRAY_SIZE(pages
);
434 size_t left
= nr_pages
- page
;
440 for (size_t k
= 0; k
< max_k
; ++k
, ++page
) {
441 pages
[k
] = addr
+ (page
* page_len
);
445 ret
= move_pages(0, max_k
, pages
, nodes
, status
, numa_flags
);
451 fprintf(stderr
, "%lu pages were not migrated\n", ret
);
452 for (size_t k
= 0; k
< max_k
; ++k
) {
455 "Error while moving page %p to numa node %d: %u\n",
456 pages
[k
], nodes
[k
], -status
[k
]);
463 int rseq_mempool_range_init_numa(void *addr
__attribute__((unused
)),
464 size_t len
__attribute__((unused
)),
465 int cpu
__attribute__((unused
)),
466 int numa_flags
__attribute__((unused
)))
474 int create_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
478 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
481 * Not being able to create the validation bitmap is an error
482 * that needs to be reported.
484 range
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
485 if (!range
->alloc_bitmap
)
491 bool percpu_addr_in_pool(const struct rseq_mempool
*pool
, void __rseq_percpu
*_addr
)
493 struct rseq_mempool_range
*range
;
494 void *addr
= (void *) _addr
;
496 for (range
= pool
->range_list
; range
; range
= range
->next
) {
497 if (addr
>= range
->base
&& addr
< range
->base
+ range
->next_unused
)
503 /* Always inline for __builtin_return_address(0). */
504 static inline __attribute__((always_inline
))
505 void check_free_list(const struct rseq_mempool
*pool
, bool mapping_accessible
)
507 size_t total_item
= 0, total_never_allocated
= 0, total_freed
= 0,
508 max_list_traversal
= 0, traversal_iteration
= 0;
509 struct rseq_mempool_range
*range
;
511 if (!pool
->attr
.robust_set
|| !mapping_accessible
)
514 for (range
= pool
->range_list
; range
; range
= range
->next
) {
515 total_item
+= pool
->attr
.stride
>> pool
->item_order
;
516 total_never_allocated
+= (pool
->attr
.stride
- range
->next_unused
) >> pool
->item_order
;
518 max_list_traversal
= total_item
- total_never_allocated
;
520 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
525 if (traversal_iteration
>= max_list_traversal
) {
526 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
527 __func__
, get_pool_name(pool
), pool
, __builtin_return_address(0));
531 /* Node is out of range. */
532 if (!percpu_addr_in_pool(pool
, __rseq_free_list_to_percpu_ptr(pool
, node
))) {
534 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
535 __func__
, prev
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
537 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
538 __func__
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
542 traversal_iteration
++;
546 if (total_never_allocated
+ total_freed
!= total_item
) {
547 fprintf(stderr
, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
548 __func__
, get_pool_name(pool
), pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
553 /* Always inline for __builtin_return_address(0). */
554 static inline __attribute__((always_inline
))
555 void check_range_poison(const struct rseq_mempool
*pool
,
556 const struct rseq_mempool_range
*range
)
560 for (item_offset
= 0; item_offset
< range
->next_unused
;
561 item_offset
+= pool
->item_len
)
562 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
565 /* Always inline for __builtin_return_address(0). */
566 static inline __attribute__((always_inline
))
567 void check_pool_poison(const struct rseq_mempool
*pool
, bool mapping_accessible
)
569 struct rseq_mempool_range
*range
;
571 if (!pool
->attr
.robust_set
|| !mapping_accessible
)
573 for (range
= pool
->range_list
; range
; range
= range
->next
)
574 check_range_poison(pool
, range
);
577 /* Always inline for __builtin_return_address(0). */
578 static inline __attribute__((always_inline
))
579 void destroy_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
581 unsigned long *bitmap
= range
->alloc_bitmap
;
582 size_t count
, total_leaks
= 0;
587 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
589 /* Assert that all items in the pool were freed. */
590 for (size_t k
= 0; k
< count
; ++k
)
591 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
593 fprintf(stderr
, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
594 __func__
, get_pool_name(pool
), pool
, total_leaks
, (void *) __builtin_return_address(0));
599 range
->alloc_bitmap
= NULL
;
602 /* Always inline for __builtin_return_address(0). */
603 static inline __attribute__((always_inline
))
604 int rseq_mempool_range_destroy(struct rseq_mempool
*pool
,
605 struct rseq_mempool_range
*range
,
606 bool mapping_accessible
)
608 destroy_alloc_bitmap(pool
, range
);
609 if (!mapping_accessible
) {
611 * Only the header pages are populated in the child
614 return munmap(range
->header
, POOL_HEADER_NR_PAGES
* rseq_get_page_len());
616 return munmap(range
->mmap_addr
, range
->mmap_len
);
620 * Allocate a memory mapping aligned on @alignment, with an optional
621 * @pre_header before the mapping.
624 void *aligned_mmap_anonymous(size_t page_size
, size_t len
, size_t alignment
,
625 void **pre_header
, size_t pre_header_len
)
627 size_t minimum_page_count
, page_count
, extra
, total_allocate
= 0;
631 if (len
< page_size
|| alignment
< page_size
||
632 !is_pow2(alignment
) || (len
& (alignment
- 1))) {
636 page_order
= rseq_get_count_order_ulong(page_size
);
637 if (page_order
< 0) {
641 if (pre_header_len
&& (pre_header_len
& (page_size
- 1))) {
646 minimum_page_count
= (pre_header_len
+ len
) >> page_order
;
647 page_count
= (pre_header_len
+ len
+ alignment
- page_size
) >> page_order
;
649 assert(page_count
>= minimum_page_count
);
651 ptr
= mmap(NULL
, page_count
<< page_order
, PROT_READ
| PROT_WRITE
,
652 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
653 if (ptr
== MAP_FAILED
) {
658 total_allocate
= page_count
<< page_order
;
660 if (!(((uintptr_t) ptr
+ pre_header_len
) & (alignment
- 1))) {
661 /* Pointer is already aligned. ptr points to pre_header. */
665 /* Unmap extra before. */
666 extra
= offset_align((uintptr_t) ptr
+ pre_header_len
, alignment
);
667 assert(!(extra
& (page_size
- 1)));
668 if (munmap(ptr
, extra
)) {
672 total_allocate
-= extra
;
673 ptr
+= extra
; /* ptr points to pre_header */
674 page_count
-= extra
>> page_order
;
676 assert(page_count
>= minimum_page_count
);
678 if (page_count
> minimum_page_count
) {
681 /* Unmap extra after. */
682 extra_ptr
= ptr
+ (minimum_page_count
<< page_order
);
683 extra
= (page_count
- minimum_page_count
) << page_order
;
684 if (munmap(extra_ptr
, extra
)) {
688 total_allocate
-= extra
;
691 assert(!(((uintptr_t)ptr
+ pre_header_len
) & (alignment
- 1)));
692 assert(total_allocate
== len
+ pre_header_len
);
698 ptr
+= pre_header_len
;
704 int rseq_memfd_create_init(const char *poolname
, size_t init_len
)
707 char buf
[249]; /* Limit is 249 bytes. */
711 snprintf(buf
, sizeof(buf
), "%s:rseq-mempool", poolname
);
714 name
= "<anonymous>:rseq-mempool";
717 fd
= memfd_create(name
, MFD_CLOEXEC
);
719 perror("memfd_create");
722 if (ftruncate(fd
, (off_t
) init_len
)) {
733 void rseq_memfd_close(int fd
)
742 struct rseq_mempool_range
*rseq_mempool_range_create(struct rseq_mempool
*pool
)
744 struct rseq_mempool_range
*range
;
745 unsigned long page_size
;
748 size_t range_len
; /* Range len excludes header. */
752 if (pool
->attr
.max_nr_ranges
&&
753 pool
->nr_ranges
>= pool
->attr
.max_nr_ranges
) {
757 page_size
= rseq_get_page_len();
759 header_len
= POOL_HEADER_NR_PAGES
* page_size
;
760 range_len
= pool
->attr
.stride
* pool
->attr
.max_nr_cpus
;
761 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
762 range_len
+= pool
->attr
.stride
; /* init values */
763 if (pool
->attr
.robust_set
)
764 range_len
+= pool
->attr
.stride
; /* dedicated free list */
765 base
= aligned_mmap_anonymous(page_size
, range_len
,
766 pool
->attr
.stride
, &header
, header_len
);
769 range
= (struct rseq_mempool_range
*) (base
- RANGE_HEADER_OFFSET
);
771 range
->header
= header
;
773 range
->mmap_addr
= header
;
774 range
->mmap_len
= header_len
+ range_len
;
776 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
) {
777 range
->init
= base
+ (pool
->attr
.stride
* pool
->attr
.max_nr_cpus
);
778 /* Populate init values pages from memfd */
779 memfd
= rseq_memfd_create_init(pool
->name
, pool
->attr
.stride
);
782 if (mmap(range
->init
, pool
->attr
.stride
, PROT_READ
| PROT_WRITE
,
783 MAP_SHARED
| MAP_FIXED
, memfd
, 0) != (void *) range
->init
)
785 assert(pool
->attr
.type
== MEMPOOL_TYPE_PERCPU
);
787 * Map per-cpu memory as private COW mappings of init values.
792 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
793 void *p
= base
+ (pool
->attr
.stride
* cpu
);
794 size_t len
= pool
->attr
.stride
;
796 if (mmap(p
, len
, PROT_READ
| PROT_WRITE
, MAP_PRIVATE
| MAP_FIXED
,
797 memfd
, 0) != (void *) p
)
802 * The init values shared mapping should not be shared
803 * with the children processes across fork. Prevent the
804 * whole mapping from being used across fork.
806 if (madvise(base
, range_len
, MADV_DONTFORK
))
810 * Write 0x1 in first byte of header first page, which
811 * will be WIPEONFORK (and thus cleared) in children
812 * processes. Used to find out if pool destroy is called
813 * from a child process after fork.
815 *((char *) header
) = 0x1;
816 if (madvise(header
, page_size
, MADV_WIPEONFORK
))
820 * The second header page contains the struct
821 * rseq_mempool_range, which is needed by pool destroy.
822 * Leave this anonymous page populated (COW) in child
825 rseq_memfd_close(memfd
);
829 if (pool
->attr
.robust_set
) {
830 if (create_alloc_bitmap(pool
, range
))
833 if (pool
->attr
.init_set
) {
834 switch (pool
->attr
.type
) {
835 case MEMPOOL_TYPE_GLOBAL
:
836 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
837 base
, pool
->attr
.stride
, -1)) {
841 case MEMPOOL_TYPE_PERCPU
:
844 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
845 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
846 base
+ (pool
->attr
.stride
* cpu
),
847 pool
->attr
.stride
, cpu
)) {
861 rseq_memfd_close(memfd
);
862 (void) rseq_mempool_range_destroy(pool
, range
, true);
867 bool pool_mappings_accessible(struct rseq_mempool
*pool
)
869 struct rseq_mempool_range
*range
;
873 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_COW_INIT
)
875 range
= pool
->range_list
;
878 page_size
= rseq_get_page_len();
880 * Header first page is one page before the page containing the
883 addr
= (char *) ((uintptr_t) range
& ~(page_size
- 1)) - page_size
;
885 * Look for 0x1 first byte marker in header first page.
892 int rseq_mempool_destroy(struct rseq_mempool
*pool
)
894 struct rseq_mempool_range
*range
, *next_range
;
895 bool mapping_accessible
;
902 * Validate that the pool mappings are accessible before doing
903 * free list/poison validation and unmapping ranges. This allows
904 * calling pool destroy in child process after a fork for COW_INIT
905 * pools to free pool resources.
907 mapping_accessible
= pool_mappings_accessible(pool
);
909 check_free_list(pool
, mapping_accessible
);
910 check_pool_poison(pool
, mapping_accessible
);
912 /* Iteration safe against removal. */
913 for (range
= pool
->range_list
; range
&& (next_range
= range
->next
, 1); range
= next_range
) {
914 if (rseq_mempool_range_destroy(pool
, range
, mapping_accessible
))
916 /* Update list head to keep list coherent in case of partial failure. */
917 pool
->range_list
= next_range
;
919 pthread_mutex_destroy(&pool
->lock
);
926 struct rseq_mempool
*rseq_mempool_create(const char *pool_name
,
927 size_t item_len
, const struct rseq_mempool_attr
*_attr
)
929 struct rseq_mempool
*pool
;
930 struct rseq_mempool_attr attr
= {};
933 /* Make sure each item is large enough to contain free list pointers. */
934 if (item_len
< sizeof(void *))
935 item_len
= sizeof(void *);
937 /* Align item_len on next power of two. */
938 order
= rseq_get_count_order_ulong(item_len
);
943 item_len
= 1UL << order
;
946 memcpy(&attr
, _attr
, sizeof(attr
));
949 * Validate that the pool populate policy requested is known.
951 switch (attr
.populate_policy
) {
952 case RSEQ_MEMPOOL_POPULATE_COW_INIT
:
954 case RSEQ_MEMPOOL_POPULATE_COW_ZERO
:
962 case MEMPOOL_TYPE_PERCPU
:
963 if (attr
.max_nr_cpus
< 0) {
967 if (attr
.max_nr_cpus
== 0) {
969 attr
.max_nr_cpus
= rseq_get_max_nr_cpus();
970 if (attr
.max_nr_cpus
== 0) {
976 case MEMPOOL_TYPE_GLOBAL
:
977 /* Override populate policy for global type. */
978 if (attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
979 attr
.populate_policy
= RSEQ_MEMPOOL_POPULATE_COW_ZERO
;
980 /* Use a 1-cpu pool for global mempool type. */
981 attr
.max_nr_cpus
= 1;
985 attr
.stride
= RSEQ_MEMPOOL_STRIDE
; /* Use default */
986 if (attr
.robust_set
&& !attr
.poison_set
) {
987 attr
.poison_set
= true;
988 if (attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
989 attr
.poison
= DEFAULT_COW_INIT_POISON_VALUE
;
991 attr
.poison
= DEFAULT_COW_ZERO_POISON_VALUE
;
993 if (item_len
> attr
.stride
|| attr
.stride
< (size_t) rseq_get_page_len() ||
994 !is_pow2(attr
.stride
)) {
999 pool
= calloc(1, sizeof(struct rseq_mempool
));
1003 memcpy(&pool
->attr
, &attr
, sizeof(attr
));
1004 pthread_mutex_init(&pool
->lock
, NULL
);
1005 pool
->item_len
= item_len
;
1006 pool
->item_order
= order
;
1008 pool
->range_list
= rseq_mempool_range_create(pool
);
1009 if (!pool
->range_list
)
1013 pool
->name
= strdup(pool_name
);
1020 rseq_mempool_destroy(pool
);
1025 /* Always inline for __builtin_return_address(0). */
1026 static inline __attribute__((always_inline
))
1027 void set_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1029 unsigned long *bitmap
= range
->alloc_bitmap
;
1030 size_t item_index
= item_offset
>> pool
->item_order
;
1037 k
= item_index
/ BIT_PER_ULONG
;
1038 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1040 /* Print error if bit is already set. */
1041 if (bitmap
[k
] & mask
) {
1042 fprintf(stderr
, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1043 __func__
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
1050 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_mempool
*pool
,
1051 bool zeroed
, void *init_ptr
, size_t init_len
)
1053 struct rseq_mempool_range
*range
;
1054 struct free_list_node
*node
;
1055 uintptr_t item_offset
;
1056 void __rseq_percpu
*addr
;
1058 if (init_len
> pool
->item_len
) {
1062 pthread_mutex_lock(&pool
->lock
);
1063 /* Get first entry from free list. */
1064 node
= pool
->free_list_head
;
1066 void *range_base
, *ptr
;
1068 ptr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1069 range_base
= (void *) ((uintptr_t) ptr
& (~(pool
->attr
.stride
- 1)));
1070 range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1071 /* Remove node from free list (update head). */
1072 pool
->free_list_head
= node
->next
;
1073 item_offset
= (uintptr_t) (ptr
- range_base
);
1074 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
1075 addr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1079 * If the most recent range (first in list) does not have any
1080 * room left, create a new range and prepend it to the list
1083 range
= pool
->range_list
;
1084 if (range
->next_unused
+ pool
->item_len
> pool
->attr
.stride
) {
1085 range
= rseq_mempool_range_create(pool
);
1091 /* Add range to head of list. */
1092 range
->next
= pool
->range_list
;
1093 pool
->range_list
= range
;
1095 /* First range in list has room left. */
1096 item_offset
= range
->next_unused
;
1097 addr
= (void __rseq_percpu
*) (range
->base
+ item_offset
);
1098 range
->next_unused
+= pool
->item_len
;
1101 range
->allocated_items
++;
1102 set_alloc_slot(pool
, range
, item_offset
);
1104 pthread_mutex_unlock(&pool
->lock
);
1107 rseq_percpu_zero_item(pool
, range
, item_offset
);
1108 else if (init_ptr
) {
1109 rseq_percpu_init_item(pool
, range
, item_offset
,
1110 init_ptr
, init_len
);
1116 void __rseq_percpu
*rseq_mempool_percpu_malloc(struct rseq_mempool
*pool
)
1118 return __rseq_percpu_malloc(pool
, false, NULL
, 0);
1121 void __rseq_percpu
*rseq_mempool_percpu_zmalloc(struct rseq_mempool
*pool
)
1123 return __rseq_percpu_malloc(pool
, true, NULL
, 0);
1126 void __rseq_percpu
*rseq_mempool_percpu_malloc_init(struct rseq_mempool
*pool
,
1127 void *init_ptr
, size_t len
)
1129 return __rseq_percpu_malloc(pool
, false, init_ptr
, len
);
1132 /* Always inline for __builtin_return_address(0). */
1133 static inline __attribute__((always_inline
))
1134 void clear_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1136 unsigned long *bitmap
= range
->alloc_bitmap
;
1137 size_t item_index
= item_offset
>> pool
->item_order
;
1144 k
= item_index
/ BIT_PER_ULONG
;
1145 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1147 /* Print error if bit is not set. */
1148 if (!(bitmap
[k
] & mask
)) {
1149 fprintf(stderr
, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1150 __func__
, get_pool_name(pool
), pool
, item_offset
,
1151 (void *) __builtin_return_address(0));
1157 void librseq_mempool_percpu_free(void __rseq_percpu
*_ptr
, size_t stride
)
1159 uintptr_t ptr
= (uintptr_t) _ptr
;
1160 void *range_base
= (void *) (ptr
& (~(stride
- 1)));
1161 struct rseq_mempool_range
*range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1162 struct rseq_mempool
*pool
= range
->pool
;
1163 uintptr_t item_offset
= ptr
& (stride
- 1);
1164 struct free_list_node
*head
, *item
;
1166 pthread_mutex_lock(&pool
->lock
);
1167 clear_alloc_slot(pool
, range
, item_offset
);
1168 if (!range
->allocated_items
) {
1169 fprintf(stderr
, "%s: Trying to free an item from an empty pool range within pool \"%s\" (%p), item offset: %zu, caller: %p.\n",
1170 __func__
, get_pool_name(pool
), pool
, item_offset
,
1171 (void *) __builtin_return_address(0));
1174 range
->allocated_items
--;
1175 /* Add ptr to head of free list */
1176 head
= pool
->free_list_head
;
1177 if (pool
->attr
.poison_set
)
1178 rseq_percpu_poison_item(pool
, range
, item_offset
);
1179 item
= __rseq_percpu_to_free_list_ptr(pool
, _ptr
);
1181 * Setting the next pointer will overwrite the first uintptr_t
1182 * poison for either CPU 0 (COW_ZERO, non-robust), or init data
1183 * (COW_INIT, non-robust).
1186 pool
->free_list_head
= item
;
1187 pthread_mutex_unlock(&pool
->lock
);
1190 struct rseq_mempool_set
*rseq_mempool_set_create(void)
1192 struct rseq_mempool_set
*pool_set
;
1194 pool_set
= calloc(1, sizeof(struct rseq_mempool_set
));
1197 pthread_mutex_init(&pool_set
->lock
, NULL
);
1201 int rseq_mempool_set_destroy(struct rseq_mempool_set
*pool_set
)
1205 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1206 struct rseq_mempool
*pool
= pool_set
->entries
[order
];
1210 ret
= rseq_mempool_destroy(pool
);
1213 pool_set
->entries
[order
] = NULL
;
1215 pthread_mutex_destroy(&pool_set
->lock
);
1220 /* Ownership of pool is handed over to pool set on success. */
1221 int rseq_mempool_set_add_pool(struct rseq_mempool_set
*pool_set
, struct rseq_mempool
*pool
)
1223 size_t item_order
= pool
->item_order
;
1226 pthread_mutex_lock(&pool_set
->lock
);
1227 if (pool_set
->entries
[item_order
]) {
1232 pool_set
->entries
[pool
->item_order
] = pool
;
1234 pthread_mutex_unlock(&pool_set
->lock
);
1239 void __rseq_percpu
*__rseq_mempool_set_malloc(struct rseq_mempool_set
*pool_set
,
1240 void *init_ptr
, size_t len
, bool zeroed
)
1242 int order
, min_order
= POOL_SET_MIN_ENTRY
;
1243 struct rseq_mempool
*pool
;
1244 void __rseq_percpu
*addr
;
1246 order
= rseq_get_count_order_ulong(len
);
1247 if (order
> POOL_SET_MIN_ENTRY
)
1250 pthread_mutex_lock(&pool_set
->lock
);
1251 /* First smallest present pool where @len fits. */
1252 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1253 pool
= pool_set
->entries
[order
];
1257 if (pool
->item_len
>= len
)
1262 pthread_mutex_unlock(&pool_set
->lock
);
1264 addr
= __rseq_percpu_malloc(pool
, zeroed
, init_ptr
, len
);
1265 if (addr
== NULL
&& errno
== ENOMEM
) {
1267 * If the allocation failed, try again with a
1270 min_order
= order
+ 1;
1281 void __rseq_percpu
*rseq_mempool_set_percpu_malloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1283 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, false);
1286 void __rseq_percpu
*rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1288 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, true);
1291 void __rseq_percpu
*rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set
*pool_set
,
1292 void *init_ptr
, size_t len
)
1294 return __rseq_mempool_set_malloc(pool_set
, init_ptr
, len
, true);
1297 struct rseq_mempool_attr
*rseq_mempool_attr_create(void)
1299 return calloc(1, sizeof(struct rseq_mempool_attr
));
1302 void rseq_mempool_attr_destroy(struct rseq_mempool_attr
*attr
)
1307 int rseq_mempool_attr_set_init(struct rseq_mempool_attr
*attr
,
1308 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
),
1315 attr
->init_set
= true;
1316 attr
->init_func
= init_func
;
1317 attr
->init_priv
= init_priv
;
1318 attr
->populate_policy
= RSEQ_MEMPOOL_POPULATE_COW_INIT
;
1322 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr
*attr
)
1328 attr
->robust_set
= true;
1332 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr
*attr
,
1333 size_t stride
, int max_nr_cpus
)
1339 attr
->type
= MEMPOOL_TYPE_PERCPU
;
1340 attr
->stride
= stride
;
1341 attr
->max_nr_cpus
= max_nr_cpus
;
1345 int rseq_mempool_attr_set_global(struct rseq_mempool_attr
*attr
,
1352 attr
->type
= MEMPOOL_TYPE_GLOBAL
;
1353 attr
->stride
= stride
;
1354 attr
->max_nr_cpus
= 0;
1358 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr
*attr
,
1359 unsigned long max_nr_ranges
)
1365 attr
->max_nr_ranges
= max_nr_ranges
;
1369 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr
*attr
,
1376 attr
->poison_set
= true;
1377 attr
->poison
= poison
;
1381 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr
*attr
,
1382 enum rseq_mempool_populate_policy policy
)
1388 attr
->populate_policy
= policy
;
1392 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool
*mempool
)
1394 if (!mempool
|| mempool
->attr
.type
!= MEMPOOL_TYPE_PERCPU
) {
1398 return mempool
->attr
.max_nr_cpus
;