1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3 // SPDX-FileCopyrightText: 2024 Olivier Dion <odion@efficios.com>
5 #include <rseq/mempool.h>
12 #include <rseq/compiler.h>
24 #include "rseq-utils.h"
26 #include <rseq/rseq.h>
29 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
31 * The rseq per-CPU memory allocator allows the application the request
32 * memory pools of CPU-Local memory each of containing objects of a
33 * given size (rounded to next power of 2), reserving a given virtual
34 * address size per CPU, for a given maximum number of CPUs.
36 * The per-CPU memory allocator is analogous to TLS (Thread-Local
37 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
38 * memory allocator provides CPU-Local Storage.
41 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
43 #define POOL_HEADER_NR_PAGES 2
46 * Smallest allocation should hold enough space for a free list pointer.
48 #if RSEQ_BITS_PER_LONG == 64
49 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
51 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
54 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
56 #define MOVE_PAGES_BATCH_SIZE 4096
58 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
60 #if RSEQ_BITS_PER_LONG == 64
61 # define DEFAULT_COW_INIT_POISON_VALUE 0x5555555555555555ULL
63 # define DEFAULT_COW_INIT_POISON_VALUE 0x55555555UL
67 * Define the default COW_ZERO poison value as zero to prevent useless
68 * COW page allocation when writing poison values when freeing items.
70 #define DEFAULT_COW_ZERO_POISON_VALUE 0x0
72 struct free_list_node
;
74 struct free_list_node
{
75 struct free_list_node
*next
;
79 MEMPOOL_TYPE_PERCPU
= 0, /* Default */
80 MEMPOOL_TYPE_GLOBAL
= 1,
83 struct rseq_mempool_attr
{
85 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
);
90 enum mempool_type type
;
94 unsigned long max_nr_ranges
;
99 enum rseq_mempool_populate_policy populate_policy
;
102 struct rseq_mempool_range
;
104 struct rseq_mempool_range
{
105 struct list_head node
; /* Linked list of ranges. */
106 struct rseq_mempool
*pool
; /* Backward reference to container pool. */
109 * Memory layout of a mempool range:
110 * - Canary header page (for detection of destroy-after-fork of
112 * - Header page (contains struct rseq_mempool_range at the
114 * - Base of the per-cpu data, starting with CPU 0.
115 * Aliases with free-list for non-robust COW_ZERO pool.
118 * - CPU max_nr_cpus - 1
119 * - init values (only allocated for COW_INIT pool).
120 * Aliases with free-list for non-robust COW_INIT pool.
121 * - free list (for robust pool).
123 * The free list aliases the CPU 0 memory area for non-robust
124 * COW_ZERO pools. It aliases with init values for non-robust
125 * COW_INIT pools. It is located immediately after the init
126 * values for robust pools.
131 * The init values contains malloc_init/zmalloc values.
132 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_COW_ZERO.
137 /* Pool range mmap/munmap */
141 size_t allocated_items
;
143 /* Track alloc/free. */
144 unsigned long *alloc_bitmap
;
147 struct rseq_mempool
{
148 struct list_head range_list
; /* Head of ranges linked-list. */
149 unsigned long nr_ranges
;
155 * COW_INIT non-robust pools:
156 * The free list chains freed items on the init
157 * values address range.
159 * COW_ZERO non-robust pools:
160 * The free list chains freed items on the CPU 0
161 * address range. We should rethink this
162 * decision if false sharing between malloc/free
163 * from other CPUs and data accesses from CPU 0
166 * Robust pools: The free list chains freed items in the
167 * address range dedicated for the free list.
169 * This is a NULL-terminated singly-linked list.
171 struct free_list_node
*free_list_head
;
173 /* This lock protects allocation/free within the pool. */
174 pthread_mutex_t lock
;
176 struct rseq_mempool_attr attr
;
181 * Pool set entries are indexed by item_len rounded to the next power of
182 * 2. A pool set can contain NULL pool entries, in which case the next
183 * large enough entry will be used for allocation.
185 struct rseq_mempool_set
{
186 /* This lock protects add vs malloc/zmalloc within the pool set. */
187 pthread_mutex_t lock
;
188 struct rseq_mempool
*entries
[POOL_SET_NR_ENTRIES
];
192 const char *get_pool_name(const struct rseq_mempool
*pool
)
194 return pool
->name
? : "<anonymous>";
198 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range
*range
, int cpu
,
199 uintptr_t item_offset
, size_t stride
)
201 return range
->base
+ (stride
* cpu
) + item_offset
;
205 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range
*range
,
206 uintptr_t item_offset
)
210 return range
->init
+ item_offset
;
214 void __rseq_percpu
*__rseq_free_list_to_percpu_ptr(const struct rseq_mempool
*pool
,
215 struct free_list_node
*node
)
217 void __rseq_percpu
*p
= (void __rseq_percpu
*) node
;
219 if (pool
->attr
.robust_set
) {
221 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
222 /* Skip init values */
223 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
224 p
-= pool
->attr
.stride
;
227 /* COW_INIT free list is in init values */
228 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
229 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
235 struct free_list_node
*__rseq_percpu_to_free_list_ptr(const struct rseq_mempool
*pool
,
236 void __rseq_percpu
*p
)
238 if (pool
->attr
.robust_set
) {
240 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
241 /* Skip init values */
242 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
243 p
+= pool
->attr
.stride
;
246 /* COW_INIT free list is in init values */
247 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
248 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
250 return (struct free_list_node
*) p
;
254 intptr_t rseq_cmp_item(void *p
, size_t item_len
, intptr_t cmp_value
, intptr_t *unexpected_value
)
259 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t)) {
260 intptr_t v
= *((intptr_t *) (p
+ offset
));
262 if ((res
= v
- cmp_value
) != 0) {
263 if (unexpected_value
)
264 *unexpected_value
= v
;
272 void rseq_percpu_zero_item(struct rseq_mempool
*pool
,
273 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
278 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
280 bzero(init_p
, pool
->item_len
);
281 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
282 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
283 item_offset
, pool
->attr
.stride
);
286 * If item is already zeroed, either because the
287 * init range update has propagated or because the
288 * content is already zeroed (e.g. zero page), don't
289 * write to the page. This eliminates useless COW over
290 * the zero page just for overwriting it with zeroes.
292 * This means zmalloc() in COW_ZERO policy pool do
293 * not trigger COW for CPUs which are not actively
294 * writing to the pool. This is however not the case for
295 * malloc_init() in populate-all pools if it populates
298 if (!rseq_cmp_item(p
, pool
->item_len
, 0, NULL
))
300 bzero(p
, pool
->item_len
);
305 void rseq_percpu_init_item(struct rseq_mempool
*pool
,
306 struct rseq_mempool_range
*range
, uintptr_t item_offset
,
307 void *init_ptr
, size_t init_len
)
312 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
314 memcpy(init_p
, init_ptr
, init_len
);
315 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
316 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
317 item_offset
, pool
->attr
.stride
);
320 * If the update propagated through a shared mapping,
321 * or the item already has the correct content, skip
322 * writing it into the cpu item to eliminate useless
325 if (!memcmp(init_ptr
, p
, init_len
))
327 memcpy(p
, init_ptr
, init_len
);
332 void rseq_poison_item(void *p
, size_t item_len
, uintptr_t poison
)
336 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t))
337 *((uintptr_t *) (p
+ offset
)) = poison
;
341 void rseq_percpu_poison_item(struct rseq_mempool
*pool
,
342 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
344 uintptr_t poison
= pool
->attr
.poison
;
348 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
350 rseq_poison_item(init_p
, pool
->item_len
, poison
);
351 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
352 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
353 item_offset
, pool
->attr
.stride
);
356 * If the update propagated through a shared mapping,
357 * or the item already has the correct content, skip
358 * writing it into the cpu item to eliminate useless
361 * It is recommended to use zero as poison value for
362 * COW_ZERO pools to eliminate COW due to writing
363 * poison to CPU memory still backed by the zero page.
365 if (rseq_cmp_item(p
, pool
->item_len
, poison
, NULL
) == 0)
367 rseq_poison_item(p
, pool
->item_len
, poison
);
371 /* Always inline for __builtin_return_address(0). */
372 static inline __attribute__((always_inline
))
373 void rseq_check_poison_item(const struct rseq_mempool
*pool
, uintptr_t item_offset
,
374 void *p
, size_t item_len
, uintptr_t poison
)
376 intptr_t unexpected_value
;
378 if (rseq_cmp_item(p
, item_len
, poison
, &unexpected_value
) == 0)
381 fprintf(stderr
, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
382 __func__
, (unsigned long) unexpected_value
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
386 /* Always inline for __builtin_return_address(0). */
387 static inline __attribute__((always_inline
))
388 void rseq_percpu_check_poison_item(const struct rseq_mempool
*pool
,
389 const struct rseq_mempool_range
*range
, uintptr_t item_offset
)
391 uintptr_t poison
= pool
->attr
.poison
;
395 if (!pool
->attr
.robust_set
)
397 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
399 rseq_check_poison_item(pool
, item_offset
, init_p
, pool
->item_len
, poison
);
400 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
401 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
402 item_offset
, pool
->attr
.stride
);
403 rseq_check_poison_item(pool
, item_offset
, p
, pool
->item_len
, poison
);
408 int rseq_mempool_range_init_numa(void *addr
, size_t len
, int cpu
, int numa_flags
)
410 unsigned long nr_pages
, page_len
;
411 int status
[MOVE_PAGES_BATCH_SIZE
];
412 int nodes
[MOVE_PAGES_BATCH_SIZE
];
413 void *pages
[MOVE_PAGES_BATCH_SIZE
];
420 page_len
= rseq_get_page_len();
421 nr_pages
= len
>> rseq_get_count_order_ulong(page_len
);
423 nodes
[0] = numa_node_of_cpu(cpu
);
427 for (size_t k
= 1; k
< RSEQ_ARRAY_SIZE(nodes
); ++k
) {
431 for (unsigned long page
= 0; page
< nr_pages
;) {
433 size_t max_k
= RSEQ_ARRAY_SIZE(pages
);
434 size_t left
= nr_pages
- page
;
440 for (size_t k
= 0; k
< max_k
; ++k
, ++page
) {
441 pages
[k
] = addr
+ (page
* page_len
);
445 ret
= move_pages(0, max_k
, pages
, nodes
, status
, numa_flags
);
451 fprintf(stderr
, "%lu pages were not migrated\n", ret
);
452 for (size_t k
= 0; k
< max_k
; ++k
) {
455 "Error while moving page %p to numa node %d: %u\n",
456 pages
[k
], nodes
[k
], -status
[k
]);
463 int rseq_mempool_range_init_numa(void *addr
__attribute__((unused
)),
464 size_t len
__attribute__((unused
)),
465 int cpu
__attribute__((unused
)),
466 int numa_flags
__attribute__((unused
)))
474 int create_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
478 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
481 * Not being able to create the validation bitmap is an error
482 * that needs to be reported.
484 range
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
485 if (!range
->alloc_bitmap
)
491 bool percpu_addr_in_pool(const struct rseq_mempool
*pool
, void __rseq_percpu
*_addr
)
493 struct rseq_mempool_range
*range
;
494 void *addr
= (void *) _addr
;
496 list_for_each_entry(range
, &pool
->range_list
, node
) {
497 if (addr
>= range
->base
&& addr
< range
->base
+ range
->next_unused
)
503 /* Always inline for __builtin_return_address(0). */
504 static inline __attribute__((always_inline
))
505 void check_free_list(const struct rseq_mempool
*pool
, bool mapping_accessible
)
507 size_t total_item
= 0, total_never_allocated
= 0, total_freed
= 0,
508 max_list_traversal
= 0, traversal_iteration
= 0;
509 struct rseq_mempool_range
*range
;
511 if (!pool
->attr
.robust_set
|| !mapping_accessible
)
514 list_for_each_entry(range
, &pool
->range_list
, node
) {
515 total_item
+= pool
->attr
.stride
>> pool
->item_order
;
516 total_never_allocated
+= (pool
->attr
.stride
- range
->next_unused
) >> pool
->item_order
;
518 max_list_traversal
= total_item
- total_never_allocated
;
520 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
525 if (traversal_iteration
>= max_list_traversal
) {
526 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
527 __func__
, get_pool_name(pool
), pool
, __builtin_return_address(0));
531 /* Node is out of range. */
532 if (!percpu_addr_in_pool(pool
, __rseq_free_list_to_percpu_ptr(pool
, node
))) {
534 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
535 __func__
, prev
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
537 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
538 __func__
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
542 traversal_iteration
++;
546 if (total_never_allocated
+ total_freed
!= total_item
) {
547 fprintf(stderr
, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
548 __func__
, get_pool_name(pool
), pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
553 /* Always inline for __builtin_return_address(0). */
554 static inline __attribute__((always_inline
))
555 void check_range_poison(const struct rseq_mempool
*pool
,
556 const struct rseq_mempool_range
*range
)
560 for (item_offset
= 0; item_offset
< range
->next_unused
;
561 item_offset
+= pool
->item_len
)
562 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
565 /* Always inline for __builtin_return_address(0). */
566 static inline __attribute__((always_inline
))
567 void check_pool_poison(const struct rseq_mempool
*pool
, bool mapping_accessible
)
569 struct rseq_mempool_range
*range
;
571 if (!pool
->attr
.robust_set
|| !mapping_accessible
)
573 list_for_each_entry(range
, &pool
->range_list
, node
)
574 check_range_poison(pool
, range
);
577 /* Always inline for __builtin_return_address(0). */
578 static inline __attribute__((always_inline
))
579 void destroy_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
581 unsigned long *bitmap
= range
->alloc_bitmap
;
582 size_t count
, total_leaks
= 0;
587 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
589 /* Assert that all items in the pool were freed. */
590 for (size_t k
= 0; k
< count
; ++k
)
591 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
593 fprintf(stderr
, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
594 __func__
, get_pool_name(pool
), pool
, total_leaks
, (void *) __builtin_return_address(0));
599 range
->alloc_bitmap
= NULL
;
602 /* Always inline for __builtin_return_address(0). */
603 static inline __attribute__((always_inline
))
604 int rseq_mempool_range_destroy(struct rseq_mempool
*pool
,
605 struct rseq_mempool_range
*range
,
606 bool mapping_accessible
)
608 destroy_alloc_bitmap(pool
, range
);
609 if (!mapping_accessible
) {
611 * Only the header pages are populated in the child
614 return munmap(range
->header
, POOL_HEADER_NR_PAGES
* rseq_get_page_len());
616 return munmap(range
->mmap_addr
, range
->mmap_len
);
620 * Allocate a memory mapping aligned on @alignment, with an optional
621 * @pre_header before the mapping.
624 void *aligned_mmap_anonymous(size_t page_size
, size_t len
, size_t alignment
,
625 void **pre_header
, size_t pre_header_len
)
627 size_t minimum_page_count
, page_count
, extra
, total_allocate
= 0;
631 if (len
< page_size
|| alignment
< page_size
||
632 !is_pow2(alignment
) || (len
& (alignment
- 1))) {
636 page_order
= rseq_get_count_order_ulong(page_size
);
637 if (page_order
< 0) {
641 if (pre_header_len
&& (pre_header_len
& (page_size
- 1))) {
646 minimum_page_count
= (pre_header_len
+ len
) >> page_order
;
647 page_count
= (pre_header_len
+ len
+ alignment
- page_size
) >> page_order
;
649 assert(page_count
>= minimum_page_count
);
651 ptr
= mmap(NULL
, page_count
<< page_order
, PROT_READ
| PROT_WRITE
,
652 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
653 if (ptr
== MAP_FAILED
) {
658 total_allocate
= page_count
<< page_order
;
660 if (!(((uintptr_t) ptr
+ pre_header_len
) & (alignment
- 1))) {
661 /* Pointer is already aligned. ptr points to pre_header. */
665 /* Unmap extra before. */
666 extra
= offset_align((uintptr_t) ptr
+ pre_header_len
, alignment
);
667 assert(!(extra
& (page_size
- 1)));
668 if (munmap(ptr
, extra
)) {
672 total_allocate
-= extra
;
673 ptr
+= extra
; /* ptr points to pre_header */
674 page_count
-= extra
>> page_order
;
676 assert(page_count
>= minimum_page_count
);
678 if (page_count
> minimum_page_count
) {
681 /* Unmap extra after. */
682 extra_ptr
= ptr
+ (minimum_page_count
<< page_order
);
683 extra
= (page_count
- minimum_page_count
) << page_order
;
684 if (munmap(extra_ptr
, extra
)) {
688 total_allocate
-= extra
;
691 assert(!(((uintptr_t)ptr
+ pre_header_len
) & (alignment
- 1)));
692 assert(total_allocate
== len
+ pre_header_len
);
698 ptr
+= pre_header_len
;
704 int rseq_memfd_create_init(const char *poolname
, size_t init_len
)
707 char buf
[249]; /* Limit is 249 bytes. */
711 snprintf(buf
, sizeof(buf
), "%s:rseq-mempool", poolname
);
714 name
= "<anonymous>:rseq-mempool";
717 fd
= memfd_create(name
, MFD_CLOEXEC
);
719 perror("memfd_create");
722 if (ftruncate(fd
, (off_t
) init_len
)) {
733 void rseq_memfd_close(int fd
)
742 struct rseq_mempool_range
*rseq_mempool_range_create(struct rseq_mempool
*pool
)
744 struct rseq_mempool_range
*range
;
745 unsigned long page_size
;
748 size_t range_len
; /* Range len excludes header. */
752 if (pool
->attr
.max_nr_ranges
&&
753 pool
->nr_ranges
>= pool
->attr
.max_nr_ranges
) {
757 page_size
= rseq_get_page_len();
759 header_len
= POOL_HEADER_NR_PAGES
* page_size
;
760 range_len
= pool
->attr
.stride
* pool
->attr
.max_nr_cpus
;
761 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
762 range_len
+= pool
->attr
.stride
; /* init values */
763 if (pool
->attr
.robust_set
)
764 range_len
+= pool
->attr
.stride
; /* dedicated free list */
765 base
= aligned_mmap_anonymous(page_size
, range_len
,
766 pool
->attr
.stride
, &header
, header_len
);
769 range
= (struct rseq_mempool_range
*) (base
- RANGE_HEADER_OFFSET
);
771 range
->header
= header
;
773 range
->mmap_addr
= header
;
774 range
->mmap_len
= header_len
+ range_len
;
776 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
) {
777 range
->init
= base
+ (pool
->attr
.stride
* pool
->attr
.max_nr_cpus
);
778 /* Populate init values pages from memfd */
779 memfd
= rseq_memfd_create_init(pool
->name
, pool
->attr
.stride
);
782 if (mmap(range
->init
, pool
->attr
.stride
, PROT_READ
| PROT_WRITE
,
783 MAP_SHARED
| MAP_FIXED
, memfd
, 0) != (void *) range
->init
)
785 assert(pool
->attr
.type
== MEMPOOL_TYPE_PERCPU
);
787 * Map per-cpu memory as private COW mappings of init values.
792 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
793 void *p
= base
+ (pool
->attr
.stride
* cpu
);
794 size_t len
= pool
->attr
.stride
;
796 if (mmap(p
, len
, PROT_READ
| PROT_WRITE
, MAP_PRIVATE
| MAP_FIXED
,
797 memfd
, 0) != (void *) p
)
802 * The init values shared mapping should not be shared
803 * with the children processes across fork. Prevent the
804 * whole mapping from being used across fork.
806 if (madvise(base
, range_len
, MADV_DONTFORK
))
810 * Write 0x1 in first byte of header first page, which
811 * will be WIPEONFORK (and thus cleared) in children
812 * processes. Used to find out if pool destroy is called
813 * from a child process after fork.
815 *((char *) header
) = 0x1;
816 if (madvise(header
, page_size
, MADV_WIPEONFORK
))
820 * The second header page contains the struct
821 * rseq_mempool_range, which is needed by pool destroy.
822 * Leave this anonymous page populated (COW) in child
825 rseq_memfd_close(memfd
);
829 if (pool
->attr
.robust_set
) {
830 if (create_alloc_bitmap(pool
, range
))
833 if (pool
->attr
.init_set
) {
834 switch (pool
->attr
.type
) {
835 case MEMPOOL_TYPE_GLOBAL
:
836 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
837 base
, pool
->attr
.stride
, -1)) {
841 case MEMPOOL_TYPE_PERCPU
:
844 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
845 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
846 base
+ (pool
->attr
.stride
* cpu
),
847 pool
->attr
.stride
, cpu
)) {
861 rseq_memfd_close(memfd
);
862 (void) rseq_mempool_range_destroy(pool
, range
, true);
867 bool pool_mappings_accessible(struct rseq_mempool
*pool
)
869 struct rseq_mempool_range
*range
;
873 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_COW_INIT
)
875 if (list_empty(&pool
->range_list
))
877 range
= list_first_entry(&pool
->range_list
, struct rseq_mempool_range
, node
);
878 page_size
= rseq_get_page_len();
880 * Header first page is one page before the page containing the
883 addr
= (char *) ((uintptr_t) range
& ~(page_size
- 1)) - page_size
;
885 * Look for 0x1 first byte marker in header first page.
892 int rseq_mempool_destroy(struct rseq_mempool
*pool
)
894 struct rseq_mempool_range
*range
, *tmp_range
;
895 bool mapping_accessible
;
902 * Validate that the pool mappings are accessible before doing
903 * free list/poison validation and unmapping ranges. This allows
904 * calling pool destroy in child process after a fork for COW_INIT
905 * pools to free pool resources.
907 mapping_accessible
= pool_mappings_accessible(pool
);
909 check_free_list(pool
, mapping_accessible
);
910 check_pool_poison(pool
, mapping_accessible
);
912 /* Iteration safe against removal. */
913 list_for_each_entry_safe(range
, tmp_range
, &pool
->range_list
, node
) {
914 list_del(&range
->node
);
915 if (rseq_mempool_range_destroy(pool
, range
, mapping_accessible
)) {
916 /* Keep list coherent in case of partial failure. */
917 list_add(&range
->node
, &pool
->range_list
);
921 pthread_mutex_destroy(&pool
->lock
);
928 struct rseq_mempool
*rseq_mempool_create(const char *pool_name
,
929 size_t item_len
, const struct rseq_mempool_attr
*_attr
)
931 struct rseq_mempool_attr attr
= {};
932 struct rseq_mempool_range
*range
;
933 struct rseq_mempool
*pool
;
936 /* Make sure each item is large enough to contain free list pointers. */
937 if (item_len
< sizeof(void *))
938 item_len
= sizeof(void *);
940 /* Align item_len on next power of two. */
941 order
= rseq_get_count_order_ulong(item_len
);
946 item_len
= 1UL << order
;
949 memcpy(&attr
, _attr
, sizeof(attr
));
952 * Validate that the pool populate policy requested is known.
954 switch (attr
.populate_policy
) {
955 case RSEQ_MEMPOOL_POPULATE_COW_INIT
:
957 case RSEQ_MEMPOOL_POPULATE_COW_ZERO
:
965 case MEMPOOL_TYPE_PERCPU
:
966 if (attr
.max_nr_cpus
< 0) {
970 if (attr
.max_nr_cpus
== 0) {
972 attr
.max_nr_cpus
= rseq_get_max_nr_cpus();
973 if (attr
.max_nr_cpus
== 0) {
979 case MEMPOOL_TYPE_GLOBAL
:
980 /* Override populate policy for global type. */
981 if (attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
982 attr
.populate_policy
= RSEQ_MEMPOOL_POPULATE_COW_ZERO
;
983 /* Use a 1-cpu pool for global mempool type. */
984 attr
.max_nr_cpus
= 1;
988 attr
.stride
= RSEQ_MEMPOOL_STRIDE
; /* Use default */
989 if (attr
.robust_set
&& !attr
.poison_set
) {
990 attr
.poison_set
= true;
991 if (attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
992 attr
.poison
= DEFAULT_COW_INIT_POISON_VALUE
;
994 attr
.poison
= DEFAULT_COW_ZERO_POISON_VALUE
;
996 if (item_len
> attr
.stride
|| attr
.stride
< (size_t) rseq_get_page_len() ||
997 !is_pow2(attr
.stride
)) {
1002 pool
= calloc(1, sizeof(struct rseq_mempool
));
1006 memcpy(&pool
->attr
, &attr
, sizeof(attr
));
1007 pthread_mutex_init(&pool
->lock
, NULL
);
1008 pool
->item_len
= item_len
;
1009 pool
->item_order
= order
;
1010 INIT_LIST_HEAD(&pool
->range_list
);
1012 range
= rseq_mempool_range_create(pool
);
1015 list_add(&range
->node
, &pool
->range_list
);
1018 pool
->name
= strdup(pool_name
);
1025 rseq_mempool_destroy(pool
);
1030 /* Always inline for __builtin_return_address(0). */
1031 static inline __attribute__((always_inline
))
1032 void set_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1034 unsigned long *bitmap
= range
->alloc_bitmap
;
1035 size_t item_index
= item_offset
>> pool
->item_order
;
1042 k
= item_index
/ BIT_PER_ULONG
;
1043 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1045 /* Print error if bit is already set. */
1046 if (bitmap
[k
] & mask
) {
1047 fprintf(stderr
, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1048 __func__
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
1055 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_mempool
*pool
,
1056 bool zeroed
, void *init_ptr
, size_t init_len
)
1058 struct rseq_mempool_range
*range
;
1059 struct free_list_node
*node
;
1060 uintptr_t item_offset
;
1061 void __rseq_percpu
*addr
;
1063 if (init_len
> pool
->item_len
) {
1067 pthread_mutex_lock(&pool
->lock
);
1068 /* Get first entry from free list. */
1069 node
= pool
->free_list_head
;
1071 void *range_base
, *ptr
;
1073 ptr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1074 range_base
= (void *) ((uintptr_t) ptr
& (~(pool
->attr
.stride
- 1)));
1075 range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1076 /* Remove node from free list (update head). */
1077 pool
->free_list_head
= node
->next
;
1078 item_offset
= (uintptr_t) (ptr
- range_base
);
1079 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
1080 addr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1084 * If there are no ranges, or if the most recent range (first in
1085 * list) does not have any room left, create a new range and
1086 * prepend it to the list head.
1088 if (list_empty(&pool
->range_list
))
1090 range
= list_first_entry(&pool
->range_list
, struct rseq_mempool_range
, node
);
1091 if (range
->next_unused
+ pool
->item_len
> pool
->attr
.stride
)
1096 range
= rseq_mempool_range_create(pool
);
1102 /* Add range to head of list. */
1103 list_add(&range
->node
, &pool
->range_list
);
1105 /* First range in list has room left. */
1106 item_offset
= range
->next_unused
;
1107 addr
= (void __rseq_percpu
*) (range
->base
+ item_offset
);
1108 range
->next_unused
+= pool
->item_len
;
1111 range
->allocated_items
++;
1112 set_alloc_slot(pool
, range
, item_offset
);
1114 pthread_mutex_unlock(&pool
->lock
);
1117 rseq_percpu_zero_item(pool
, range
, item_offset
);
1118 else if (init_ptr
) {
1119 rseq_percpu_init_item(pool
, range
, item_offset
,
1120 init_ptr
, init_len
);
1126 void __rseq_percpu
*rseq_mempool_percpu_malloc(struct rseq_mempool
*pool
)
1128 return __rseq_percpu_malloc(pool
, false, NULL
, 0);
1131 void __rseq_percpu
*rseq_mempool_percpu_zmalloc(struct rseq_mempool
*pool
)
1133 return __rseq_percpu_malloc(pool
, true, NULL
, 0);
1136 void __rseq_percpu
*rseq_mempool_percpu_malloc_init(struct rseq_mempool
*pool
,
1137 void *init_ptr
, size_t len
)
1139 return __rseq_percpu_malloc(pool
, false, init_ptr
, len
);
1142 /* Always inline for __builtin_return_address(0). */
1143 static inline __attribute__((always_inline
))
1144 void clear_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1146 unsigned long *bitmap
= range
->alloc_bitmap
;
1147 size_t item_index
= item_offset
>> pool
->item_order
;
1154 k
= item_index
/ BIT_PER_ULONG
;
1155 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1157 /* Print error if bit is not set. */
1158 if (!(bitmap
[k
] & mask
)) {
1159 fprintf(stderr
, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1160 __func__
, get_pool_name(pool
), pool
, item_offset
,
1161 (void *) __builtin_return_address(0));
1167 void librseq_mempool_percpu_free(void __rseq_percpu
*_ptr
, size_t stride
)
1169 uintptr_t ptr
= (uintptr_t) _ptr
;
1170 void *range_base
= (void *) (ptr
& (~(stride
- 1)));
1171 struct rseq_mempool_range
*range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1172 struct rseq_mempool
*pool
= range
->pool
;
1173 uintptr_t item_offset
= ptr
& (stride
- 1);
1174 struct free_list_node
*head
, *item
;
1176 pthread_mutex_lock(&pool
->lock
);
1177 clear_alloc_slot(pool
, range
, item_offset
);
1178 if (!range
->allocated_items
) {
1179 fprintf(stderr
, "%s: Trying to free an item from an empty pool range within pool \"%s\" (%p), item offset: %zu, caller: %p.\n",
1180 __func__
, get_pool_name(pool
), pool
, item_offset
,
1181 (void *) __builtin_return_address(0));
1184 range
->allocated_items
--;
1185 /* Add ptr to head of free list */
1186 head
= pool
->free_list_head
;
1187 if (pool
->attr
.poison_set
)
1188 rseq_percpu_poison_item(pool
, range
, item_offset
);
1189 item
= __rseq_percpu_to_free_list_ptr(pool
, _ptr
);
1191 * Setting the next pointer will overwrite the first uintptr_t
1192 * poison for either CPU 0 (COW_ZERO, non-robust), or init data
1193 * (COW_INIT, non-robust).
1196 pool
->free_list_head
= item
;
1197 pthread_mutex_unlock(&pool
->lock
);
1200 struct rseq_mempool_set
*rseq_mempool_set_create(void)
1202 struct rseq_mempool_set
*pool_set
;
1204 pool_set
= calloc(1, sizeof(struct rseq_mempool_set
));
1207 pthread_mutex_init(&pool_set
->lock
, NULL
);
1211 int rseq_mempool_set_destroy(struct rseq_mempool_set
*pool_set
)
1215 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1216 struct rseq_mempool
*pool
= pool_set
->entries
[order
];
1220 ret
= rseq_mempool_destroy(pool
);
1223 pool_set
->entries
[order
] = NULL
;
1225 pthread_mutex_destroy(&pool_set
->lock
);
1230 /* Ownership of pool is handed over to pool set on success. */
1231 int rseq_mempool_set_add_pool(struct rseq_mempool_set
*pool_set
, struct rseq_mempool
*pool
)
1233 size_t item_order
= pool
->item_order
;
1236 pthread_mutex_lock(&pool_set
->lock
);
1237 if (pool_set
->entries
[item_order
]) {
1242 pool_set
->entries
[pool
->item_order
] = pool
;
1244 pthread_mutex_unlock(&pool_set
->lock
);
1249 void __rseq_percpu
*__rseq_mempool_set_malloc(struct rseq_mempool_set
*pool_set
,
1250 void *init_ptr
, size_t len
, bool zeroed
)
1252 int order
, min_order
= POOL_SET_MIN_ENTRY
;
1253 struct rseq_mempool
*pool
;
1254 void __rseq_percpu
*addr
;
1256 order
= rseq_get_count_order_ulong(len
);
1257 if (order
> POOL_SET_MIN_ENTRY
)
1260 pthread_mutex_lock(&pool_set
->lock
);
1261 /* First smallest present pool where @len fits. */
1262 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1263 pool
= pool_set
->entries
[order
];
1267 if (pool
->item_len
>= len
)
1272 pthread_mutex_unlock(&pool_set
->lock
);
1274 addr
= __rseq_percpu_malloc(pool
, zeroed
, init_ptr
, len
);
1275 if (addr
== NULL
&& errno
== ENOMEM
) {
1277 * If the allocation failed, try again with a
1280 min_order
= order
+ 1;
1291 void __rseq_percpu
*rseq_mempool_set_percpu_malloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1293 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, false);
1296 void __rseq_percpu
*rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1298 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, true);
1301 void __rseq_percpu
*rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set
*pool_set
,
1302 void *init_ptr
, size_t len
)
1304 return __rseq_mempool_set_malloc(pool_set
, init_ptr
, len
, true);
1307 struct rseq_mempool_attr
*rseq_mempool_attr_create(void)
1309 return calloc(1, sizeof(struct rseq_mempool_attr
));
1312 void rseq_mempool_attr_destroy(struct rseq_mempool_attr
*attr
)
1317 int rseq_mempool_attr_set_init(struct rseq_mempool_attr
*attr
,
1318 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
),
1325 attr
->init_set
= true;
1326 attr
->init_func
= init_func
;
1327 attr
->init_priv
= init_priv
;
1328 attr
->populate_policy
= RSEQ_MEMPOOL_POPULATE_COW_INIT
;
1332 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr
*attr
)
1338 attr
->robust_set
= true;
1342 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr
*attr
,
1343 size_t stride
, int max_nr_cpus
)
1349 attr
->type
= MEMPOOL_TYPE_PERCPU
;
1350 attr
->stride
= stride
;
1351 attr
->max_nr_cpus
= max_nr_cpus
;
1355 int rseq_mempool_attr_set_global(struct rseq_mempool_attr
*attr
,
1362 attr
->type
= MEMPOOL_TYPE_GLOBAL
;
1363 attr
->stride
= stride
;
1364 attr
->max_nr_cpus
= 0;
1368 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr
*attr
,
1369 unsigned long max_nr_ranges
)
1375 attr
->max_nr_ranges
= max_nr_ranges
;
1379 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr
*attr
,
1386 attr
->poison_set
= true;
1387 attr
->poison
= poison
;
1391 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr
*attr
,
1392 enum rseq_mempool_populate_policy policy
)
1398 attr
->populate_policy
= policy
;
1402 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool
*mempool
)
1404 if (!mempool
|| mempool
->attr
.type
!= MEMPOOL_TYPE_PERCPU
) {
1408 return mempool
->attr
.max_nr_cpus
;