mempool: Track allocated items per range
[librseq.git] / src / rseq-mempool.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3 // SPDX-FileCopyrightText: 2024 Olivier Dion <odion@efficios.com>
4
5 #include <rseq/mempool.h>
6 #include <sys/mman.h>
7 #include <assert.h>
8 #include <string.h>
9 #include <pthread.h>
10 #include <unistd.h>
11 #include <stdlib.h>
12 #include <rseq/compiler.h>
13 #include <errno.h>
14 #include <stdint.h>
15 #include <stdbool.h>
16 #include <stdio.h>
17 #include <fcntl.h>
18
19 #ifdef HAVE_LIBNUMA
20 # include <numa.h>
21 # include <numaif.h>
22 #endif
23
24 #include "rseq-utils.h"
25 #include <rseq/rseq.h>
26
27 /*
28 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
29 *
30 * The rseq per-CPU memory allocator allows the application the request
31 * memory pools of CPU-Local memory each of containing objects of a
32 * given size (rounded to next power of 2), reserving a given virtual
33 * address size per CPU, for a given maximum number of CPUs.
34 *
35 * The per-CPU memory allocator is analogous to TLS (Thread-Local
36 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
37 * memory allocator provides CPU-Local Storage.
38 */
39
40 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
41
42 #define POOL_HEADER_NR_PAGES 2
43
44 /*
45 * Smallest allocation should hold enough space for a free list pointer.
46 */
47 #if RSEQ_BITS_PER_LONG == 64
48 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
49 #else
50 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
51 #endif
52
53 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
54
55 #define MOVE_PAGES_BATCH_SIZE 4096
56
57 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
58
59 #if RSEQ_BITS_PER_LONG == 64
60 # define DEFAULT_COW_INIT_POISON_VALUE 0x5555555555555555ULL
61 #else
62 # define DEFAULT_COW_INIT_POISON_VALUE 0x55555555UL
63 #endif
64
65 /*
66 * Define the default COW_ZERO poison value as zero to prevent useless
67 * COW page allocation when writing poison values when freeing items.
68 */
69 #define DEFAULT_COW_ZERO_POISON_VALUE 0x0
70
71 struct free_list_node;
72
73 struct free_list_node {
74 struct free_list_node *next;
75 };
76
77 enum mempool_type {
78 MEMPOOL_TYPE_PERCPU = 0, /* Default */
79 MEMPOOL_TYPE_GLOBAL = 1,
80 };
81
82 struct rseq_mempool_attr {
83 bool init_set;
84 int (*init_func)(void *priv, void *addr, size_t len, int cpu);
85 void *init_priv;
86
87 bool robust_set;
88
89 enum mempool_type type;
90 size_t stride;
91 int max_nr_cpus;
92
93 unsigned long max_nr_ranges;
94
95 bool poison_set;
96 uintptr_t poison;
97
98 enum rseq_mempool_populate_policy populate_policy;
99 };
100
101 struct rseq_mempool_range;
102
103 struct rseq_mempool_range {
104 struct rseq_mempool_range *next; /* Linked list of ranges. */
105 struct rseq_mempool *pool; /* Backward reference to container pool. */
106
107 /*
108 * Memory layout of a mempool range:
109 * - Canary header page (for detection of destroy-after-fork of
110 * COW_INIT pool),
111 * - Header page (contains struct rseq_mempool_range at the
112 * very end),
113 * - Base of the per-cpu data, starting with CPU 0.
114 * Aliases with free-list for non-robust COW_ZERO pool.
115 * - CPU 1,
116 * ...
117 * - CPU max_nr_cpus - 1
118 * - init values (only allocated for COW_INIT pool).
119 * Aliases with free-list for non-robust COW_INIT pool.
120 * - free list (for robust pool).
121 *
122 * The free list aliases the CPU 0 memory area for non-robust
123 * COW_ZERO pools. It aliases with init values for non-robust
124 * COW_INIT pools. It is located immediately after the init
125 * values for robust pools.
126 */
127 void *header;
128 void *base;
129 /*
130 * The init values contains malloc_init/zmalloc values.
131 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_COW_ZERO.
132 */
133 void *init;
134 size_t next_unused;
135
136 /* Pool range mmap/munmap */
137 void *mmap_addr;
138 size_t mmap_len;
139
140 size_t allocated_items;
141
142 /* Track alloc/free. */
143 unsigned long *alloc_bitmap;
144 };
145
146 struct rseq_mempool {
147 /* Head of ranges linked-list. */
148 struct rseq_mempool_range *range_list;
149 unsigned long nr_ranges;
150
151 size_t item_len;
152 int item_order;
153
154 /*
155 * COW_INIT non-robust pools:
156 * The free list chains freed items on the init
157 * values address range.
158 *
159 * COW_ZERO non-robust pools:
160 * The free list chains freed items on the CPU 0
161 * address range. We should rethink this
162 * decision if false sharing between malloc/free
163 * from other CPUs and data accesses from CPU 0
164 * becomes an issue.
165 *
166 * Robust pools: The free list chains freed items in the
167 * address range dedicated for the free list.
168 *
169 * This is a NULL-terminated singly-linked list.
170 */
171 struct free_list_node *free_list_head;
172
173 /* This lock protects allocation/free within the pool. */
174 pthread_mutex_t lock;
175
176 struct rseq_mempool_attr attr;
177 char *name;
178 };
179
180 /*
181 * Pool set entries are indexed by item_len rounded to the next power of
182 * 2. A pool set can contain NULL pool entries, in which case the next
183 * large enough entry will be used for allocation.
184 */
185 struct rseq_mempool_set {
186 /* This lock protects add vs malloc/zmalloc within the pool set. */
187 pthread_mutex_t lock;
188 struct rseq_mempool *entries[POOL_SET_NR_ENTRIES];
189 };
190
191 static
192 const char *get_pool_name(const struct rseq_mempool *pool)
193 {
194 return pool->name ? : "<anonymous>";
195 }
196
197 static
198 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu,
199 uintptr_t item_offset, size_t stride)
200 {
201 return range->base + (stride * cpu) + item_offset;
202 }
203
204 static
205 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range,
206 uintptr_t item_offset)
207 {
208 if (!range->init)
209 return NULL;
210 return range->init + item_offset;
211 }
212
213 static
214 void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool,
215 struct free_list_node *node)
216 {
217 void __rseq_percpu *p = (void __rseq_percpu *) node;
218
219 if (pool->attr.robust_set) {
220 /* Skip cpus. */
221 p -= pool->attr.max_nr_cpus * pool->attr.stride;
222 /* Skip init values */
223 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
224 p -= pool->attr.stride;
225
226 } else {
227 /* COW_INIT free list is in init values */
228 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
229 p -= pool->attr.max_nr_cpus * pool->attr.stride;
230 }
231 return p;
232 }
233
234 static
235 struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool,
236 void __rseq_percpu *p)
237 {
238 if (pool->attr.robust_set) {
239 /* Skip cpus. */
240 p += pool->attr.max_nr_cpus * pool->attr.stride;
241 /* Skip init values */
242 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
243 p += pool->attr.stride;
244
245 } else {
246 /* COW_INIT free list is in init values */
247 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
248 p += pool->attr.max_nr_cpus * pool->attr.stride;
249 }
250 return (struct free_list_node *) p;
251 }
252
253 static
254 intptr_t rseq_cmp_item(void *p, size_t item_len, intptr_t cmp_value, intptr_t *unexpected_value)
255 {
256 size_t offset;
257 intptr_t res = 0;
258
259 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
260 intptr_t v = *((intptr_t *) (p + offset));
261
262 if ((res = v - cmp_value) != 0) {
263 if (unexpected_value)
264 *unexpected_value = v;
265 break;
266 }
267 }
268 return res;
269 }
270
271 static
272 void rseq_percpu_zero_item(struct rseq_mempool *pool,
273 struct rseq_mempool_range *range, uintptr_t item_offset)
274 {
275 char *init_p = NULL;
276 int i;
277
278 init_p = __rseq_pool_range_init_ptr(range, item_offset);
279 if (init_p)
280 bzero(init_p, pool->item_len);
281 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
282 char *p = __rseq_pool_range_percpu_ptr(range, i,
283 item_offset, pool->attr.stride);
284
285 /*
286 * If item is already zeroed, either because the
287 * init range update has propagated or because the
288 * content is already zeroed (e.g. zero page), don't
289 * write to the page. This eliminates useless COW over
290 * the zero page just for overwriting it with zeroes.
291 *
292 * This means zmalloc() in COW_ZERO policy pool do
293 * not trigger COW for CPUs which are not actively
294 * writing to the pool. This is however not the case for
295 * malloc_init() in populate-all pools if it populates
296 * non-zero content.
297 */
298 if (!rseq_cmp_item(p, pool->item_len, 0, NULL))
299 continue;
300 bzero(p, pool->item_len);
301 }
302 }
303
304 static
305 void rseq_percpu_init_item(struct rseq_mempool *pool,
306 struct rseq_mempool_range *range, uintptr_t item_offset,
307 void *init_ptr, size_t init_len)
308 {
309 char *init_p = NULL;
310 int i;
311
312 init_p = __rseq_pool_range_init_ptr(range, item_offset);
313 if (init_p)
314 memcpy(init_p, init_ptr, init_len);
315 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
316 char *p = __rseq_pool_range_percpu_ptr(range, i,
317 item_offset, pool->attr.stride);
318
319 /*
320 * If the update propagated through a shared mapping,
321 * or the item already has the correct content, skip
322 * writing it into the cpu item to eliminate useless
323 * COW of the page.
324 */
325 if (!memcmp(init_ptr, p, init_len))
326 continue;
327 memcpy(p, init_ptr, init_len);
328 }
329 }
330
331 static
332 void rseq_poison_item(void *p, size_t item_len, uintptr_t poison)
333 {
334 size_t offset;
335
336 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t))
337 *((uintptr_t *) (p + offset)) = poison;
338 }
339
340 static
341 void rseq_percpu_poison_item(struct rseq_mempool *pool,
342 struct rseq_mempool_range *range, uintptr_t item_offset)
343 {
344 uintptr_t poison = pool->attr.poison;
345 char *init_p = NULL;
346 int i;
347
348 init_p = __rseq_pool_range_init_ptr(range, item_offset);
349 if (init_p)
350 rseq_poison_item(init_p, pool->item_len, poison);
351 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
352 char *p = __rseq_pool_range_percpu_ptr(range, i,
353 item_offset, pool->attr.stride);
354
355 /*
356 * If the update propagated through a shared mapping,
357 * or the item already has the correct content, skip
358 * writing it into the cpu item to eliminate useless
359 * COW of the page.
360 *
361 * It is recommended to use zero as poison value for
362 * COW_ZERO pools to eliminate COW due to writing
363 * poison to CPU memory still backed by the zero page.
364 */
365 if (rseq_cmp_item(p, pool->item_len, poison, NULL) == 0)
366 continue;
367 rseq_poison_item(p, pool->item_len, poison);
368 }
369 }
370
371 /* Always inline for __builtin_return_address(0). */
372 static inline __attribute__((always_inline))
373 void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset,
374 void *p, size_t item_len, uintptr_t poison)
375 {
376 intptr_t unexpected_value;
377
378 if (rseq_cmp_item(p, item_len, poison, &unexpected_value) == 0)
379 return;
380
381 fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
382 __func__, (unsigned long) unexpected_value, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
383 abort();
384 }
385
386 /* Always inline for __builtin_return_address(0). */
387 static inline __attribute__((always_inline))
388 void rseq_percpu_check_poison_item(const struct rseq_mempool *pool,
389 const struct rseq_mempool_range *range, uintptr_t item_offset)
390 {
391 uintptr_t poison = pool->attr.poison;
392 char *init_p;
393 int i;
394
395 if (!pool->attr.robust_set)
396 return;
397 init_p = __rseq_pool_range_init_ptr(range, item_offset);
398 if (init_p)
399 rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison);
400 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
401 char *p = __rseq_pool_range_percpu_ptr(range, i,
402 item_offset, pool->attr.stride);
403 rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison);
404 }
405 }
406
407 #ifdef HAVE_LIBNUMA
408 int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags)
409 {
410 unsigned long nr_pages, page_len;
411 int status[MOVE_PAGES_BATCH_SIZE];
412 int nodes[MOVE_PAGES_BATCH_SIZE];
413 void *pages[MOVE_PAGES_BATCH_SIZE];
414 long ret;
415
416 if (!numa_flags) {
417 errno = EINVAL;
418 return -1;
419 }
420 page_len = rseq_get_page_len();
421 nr_pages = len >> rseq_get_count_order_ulong(page_len);
422
423 nodes[0] = numa_node_of_cpu(cpu);
424 if (nodes[0] < 0)
425 return -1;
426
427 for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) {
428 nodes[k] = nodes[0];
429 }
430
431 for (unsigned long page = 0; page < nr_pages;) {
432
433 size_t max_k = RSEQ_ARRAY_SIZE(pages);
434 size_t left = nr_pages - page;
435
436 if (left < max_k) {
437 max_k = left;
438 }
439
440 for (size_t k = 0; k < max_k; ++k, ++page) {
441 pages[k] = addr + (page * page_len);
442 status[k] = -EPERM;
443 }
444
445 ret = move_pages(0, max_k, pages, nodes, status, numa_flags);
446
447 if (ret < 0)
448 return ret;
449
450 if (ret > 0) {
451 fprintf(stderr, "%lu pages were not migrated\n", ret);
452 for (size_t k = 0; k < max_k; ++k) {
453 if (status[k] < 0)
454 fprintf(stderr,
455 "Error while moving page %p to numa node %d: %u\n",
456 pages[k], nodes[k], -status[k]);
457 }
458 }
459 }
460 return 0;
461 }
462 #else
463 int rseq_mempool_range_init_numa(void *addr __attribute__((unused)),
464 size_t len __attribute__((unused)),
465 int cpu __attribute__((unused)),
466 int numa_flags __attribute__((unused)))
467 {
468 errno = ENOSYS;
469 return -1;
470 }
471 #endif
472
473 static
474 int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
475 {
476 size_t count;
477
478 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
479
480 /*
481 * Not being able to create the validation bitmap is an error
482 * that needs to be reported.
483 */
484 range->alloc_bitmap = calloc(count, sizeof(unsigned long));
485 if (!range->alloc_bitmap)
486 return -1;
487 return 0;
488 }
489
490 static
491 bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr)
492 {
493 struct rseq_mempool_range *range;
494 void *addr = (void *) _addr;
495
496 for (range = pool->range_list; range; range = range->next) {
497 if (addr >= range->base && addr < range->base + range->next_unused)
498 return true;
499 }
500 return false;
501 }
502
503 /* Always inline for __builtin_return_address(0). */
504 static inline __attribute__((always_inline))
505 void check_free_list(const struct rseq_mempool *pool, bool mapping_accessible)
506 {
507 size_t total_item = 0, total_never_allocated = 0, total_freed = 0,
508 max_list_traversal = 0, traversal_iteration = 0;
509 struct rseq_mempool_range *range;
510
511 if (!pool->attr.robust_set || !mapping_accessible)
512 return;
513
514 for (range = pool->range_list; range; range = range->next) {
515 total_item += pool->attr.stride >> pool->item_order;
516 total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order;
517 }
518 max_list_traversal = total_item - total_never_allocated;
519
520 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
521 node;
522 prev = node,
523 node = node->next) {
524
525 if (traversal_iteration >= max_list_traversal) {
526 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
527 __func__, get_pool_name(pool), pool, __builtin_return_address(0));
528 abort();
529 }
530
531 /* Node is out of range. */
532 if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) {
533 if (prev)
534 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
535 __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0));
536 else
537 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
538 __func__, node, get_pool_name(pool), pool, __builtin_return_address(0));
539 abort();
540 }
541
542 traversal_iteration++;
543 total_freed++;
544 }
545
546 if (total_never_allocated + total_freed != total_item) {
547 fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
548 __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
549 abort();
550 }
551 }
552
553 /* Always inline for __builtin_return_address(0). */
554 static inline __attribute__((always_inline))
555 void check_range_poison(const struct rseq_mempool *pool,
556 const struct rseq_mempool_range *range)
557 {
558 size_t item_offset;
559
560 for (item_offset = 0; item_offset < range->next_unused;
561 item_offset += pool->item_len)
562 rseq_percpu_check_poison_item(pool, range, item_offset);
563 }
564
565 /* Always inline for __builtin_return_address(0). */
566 static inline __attribute__((always_inline))
567 void check_pool_poison(const struct rseq_mempool *pool, bool mapping_accessible)
568 {
569 struct rseq_mempool_range *range;
570
571 if (!pool->attr.robust_set || !mapping_accessible)
572 return;
573 for (range = pool->range_list; range; range = range->next)
574 check_range_poison(pool, range);
575 }
576
577 /* Always inline for __builtin_return_address(0). */
578 static inline __attribute__((always_inline))
579 void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
580 {
581 unsigned long *bitmap = range->alloc_bitmap;
582 size_t count, total_leaks = 0;
583
584 if (!bitmap)
585 return;
586
587 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
588
589 /* Assert that all items in the pool were freed. */
590 for (size_t k = 0; k < count; ++k)
591 total_leaks += rseq_hweight_ulong(bitmap[k]);
592 if (total_leaks) {
593 fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
594 __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0));
595 abort();
596 }
597
598 free(bitmap);
599 range->alloc_bitmap = NULL;
600 }
601
602 /* Always inline for __builtin_return_address(0). */
603 static inline __attribute__((always_inline))
604 int rseq_mempool_range_destroy(struct rseq_mempool *pool,
605 struct rseq_mempool_range *range,
606 bool mapping_accessible)
607 {
608 destroy_alloc_bitmap(pool, range);
609 if (!mapping_accessible) {
610 /*
611 * Only the header pages are populated in the child
612 * process.
613 */
614 return munmap(range->header, POOL_HEADER_NR_PAGES * rseq_get_page_len());
615 }
616 return munmap(range->mmap_addr, range->mmap_len);
617 }
618
619 /*
620 * Allocate a memory mapping aligned on @alignment, with an optional
621 * @pre_header before the mapping.
622 */
623 static
624 void *aligned_mmap_anonymous(size_t page_size, size_t len, size_t alignment,
625 void **pre_header, size_t pre_header_len)
626 {
627 size_t minimum_page_count, page_count, extra, total_allocate = 0;
628 int page_order;
629 void *ptr;
630
631 if (len < page_size || alignment < page_size ||
632 !is_pow2(alignment) || (len & (alignment - 1))) {
633 errno = EINVAL;
634 return NULL;
635 }
636 page_order = rseq_get_count_order_ulong(page_size);
637 if (page_order < 0) {
638 errno = EINVAL;
639 return NULL;
640 }
641 if (pre_header_len && (pre_header_len & (page_size - 1))) {
642 errno = EINVAL;
643 return NULL;
644 }
645
646 minimum_page_count = (pre_header_len + len) >> page_order;
647 page_count = (pre_header_len + len + alignment - page_size) >> page_order;
648
649 assert(page_count >= minimum_page_count);
650
651 ptr = mmap(NULL, page_count << page_order, PROT_READ | PROT_WRITE,
652 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
653 if (ptr == MAP_FAILED) {
654 ptr = NULL;
655 goto alloc_error;
656 }
657
658 total_allocate = page_count << page_order;
659
660 if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) {
661 /* Pointer is already aligned. ptr points to pre_header. */
662 goto out;
663 }
664
665 /* Unmap extra before. */
666 extra = offset_align((uintptr_t) ptr + pre_header_len, alignment);
667 assert(!(extra & (page_size - 1)));
668 if (munmap(ptr, extra)) {
669 perror("munmap");
670 abort();
671 }
672 total_allocate -= extra;
673 ptr += extra; /* ptr points to pre_header */
674 page_count -= extra >> page_order;
675 out:
676 assert(page_count >= minimum_page_count);
677
678 if (page_count > minimum_page_count) {
679 void *extra_ptr;
680
681 /* Unmap extra after. */
682 extra_ptr = ptr + (minimum_page_count << page_order);
683 extra = (page_count - minimum_page_count) << page_order;
684 if (munmap(extra_ptr, extra)) {
685 perror("munmap");
686 abort();
687 }
688 total_allocate -= extra;
689 }
690
691 assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1)));
692 assert(total_allocate == len + pre_header_len);
693
694 alloc_error:
695 if (ptr) {
696 if (pre_header)
697 *pre_header = ptr;
698 ptr += pre_header_len;
699 }
700 return ptr;
701 }
702
703 static
704 int rseq_memfd_create_init(const char *poolname, size_t init_len)
705 {
706 int fd;
707 char buf[249]; /* Limit is 249 bytes. */
708 const char *name;
709
710 if (poolname) {
711 snprintf(buf, sizeof(buf), "%s:rseq-mempool", poolname);
712 name = buf;
713 } else {
714 name = "<anonymous>:rseq-mempool";
715 }
716
717 fd = memfd_create(name, MFD_CLOEXEC);
718 if (fd < 0) {
719 perror("memfd_create");
720 goto end;
721 }
722 if (ftruncate(fd, (off_t) init_len)) {
723 if (close(fd))
724 perror("close");
725 fd = -1;
726 goto end;
727 }
728 end:
729 return fd;
730 }
731
732 static
733 void rseq_memfd_close(int fd)
734 {
735 if (fd < 0)
736 return;
737 if (close(fd))
738 perror("close");
739 }
740
741 static
742 struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool)
743 {
744 struct rseq_mempool_range *range;
745 unsigned long page_size;
746 void *header;
747 void *base;
748 size_t range_len; /* Range len excludes header. */
749 size_t header_len;
750 int memfd = -1;
751
752 if (pool->attr.max_nr_ranges &&
753 pool->nr_ranges >= pool->attr.max_nr_ranges) {
754 errno = ENOMEM;
755 return NULL;
756 }
757 page_size = rseq_get_page_len();
758
759 header_len = POOL_HEADER_NR_PAGES * page_size;
760 range_len = pool->attr.stride * pool->attr.max_nr_cpus;
761 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
762 range_len += pool->attr.stride; /* init values */
763 if (pool->attr.robust_set)
764 range_len += pool->attr.stride; /* dedicated free list */
765 base = aligned_mmap_anonymous(page_size, range_len,
766 pool->attr.stride, &header, header_len);
767 if (!base)
768 return NULL;
769 range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET);
770 range->pool = pool;
771 range->header = header;
772 range->base = base;
773 range->mmap_addr = header;
774 range->mmap_len = header_len + range_len;
775
776 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) {
777 range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus);
778 /* Populate init values pages from memfd */
779 memfd = rseq_memfd_create_init(pool->name, pool->attr.stride);
780 if (memfd < 0)
781 goto error_alloc;
782 if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE,
783 MAP_SHARED | MAP_FIXED, memfd, 0) != (void *) range->init)
784 goto error_alloc;
785 assert(pool->attr.type == MEMPOOL_TYPE_PERCPU);
786 /*
787 * Map per-cpu memory as private COW mappings of init values.
788 */
789 {
790 int cpu;
791
792 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
793 void *p = base + (pool->attr.stride * cpu);
794 size_t len = pool->attr.stride;
795
796 if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED,
797 memfd, 0) != (void *) p)
798 goto error_alloc;
799 }
800 }
801 /*
802 * The init values shared mapping should not be shared
803 * with the children processes across fork. Prevent the
804 * whole mapping from being used across fork.
805 */
806 if (madvise(base, range_len, MADV_DONTFORK))
807 goto error_alloc;
808
809 /*
810 * Write 0x1 in first byte of header first page, which
811 * will be WIPEONFORK (and thus cleared) in children
812 * processes. Used to find out if pool destroy is called
813 * from a child process after fork.
814 */
815 *((char *) header) = 0x1;
816 if (madvise(header, page_size, MADV_WIPEONFORK))
817 goto error_alloc;
818
819 /*
820 * The second header page contains the struct
821 * rseq_mempool_range, which is needed by pool destroy.
822 * Leave this anonymous page populated (COW) in child
823 * processes.
824 */
825 rseq_memfd_close(memfd);
826 memfd = -1;
827 }
828
829 if (pool->attr.robust_set) {
830 if (create_alloc_bitmap(pool, range))
831 goto error_alloc;
832 }
833 if (pool->attr.init_set) {
834 switch (pool->attr.type) {
835 case MEMPOOL_TYPE_GLOBAL:
836 if (pool->attr.init_func(pool->attr.init_priv,
837 base, pool->attr.stride, -1)) {
838 goto error_alloc;
839 }
840 break;
841 case MEMPOOL_TYPE_PERCPU:
842 {
843 int cpu;
844 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
845 if (pool->attr.init_func(pool->attr.init_priv,
846 base + (pool->attr.stride * cpu),
847 pool->attr.stride, cpu)) {
848 goto error_alloc;
849 }
850 }
851 break;
852 }
853 default:
854 abort();
855 }
856 }
857 pool->nr_ranges++;
858 return range;
859
860 error_alloc:
861 rseq_memfd_close(memfd);
862 (void) rseq_mempool_range_destroy(pool, range, true);
863 return NULL;
864 }
865
866 static
867 bool pool_mappings_accessible(struct rseq_mempool *pool)
868 {
869 struct rseq_mempool_range *range;
870 size_t page_size;
871 char *addr;
872
873 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_COW_INIT)
874 return true;
875 range = pool->range_list;
876 if (!range)
877 return true;
878 page_size = rseq_get_page_len();
879 /*
880 * Header first page is one page before the page containing the
881 * range structure.
882 */
883 addr = (char *) ((uintptr_t) range & ~(page_size - 1)) - page_size;
884 /*
885 * Look for 0x1 first byte marker in header first page.
886 */
887 if (*addr != 0x1)
888 return false;
889 return true;
890 }
891
892 int rseq_mempool_destroy(struct rseq_mempool *pool)
893 {
894 struct rseq_mempool_range *range, *next_range;
895 bool mapping_accessible;
896 int ret = 0;
897
898 if (!pool)
899 return 0;
900
901 /*
902 * Validate that the pool mappings are accessible before doing
903 * free list/poison validation and unmapping ranges. This allows
904 * calling pool destroy in child process after a fork for COW_INIT
905 * pools to free pool resources.
906 */
907 mapping_accessible = pool_mappings_accessible(pool);
908
909 check_free_list(pool, mapping_accessible);
910 check_pool_poison(pool, mapping_accessible);
911
912 /* Iteration safe against removal. */
913 for (range = pool->range_list; range && (next_range = range->next, 1); range = next_range) {
914 if (rseq_mempool_range_destroy(pool, range, mapping_accessible))
915 goto end;
916 /* Update list head to keep list coherent in case of partial failure. */
917 pool->range_list = next_range;
918 }
919 pthread_mutex_destroy(&pool->lock);
920 free(pool->name);
921 free(pool);
922 end:
923 return ret;
924 }
925
926 struct rseq_mempool *rseq_mempool_create(const char *pool_name,
927 size_t item_len, const struct rseq_mempool_attr *_attr)
928 {
929 struct rseq_mempool *pool;
930 struct rseq_mempool_attr attr = {};
931 int order;
932
933 /* Make sure each item is large enough to contain free list pointers. */
934 if (item_len < sizeof(void *))
935 item_len = sizeof(void *);
936
937 /* Align item_len on next power of two. */
938 order = rseq_get_count_order_ulong(item_len);
939 if (order < 0) {
940 errno = EINVAL;
941 return NULL;
942 }
943 item_len = 1UL << order;
944
945 if (_attr)
946 memcpy(&attr, _attr, sizeof(attr));
947
948 /*
949 * Validate that the pool populate policy requested is known.
950 */
951 switch (attr.populate_policy) {
952 case RSEQ_MEMPOOL_POPULATE_COW_INIT:
953 break;
954 case RSEQ_MEMPOOL_POPULATE_COW_ZERO:
955 break;
956 default:
957 errno = EINVAL;
958 return NULL;
959 }
960
961 switch (attr.type) {
962 case MEMPOOL_TYPE_PERCPU:
963 if (attr.max_nr_cpus < 0) {
964 errno = EINVAL;
965 return NULL;
966 }
967 if (attr.max_nr_cpus == 0) {
968 /* Auto-detect */
969 attr.max_nr_cpus = rseq_get_max_nr_cpus();
970 if (attr.max_nr_cpus == 0) {
971 errno = EINVAL;
972 return NULL;
973 }
974 }
975 break;
976 case MEMPOOL_TYPE_GLOBAL:
977 /* Override populate policy for global type. */
978 if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
979 attr.populate_policy = RSEQ_MEMPOOL_POPULATE_COW_ZERO;
980 /* Use a 1-cpu pool for global mempool type. */
981 attr.max_nr_cpus = 1;
982 break;
983 }
984 if (!attr.stride)
985 attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */
986 if (attr.robust_set && !attr.poison_set) {
987 attr.poison_set = true;
988 if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
989 attr.poison = DEFAULT_COW_INIT_POISON_VALUE;
990 else
991 attr.poison = DEFAULT_COW_ZERO_POISON_VALUE;
992 }
993 if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() ||
994 !is_pow2(attr.stride)) {
995 errno = EINVAL;
996 return NULL;
997 }
998
999 pool = calloc(1, sizeof(struct rseq_mempool));
1000 if (!pool)
1001 return NULL;
1002
1003 memcpy(&pool->attr, &attr, sizeof(attr));
1004 pthread_mutex_init(&pool->lock, NULL);
1005 pool->item_len = item_len;
1006 pool->item_order = order;
1007
1008 pool->range_list = rseq_mempool_range_create(pool);
1009 if (!pool->range_list)
1010 goto error_alloc;
1011
1012 if (pool_name) {
1013 pool->name = strdup(pool_name);
1014 if (!pool->name)
1015 goto error_alloc;
1016 }
1017 return pool;
1018
1019 error_alloc:
1020 rseq_mempool_destroy(pool);
1021 errno = ENOMEM;
1022 return NULL;
1023 }
1024
1025 /* Always inline for __builtin_return_address(0). */
1026 static inline __attribute__((always_inline))
1027 void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
1028 {
1029 unsigned long *bitmap = range->alloc_bitmap;
1030 size_t item_index = item_offset >> pool->item_order;
1031 unsigned long mask;
1032 size_t k;
1033
1034 if (!bitmap)
1035 return;
1036
1037 k = item_index / BIT_PER_ULONG;
1038 mask = 1ULL << (item_index % BIT_PER_ULONG);
1039
1040 /* Print error if bit is already set. */
1041 if (bitmap[k] & mask) {
1042 fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1043 __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
1044 abort();
1045 }
1046 bitmap[k] |= mask;
1047 }
1048
1049 static
1050 void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool,
1051 bool zeroed, void *init_ptr, size_t init_len)
1052 {
1053 struct rseq_mempool_range *range;
1054 struct free_list_node *node;
1055 uintptr_t item_offset;
1056 void __rseq_percpu *addr;
1057
1058 if (init_len > pool->item_len) {
1059 errno = EINVAL;
1060 return NULL;
1061 }
1062 pthread_mutex_lock(&pool->lock);
1063 /* Get first entry from free list. */
1064 node = pool->free_list_head;
1065 if (node != NULL) {
1066 void *range_base, *ptr;
1067
1068 ptr = __rseq_free_list_to_percpu_ptr(pool, node);
1069 range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1)));
1070 range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1071 /* Remove node from free list (update head). */
1072 pool->free_list_head = node->next;
1073 item_offset = (uintptr_t) (ptr - range_base);
1074 rseq_percpu_check_poison_item(pool, range, item_offset);
1075 addr = __rseq_free_list_to_percpu_ptr(pool, node);
1076 goto end;
1077 }
1078 /*
1079 * If the most recent range (first in list) does not have any
1080 * room left, create a new range and prepend it to the list
1081 * head.
1082 */
1083 range = pool->range_list;
1084 if (range->next_unused + pool->item_len > pool->attr.stride) {
1085 range = rseq_mempool_range_create(pool);
1086 if (!range) {
1087 errno = ENOMEM;
1088 addr = NULL;
1089 goto end;
1090 }
1091 /* Add range to head of list. */
1092 range->next = pool->range_list;
1093 pool->range_list = range;
1094 }
1095 /* First range in list has room left. */
1096 item_offset = range->next_unused;
1097 addr = (void __rseq_percpu *) (range->base + item_offset);
1098 range->next_unused += pool->item_len;
1099 end:
1100 if (addr) {
1101 range->allocated_items++;
1102 set_alloc_slot(pool, range, item_offset);
1103 }
1104 pthread_mutex_unlock(&pool->lock);
1105 if (addr) {
1106 if (zeroed)
1107 rseq_percpu_zero_item(pool, range, item_offset);
1108 else if (init_ptr) {
1109 rseq_percpu_init_item(pool, range, item_offset,
1110 init_ptr, init_len);
1111 }
1112 }
1113 return addr;
1114 }
1115
1116 void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool)
1117 {
1118 return __rseq_percpu_malloc(pool, false, NULL, 0);
1119 }
1120
1121 void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool)
1122 {
1123 return __rseq_percpu_malloc(pool, true, NULL, 0);
1124 }
1125
1126 void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool,
1127 void *init_ptr, size_t len)
1128 {
1129 return __rseq_percpu_malloc(pool, false, init_ptr, len);
1130 }
1131
1132 /* Always inline for __builtin_return_address(0). */
1133 static inline __attribute__((always_inline))
1134 void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
1135 {
1136 unsigned long *bitmap = range->alloc_bitmap;
1137 size_t item_index = item_offset >> pool->item_order;
1138 unsigned long mask;
1139 size_t k;
1140
1141 if (!bitmap)
1142 return;
1143
1144 k = item_index / BIT_PER_ULONG;
1145 mask = 1ULL << (item_index % BIT_PER_ULONG);
1146
1147 /* Print error if bit is not set. */
1148 if (!(bitmap[k] & mask)) {
1149 fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1150 __func__, get_pool_name(pool), pool, item_offset,
1151 (void *) __builtin_return_address(0));
1152 abort();
1153 }
1154 bitmap[k] &= ~mask;
1155 }
1156
1157 void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride)
1158 {
1159 uintptr_t ptr = (uintptr_t) _ptr;
1160 void *range_base = (void *) (ptr & (~(stride - 1)));
1161 struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1162 struct rseq_mempool *pool = range->pool;
1163 uintptr_t item_offset = ptr & (stride - 1);
1164 struct free_list_node *head, *item;
1165
1166 pthread_mutex_lock(&pool->lock);
1167 clear_alloc_slot(pool, range, item_offset);
1168 if (!range->allocated_items) {
1169 fprintf(stderr, "%s: Trying to free an item from an empty pool range within pool \"%s\" (%p), item offset: %zu, caller: %p.\n",
1170 __func__, get_pool_name(pool), pool, item_offset,
1171 (void *) __builtin_return_address(0));
1172 abort();
1173 }
1174 range->allocated_items--;
1175 /* Add ptr to head of free list */
1176 head = pool->free_list_head;
1177 if (pool->attr.poison_set)
1178 rseq_percpu_poison_item(pool, range, item_offset);
1179 item = __rseq_percpu_to_free_list_ptr(pool, _ptr);
1180 /*
1181 * Setting the next pointer will overwrite the first uintptr_t
1182 * poison for either CPU 0 (COW_ZERO, non-robust), or init data
1183 * (COW_INIT, non-robust).
1184 */
1185 item->next = head;
1186 pool->free_list_head = item;
1187 pthread_mutex_unlock(&pool->lock);
1188 }
1189
1190 struct rseq_mempool_set *rseq_mempool_set_create(void)
1191 {
1192 struct rseq_mempool_set *pool_set;
1193
1194 pool_set = calloc(1, sizeof(struct rseq_mempool_set));
1195 if (!pool_set)
1196 return NULL;
1197 pthread_mutex_init(&pool_set->lock, NULL);
1198 return pool_set;
1199 }
1200
1201 int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set)
1202 {
1203 int order, ret;
1204
1205 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
1206 struct rseq_mempool *pool = pool_set->entries[order];
1207
1208 if (!pool)
1209 continue;
1210 ret = rseq_mempool_destroy(pool);
1211 if (ret)
1212 return ret;
1213 pool_set->entries[order] = NULL;
1214 }
1215 pthread_mutex_destroy(&pool_set->lock);
1216 free(pool_set);
1217 return 0;
1218 }
1219
1220 /* Ownership of pool is handed over to pool set on success. */
1221 int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool)
1222 {
1223 size_t item_order = pool->item_order;
1224 int ret = 0;
1225
1226 pthread_mutex_lock(&pool_set->lock);
1227 if (pool_set->entries[item_order]) {
1228 errno = EBUSY;
1229 ret = -1;
1230 goto end;
1231 }
1232 pool_set->entries[pool->item_order] = pool;
1233 end:
1234 pthread_mutex_unlock(&pool_set->lock);
1235 return ret;
1236 }
1237
1238 static
1239 void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set,
1240 void *init_ptr, size_t len, bool zeroed)
1241 {
1242 int order, min_order = POOL_SET_MIN_ENTRY;
1243 struct rseq_mempool *pool;
1244 void __rseq_percpu *addr;
1245
1246 order = rseq_get_count_order_ulong(len);
1247 if (order > POOL_SET_MIN_ENTRY)
1248 min_order = order;
1249 again:
1250 pthread_mutex_lock(&pool_set->lock);
1251 /* First smallest present pool where @len fits. */
1252 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
1253 pool = pool_set->entries[order];
1254
1255 if (!pool)
1256 continue;
1257 if (pool->item_len >= len)
1258 goto found;
1259 }
1260 pool = NULL;
1261 found:
1262 pthread_mutex_unlock(&pool_set->lock);
1263 if (pool) {
1264 addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len);
1265 if (addr == NULL && errno == ENOMEM) {
1266 /*
1267 * If the allocation failed, try again with a
1268 * larger pool.
1269 */
1270 min_order = order + 1;
1271 goto again;
1272 }
1273 } else {
1274 /* Not found. */
1275 errno = ENOMEM;
1276 addr = NULL;
1277 }
1278 return addr;
1279 }
1280
1281 void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len)
1282 {
1283 return __rseq_mempool_set_malloc(pool_set, NULL, len, false);
1284 }
1285
1286 void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len)
1287 {
1288 return __rseq_mempool_set_malloc(pool_set, NULL, len, true);
1289 }
1290
1291 void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set,
1292 void *init_ptr, size_t len)
1293 {
1294 return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true);
1295 }
1296
1297 struct rseq_mempool_attr *rseq_mempool_attr_create(void)
1298 {
1299 return calloc(1, sizeof(struct rseq_mempool_attr));
1300 }
1301
1302 void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr)
1303 {
1304 free(attr);
1305 }
1306
1307 int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr,
1308 int (*init_func)(void *priv, void *addr, size_t len, int cpu),
1309 void *init_priv)
1310 {
1311 if (!attr) {
1312 errno = EINVAL;
1313 return -1;
1314 }
1315 attr->init_set = true;
1316 attr->init_func = init_func;
1317 attr->init_priv = init_priv;
1318 attr->populate_policy = RSEQ_MEMPOOL_POPULATE_COW_INIT;
1319 return 0;
1320 }
1321
1322 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr)
1323 {
1324 if (!attr) {
1325 errno = EINVAL;
1326 return -1;
1327 }
1328 attr->robust_set = true;
1329 return 0;
1330 }
1331
1332 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr,
1333 size_t stride, int max_nr_cpus)
1334 {
1335 if (!attr) {
1336 errno = EINVAL;
1337 return -1;
1338 }
1339 attr->type = MEMPOOL_TYPE_PERCPU;
1340 attr->stride = stride;
1341 attr->max_nr_cpus = max_nr_cpus;
1342 return 0;
1343 }
1344
1345 int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr,
1346 size_t stride)
1347 {
1348 if (!attr) {
1349 errno = EINVAL;
1350 return -1;
1351 }
1352 attr->type = MEMPOOL_TYPE_GLOBAL;
1353 attr->stride = stride;
1354 attr->max_nr_cpus = 0;
1355 return 0;
1356 }
1357
1358 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr,
1359 unsigned long max_nr_ranges)
1360 {
1361 if (!attr) {
1362 errno = EINVAL;
1363 return -1;
1364 }
1365 attr->max_nr_ranges = max_nr_ranges;
1366 return 0;
1367 }
1368
1369 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr,
1370 uintptr_t poison)
1371 {
1372 if (!attr) {
1373 errno = EINVAL;
1374 return -1;
1375 }
1376 attr->poison_set = true;
1377 attr->poison = poison;
1378 return 0;
1379 }
1380
1381 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr,
1382 enum rseq_mempool_populate_policy policy)
1383 {
1384 if (!attr) {
1385 errno = EINVAL;
1386 return -1;
1387 }
1388 attr->populate_policy = policy;
1389 return 0;
1390 }
1391
1392 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool)
1393 {
1394 if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) {
1395 errno = EINVAL;
1396 return -1;
1397 }
1398 return mempool->attr.max_nr_cpus;
1399 }
This page took 0.06609 seconds and 4 git commands to generate.