mempool: Implement populate none policy
[librseq.git] / src / rseq-mempool.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4 #include <rseq/mempool.h>
5 #include <sys/mman.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <pthread.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <rseq/compiler.h>
12 #include <errno.h>
13 #include <stdint.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16 #include <fcntl.h>
17
18 #ifdef HAVE_LIBNUMA
19 # include <numa.h>
20 # include <numaif.h>
21 #endif
22
23 #include "rseq-utils.h"
24 #include <rseq/rseq.h>
25
26 /*
27 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
28 *
29 * The rseq per-CPU memory allocator allows the application the request
30 * memory pools of CPU-Local memory each of containing objects of a
31 * given size (rounded to next power of 2), reserving a given virtual
32 * address size per CPU, for a given maximum number of CPUs.
33 *
34 * The per-CPU memory allocator is analogous to TLS (Thread-Local
35 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
36 * memory allocator provides CPU-Local Storage.
37 */
38
39 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
40
41 /*
42 * Smallest allocation should hold enough space for a free list pointer.
43 */
44 #if RSEQ_BITS_PER_LONG == 64
45 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
46 #else
47 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
48 #endif
49
50 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
51
52 #define MOVE_PAGES_BATCH_SIZE 4096
53
54 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
55
56 #if RSEQ_BITS_PER_LONG == 64
57 # define DEFAULT_POISON_VALUE 0x5555555555555555ULL
58 #else
59 # define DEFAULT_POISON_VALUE 0x55555555UL
60 #endif
61
62 struct free_list_node;
63
64 struct free_list_node {
65 struct free_list_node *next;
66 };
67
68 enum mempool_type {
69 MEMPOOL_TYPE_GLOBAL = 0, /* Default */
70 MEMPOOL_TYPE_PERCPU = 1,
71 };
72
73 struct rseq_mempool_attr {
74 bool mmap_set;
75 void *(*mmap_func)(void *priv, size_t len);
76 int (*munmap_func)(void *priv, void *ptr, size_t len);
77 void *mmap_priv;
78
79 bool init_set;
80 int (*init_func)(void *priv, void *addr, size_t len, int cpu);
81 void *init_priv;
82
83 bool robust_set;
84
85 enum mempool_type type;
86 size_t stride;
87 int max_nr_cpus;
88
89 unsigned long max_nr_ranges;
90
91 bool poison_set;
92 uintptr_t poison;
93
94 enum rseq_mempool_populate_policy populate_policy;
95 };
96
97 struct rseq_mempool_range;
98
99 struct rseq_mempool_range {
100 struct rseq_mempool_range *next; /* Linked list of ranges. */
101 struct rseq_mempool *pool; /* Backward reference to container pool. */
102
103 /*
104 * Memory layout of a mempool range:
105 * - Header page (contains struct rseq_mempool_range at the very end),
106 * - Base of the per-cpu data, starting with CPU 0,
107 * - CPU 1,
108 * ...
109 * - CPU max_nr_cpus - 1
110 * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_ALL).
111 */
112 void *header;
113 void *base;
114 /*
115 * The init values contains malloc_init/zmalloc values.
116 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_ALL.
117 */
118 void *init;
119 size_t next_unused;
120
121 /* Pool range mmap/munmap */
122 void *mmap_addr;
123 size_t mmap_len;
124
125 /* Track alloc/free. */
126 unsigned long *alloc_bitmap;
127 };
128
129 struct rseq_mempool {
130 /* Head of ranges linked-list. */
131 struct rseq_mempool_range *range_list;
132 unsigned long nr_ranges;
133
134 size_t item_len;
135 int item_order;
136
137 /*
138 * The free list chains freed items on the CPU 0 address range.
139 * We should rethink this decision if false sharing between
140 * malloc/free from other CPUs and data accesses from CPU 0
141 * becomes an issue. This is a NULL-terminated singly-linked
142 * list.
143 */
144 struct free_list_node *free_list_head;
145
146 /* This lock protects allocation/free within the pool. */
147 pthread_mutex_t lock;
148
149 struct rseq_mempool_attr attr;
150 char *name;
151 };
152
153 /*
154 * Pool set entries are indexed by item_len rounded to the next power of
155 * 2. A pool set can contain NULL pool entries, in which case the next
156 * large enough entry will be used for allocation.
157 */
158 struct rseq_mempool_set {
159 /* This lock protects add vs malloc/zmalloc within the pool set. */
160 pthread_mutex_t lock;
161 struct rseq_mempool *entries[POOL_SET_NR_ENTRIES];
162 };
163
164 /*
165 * This memfd is used to implement the user COW behavior for the page
166 * protection scheme. memfd is a sparse virtual file. Its layout (in
167 * offset from beginning of file) matches the process address space
168 * (pointers directly converted to file offsets).
169 */
170 struct rseq_memfd {
171 pthread_mutex_t lock;
172 size_t reserved_size;
173 unsigned int refcount;
174 int fd;
175 };
176
177 static struct rseq_memfd memfd = {
178 .lock = PTHREAD_MUTEX_INITIALIZER,
179 .reserved_size = 0,
180 .refcount = 0,
181 .fd = -1,
182 };
183
184 static
185 const char *get_pool_name(const struct rseq_mempool *pool)
186 {
187 return pool->name ? : "<anonymous>";
188 }
189
190 static
191 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu,
192 uintptr_t item_offset, size_t stride)
193 {
194 return range->base + (stride * cpu) + item_offset;
195 }
196
197 static
198 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range,
199 uintptr_t item_offset)
200 {
201 if (!range->init)
202 return NULL;
203 return range->init + item_offset;
204 }
205
206 static
207 void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool,
208 struct free_list_node *node)
209 {
210 void __rseq_percpu *p = (void __rseq_percpu *) node;
211
212 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
213 p -= pool->attr.max_nr_cpus * pool->attr.stride;
214 return p;
215 }
216
217 static
218 struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool,
219 void __rseq_percpu *p)
220 {
221 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
222 p += pool->attr.max_nr_cpus * pool->attr.stride;
223 return (struct free_list_node *) p;
224 }
225
226 static
227 int memcmpbyte(const char *s, int c, size_t n)
228 {
229 int res = 0;
230
231 while (n-- > 0)
232 if ((res = *(s++) - c) != 0)
233 break;
234 return res;
235 }
236
237 static
238 void rseq_percpu_zero_item(struct rseq_mempool *pool,
239 struct rseq_mempool_range *range, uintptr_t item_offset)
240 {
241 char *init_p = NULL;
242 int i;
243
244 init_p = __rseq_pool_range_init_ptr(range, item_offset);
245 if (init_p)
246 memset(init_p, 0, pool->item_len);
247 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
248 char *p = __rseq_pool_range_percpu_ptr(range, i,
249 item_offset, pool->attr.stride);
250
251 /* Update propagated */
252 if (init_p && !memcmpbyte(p, 0, pool->item_len))
253 continue;
254 memset(p, 0, pool->item_len);
255 }
256 }
257
258 static
259 void rseq_percpu_init_item(struct rseq_mempool *pool,
260 struct rseq_mempool_range *range, uintptr_t item_offset,
261 void *init_ptr, size_t init_len)
262 {
263 char *init_p = NULL;
264 int i;
265
266 init_p = __rseq_pool_range_init_ptr(range, item_offset);
267 if (init_p)
268 memcpy(init_p, init_ptr, init_len);
269 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
270 char *p = __rseq_pool_range_percpu_ptr(range, i,
271 item_offset, pool->attr.stride);
272
273 /* Update propagated */
274 if (init_p && !memcmp(init_p, p, init_len))
275 continue;
276 memcpy(p, init_ptr, init_len);
277 }
278 }
279
280 static
281 void rseq_poison_item(void *p, size_t item_len, uintptr_t poison)
282 {
283 size_t offset;
284
285 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t))
286 *((uintptr_t *) (p + offset)) = poison;
287 }
288
289 static
290 void rseq_percpu_poison_item(struct rseq_mempool *pool,
291 struct rseq_mempool_range *range, uintptr_t item_offset)
292 {
293 uintptr_t poison = pool->attr.poison;
294 char *init_p = NULL;
295 int i;
296
297 init_p = __rseq_pool_range_init_ptr(range, item_offset);
298 if (init_p)
299 rseq_poison_item(init_p, pool->item_len, poison);
300 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
301 char *p = __rseq_pool_range_percpu_ptr(range, i,
302 item_offset, pool->attr.stride);
303
304 /* Update propagated */
305 if (init_p && !memcmp(init_p, p, pool->item_len))
306 continue;
307 rseq_poison_item(p, pool->item_len, poison);
308 }
309 }
310
311 /* Always inline for __builtin_return_address(0). */
312 static inline __attribute__((always_inline))
313 void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset,
314 void *p, size_t item_len, uintptr_t poison, bool skip_freelist_ptr)
315 {
316 size_t offset;
317
318 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
319 uintptr_t v;
320
321 /* Skip poison check for free-list pointer. */
322 if (skip_freelist_ptr && offset == 0)
323 continue;
324 v = *((uintptr_t *) (p + offset));
325 if (v != poison) {
326 fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
327 __func__, (unsigned long) v, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
328 abort();
329 }
330 }
331 }
332
333 /* Always inline for __builtin_return_address(0). */
334 static inline __attribute__((always_inline))
335 void rseq_percpu_check_poison_item(const struct rseq_mempool *pool,
336 const struct rseq_mempool_range *range, uintptr_t item_offset)
337 {
338 uintptr_t poison = pool->attr.poison;
339 char *init_p;
340 int i;
341
342 if (!pool->attr.robust_set)
343 return;
344 init_p = __rseq_pool_range_init_ptr(range, item_offset);
345 if (init_p)
346 rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison, true);
347 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
348 char *p = __rseq_pool_range_percpu_ptr(range, i,
349 item_offset, pool->attr.stride);
350 /*
351 * When the free list is embedded in the init values
352 * memory (populate none), it is visible from the init
353 * values memory mapping as well as per-cpu private
354 * mappings before they COW.
355 *
356 * When the free list is embedded in CPU 0 mapping
357 * (populate all), only this CPU must skip the free list
358 * nodes when checking poison.
359 */
360 rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison,
361 init_p == NULL ? (i == 0) : true);
362 }
363 }
364
365 #ifdef HAVE_LIBNUMA
366 int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags)
367 {
368 unsigned long nr_pages, page_len;
369 int status[MOVE_PAGES_BATCH_SIZE];
370 int nodes[MOVE_PAGES_BATCH_SIZE];
371 void *pages[MOVE_PAGES_BATCH_SIZE];
372 long ret;
373
374 if (!numa_flags) {
375 errno = EINVAL;
376 return -1;
377 }
378 page_len = rseq_get_page_len();
379 nr_pages = len >> rseq_get_count_order_ulong(page_len);
380
381 nodes[0] = numa_node_of_cpu(cpu);
382 if (nodes[0] < 0)
383 return -1;
384
385 for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) {
386 nodes[k] = nodes[0];
387 }
388
389 for (unsigned long page = 0; page < nr_pages;) {
390
391 size_t max_k = RSEQ_ARRAY_SIZE(pages);
392 size_t left = nr_pages - page;
393
394 if (left < max_k) {
395 max_k = left;
396 }
397
398 for (size_t k = 0; k < max_k; ++k, ++page) {
399 pages[k] = addr + (page * page_len);
400 status[k] = -EPERM;
401 }
402
403 ret = move_pages(0, max_k, pages, nodes, status, numa_flags);
404
405 if (ret < 0)
406 return ret;
407
408 if (ret > 0) {
409 fprintf(stderr, "%lu pages were not migrated\n", ret);
410 for (size_t k = 0; k < max_k; ++k) {
411 if (status[k] < 0)
412 fprintf(stderr,
413 "Error while moving page %p to numa node %d: %u\n",
414 pages[k], nodes[k], -status[k]);
415 }
416 }
417 }
418 return 0;
419 }
420 #else
421 int rseq_mempool_range_init_numa(void *addr __attribute__((unused)),
422 size_t len __attribute__((unused)),
423 int cpu __attribute__((unused)),
424 int numa_flags __attribute__((unused)))
425 {
426 errno = ENOSYS;
427 return -1;
428 }
429 #endif
430
431 static
432 void *default_mmap_func(void *priv __attribute__((unused)), size_t len)
433 {
434 void *base;
435
436 base = mmap(NULL, len, PROT_READ | PROT_WRITE,
437 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
438 if (base == MAP_FAILED)
439 return NULL;
440 return base;
441 }
442
443 static
444 int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len)
445 {
446 return munmap(ptr, len);
447 }
448
449 static
450 int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
451 {
452 size_t count;
453
454 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
455
456 /*
457 * Not being able to create the validation bitmap is an error
458 * that needs to be reported.
459 */
460 range->alloc_bitmap = calloc(count, sizeof(unsigned long));
461 if (!range->alloc_bitmap)
462 return -1;
463 return 0;
464 }
465
466 static
467 bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr)
468 {
469 struct rseq_mempool_range *range;
470 void *addr = (void *) _addr;
471
472 for (range = pool->range_list; range; range = range->next) {
473 if (addr >= range->base && addr < range->base + range->next_unused)
474 return true;
475 }
476 return false;
477 }
478
479 /* Always inline for __builtin_return_address(0). */
480 static inline __attribute__((always_inline))
481 void check_free_list(const struct rseq_mempool *pool)
482 {
483 size_t total_item = 0, total_never_allocated = 0, total_freed = 0,
484 max_list_traversal = 0, traversal_iteration = 0;
485 struct rseq_mempool_range *range;
486
487 if (!pool->attr.robust_set)
488 return;
489
490 for (range = pool->range_list; range; range = range->next) {
491 total_item += pool->attr.stride >> pool->item_order;
492 total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order;
493 }
494 max_list_traversal = total_item - total_never_allocated;
495
496 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
497 node;
498 prev = node,
499 node = node->next) {
500
501 if (traversal_iteration >= max_list_traversal) {
502 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
503 __func__, get_pool_name(pool), pool, __builtin_return_address(0));
504 abort();
505 }
506
507 /* Node is out of range. */
508 if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) {
509 if (prev)
510 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
511 __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0));
512 else
513 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
514 __func__, node, get_pool_name(pool), pool, __builtin_return_address(0));
515 abort();
516 }
517
518 traversal_iteration++;
519 total_freed++;
520 }
521
522 if (total_never_allocated + total_freed != total_item) {
523 fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
524 __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
525 abort();
526 }
527 }
528
529 /* Always inline for __builtin_return_address(0). */
530 static inline __attribute__((always_inline))
531 void check_range_poison(const struct rseq_mempool *pool,
532 const struct rseq_mempool_range *range)
533 {
534 size_t item_offset;
535
536 for (item_offset = 0; item_offset < range->next_unused;
537 item_offset += pool->item_len)
538 rseq_percpu_check_poison_item(pool, range, item_offset);
539 }
540
541 /* Always inline for __builtin_return_address(0). */
542 static inline __attribute__((always_inline))
543 void check_pool_poison(const struct rseq_mempool *pool)
544 {
545 struct rseq_mempool_range *range;
546
547 if (!pool->attr.robust_set)
548 return;
549 for (range = pool->range_list; range; range = range->next)
550 check_range_poison(pool, range);
551 }
552
553 /* Always inline for __builtin_return_address(0). */
554 static inline __attribute__((always_inline))
555 void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
556 {
557 unsigned long *bitmap = range->alloc_bitmap;
558 size_t count, total_leaks = 0;
559
560 if (!bitmap)
561 return;
562
563 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
564
565 /* Assert that all items in the pool were freed. */
566 for (size_t k = 0; k < count; ++k)
567 total_leaks += rseq_hweight_ulong(bitmap[k]);
568 if (total_leaks) {
569 fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
570 __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0));
571 abort();
572 }
573
574 free(bitmap);
575 range->alloc_bitmap = NULL;
576 }
577
578 /* Always inline for __builtin_return_address(0). */
579 static inline __attribute__((always_inline))
580 int rseq_mempool_range_destroy(struct rseq_mempool *pool,
581 struct rseq_mempool_range *range)
582 {
583 int ret = 0;
584
585 destroy_alloc_bitmap(pool, range);
586
587 /*
588 * Punch a hole into memfd where the init values used to be.
589 */
590 if (range->init) {
591 ret = fallocate(memfd.fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
592 (off_t) range->init, pool->attr.stride);
593 if (ret)
594 return ret;
595 range->init = NULL;
596 }
597
598 /* range is a header located one page before the aligned mapping. */
599 return pool->attr.munmap_func(pool->attr.mmap_priv, range->mmap_addr, range->mmap_len);
600 }
601
602 /*
603 * Allocate a memory mapping aligned on @alignment, with an optional
604 * @pre_header before the mapping.
605 */
606 static
607 void *aligned_mmap_anonymous(struct rseq_mempool *pool,
608 size_t page_size, size_t len, size_t alignment,
609 void **pre_header, size_t pre_header_len)
610 {
611 size_t minimum_page_count, page_count, extra, total_allocate = 0;
612 int page_order;
613 void *ptr;
614
615 if (len < page_size || alignment < page_size ||
616 !is_pow2(alignment) || (len & (alignment - 1))) {
617 errno = EINVAL;
618 return NULL;
619 }
620 page_order = rseq_get_count_order_ulong(page_size);
621 if (page_order < 0) {
622 errno = EINVAL;
623 return NULL;
624 }
625 if (pre_header_len && (pre_header_len & (page_size - 1))) {
626 errno = EINVAL;
627 return NULL;
628 }
629
630 minimum_page_count = (pre_header_len + len) >> page_order;
631 page_count = (pre_header_len + len + alignment - page_size) >> page_order;
632
633 assert(page_count >= minimum_page_count);
634
635 ptr = pool->attr.mmap_func(pool->attr.mmap_priv, page_count << page_order);
636 if (!ptr)
637 goto alloc_error;
638
639 total_allocate = page_count << page_order;
640
641 if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) {
642 /* Pointer is already aligned. ptr points to pre_header. */
643 goto out;
644 }
645
646 /* Unmap extra before. */
647 extra = offset_align((uintptr_t) ptr + pre_header_len, alignment);
648 assert(!(extra & (page_size - 1)));
649 if (pool->attr.munmap_func(pool->attr.mmap_priv, ptr, extra)) {
650 perror("munmap");
651 abort();
652 }
653 total_allocate -= extra;
654 ptr += extra; /* ptr points to pre_header */
655 page_count -= extra >> page_order;
656 out:
657 assert(page_count >= minimum_page_count);
658
659 if (page_count > minimum_page_count) {
660 void *extra_ptr;
661
662 /* Unmap extra after. */
663 extra_ptr = ptr + (minimum_page_count << page_order);
664 extra = (page_count - minimum_page_count) << page_order;
665 if (pool->attr.munmap_func(pool->attr.mmap_priv, extra_ptr, extra)) {
666 perror("munmap");
667 abort();
668 }
669 total_allocate -= extra;
670 }
671
672 assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1)));
673 assert(total_allocate == len + pre_header_len);
674
675 alloc_error:
676 if (ptr) {
677 if (pre_header)
678 *pre_header = ptr;
679 ptr += pre_header_len;
680 }
681 return ptr;
682 }
683
684 static
685 int rseq_memfd_reserve_init(void *init, size_t init_len)
686 {
687 int ret = 0;
688 size_t reserve_len;
689
690 pthread_mutex_lock(&memfd.lock);
691 reserve_len = (size_t) init + init_len;
692 if (reserve_len > memfd.reserved_size) {
693 if (ftruncate(memfd.fd, (off_t) reserve_len)) {
694 ret = -1;
695 goto unlock;
696 }
697 memfd.reserved_size = reserve_len;
698 }
699 unlock:
700 pthread_mutex_unlock(&memfd.lock);
701 return ret;
702 }
703
704 static
705 struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool)
706 {
707 struct rseq_mempool_range *range;
708 unsigned long page_size;
709 void *header;
710 void *base;
711 size_t range_len; /* Range len excludes header. */
712
713 if (pool->attr.max_nr_ranges &&
714 pool->nr_ranges >= pool->attr.max_nr_ranges) {
715 errno = ENOMEM;
716 return NULL;
717 }
718 page_size = rseq_get_page_len();
719
720 range_len = pool->attr.stride * pool->attr.max_nr_cpus;
721 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
722 range_len += pool->attr.stride; /* init values */
723 base = aligned_mmap_anonymous(pool, page_size,
724 range_len,
725 pool->attr.stride,
726 &header, page_size);
727 if (!base)
728 return NULL;
729 range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET);
730 range->pool = pool;
731 range->header = header;
732 range->base = base;
733 range->mmap_addr = header;
734 range->mmap_len = page_size + range_len;
735
736 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) {
737 range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus);
738 /* Populate init values pages from memfd */
739 if (rseq_memfd_reserve_init(range->init, pool->attr.stride))
740 goto error_alloc;
741 if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE,
742 MAP_SHARED | MAP_FIXED, memfd.fd,
743 (off_t) range->init) != (void *) range->init) {
744 goto error_alloc;
745 }
746 assert(pool->attr.type == MEMPOOL_TYPE_PERCPU);
747 /*
748 * Map per-cpu memory as private COW mappings of init values.
749 */
750 {
751 int cpu;
752
753 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
754 void *p = base + (pool->attr.stride * cpu);
755 size_t len = pool->attr.stride;
756
757 if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED,
758 memfd.fd, (off_t) range->init) != (void *) p) {
759 goto error_alloc;
760 }
761 }
762 }
763 }
764
765 if (pool->attr.robust_set) {
766 if (create_alloc_bitmap(pool, range))
767 goto error_alloc;
768 }
769 if (pool->attr.init_set) {
770 switch (pool->attr.type) {
771 case MEMPOOL_TYPE_GLOBAL:
772 if (pool->attr.init_func(pool->attr.init_priv,
773 base, pool->attr.stride, -1)) {
774 goto error_alloc;
775 }
776 break;
777 case MEMPOOL_TYPE_PERCPU:
778 {
779 int cpu;
780 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
781 if (pool->attr.init_func(pool->attr.init_priv,
782 base + (pool->attr.stride * cpu),
783 pool->attr.stride, cpu)) {
784 goto error_alloc;
785 }
786 }
787 break;
788 }
789 default:
790 abort();
791 }
792 }
793 pool->nr_ranges++;
794 return range;
795
796 error_alloc:
797 (void) rseq_mempool_range_destroy(pool, range);
798 return NULL;
799 }
800
801 static
802 int rseq_mempool_memfd_ref(struct rseq_mempool *pool)
803 {
804 int ret = 0;
805
806 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_ALL)
807 return 0;
808
809 pthread_mutex_lock(&memfd.lock);
810 if (memfd.refcount == 0) {
811 memfd.fd = memfd_create("mempool", MFD_CLOEXEC);
812 if (memfd.fd < 0) {
813 perror("memfd_create");
814 ret = -1;
815 goto unlock;
816 }
817 }
818 memfd.refcount++;
819 unlock:
820 pthread_mutex_unlock(&memfd.lock);
821 return ret;
822 }
823
824 static
825 void rseq_mempool_memfd_unref(struct rseq_mempool *pool)
826 {
827 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_ALL)
828 return;
829
830 pthread_mutex_lock(&memfd.lock);
831 if (memfd.refcount == 1) {
832 if (close(memfd.fd)) {
833 perror("close");
834 abort();
835 }
836 memfd.fd = -1;
837 memfd.reserved_size = 0;
838 }
839 memfd.refcount--;
840 pthread_mutex_unlock(&memfd.lock);
841 }
842
843 int rseq_mempool_destroy(struct rseq_mempool *pool)
844 {
845 struct rseq_mempool_range *range, *next_range;
846 int ret = 0;
847
848 if (!pool)
849 return 0;
850 check_free_list(pool);
851 check_pool_poison(pool);
852 /* Iteration safe against removal. */
853 for (range = pool->range_list; range && (next_range = range->next, 1); range = next_range) {
854 if (rseq_mempool_range_destroy(pool, range))
855 goto end;
856 /* Update list head to keep list coherent in case of partial failure. */
857 pool->range_list = next_range;
858 }
859 rseq_mempool_memfd_unref(pool);
860 pthread_mutex_destroy(&pool->lock);
861 free(pool->name);
862 free(pool);
863 end:
864 return ret;
865 }
866
867 struct rseq_mempool *rseq_mempool_create(const char *pool_name,
868 size_t item_len, const struct rseq_mempool_attr *_attr)
869 {
870 struct rseq_mempool *pool;
871 struct rseq_mempool_attr attr = {};
872 int order;
873
874 /* Make sure each item is large enough to contain free list pointers. */
875 if (item_len < sizeof(void *))
876 item_len = sizeof(void *);
877
878 /* Align item_len on next power of two. */
879 order = rseq_get_count_order_ulong(item_len);
880 if (order < 0) {
881 errno = EINVAL;
882 return NULL;
883 }
884 item_len = 1UL << order;
885
886 if (_attr)
887 memcpy(&attr, _attr, sizeof(attr));
888 if (!attr.mmap_set) {
889 attr.mmap_func = default_mmap_func;
890 attr.munmap_func = default_munmap_func;
891 attr.mmap_priv = NULL;
892 }
893
894 switch (attr.type) {
895 case MEMPOOL_TYPE_PERCPU:
896 if (attr.max_nr_cpus < 0) {
897 errno = EINVAL;
898 return NULL;
899 }
900 if (attr.max_nr_cpus == 0) {
901 /* Auto-detect */
902 attr.max_nr_cpus = rseq_get_max_nr_cpus();
903 if (attr.max_nr_cpus == 0) {
904 errno = EINVAL;
905 return NULL;
906 }
907 }
908 break;
909 case MEMPOOL_TYPE_GLOBAL:
910 /* Override populate policy for global type. */
911 attr.populate_policy = RSEQ_MEMPOOL_POPULATE_ALL;
912 /* Use a 1-cpu pool for global mempool type. */
913 attr.max_nr_cpus = 1;
914 break;
915 }
916 if (!attr.stride)
917 attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */
918 if (attr.robust_set && !attr.poison_set) {
919 attr.poison_set = true;
920 attr.poison = DEFAULT_POISON_VALUE;
921 }
922 if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() ||
923 !is_pow2(attr.stride)) {
924 errno = EINVAL;
925 return NULL;
926 }
927
928 pool = calloc(1, sizeof(struct rseq_mempool));
929 if (!pool)
930 return NULL;
931
932 memcpy(&pool->attr, &attr, sizeof(attr));
933 pthread_mutex_init(&pool->lock, NULL);
934 pool->item_len = item_len;
935 pool->item_order = order;
936
937 if (rseq_mempool_memfd_ref(pool))
938 goto error_alloc;
939
940 pool->range_list = rseq_mempool_range_create(pool);
941 if (!pool->range_list)
942 goto error_alloc;
943
944 if (pool_name) {
945 pool->name = strdup(pool_name);
946 if (!pool->name)
947 goto error_alloc;
948 }
949 return pool;
950
951 error_alloc:
952 rseq_mempool_destroy(pool);
953 errno = ENOMEM;
954 return NULL;
955 }
956
957 /* Always inline for __builtin_return_address(0). */
958 static inline __attribute__((always_inline))
959 void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
960 {
961 unsigned long *bitmap = range->alloc_bitmap;
962 size_t item_index = item_offset >> pool->item_order;
963 unsigned long mask;
964 size_t k;
965
966 if (!bitmap)
967 return;
968
969 k = item_index / BIT_PER_ULONG;
970 mask = 1ULL << (item_index % BIT_PER_ULONG);
971
972 /* Print error if bit is already set. */
973 if (bitmap[k] & mask) {
974 fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
975 __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
976 abort();
977 }
978 bitmap[k] |= mask;
979 }
980
981 static
982 void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool,
983 bool zeroed, void *init_ptr, size_t init_len)
984 {
985 struct rseq_mempool_range *range;
986 struct free_list_node *node;
987 uintptr_t item_offset;
988 void __rseq_percpu *addr;
989
990 if (init_len > pool->item_len) {
991 errno = EINVAL;
992 return NULL;
993 }
994 pthread_mutex_lock(&pool->lock);
995 /* Get first entry from free list. */
996 node = pool->free_list_head;
997 if (node != NULL) {
998 void *range_base, *ptr;
999
1000 ptr = __rseq_free_list_to_percpu_ptr(pool, node);
1001 range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1)));
1002 range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1003 /* Remove node from free list (update head). */
1004 pool->free_list_head = node->next;
1005 item_offset = (uintptr_t) (ptr - range_base);
1006 rseq_percpu_check_poison_item(pool, range, item_offset);
1007 addr = __rseq_free_list_to_percpu_ptr(pool, node);
1008 goto end;
1009 }
1010 /*
1011 * If the most recent range (first in list) does not have any
1012 * room left, create a new range and prepend it to the list
1013 * head.
1014 */
1015 range = pool->range_list;
1016 if (range->next_unused + pool->item_len > pool->attr.stride) {
1017 range = rseq_mempool_range_create(pool);
1018 if (!range) {
1019 errno = ENOMEM;
1020 addr = NULL;
1021 goto end;
1022 }
1023 /* Add range to head of list. */
1024 range->next = pool->range_list;
1025 pool->range_list = range;
1026 }
1027 /* First range in list has room left. */
1028 item_offset = range->next_unused;
1029 addr = (void __rseq_percpu *) (range->base + item_offset);
1030 range->next_unused += pool->item_len;
1031 end:
1032 if (addr)
1033 set_alloc_slot(pool, range, item_offset);
1034 pthread_mutex_unlock(&pool->lock);
1035 if (addr) {
1036 if (zeroed)
1037 rseq_percpu_zero_item(pool, range, item_offset);
1038 else if (init_ptr) {
1039 rseq_percpu_init_item(pool, range, item_offset,
1040 init_ptr, init_len);
1041 }
1042 }
1043 return addr;
1044 }
1045
1046 void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool)
1047 {
1048 return __rseq_percpu_malloc(pool, false, NULL, 0);
1049 }
1050
1051 void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool)
1052 {
1053 return __rseq_percpu_malloc(pool, true, NULL, 0);
1054 }
1055
1056 void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool,
1057 void *init_ptr, size_t len)
1058 {
1059 return __rseq_percpu_malloc(pool, false, init_ptr, len);
1060 }
1061
1062 /* Always inline for __builtin_return_address(0). */
1063 static inline __attribute__((always_inline))
1064 void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
1065 {
1066 unsigned long *bitmap = range->alloc_bitmap;
1067 size_t item_index = item_offset >> pool->item_order;
1068 unsigned long mask;
1069 size_t k;
1070
1071 if (!bitmap)
1072 return;
1073
1074 k = item_index / BIT_PER_ULONG;
1075 mask = 1ULL << (item_index % BIT_PER_ULONG);
1076
1077 /* Print error if bit is not set. */
1078 if (!(bitmap[k] & mask)) {
1079 fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1080 __func__, get_pool_name(pool), pool, item_offset,
1081 (void *) __builtin_return_address(0));
1082 abort();
1083 }
1084 bitmap[k] &= ~mask;
1085 }
1086
1087 void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride)
1088 {
1089 uintptr_t ptr = (uintptr_t) _ptr;
1090 void *range_base = (void *) (ptr & (~(stride - 1)));
1091 struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1092 struct rseq_mempool *pool = range->pool;
1093 uintptr_t item_offset = ptr & (stride - 1);
1094 struct free_list_node *head, *item;
1095
1096 pthread_mutex_lock(&pool->lock);
1097 clear_alloc_slot(pool, range, item_offset);
1098 /* Add ptr to head of free list */
1099 head = pool->free_list_head;
1100 if (pool->attr.poison_set)
1101 rseq_percpu_poison_item(pool, range, item_offset);
1102 item = __rseq_percpu_to_free_list_ptr(pool, _ptr);
1103 /*
1104 * Setting the next pointer will overwrite the first uintptr_t
1105 * poison for either CPU 0 (populate all) or init data (populate
1106 * none).
1107 */
1108 item->next = head;
1109 pool->free_list_head = item;
1110 pthread_mutex_unlock(&pool->lock);
1111 }
1112
1113 struct rseq_mempool_set *rseq_mempool_set_create(void)
1114 {
1115 struct rseq_mempool_set *pool_set;
1116
1117 pool_set = calloc(1, sizeof(struct rseq_mempool_set));
1118 if (!pool_set)
1119 return NULL;
1120 pthread_mutex_init(&pool_set->lock, NULL);
1121 return pool_set;
1122 }
1123
1124 int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set)
1125 {
1126 int order, ret;
1127
1128 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
1129 struct rseq_mempool *pool = pool_set->entries[order];
1130
1131 if (!pool)
1132 continue;
1133 ret = rseq_mempool_destroy(pool);
1134 if (ret)
1135 return ret;
1136 pool_set->entries[order] = NULL;
1137 }
1138 pthread_mutex_destroy(&pool_set->lock);
1139 free(pool_set);
1140 return 0;
1141 }
1142
1143 /* Ownership of pool is handed over to pool set on success. */
1144 int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool)
1145 {
1146 size_t item_order = pool->item_order;
1147 int ret = 0;
1148
1149 pthread_mutex_lock(&pool_set->lock);
1150 if (pool_set->entries[item_order]) {
1151 errno = EBUSY;
1152 ret = -1;
1153 goto end;
1154 }
1155 pool_set->entries[pool->item_order] = pool;
1156 end:
1157 pthread_mutex_unlock(&pool_set->lock);
1158 return ret;
1159 }
1160
1161 static
1162 void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set,
1163 void *init_ptr, size_t len, bool zeroed)
1164 {
1165 int order, min_order = POOL_SET_MIN_ENTRY;
1166 struct rseq_mempool *pool;
1167 void __rseq_percpu *addr;
1168
1169 order = rseq_get_count_order_ulong(len);
1170 if (order > POOL_SET_MIN_ENTRY)
1171 min_order = order;
1172 again:
1173 pthread_mutex_lock(&pool_set->lock);
1174 /* First smallest present pool where @len fits. */
1175 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
1176 pool = pool_set->entries[order];
1177
1178 if (!pool)
1179 continue;
1180 if (pool->item_len >= len)
1181 goto found;
1182 }
1183 pool = NULL;
1184 found:
1185 pthread_mutex_unlock(&pool_set->lock);
1186 if (pool) {
1187 addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len);
1188 if (addr == NULL && errno == ENOMEM) {
1189 /*
1190 * If the allocation failed, try again with a
1191 * larger pool.
1192 */
1193 min_order = order + 1;
1194 goto again;
1195 }
1196 } else {
1197 /* Not found. */
1198 errno = ENOMEM;
1199 addr = NULL;
1200 }
1201 return addr;
1202 }
1203
1204 void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len)
1205 {
1206 return __rseq_mempool_set_malloc(pool_set, NULL, len, false);
1207 }
1208
1209 void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len)
1210 {
1211 return __rseq_mempool_set_malloc(pool_set, NULL, len, true);
1212 }
1213
1214 void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set,
1215 void *init_ptr, size_t len)
1216 {
1217 return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true);
1218 }
1219
1220 struct rseq_mempool_attr *rseq_mempool_attr_create(void)
1221 {
1222 return calloc(1, sizeof(struct rseq_mempool_attr));
1223 }
1224
1225 void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr)
1226 {
1227 free(attr);
1228 }
1229
1230 int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr *attr,
1231 void *(*mmap_func)(void *priv, size_t len),
1232 int (*munmap_func)(void *priv, void *ptr, size_t len),
1233 void *mmap_priv)
1234 {
1235 if (!attr) {
1236 errno = EINVAL;
1237 return -1;
1238 }
1239 attr->mmap_set = true;
1240 attr->mmap_func = mmap_func;
1241 attr->munmap_func = munmap_func;
1242 attr->mmap_priv = mmap_priv;
1243 return 0;
1244 }
1245
1246 int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr,
1247 int (*init_func)(void *priv, void *addr, size_t len, int cpu),
1248 void *init_priv)
1249 {
1250 if (!attr) {
1251 errno = EINVAL;
1252 return -1;
1253 }
1254 attr->init_set = true;
1255 attr->init_func = init_func;
1256 attr->init_priv = init_priv;
1257 return 0;
1258 }
1259
1260 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr)
1261 {
1262 if (!attr) {
1263 errno = EINVAL;
1264 return -1;
1265 }
1266 attr->robust_set = true;
1267 return 0;
1268 }
1269
1270 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr,
1271 size_t stride, int max_nr_cpus)
1272 {
1273 if (!attr) {
1274 errno = EINVAL;
1275 return -1;
1276 }
1277 attr->type = MEMPOOL_TYPE_PERCPU;
1278 attr->stride = stride;
1279 attr->max_nr_cpus = max_nr_cpus;
1280 return 0;
1281 }
1282
1283 int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr,
1284 size_t stride)
1285 {
1286 if (!attr) {
1287 errno = EINVAL;
1288 return -1;
1289 }
1290 attr->type = MEMPOOL_TYPE_GLOBAL;
1291 attr->stride = stride;
1292 attr->max_nr_cpus = 0;
1293 return 0;
1294 }
1295
1296 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr,
1297 unsigned long max_nr_ranges)
1298 {
1299 if (!attr) {
1300 errno = EINVAL;
1301 return -1;
1302 }
1303 attr->max_nr_ranges = max_nr_ranges;
1304 return 0;
1305 }
1306
1307 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr,
1308 uintptr_t poison)
1309 {
1310 if (!attr) {
1311 errno = EINVAL;
1312 return -1;
1313 }
1314 attr->poison_set = true;
1315 attr->poison = poison;
1316 return 0;
1317 }
1318
1319 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr,
1320 enum rseq_mempool_populate_policy policy)
1321 {
1322 if (!attr) {
1323 errno = EINVAL;
1324 return -1;
1325 }
1326 attr->populate_policy = policy;
1327 return 0;
1328 }
1329
1330 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool)
1331 {
1332 if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) {
1333 errno = EINVAL;
1334 return -1;
1335 }
1336 return mempool->attr.max_nr_cpus;
1337 }
This page took 0.05627 seconds and 4 git commands to generate.