a54960625f2eca541a2453e565714467147379b3
[librseq.git] / src / rseq-mempool.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4 #include <rseq/mempool.h>
5 #include <sys/mman.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <pthread.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <rseq/compiler.h>
12 #include <errno.h>
13 #include <stdint.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16 #include <fcntl.h>
17
18 #ifdef HAVE_LIBNUMA
19 # include <numa.h>
20 # include <numaif.h>
21 #endif
22
23 #include "rseq-utils.h"
24 #include <rseq/rseq.h>
25
26 /*
27 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
28 *
29 * The rseq per-CPU memory allocator allows the application the request
30 * memory pools of CPU-Local memory each of containing objects of a
31 * given size (rounded to next power of 2), reserving a given virtual
32 * address size per CPU, for a given maximum number of CPUs.
33 *
34 * The per-CPU memory allocator is analogous to TLS (Thread-Local
35 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
36 * memory allocator provides CPU-Local Storage.
37 */
38
39 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
40
41 /*
42 * Smallest allocation should hold enough space for a free list pointer.
43 */
44 #if RSEQ_BITS_PER_LONG == 64
45 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
46 #else
47 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
48 #endif
49
50 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
51
52 #define MOVE_PAGES_BATCH_SIZE 4096
53
54 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
55
56 #if RSEQ_BITS_PER_LONG == 64
57 # define DEFAULT_POISON_VALUE 0x5555555555555555ULL
58 #else
59 # define DEFAULT_POISON_VALUE 0x55555555UL
60 #endif
61
62 struct free_list_node;
63
64 struct free_list_node {
65 struct free_list_node *next;
66 };
67
68 enum mempool_type {
69 MEMPOOL_TYPE_GLOBAL = 0, /* Default */
70 MEMPOOL_TYPE_PERCPU = 1,
71 };
72
73 struct rseq_mempool_attr {
74 bool mmap_set;
75 void *(*mmap_func)(void *priv, size_t len);
76 int (*munmap_func)(void *priv, void *ptr, size_t len);
77 void *mmap_priv;
78
79 bool init_set;
80 int (*init_func)(void *priv, void *addr, size_t len, int cpu);
81 void *init_priv;
82
83 bool robust_set;
84
85 enum mempool_type type;
86 size_t stride;
87 int max_nr_cpus;
88
89 unsigned long max_nr_ranges;
90
91 bool poison_set;
92 uintptr_t poison;
93
94 enum rseq_mempool_populate_policy populate_policy;
95 };
96
97 struct rseq_mempool_range;
98
99 struct rseq_mempool_range {
100 struct rseq_mempool_range *next; /* Linked list of ranges. */
101 struct rseq_mempool *pool; /* Backward reference to container pool. */
102
103 /*
104 * Memory layout of a mempool range:
105 * - Header page (contains struct rseq_mempool_range at the very end),
106 * - Base of the per-cpu data, starting with CPU 0,
107 * - CPU 1,
108 * ...
109 * - CPU max_nr_cpus - 1
110 * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_ALL).
111 */
112 void *header;
113 void *base;
114 /*
115 * The init values contains malloc_init/zmalloc values.
116 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_ALL.
117 */
118 void *init;
119 size_t next_unused;
120
121 /* Pool range mmap/munmap */
122 void *mmap_addr;
123 size_t mmap_len;
124
125 /* Track alloc/free. */
126 unsigned long *alloc_bitmap;
127 };
128
129 struct rseq_mempool {
130 /* Head of ranges linked-list. */
131 struct rseq_mempool_range *range_list;
132 unsigned long nr_ranges;
133
134 size_t item_len;
135 int item_order;
136
137 /*
138 * The free list chains freed items on the CPU 0 address range.
139 * We should rethink this decision if false sharing between
140 * malloc/free from other CPUs and data accesses from CPU 0
141 * becomes an issue. This is a NULL-terminated singly-linked
142 * list.
143 */
144 struct free_list_node *free_list_head;
145
146 /* This lock protects allocation/free within the pool. */
147 pthread_mutex_t lock;
148
149 struct rseq_mempool_attr attr;
150 char *name;
151 };
152
153 /*
154 * Pool set entries are indexed by item_len rounded to the next power of
155 * 2. A pool set can contain NULL pool entries, in which case the next
156 * large enough entry will be used for allocation.
157 */
158 struct rseq_mempool_set {
159 /* This lock protects add vs malloc/zmalloc within the pool set. */
160 pthread_mutex_t lock;
161 struct rseq_mempool *entries[POOL_SET_NR_ENTRIES];
162 };
163
164 /*
165 * This memfd is used to implement the user COW behavior for the page
166 * protection scheme. memfd is a sparse virtual file. Its layout (in
167 * offset from beginning of file) matches the process address space
168 * (pointers directly converted to file offsets).
169 */
170 struct rseq_memfd {
171 pthread_mutex_t lock;
172 size_t reserved_size;
173 unsigned int refcount;
174 int fd;
175 };
176
177 static struct rseq_memfd memfd = {
178 .lock = PTHREAD_MUTEX_INITIALIZER,
179 .reserved_size = 0,
180 .refcount = 0,
181 .fd = -1,
182 };
183
184 static
185 const char *get_pool_name(const struct rseq_mempool *pool)
186 {
187 return pool->name ? : "<anonymous>";
188 }
189
190 static
191 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu,
192 uintptr_t item_offset, size_t stride)
193 {
194 return range->base + (stride * cpu) + item_offset;
195 }
196
197 static
198 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range,
199 uintptr_t item_offset)
200 {
201 if (!range->init)
202 return NULL;
203 return range->init + item_offset;
204 }
205
206 static
207 void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool,
208 struct free_list_node *node)
209 {
210 void __rseq_percpu *p = (void __rseq_percpu *) node;
211
212 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
213 p -= pool->attr.max_nr_cpus * pool->attr.stride;
214 return p;
215 }
216
217 static
218 struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool,
219 void __rseq_percpu *p)
220 {
221 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
222 p += pool->attr.max_nr_cpus * pool->attr.stride;
223 return (struct free_list_node *) p;
224 }
225
226 static
227 off_t ptr_to_off_t(void *p)
228 {
229 return (off_t) (uintptr_t) p;
230 }
231
232 static
233 int memcmpbyte(const char *s, int c, size_t n)
234 {
235 int res = 0;
236
237 while (n-- > 0)
238 if ((res = *(s++) - c) != 0)
239 break;
240 return res;
241 }
242
243 static
244 void rseq_percpu_zero_item(struct rseq_mempool *pool,
245 struct rseq_mempool_range *range, uintptr_t item_offset)
246 {
247 char *init_p = NULL;
248 int i;
249
250 init_p = __rseq_pool_range_init_ptr(range, item_offset);
251 if (init_p)
252 memset(init_p, 0, pool->item_len);
253 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
254 char *p = __rseq_pool_range_percpu_ptr(range, i,
255 item_offset, pool->attr.stride);
256
257 /* Update propagated */
258 if (init_p && !memcmpbyte(p, 0, pool->item_len))
259 continue;
260 memset(p, 0, pool->item_len);
261 }
262 }
263
264 static
265 void rseq_percpu_init_item(struct rseq_mempool *pool,
266 struct rseq_mempool_range *range, uintptr_t item_offset,
267 void *init_ptr, size_t init_len)
268 {
269 char *init_p = NULL;
270 int i;
271
272 init_p = __rseq_pool_range_init_ptr(range, item_offset);
273 if (init_p)
274 memcpy(init_p, init_ptr, init_len);
275 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
276 char *p = __rseq_pool_range_percpu_ptr(range, i,
277 item_offset, pool->attr.stride);
278
279 /* Update propagated */
280 if (init_p && !memcmp(init_p, p, init_len))
281 continue;
282 memcpy(p, init_ptr, init_len);
283 }
284 }
285
286 static
287 void rseq_poison_item(void *p, size_t item_len, uintptr_t poison)
288 {
289 size_t offset;
290
291 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t))
292 *((uintptr_t *) (p + offset)) = poison;
293 }
294
295 static
296 void rseq_percpu_poison_item(struct rseq_mempool *pool,
297 struct rseq_mempool_range *range, uintptr_t item_offset)
298 {
299 uintptr_t poison = pool->attr.poison;
300 char *init_p = NULL;
301 int i;
302
303 init_p = __rseq_pool_range_init_ptr(range, item_offset);
304 if (init_p)
305 rseq_poison_item(init_p, pool->item_len, poison);
306 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
307 char *p = __rseq_pool_range_percpu_ptr(range, i,
308 item_offset, pool->attr.stride);
309
310 /* Update propagated */
311 if (init_p && !memcmp(init_p, p, pool->item_len))
312 continue;
313 rseq_poison_item(p, pool->item_len, poison);
314 }
315 }
316
317 /* Always inline for __builtin_return_address(0). */
318 static inline __attribute__((always_inline))
319 void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset,
320 void *p, size_t item_len, uintptr_t poison, bool skip_freelist_ptr)
321 {
322 size_t offset;
323
324 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
325 uintptr_t v;
326
327 /* Skip poison check for free-list pointer. */
328 if (skip_freelist_ptr && offset == 0)
329 continue;
330 v = *((uintptr_t *) (p + offset));
331 if (v != poison) {
332 fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
333 __func__, (unsigned long) v, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
334 abort();
335 }
336 }
337 }
338
339 /* Always inline for __builtin_return_address(0). */
340 static inline __attribute__((always_inline))
341 void rseq_percpu_check_poison_item(const struct rseq_mempool *pool,
342 const struct rseq_mempool_range *range, uintptr_t item_offset)
343 {
344 uintptr_t poison = pool->attr.poison;
345 char *init_p;
346 int i;
347
348 if (!pool->attr.robust_set)
349 return;
350 init_p = __rseq_pool_range_init_ptr(range, item_offset);
351 if (init_p)
352 rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison, true);
353 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
354 char *p = __rseq_pool_range_percpu_ptr(range, i,
355 item_offset, pool->attr.stride);
356 /*
357 * When the free list is embedded in the init values
358 * memory (populate none), it is visible from the init
359 * values memory mapping as well as per-cpu private
360 * mappings before they COW.
361 *
362 * When the free list is embedded in CPU 0 mapping
363 * (populate all), only this CPU must skip the free list
364 * nodes when checking poison.
365 */
366 rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison,
367 init_p == NULL ? (i == 0) : true);
368 }
369 }
370
371 #ifdef HAVE_LIBNUMA
372 int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags)
373 {
374 unsigned long nr_pages, page_len;
375 int status[MOVE_PAGES_BATCH_SIZE];
376 int nodes[MOVE_PAGES_BATCH_SIZE];
377 void *pages[MOVE_PAGES_BATCH_SIZE];
378 long ret;
379
380 if (!numa_flags) {
381 errno = EINVAL;
382 return -1;
383 }
384 page_len = rseq_get_page_len();
385 nr_pages = len >> rseq_get_count_order_ulong(page_len);
386
387 nodes[0] = numa_node_of_cpu(cpu);
388 if (nodes[0] < 0)
389 return -1;
390
391 for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) {
392 nodes[k] = nodes[0];
393 }
394
395 for (unsigned long page = 0; page < nr_pages;) {
396
397 size_t max_k = RSEQ_ARRAY_SIZE(pages);
398 size_t left = nr_pages - page;
399
400 if (left < max_k) {
401 max_k = left;
402 }
403
404 for (size_t k = 0; k < max_k; ++k, ++page) {
405 pages[k] = addr + (page * page_len);
406 status[k] = -EPERM;
407 }
408
409 ret = move_pages(0, max_k, pages, nodes, status, numa_flags);
410
411 if (ret < 0)
412 return ret;
413
414 if (ret > 0) {
415 fprintf(stderr, "%lu pages were not migrated\n", ret);
416 for (size_t k = 0; k < max_k; ++k) {
417 if (status[k] < 0)
418 fprintf(stderr,
419 "Error while moving page %p to numa node %d: %u\n",
420 pages[k], nodes[k], -status[k]);
421 }
422 }
423 }
424 return 0;
425 }
426 #else
427 int rseq_mempool_range_init_numa(void *addr __attribute__((unused)),
428 size_t len __attribute__((unused)),
429 int cpu __attribute__((unused)),
430 int numa_flags __attribute__((unused)))
431 {
432 errno = ENOSYS;
433 return -1;
434 }
435 #endif
436
437 static
438 void *default_mmap_func(void *priv __attribute__((unused)), size_t len)
439 {
440 void *base;
441
442 base = mmap(NULL, len, PROT_READ | PROT_WRITE,
443 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
444 if (base == MAP_FAILED)
445 return NULL;
446 return base;
447 }
448
449 static
450 int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len)
451 {
452 return munmap(ptr, len);
453 }
454
455 static
456 int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
457 {
458 size_t count;
459
460 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
461
462 /*
463 * Not being able to create the validation bitmap is an error
464 * that needs to be reported.
465 */
466 range->alloc_bitmap = calloc(count, sizeof(unsigned long));
467 if (!range->alloc_bitmap)
468 return -1;
469 return 0;
470 }
471
472 static
473 bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr)
474 {
475 struct rseq_mempool_range *range;
476 void *addr = (void *) _addr;
477
478 for (range = pool->range_list; range; range = range->next) {
479 if (addr >= range->base && addr < range->base + range->next_unused)
480 return true;
481 }
482 return false;
483 }
484
485 /* Always inline for __builtin_return_address(0). */
486 static inline __attribute__((always_inline))
487 void check_free_list(const struct rseq_mempool *pool)
488 {
489 size_t total_item = 0, total_never_allocated = 0, total_freed = 0,
490 max_list_traversal = 0, traversal_iteration = 0;
491 struct rseq_mempool_range *range;
492
493 if (!pool->attr.robust_set)
494 return;
495
496 for (range = pool->range_list; range; range = range->next) {
497 total_item += pool->attr.stride >> pool->item_order;
498 total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order;
499 }
500 max_list_traversal = total_item - total_never_allocated;
501
502 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
503 node;
504 prev = node,
505 node = node->next) {
506
507 if (traversal_iteration >= max_list_traversal) {
508 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
509 __func__, get_pool_name(pool), pool, __builtin_return_address(0));
510 abort();
511 }
512
513 /* Node is out of range. */
514 if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) {
515 if (prev)
516 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
517 __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0));
518 else
519 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
520 __func__, node, get_pool_name(pool), pool, __builtin_return_address(0));
521 abort();
522 }
523
524 traversal_iteration++;
525 total_freed++;
526 }
527
528 if (total_never_allocated + total_freed != total_item) {
529 fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
530 __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
531 abort();
532 }
533 }
534
535 /* Always inline for __builtin_return_address(0). */
536 static inline __attribute__((always_inline))
537 void check_range_poison(const struct rseq_mempool *pool,
538 const struct rseq_mempool_range *range)
539 {
540 size_t item_offset;
541
542 for (item_offset = 0; item_offset < range->next_unused;
543 item_offset += pool->item_len)
544 rseq_percpu_check_poison_item(pool, range, item_offset);
545 }
546
547 /* Always inline for __builtin_return_address(0). */
548 static inline __attribute__((always_inline))
549 void check_pool_poison(const struct rseq_mempool *pool)
550 {
551 struct rseq_mempool_range *range;
552
553 if (!pool->attr.robust_set)
554 return;
555 for (range = pool->range_list; range; range = range->next)
556 check_range_poison(pool, range);
557 }
558
559 /* Always inline for __builtin_return_address(0). */
560 static inline __attribute__((always_inline))
561 void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
562 {
563 unsigned long *bitmap = range->alloc_bitmap;
564 size_t count, total_leaks = 0;
565
566 if (!bitmap)
567 return;
568
569 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
570
571 /* Assert that all items in the pool were freed. */
572 for (size_t k = 0; k < count; ++k)
573 total_leaks += rseq_hweight_ulong(bitmap[k]);
574 if (total_leaks) {
575 fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
576 __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0));
577 abort();
578 }
579
580 free(bitmap);
581 range->alloc_bitmap = NULL;
582 }
583
584 /* Always inline for __builtin_return_address(0). */
585 static inline __attribute__((always_inline))
586 int rseq_mempool_range_destroy(struct rseq_mempool *pool,
587 struct rseq_mempool_range *range)
588 {
589 int ret = 0;
590
591 destroy_alloc_bitmap(pool, range);
592
593 /*
594 * Punch a hole into memfd where the init values used to be.
595 */
596 if (range->init) {
597 ret = fallocate(memfd.fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
598 ptr_to_off_t(range->init), pool->attr.stride);
599 if (ret)
600 return ret;
601 range->init = NULL;
602 }
603
604 /* range is a header located one page before the aligned mapping. */
605 return pool->attr.munmap_func(pool->attr.mmap_priv, range->mmap_addr, range->mmap_len);
606 }
607
608 /*
609 * Allocate a memory mapping aligned on @alignment, with an optional
610 * @pre_header before the mapping.
611 */
612 static
613 void *aligned_mmap_anonymous(struct rseq_mempool *pool,
614 size_t page_size, size_t len, size_t alignment,
615 void **pre_header, size_t pre_header_len)
616 {
617 size_t minimum_page_count, page_count, extra, total_allocate = 0;
618 int page_order;
619 void *ptr;
620
621 if (len < page_size || alignment < page_size ||
622 !is_pow2(alignment) || (len & (alignment - 1))) {
623 errno = EINVAL;
624 return NULL;
625 }
626 page_order = rseq_get_count_order_ulong(page_size);
627 if (page_order < 0) {
628 errno = EINVAL;
629 return NULL;
630 }
631 if (pre_header_len && (pre_header_len & (page_size - 1))) {
632 errno = EINVAL;
633 return NULL;
634 }
635
636 minimum_page_count = (pre_header_len + len) >> page_order;
637 page_count = (pre_header_len + len + alignment - page_size) >> page_order;
638
639 assert(page_count >= minimum_page_count);
640
641 ptr = pool->attr.mmap_func(pool->attr.mmap_priv, page_count << page_order);
642 if (!ptr)
643 goto alloc_error;
644
645 total_allocate = page_count << page_order;
646
647 if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) {
648 /* Pointer is already aligned. ptr points to pre_header. */
649 goto out;
650 }
651
652 /* Unmap extra before. */
653 extra = offset_align((uintptr_t) ptr + pre_header_len, alignment);
654 assert(!(extra & (page_size - 1)));
655 if (pool->attr.munmap_func(pool->attr.mmap_priv, ptr, extra)) {
656 perror("munmap");
657 abort();
658 }
659 total_allocate -= extra;
660 ptr += extra; /* ptr points to pre_header */
661 page_count -= extra >> page_order;
662 out:
663 assert(page_count >= minimum_page_count);
664
665 if (page_count > minimum_page_count) {
666 void *extra_ptr;
667
668 /* Unmap extra after. */
669 extra_ptr = ptr + (minimum_page_count << page_order);
670 extra = (page_count - minimum_page_count) << page_order;
671 if (pool->attr.munmap_func(pool->attr.mmap_priv, extra_ptr, extra)) {
672 perror("munmap");
673 abort();
674 }
675 total_allocate -= extra;
676 }
677
678 assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1)));
679 assert(total_allocate == len + pre_header_len);
680
681 alloc_error:
682 if (ptr) {
683 if (pre_header)
684 *pre_header = ptr;
685 ptr += pre_header_len;
686 }
687 return ptr;
688 }
689
690 static
691 int rseq_memfd_reserve_init(void *init, size_t init_len)
692 {
693 int ret = 0;
694 size_t reserve_len;
695
696 pthread_mutex_lock(&memfd.lock);
697 reserve_len = (size_t) ptr_to_off_t(init) + init_len;
698 if (reserve_len > memfd.reserved_size) {
699 if (ftruncate(memfd.fd, (off_t) reserve_len)) {
700 ret = -1;
701 goto unlock;
702 }
703 memfd.reserved_size = reserve_len;
704 }
705 unlock:
706 pthread_mutex_unlock(&memfd.lock);
707 return ret;
708 }
709
710 static
711 struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool)
712 {
713 struct rseq_mempool_range *range;
714 unsigned long page_size;
715 void *header;
716 void *base;
717 size_t range_len; /* Range len excludes header. */
718
719 if (pool->attr.max_nr_ranges &&
720 pool->nr_ranges >= pool->attr.max_nr_ranges) {
721 errno = ENOMEM;
722 return NULL;
723 }
724 page_size = rseq_get_page_len();
725
726 range_len = pool->attr.stride * pool->attr.max_nr_cpus;
727 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
728 range_len += pool->attr.stride; /* init values */
729 base = aligned_mmap_anonymous(pool, page_size,
730 range_len,
731 pool->attr.stride,
732 &header, page_size);
733 if (!base)
734 return NULL;
735 range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET);
736 range->pool = pool;
737 range->header = header;
738 range->base = base;
739 range->mmap_addr = header;
740 range->mmap_len = page_size + range_len;
741
742 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) {
743 range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus);
744 /* Populate init values pages from memfd */
745 if (rseq_memfd_reserve_init(range->init, pool->attr.stride))
746 goto error_alloc;
747 if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE,
748 MAP_SHARED | MAP_FIXED, memfd.fd,
749 ptr_to_off_t(range->init)) != (void *) range->init) {
750 goto error_alloc;
751 }
752 assert(pool->attr.type == MEMPOOL_TYPE_PERCPU);
753 /*
754 * Map per-cpu memory as private COW mappings of init values.
755 */
756 {
757 int cpu;
758
759 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
760 void *p = base + (pool->attr.stride * cpu);
761 size_t len = pool->attr.stride;
762
763 if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED,
764 memfd.fd, ptr_to_off_t(range->init)) != (void *) p) {
765 goto error_alloc;
766 }
767 }
768 }
769 }
770
771 if (pool->attr.robust_set) {
772 if (create_alloc_bitmap(pool, range))
773 goto error_alloc;
774 }
775 if (pool->attr.init_set) {
776 switch (pool->attr.type) {
777 case MEMPOOL_TYPE_GLOBAL:
778 if (pool->attr.init_func(pool->attr.init_priv,
779 base, pool->attr.stride, -1)) {
780 goto error_alloc;
781 }
782 break;
783 case MEMPOOL_TYPE_PERCPU:
784 {
785 int cpu;
786 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
787 if (pool->attr.init_func(pool->attr.init_priv,
788 base + (pool->attr.stride * cpu),
789 pool->attr.stride, cpu)) {
790 goto error_alloc;
791 }
792 }
793 break;
794 }
795 default:
796 abort();
797 }
798 }
799 pool->nr_ranges++;
800 return range;
801
802 error_alloc:
803 (void) rseq_mempool_range_destroy(pool, range);
804 return NULL;
805 }
806
807 static
808 int rseq_mempool_memfd_ref(struct rseq_mempool *pool)
809 {
810 int ret = 0;
811
812 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_ALL)
813 return 0;
814
815 pthread_mutex_lock(&memfd.lock);
816 if (memfd.refcount == 0) {
817 memfd.fd = memfd_create("mempool", MFD_CLOEXEC);
818 if (memfd.fd < 0) {
819 perror("memfd_create");
820 ret = -1;
821 goto unlock;
822 }
823 }
824 memfd.refcount++;
825 unlock:
826 pthread_mutex_unlock(&memfd.lock);
827 return ret;
828 }
829
830 static
831 void rseq_mempool_memfd_unref(struct rseq_mempool *pool)
832 {
833 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_ALL)
834 return;
835
836 pthread_mutex_lock(&memfd.lock);
837 if (memfd.refcount == 1) {
838 if (close(memfd.fd)) {
839 perror("close");
840 abort();
841 }
842 memfd.fd = -1;
843 memfd.reserved_size = 0;
844 }
845 memfd.refcount--;
846 pthread_mutex_unlock(&memfd.lock);
847 }
848
849 int rseq_mempool_destroy(struct rseq_mempool *pool)
850 {
851 struct rseq_mempool_range *range, *next_range;
852 int ret = 0;
853
854 if (!pool)
855 return 0;
856 check_free_list(pool);
857 check_pool_poison(pool);
858 /* Iteration safe against removal. */
859 for (range = pool->range_list; range && (next_range = range->next, 1); range = next_range) {
860 if (rseq_mempool_range_destroy(pool, range))
861 goto end;
862 /* Update list head to keep list coherent in case of partial failure. */
863 pool->range_list = next_range;
864 }
865 rseq_mempool_memfd_unref(pool);
866 pthread_mutex_destroy(&pool->lock);
867 free(pool->name);
868 free(pool);
869 end:
870 return ret;
871 }
872
873 struct rseq_mempool *rseq_mempool_create(const char *pool_name,
874 size_t item_len, const struct rseq_mempool_attr *_attr)
875 {
876 struct rseq_mempool *pool;
877 struct rseq_mempool_attr attr = {};
878 int order;
879
880 /* Make sure each item is large enough to contain free list pointers. */
881 if (item_len < sizeof(void *))
882 item_len = sizeof(void *);
883
884 /* Align item_len on next power of two. */
885 order = rseq_get_count_order_ulong(item_len);
886 if (order < 0) {
887 errno = EINVAL;
888 return NULL;
889 }
890 item_len = 1UL << order;
891
892 if (_attr)
893 memcpy(&attr, _attr, sizeof(attr));
894 if (!attr.mmap_set) {
895 attr.mmap_func = default_mmap_func;
896 attr.munmap_func = default_munmap_func;
897 attr.mmap_priv = NULL;
898 }
899
900 switch (attr.type) {
901 case MEMPOOL_TYPE_PERCPU:
902 if (attr.max_nr_cpus < 0) {
903 errno = EINVAL;
904 return NULL;
905 }
906 if (attr.max_nr_cpus == 0) {
907 /* Auto-detect */
908 attr.max_nr_cpus = rseq_get_max_nr_cpus();
909 if (attr.max_nr_cpus == 0) {
910 errno = EINVAL;
911 return NULL;
912 }
913 }
914 break;
915 case MEMPOOL_TYPE_GLOBAL:
916 /* Override populate policy for global type. */
917 attr.populate_policy = RSEQ_MEMPOOL_POPULATE_ALL;
918 /* Use a 1-cpu pool for global mempool type. */
919 attr.max_nr_cpus = 1;
920 break;
921 }
922 if (!attr.stride)
923 attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */
924 if (attr.robust_set && !attr.poison_set) {
925 attr.poison_set = true;
926 attr.poison = DEFAULT_POISON_VALUE;
927 }
928 if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() ||
929 !is_pow2(attr.stride)) {
930 errno = EINVAL;
931 return NULL;
932 }
933
934 pool = calloc(1, sizeof(struct rseq_mempool));
935 if (!pool)
936 return NULL;
937
938 memcpy(&pool->attr, &attr, sizeof(attr));
939 pthread_mutex_init(&pool->lock, NULL);
940 pool->item_len = item_len;
941 pool->item_order = order;
942
943 if (rseq_mempool_memfd_ref(pool))
944 goto error_alloc;
945
946 pool->range_list = rseq_mempool_range_create(pool);
947 if (!pool->range_list)
948 goto error_alloc;
949
950 if (pool_name) {
951 pool->name = strdup(pool_name);
952 if (!pool->name)
953 goto error_alloc;
954 }
955 return pool;
956
957 error_alloc:
958 rseq_mempool_destroy(pool);
959 errno = ENOMEM;
960 return NULL;
961 }
962
963 /* Always inline for __builtin_return_address(0). */
964 static inline __attribute__((always_inline))
965 void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
966 {
967 unsigned long *bitmap = range->alloc_bitmap;
968 size_t item_index = item_offset >> pool->item_order;
969 unsigned long mask;
970 size_t k;
971
972 if (!bitmap)
973 return;
974
975 k = item_index / BIT_PER_ULONG;
976 mask = 1ULL << (item_index % BIT_PER_ULONG);
977
978 /* Print error if bit is already set. */
979 if (bitmap[k] & mask) {
980 fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
981 __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
982 abort();
983 }
984 bitmap[k] |= mask;
985 }
986
987 static
988 void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool,
989 bool zeroed, void *init_ptr, size_t init_len)
990 {
991 struct rseq_mempool_range *range;
992 struct free_list_node *node;
993 uintptr_t item_offset;
994 void __rseq_percpu *addr;
995
996 if (init_len > pool->item_len) {
997 errno = EINVAL;
998 return NULL;
999 }
1000 pthread_mutex_lock(&pool->lock);
1001 /* Get first entry from free list. */
1002 node = pool->free_list_head;
1003 if (node != NULL) {
1004 void *range_base, *ptr;
1005
1006 ptr = __rseq_free_list_to_percpu_ptr(pool, node);
1007 range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1)));
1008 range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1009 /* Remove node from free list (update head). */
1010 pool->free_list_head = node->next;
1011 item_offset = (uintptr_t) (ptr - range_base);
1012 rseq_percpu_check_poison_item(pool, range, item_offset);
1013 addr = __rseq_free_list_to_percpu_ptr(pool, node);
1014 goto end;
1015 }
1016 /*
1017 * If the most recent range (first in list) does not have any
1018 * room left, create a new range and prepend it to the list
1019 * head.
1020 */
1021 range = pool->range_list;
1022 if (range->next_unused + pool->item_len > pool->attr.stride) {
1023 range = rseq_mempool_range_create(pool);
1024 if (!range) {
1025 errno = ENOMEM;
1026 addr = NULL;
1027 goto end;
1028 }
1029 /* Add range to head of list. */
1030 range->next = pool->range_list;
1031 pool->range_list = range;
1032 }
1033 /* First range in list has room left. */
1034 item_offset = range->next_unused;
1035 addr = (void __rseq_percpu *) (range->base + item_offset);
1036 range->next_unused += pool->item_len;
1037 end:
1038 if (addr)
1039 set_alloc_slot(pool, range, item_offset);
1040 pthread_mutex_unlock(&pool->lock);
1041 if (addr) {
1042 if (zeroed)
1043 rseq_percpu_zero_item(pool, range, item_offset);
1044 else if (init_ptr) {
1045 rseq_percpu_init_item(pool, range, item_offset,
1046 init_ptr, init_len);
1047 }
1048 }
1049 return addr;
1050 }
1051
1052 void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool)
1053 {
1054 return __rseq_percpu_malloc(pool, false, NULL, 0);
1055 }
1056
1057 void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool)
1058 {
1059 return __rseq_percpu_malloc(pool, true, NULL, 0);
1060 }
1061
1062 void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool,
1063 void *init_ptr, size_t len)
1064 {
1065 return __rseq_percpu_malloc(pool, false, init_ptr, len);
1066 }
1067
1068 /* Always inline for __builtin_return_address(0). */
1069 static inline __attribute__((always_inline))
1070 void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
1071 {
1072 unsigned long *bitmap = range->alloc_bitmap;
1073 size_t item_index = item_offset >> pool->item_order;
1074 unsigned long mask;
1075 size_t k;
1076
1077 if (!bitmap)
1078 return;
1079
1080 k = item_index / BIT_PER_ULONG;
1081 mask = 1ULL << (item_index % BIT_PER_ULONG);
1082
1083 /* Print error if bit is not set. */
1084 if (!(bitmap[k] & mask)) {
1085 fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1086 __func__, get_pool_name(pool), pool, item_offset,
1087 (void *) __builtin_return_address(0));
1088 abort();
1089 }
1090 bitmap[k] &= ~mask;
1091 }
1092
1093 void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride)
1094 {
1095 uintptr_t ptr = (uintptr_t) _ptr;
1096 void *range_base = (void *) (ptr & (~(stride - 1)));
1097 struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1098 struct rseq_mempool *pool = range->pool;
1099 uintptr_t item_offset = ptr & (stride - 1);
1100 struct free_list_node *head, *item;
1101
1102 pthread_mutex_lock(&pool->lock);
1103 clear_alloc_slot(pool, range, item_offset);
1104 /* Add ptr to head of free list */
1105 head = pool->free_list_head;
1106 if (pool->attr.poison_set)
1107 rseq_percpu_poison_item(pool, range, item_offset);
1108 item = __rseq_percpu_to_free_list_ptr(pool, _ptr);
1109 /*
1110 * Setting the next pointer will overwrite the first uintptr_t
1111 * poison for either CPU 0 (populate all) or init data (populate
1112 * none).
1113 */
1114 item->next = head;
1115 pool->free_list_head = item;
1116 pthread_mutex_unlock(&pool->lock);
1117 }
1118
1119 struct rseq_mempool_set *rseq_mempool_set_create(void)
1120 {
1121 struct rseq_mempool_set *pool_set;
1122
1123 pool_set = calloc(1, sizeof(struct rseq_mempool_set));
1124 if (!pool_set)
1125 return NULL;
1126 pthread_mutex_init(&pool_set->lock, NULL);
1127 return pool_set;
1128 }
1129
1130 int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set)
1131 {
1132 int order, ret;
1133
1134 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
1135 struct rseq_mempool *pool = pool_set->entries[order];
1136
1137 if (!pool)
1138 continue;
1139 ret = rseq_mempool_destroy(pool);
1140 if (ret)
1141 return ret;
1142 pool_set->entries[order] = NULL;
1143 }
1144 pthread_mutex_destroy(&pool_set->lock);
1145 free(pool_set);
1146 return 0;
1147 }
1148
1149 /* Ownership of pool is handed over to pool set on success. */
1150 int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool)
1151 {
1152 size_t item_order = pool->item_order;
1153 int ret = 0;
1154
1155 pthread_mutex_lock(&pool_set->lock);
1156 if (pool_set->entries[item_order]) {
1157 errno = EBUSY;
1158 ret = -1;
1159 goto end;
1160 }
1161 pool_set->entries[pool->item_order] = pool;
1162 end:
1163 pthread_mutex_unlock(&pool_set->lock);
1164 return ret;
1165 }
1166
1167 static
1168 void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set,
1169 void *init_ptr, size_t len, bool zeroed)
1170 {
1171 int order, min_order = POOL_SET_MIN_ENTRY;
1172 struct rseq_mempool *pool;
1173 void __rseq_percpu *addr;
1174
1175 order = rseq_get_count_order_ulong(len);
1176 if (order > POOL_SET_MIN_ENTRY)
1177 min_order = order;
1178 again:
1179 pthread_mutex_lock(&pool_set->lock);
1180 /* First smallest present pool where @len fits. */
1181 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
1182 pool = pool_set->entries[order];
1183
1184 if (!pool)
1185 continue;
1186 if (pool->item_len >= len)
1187 goto found;
1188 }
1189 pool = NULL;
1190 found:
1191 pthread_mutex_unlock(&pool_set->lock);
1192 if (pool) {
1193 addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len);
1194 if (addr == NULL && errno == ENOMEM) {
1195 /*
1196 * If the allocation failed, try again with a
1197 * larger pool.
1198 */
1199 min_order = order + 1;
1200 goto again;
1201 }
1202 } else {
1203 /* Not found. */
1204 errno = ENOMEM;
1205 addr = NULL;
1206 }
1207 return addr;
1208 }
1209
1210 void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len)
1211 {
1212 return __rseq_mempool_set_malloc(pool_set, NULL, len, false);
1213 }
1214
1215 void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len)
1216 {
1217 return __rseq_mempool_set_malloc(pool_set, NULL, len, true);
1218 }
1219
1220 void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set,
1221 void *init_ptr, size_t len)
1222 {
1223 return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true);
1224 }
1225
1226 struct rseq_mempool_attr *rseq_mempool_attr_create(void)
1227 {
1228 return calloc(1, sizeof(struct rseq_mempool_attr));
1229 }
1230
1231 void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr)
1232 {
1233 free(attr);
1234 }
1235
1236 int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr *attr,
1237 void *(*mmap_func)(void *priv, size_t len),
1238 int (*munmap_func)(void *priv, void *ptr, size_t len),
1239 void *mmap_priv)
1240 {
1241 if (!attr) {
1242 errno = EINVAL;
1243 return -1;
1244 }
1245 attr->mmap_set = true;
1246 attr->mmap_func = mmap_func;
1247 attr->munmap_func = munmap_func;
1248 attr->mmap_priv = mmap_priv;
1249 return 0;
1250 }
1251
1252 int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr,
1253 int (*init_func)(void *priv, void *addr, size_t len, int cpu),
1254 void *init_priv)
1255 {
1256 if (!attr) {
1257 errno = EINVAL;
1258 return -1;
1259 }
1260 attr->init_set = true;
1261 attr->init_func = init_func;
1262 attr->init_priv = init_priv;
1263 return 0;
1264 }
1265
1266 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr)
1267 {
1268 if (!attr) {
1269 errno = EINVAL;
1270 return -1;
1271 }
1272 attr->robust_set = true;
1273 return 0;
1274 }
1275
1276 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr,
1277 size_t stride, int max_nr_cpus)
1278 {
1279 if (!attr) {
1280 errno = EINVAL;
1281 return -1;
1282 }
1283 attr->type = MEMPOOL_TYPE_PERCPU;
1284 attr->stride = stride;
1285 attr->max_nr_cpus = max_nr_cpus;
1286 return 0;
1287 }
1288
1289 int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr,
1290 size_t stride)
1291 {
1292 if (!attr) {
1293 errno = EINVAL;
1294 return -1;
1295 }
1296 attr->type = MEMPOOL_TYPE_GLOBAL;
1297 attr->stride = stride;
1298 attr->max_nr_cpus = 0;
1299 return 0;
1300 }
1301
1302 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr,
1303 unsigned long max_nr_ranges)
1304 {
1305 if (!attr) {
1306 errno = EINVAL;
1307 return -1;
1308 }
1309 attr->max_nr_ranges = max_nr_ranges;
1310 return 0;
1311 }
1312
1313 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr,
1314 uintptr_t poison)
1315 {
1316 if (!attr) {
1317 errno = EINVAL;
1318 return -1;
1319 }
1320 attr->poison_set = true;
1321 attr->poison = poison;
1322 return 0;
1323 }
1324
1325 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr,
1326 enum rseq_mempool_populate_policy policy)
1327 {
1328 if (!attr) {
1329 errno = EINVAL;
1330 return -1;
1331 }
1332 attr->populate_policy = policy;
1333 return 0;
1334 }
1335
1336 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool)
1337 {
1338 if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) {
1339 errno = EINVAL;
1340 return -1;
1341 }
1342 return mempool->attr.max_nr_cpus;
1343 }
This page took 0.069167 seconds and 3 git commands to generate.