mempool: Introduce "private" populate policy
[librseq.git] / src / rseq-mempool.c
CommitLineData
ef6695f1
MD
1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
34337fec 4#include <rseq/mempool.h>
ef6695f1
MD
5#include <sys/mman.h>
6#include <assert.h>
7#include <string.h>
8#include <pthread.h>
9#include <unistd.h>
10#include <stdlib.h>
11#include <rseq/compiler.h>
12#include <errno.h>
13#include <stdint.h>
14#include <stdbool.h>
367e559c 15#include <stdio.h>
a5694a4d 16#include <fcntl.h>
367e559c
MD
17
18#ifdef HAVE_LIBNUMA
19# include <numa.h>
20# include <numaif.h>
21#endif
ef6695f1 22
34337fec 23#include "rseq-utils.h"
47c725dd 24#include <rseq/rseq.h>
19be9217 25
ef6695f1 26/*
b73b0c25 27 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
ef6695f1 28 *
8ab16a24
MD
29 * The rseq per-CPU memory allocator allows the application the request
30 * memory pools of CPU-Local memory each of containing objects of a
8aa1462d
MD
31 * given size (rounded to next power of 2), reserving a given virtual
32 * address size per CPU, for a given maximum number of CPUs.
8ab16a24
MD
33 *
34 * The per-CPU memory allocator is analogous to TLS (Thread-Local
35 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
36 * memory allocator provides CPU-Local Storage.
ef6695f1
MD
37 */
38
3236da62 39#define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
ef6695f1 40
72b100a1
MD
41/*
42 * Smallest allocation should hold enough space for a free list pointer.
43 */
ef6695f1
MD
44#if RSEQ_BITS_PER_LONG == 64
45# define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
46#else
47# define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
48#endif
49
0fdf7a4c
OD
50#define BIT_PER_ULONG (8 * sizeof(unsigned long))
51
57d8b586
OD
52#define MOVE_PAGES_BATCH_SIZE 4096
53
0ba2a93e 54#define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
4aa3220c 55
3975084e
MD
56#if RSEQ_BITS_PER_LONG == 64
57# define DEFAULT_POISON_VALUE 0x5555555555555555ULL
58#else
59# define DEFAULT_POISON_VALUE 0x55555555UL
60#endif
61
ef6695f1
MD
62struct free_list_node;
63
64struct free_list_node {
65 struct free_list_node *next;
66};
67
cb475906 68enum mempool_type {
89b7e681
MD
69 MEMPOOL_TYPE_GLOBAL = 0, /* Default */
70 MEMPOOL_TYPE_PERCPU = 1,
cb475906
MD
71};
72
0ba2a93e 73struct rseq_mempool_attr {
135811f2 74 bool init_set;
6e329183 75 int (*init_func)(void *priv, void *addr, size_t len, int cpu);
135811f2
MD
76 void *init_priv;
77
d6acc8aa 78 bool robust_set;
cb475906
MD
79
80 enum mempool_type type;
81 size_t stride;
82 int max_nr_cpus;
e11a02d7
MD
83
84 unsigned long max_nr_ranges;
455e090e
MD
85
86 bool poison_set;
87 uintptr_t poison;
a5694a4d
MD
88
89 enum rseq_mempool_populate_policy populate_policy;
9bd07c29
MD
90};
91
0ba2a93e 92struct rseq_mempool_range;
b73b0c25 93
0ba2a93e 94struct rseq_mempool_range {
9d986353
MD
95 struct rseq_mempool_range *next; /* Linked list of ranges. */
96 struct rseq_mempool *pool; /* Backward reference to container pool. */
a5694a4d
MD
97
98 /*
99 * Memory layout of a mempool range:
100 * - Header page (contains struct rseq_mempool_range at the very end),
c0de0012
MD
101 * - Base of the per-cpu data, starting with CPU 0.
102 * Aliases with free-list for non-robust populate all pool.
a5694a4d
MD
103 * - CPU 1,
104 * ...
105 * - CPU max_nr_cpus - 1
4e8ae59d 106 * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL).
c0de0012
MD
107 * Aliases with free-list for non-robust populate none pool.
108 * - free list (for robust pool).
109 *
110 * The free list aliases the CPU 0 memory area for non-robust
111 * populate all pools. It aliases with init values for
112 * non-robust populate none pools. It is located immediately
113 * after the init values for robust pools.
a5694a4d 114 */
4aa3220c 115 void *header;
ef6695f1 116 void *base;
a5694a4d
MD
117 /*
118 * The init values contains malloc_init/zmalloc values.
4e8ae59d 119 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL.
a5694a4d
MD
120 */
121 void *init;
b73b0c25 122 size_t next_unused;
fa6a0fb3
MD
123
124 /* Pool range mmap/munmap */
125 void *mmap_addr;
126 size_t mmap_len;
127
b73b0c25
MD
128 /* Track alloc/free. */
129 unsigned long *alloc_bitmap;
130};
131
0ba2a93e 132struct rseq_mempool {
9d986353
MD
133 /* Head of ranges linked-list. */
134 struct rseq_mempool_range *range_list;
135 unsigned long nr_ranges;
b73b0c25 136
ef6695f1 137 size_t item_len;
ef6695f1 138 int item_order;
ef6695f1
MD
139
140 /*
8ab16a24 141 * The free list chains freed items on the CPU 0 address range.
ef6695f1 142 * We should rethink this decision if false sharing between
8ab16a24 143 * malloc/free from other CPUs and data accesses from CPU 0
ef6695f1
MD
144 * becomes an issue. This is a NULL-terminated singly-linked
145 * list.
146 */
147 struct free_list_node *free_list_head;
b73b0c25 148
ef6695f1
MD
149 /* This lock protects allocation/free within the pool. */
150 pthread_mutex_t lock;
9bd07c29 151
0ba2a93e 152 struct rseq_mempool_attr attr;
ca452fee 153 char *name;
ef6695f1
MD
154};
155
ef6695f1
MD
156/*
157 * Pool set entries are indexed by item_len rounded to the next power of
158 * 2. A pool set can contain NULL pool entries, in which case the next
159 * large enough entry will be used for allocation.
160 */
0ba2a93e 161struct rseq_mempool_set {
ef6695f1
MD
162 /* This lock protects add vs malloc/zmalloc within the pool set. */
163 pthread_mutex_t lock;
0ba2a93e 164 struct rseq_mempool *entries[POOL_SET_NR_ENTRIES];
ef6695f1
MD
165};
166
a5694a4d
MD
167/*
168 * This memfd is used to implement the user COW behavior for the page
169 * protection scheme. memfd is a sparse virtual file. Its layout (in
170 * offset from beginning of file) matches the process address space
171 * (pointers directly converted to file offsets).
172 */
173struct rseq_memfd {
174 pthread_mutex_t lock;
175 size_t reserved_size;
176 unsigned int refcount;
177 int fd;
178};
179
180static struct rseq_memfd memfd = {
181 .lock = PTHREAD_MUTEX_INITIALIZER,
182 .reserved_size = 0,
183 .refcount = 0,
184 .fd = -1,
185};
186
86617384
MD
187static
188const char *get_pool_name(const struct rseq_mempool *pool)
189{
190 return pool->name ? : "<anonymous>";
191}
192
367e559c 193static
6fbf1fb6 194void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu,
f2981623 195 uintptr_t item_offset, size_t stride)
367e559c 196{
15b63c9f 197 return range->base + (stride * cpu) + item_offset;
367e559c
MD
198}
199
a5694a4d
MD
200static
201void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range,
202 uintptr_t item_offset)
203{
204 if (!range->init)
205 return NULL;
206 return range->init + item_offset;
207}
208
209static
210void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool,
211 struct free_list_node *node)
212{
213 void __rseq_percpu *p = (void __rseq_percpu *) node;
214
c0de0012
MD
215 if (pool->attr.robust_set) {
216 /* Skip cpus. */
a5694a4d 217 p -= pool->attr.max_nr_cpus * pool->attr.stride;
c0de0012 218 /* Skip init values */
4e8ae59d 219 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
c0de0012
MD
220 p -= pool->attr.stride;
221
222 } else {
223 /* Populate none free list is in init values */
4e8ae59d 224 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
c0de0012
MD
225 p -= pool->attr.max_nr_cpus * pool->attr.stride;
226 }
a5694a4d
MD
227 return p;
228}
229
230static
231struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool,
232 void __rseq_percpu *p)
233{
c0de0012
MD
234 if (pool->attr.robust_set) {
235 /* Skip cpus. */
a5694a4d 236 p += pool->attr.max_nr_cpus * pool->attr.stride;
c0de0012 237 /* Skip init values */
4e8ae59d 238 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
c0de0012
MD
239 p += pool->attr.stride;
240
241 } else {
242 /* Populate none free list is in init values */
4e8ae59d 243 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
c0de0012
MD
244 p += pool->attr.max_nr_cpus * pool->attr.stride;
245 }
a5694a4d
MD
246 return (struct free_list_node *) p;
247}
248
49f96dc5
MD
249static
250off_t ptr_to_off_t(void *p)
251{
252 return (off_t) (uintptr_t) p;
253}
254
a5694a4d 255static
14af0aa2 256intptr_t rseq_cmp_item(void *p, size_t item_len, intptr_t cmp_value, intptr_t *unexpected_value)
a5694a4d 257{
14af0aa2
MD
258 size_t offset;
259 intptr_t res = 0;
a5694a4d 260
14af0aa2
MD
261 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
262 intptr_t v = *((intptr_t *) (p + offset));
263
264 if ((res = v - cmp_value) != 0) {
265 if (unexpected_value)
266 *unexpected_value = v;
a5694a4d 267 break;
14af0aa2
MD
268 }
269 }
a5694a4d
MD
270 return res;
271}
272
367e559c 273static
15b63c9f
MD
274void rseq_percpu_zero_item(struct rseq_mempool *pool,
275 struct rseq_mempool_range *range, uintptr_t item_offset)
367e559c 276{
a5694a4d 277 char *init_p = NULL;
367e559c
MD
278 int i;
279
a5694a4d
MD
280 init_p = __rseq_pool_range_init_ptr(range, item_offset);
281 if (init_p)
644298bb 282 bzero(init_p, pool->item_len);
cb475906 283 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
15b63c9f 284 char *p = __rseq_pool_range_percpu_ptr(range, i,
cb475906 285 item_offset, pool->attr.stride);
a5694a4d 286
1b658191
MD
287 /*
288 * If item is already zeroed, either because the
289 * init range update has propagated or because the
290 * content is already zeroed (e.g. zero page), don't
291 * write to the page. This eliminates useless COW over
292 * the zero page just for overwriting it with zeroes.
293 *
294 * This means zmalloc() in populate all policy pool do
295 * not trigger COW for CPUs which are not actively
296 * writing to the pool. This is however not the case for
297 * malloc_init() in populate-all pools if it populates
298 * non-zero content.
299 */
14af0aa2 300 if (!rseq_cmp_item(p, pool->item_len, 0, NULL))
a5694a4d 301 continue;
644298bb 302 bzero(p, pool->item_len);
367e559c
MD
303 }
304}
305
6ff43d9a
MD
306static
307void rseq_percpu_init_item(struct rseq_mempool *pool,
308 struct rseq_mempool_range *range, uintptr_t item_offset,
309 void *init_ptr, size_t init_len)
310{
a5694a4d 311 char *init_p = NULL;
6ff43d9a
MD
312 int i;
313
a5694a4d
MD
314 init_p = __rseq_pool_range_init_ptr(range, item_offset);
315 if (init_p)
316 memcpy(init_p, init_ptr, init_len);
6ff43d9a
MD
317 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
318 char *p = __rseq_pool_range_percpu_ptr(range, i,
319 item_offset, pool->attr.stride);
a5694a4d 320
1b658191
MD
321 /*
322 * If the update propagated through a shared mapping,
323 * or the item already has the correct content, skip
324 * writing it into the cpu item to eliminate useless
325 * COW of the page.
326 */
327 if (!memcmp(init_ptr, p, init_len))
a5694a4d 328 continue;
6ff43d9a
MD
329 memcpy(p, init_ptr, init_len);
330 }
331}
332
a5694a4d
MD
333static
334void rseq_poison_item(void *p, size_t item_len, uintptr_t poison)
335{
336 size_t offset;
337
338 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t))
339 *((uintptr_t *) (p + offset)) = poison;
340}
341
455e090e
MD
342static
343void rseq_percpu_poison_item(struct rseq_mempool *pool,
344 struct rseq_mempool_range *range, uintptr_t item_offset)
345{
346 uintptr_t poison = pool->attr.poison;
a5694a4d 347 char *init_p = NULL;
455e090e
MD
348 int i;
349
a5694a4d
MD
350 init_p = __rseq_pool_range_init_ptr(range, item_offset);
351 if (init_p)
352 rseq_poison_item(init_p, pool->item_len, poison);
455e090e
MD
353 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
354 char *p = __rseq_pool_range_percpu_ptr(range, i,
355 item_offset, pool->attr.stride);
455e090e 356
1b658191
MD
357 /*
358 * If the update propagated through a shared mapping,
359 * or the item already has the correct content, skip
360 * writing it into the cpu item to eliminate useless
361 * COW of the page.
362 *
363 * It is recommended to use zero as poison value for
364 * populate-all pools to eliminate COW due to writing
365 * poison to unused CPU memory.
366 */
14af0aa2 367 if (rseq_cmp_item(p, pool->item_len, poison, NULL) == 0)
a5694a4d
MD
368 continue;
369 rseq_poison_item(p, pool->item_len, poison);
370 }
371}
372
373/* Always inline for __builtin_return_address(0). */
374static inline __attribute__((always_inline))
375void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset,
c0de0012 376 void *p, size_t item_len, uintptr_t poison)
a5694a4d 377{
1b658191 378 intptr_t unexpected_value;
a5694a4d 379
14af0aa2 380 if (rseq_cmp_item(p, item_len, poison, &unexpected_value) == 0)
1b658191 381 return;
a5694a4d 382
1b658191
MD
383 fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
384 __func__, (unsigned long) unexpected_value, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
385 abort();
86617384
MD
386}
387
388/* Always inline for __builtin_return_address(0). */
389static inline __attribute__((always_inline))
6fbf1fb6
MD
390void rseq_percpu_check_poison_item(const struct rseq_mempool *pool,
391 const struct rseq_mempool_range *range, uintptr_t item_offset)
86617384
MD
392{
393 uintptr_t poison = pool->attr.poison;
a5694a4d 394 char *init_p;
86617384
MD
395 int i;
396
3975084e 397 if (!pool->attr.robust_set)
86617384 398 return;
a5694a4d
MD
399 init_p = __rseq_pool_range_init_ptr(range, item_offset);
400 if (init_p)
c0de0012 401 rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison);
86617384
MD
402 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
403 char *p = __rseq_pool_range_percpu_ptr(range, i,
404 item_offset, pool->attr.stride);
c0de0012 405 rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison);
455e090e
MD
406 }
407}
408
15b63c9f 409#ifdef HAVE_LIBNUMA
c6fd3981 410int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags)
367e559c 411{
f2981623 412 unsigned long nr_pages, page_len;
c6fd3981
MD
413 int status[MOVE_PAGES_BATCH_SIZE];
414 int nodes[MOVE_PAGES_BATCH_SIZE];
415 void *pages[MOVE_PAGES_BATCH_SIZE];
f2981623 416 long ret;
367e559c 417
c6fd3981
MD
418 if (!numa_flags) {
419 errno = EINVAL;
420 return -1;
421 }
367e559c 422 page_len = rseq_get_page_len();
c6fd3981 423 nr_pages = len >> rseq_get_count_order_ulong(page_len);
57d8b586 424
c6fd3981
MD
425 nodes[0] = numa_node_of_cpu(cpu);
426 if (nodes[0] < 0)
427 return -1;
57d8b586 428
c6fd3981
MD
429 for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) {
430 nodes[k] = nodes[0];
431 }
57d8b586 432
c6fd3981 433 for (unsigned long page = 0; page < nr_pages;) {
57d8b586 434
c6fd3981
MD
435 size_t max_k = RSEQ_ARRAY_SIZE(pages);
436 size_t left = nr_pages - page;
57d8b586 437
c6fd3981
MD
438 if (left < max_k) {
439 max_k = left;
440 }
57d8b586 441
c6fd3981
MD
442 for (size_t k = 0; k < max_k; ++k, ++page) {
443 pages[k] = addr + (page * page_len);
444 status[k] = -EPERM;
367e559c 445 }
b73b0c25 446
c6fd3981
MD
447 ret = move_pages(0, max_k, pages, nodes, status, numa_flags);
448
449 if (ret < 0)
b73b0c25 450 return ret;
c6fd3981
MD
451
452 if (ret > 0) {
453 fprintf(stderr, "%lu pages were not migrated\n", ret);
454 for (size_t k = 0; k < max_k; ++k) {
455 if (status[k] < 0)
456 fprintf(stderr,
457 "Error while moving page %p to numa node %d: %u\n",
458 pages[k], nodes[k], -status[k]);
459 }
460 }
b73b0c25
MD
461 }
462 return 0;
463}
367e559c 464#else
c6fd3981
MD
465int rseq_mempool_range_init_numa(void *addr __attribute__((unused)),
466 size_t len __attribute__((unused)),
467 int cpu __attribute__((unused)),
367e559c
MD
468 int numa_flags __attribute__((unused)))
469{
c6fd3981
MD
470 errno = ENOSYS;
471 return -1;
367e559c
MD
472}
473#endif
474
0fdf7a4c 475static
0ba2a93e 476int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
0fdf7a4c
OD
477{
478 size_t count;
479
cb475906 480 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
0fdf7a4c
OD
481
482 /*
9649c7ee
MD
483 * Not being able to create the validation bitmap is an error
484 * that needs to be reported.
0fdf7a4c 485 */
b73b0c25
MD
486 range->alloc_bitmap = calloc(count, sizeof(unsigned long));
487 if (!range->alloc_bitmap)
9649c7ee
MD
488 return -1;
489 return 0;
0fdf7a4c
OD
490}
491
b73b0c25 492static
a5694a4d 493bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr)
b73b0c25 494{
0ba2a93e 495 struct rseq_mempool_range *range;
a5694a4d 496 void *addr = (void *) _addr;
b73b0c25 497
9d986353 498 for (range = pool->range_list; range; range = range->next) {
b73b0c25
MD
499 if (addr >= range->base && addr < range->base + range->next_unused)
500 return true;
501 }
502 return false;
503}
504
a9ec6111
OD
505/* Always inline for __builtin_return_address(0). */
506static inline __attribute__((always_inline))
0ba2a93e 507void check_free_list(const struct rseq_mempool *pool)
a9ec6111 508{
b73b0c25
MD
509 size_t total_item = 0, total_never_allocated = 0, total_freed = 0,
510 max_list_traversal = 0, traversal_iteration = 0;
0ba2a93e 511 struct rseq_mempool_range *range;
b73b0c25
MD
512
513 if (!pool->attr.robust_set)
514 return;
515
9d986353 516 for (range = pool->range_list; range; range = range->next) {
cb475906
MD
517 total_item += pool->attr.stride >> pool->item_order;
518 total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order;
b73b0c25
MD
519 }
520 max_list_traversal = total_item - total_never_allocated;
a9ec6111
OD
521
522 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
523 node;
524 prev = node,
525 node = node->next) {
526
a9ec6111 527 if (traversal_iteration >= max_list_traversal) {
ca452fee
MD
528 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
529 __func__, get_pool_name(pool), pool, __builtin_return_address(0));
a9ec6111
OD
530 abort();
531 }
532
533 /* Node is out of range. */
a5694a4d 534 if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) {
a9ec6111 535 if (prev)
ca452fee
MD
536 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
537 __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0));
a9ec6111 538 else
ca452fee
MD
539 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
540 __func__, node, get_pool_name(pool), pool, __builtin_return_address(0));
a9ec6111
OD
541 abort();
542 }
543
b73b0c25
MD
544 traversal_iteration++;
545 total_freed++;
a9ec6111
OD
546 }
547
548 if (total_never_allocated + total_freed != total_item) {
ca452fee
MD
549 fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
550 __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
a9ec6111
OD
551 abort();
552 }
a9ec6111
OD
553}
554
6fbf1fb6
MD
555/* Always inline for __builtin_return_address(0). */
556static inline __attribute__((always_inline))
557void check_range_poison(const struct rseq_mempool *pool,
558 const struct rseq_mempool_range *range)
559{
560 size_t item_offset;
561
562 for (item_offset = 0; item_offset < range->next_unused;
563 item_offset += pool->item_len)
564 rseq_percpu_check_poison_item(pool, range, item_offset);
565}
566
567/* Always inline for __builtin_return_address(0). */
568static inline __attribute__((always_inline))
569void check_pool_poison(const struct rseq_mempool *pool)
570{
571 struct rseq_mempool_range *range;
572
3975084e 573 if (!pool->attr.robust_set)
6fbf1fb6
MD
574 return;
575 for (range = pool->range_list; range; range = range->next)
576 check_range_poison(pool, range);
577}
578
e7cbbc10
MD
579/* Always inline for __builtin_return_address(0). */
580static inline __attribute__((always_inline))
0ba2a93e 581void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
0fdf7a4c 582{
b73b0c25 583 unsigned long *bitmap = range->alloc_bitmap;
9649c7ee 584 size_t count, total_leaks = 0;
0fdf7a4c 585
9649c7ee 586 if (!bitmap)
0fdf7a4c 587 return;
0fdf7a4c 588
cb475906 589 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
0fdf7a4c
OD
590
591 /* Assert that all items in the pool were freed. */
9649c7ee
MD
592 for (size_t k = 0; k < count; ++k)
593 total_leaks += rseq_hweight_ulong(bitmap[k]);
594 if (total_leaks) {
ca452fee
MD
595 fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
596 __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0));
9649c7ee 597 abort();
0fdf7a4c
OD
598 }
599
600 free(bitmap);
a5694a4d 601 range->alloc_bitmap = NULL;
0fdf7a4c
OD
602}
603
b73b0c25
MD
604/* Always inline for __builtin_return_address(0). */
605static inline __attribute__((always_inline))
0ba2a93e
MD
606int rseq_mempool_range_destroy(struct rseq_mempool *pool,
607 struct rseq_mempool_range *range)
b73b0c25 608{
a5694a4d
MD
609 int ret = 0;
610
b73b0c25 611 destroy_alloc_bitmap(pool, range);
a5694a4d
MD
612
613 /*
614 * Punch a hole into memfd where the init values used to be.
615 */
616 if (range->init) {
617 ret = fallocate(memfd.fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
49f96dc5 618 ptr_to_off_t(range->init), pool->attr.stride);
a5694a4d
MD
619 if (ret)
620 return ret;
621 range->init = NULL;
622 }
623
5c99f3d6 624 /* range is a header located one page before the aligned mapping. */
5cd72fc7 625 return munmap(range->mmap_addr, range->mmap_len);
5c99f3d6
MD
626}
627
628/*
629 * Allocate a memory mapping aligned on @alignment, with an optional
630 * @pre_header before the mapping.
631 */
632static
5cd72fc7 633void *aligned_mmap_anonymous(size_t page_size, size_t len, size_t alignment,
5c99f3d6
MD
634 void **pre_header, size_t pre_header_len)
635{
636 size_t minimum_page_count, page_count, extra, total_allocate = 0;
637 int page_order;
638 void *ptr;
639
640 if (len < page_size || alignment < page_size ||
b72b2d9e 641 !is_pow2(alignment) || (len & (alignment - 1))) {
5c99f3d6
MD
642 errno = EINVAL;
643 return NULL;
644 }
645 page_order = rseq_get_count_order_ulong(page_size);
646 if (page_order < 0) {
647 errno = EINVAL;
648 return NULL;
649 }
650 if (pre_header_len && (pre_header_len & (page_size - 1))) {
651 errno = EINVAL;
652 return NULL;
653 }
654
655 minimum_page_count = (pre_header_len + len) >> page_order;
656 page_count = (pre_header_len + len + alignment - page_size) >> page_order;
657
658 assert(page_count >= minimum_page_count);
659
5cd72fc7
MD
660 ptr = mmap(NULL, page_count << page_order, PROT_READ | PROT_WRITE,
661 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
662 if (ptr == MAP_FAILED) {
663 ptr = NULL;
5c99f3d6 664 goto alloc_error;
5cd72fc7 665 }
5c99f3d6
MD
666
667 total_allocate = page_count << page_order;
668
669 if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) {
670 /* Pointer is already aligned. ptr points to pre_header. */
671 goto out;
672 }
673
674 /* Unmap extra before. */
675 extra = offset_align((uintptr_t) ptr + pre_header_len, alignment);
676 assert(!(extra & (page_size - 1)));
5cd72fc7 677 if (munmap(ptr, extra)) {
5c99f3d6
MD
678 perror("munmap");
679 abort();
680 }
681 total_allocate -= extra;
682 ptr += extra; /* ptr points to pre_header */
683 page_count -= extra >> page_order;
684out:
685 assert(page_count >= minimum_page_count);
686
687 if (page_count > minimum_page_count) {
688 void *extra_ptr;
689
690 /* Unmap extra after. */
691 extra_ptr = ptr + (minimum_page_count << page_order);
692 extra = (page_count - minimum_page_count) << page_order;
5cd72fc7 693 if (munmap(extra_ptr, extra)) {
5c99f3d6
MD
694 perror("munmap");
695 abort();
696 }
697 total_allocate -= extra;
698 }
699
700 assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1)));
701 assert(total_allocate == len + pre_header_len);
702
703alloc_error:
704 if (ptr) {
705 if (pre_header)
706 *pre_header = ptr;
707 ptr += pre_header_len;
708 }
709 return ptr;
b73b0c25
MD
710}
711
a5694a4d
MD
712static
713int rseq_memfd_reserve_init(void *init, size_t init_len)
714{
715 int ret = 0;
716 size_t reserve_len;
717
718 pthread_mutex_lock(&memfd.lock);
49f96dc5 719 reserve_len = (size_t) ptr_to_off_t(init) + init_len;
a5694a4d
MD
720 if (reserve_len > memfd.reserved_size) {
721 if (ftruncate(memfd.fd, (off_t) reserve_len)) {
722 ret = -1;
723 goto unlock;
724 }
725 memfd.reserved_size = reserve_len;
726 }
727unlock:
728 pthread_mutex_unlock(&memfd.lock);
729 return ret;
730}
731
b73b0c25 732static
0ba2a93e 733struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool)
b73b0c25 734{
0ba2a93e 735 struct rseq_mempool_range *range;
5c99f3d6 736 unsigned long page_size;
4aa3220c 737 void *header;
b73b0c25 738 void *base;
a5694a4d 739 size_t range_len; /* Range len excludes header. */
b73b0c25 740
e11a02d7
MD
741 if (pool->attr.max_nr_ranges &&
742 pool->nr_ranges >= pool->attr.max_nr_ranges) {
9d986353
MD
743 errno = ENOMEM;
744 return NULL;
745 }
5c99f3d6 746 page_size = rseq_get_page_len();
b73b0c25 747
a5694a4d 748 range_len = pool->attr.stride * pool->attr.max_nr_cpus;
4e8ae59d 749 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
a5694a4d 750 range_len += pool->attr.stride; /* init values */
c0de0012
MD
751 if (pool->attr.robust_set)
752 range_len += pool->attr.stride; /* free list */
5cd72fc7
MD
753 base = aligned_mmap_anonymous(page_size, range_len,
754 pool->attr.stride, &header, page_size);
b73b0c25 755 if (!base)
5c99f3d6 756 return NULL;
0ba2a93e 757 range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET);
5c99f3d6 758 range->pool = pool;
4aa3220c 759 range->header = header;
a5694a4d 760 range->base = base;
fa6a0fb3 761 range->mmap_addr = header;
a5694a4d
MD
762 range->mmap_len = page_size + range_len;
763
4e8ae59d 764 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL) {
a5694a4d
MD
765 range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus);
766 /* Populate init values pages from memfd */
767 if (rseq_memfd_reserve_init(range->init, pool->attr.stride))
768 goto error_alloc;
769 if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE,
770 MAP_SHARED | MAP_FIXED, memfd.fd,
49f96dc5 771 ptr_to_off_t(range->init)) != (void *) range->init) {
a5694a4d
MD
772 goto error_alloc;
773 }
774 assert(pool->attr.type == MEMPOOL_TYPE_PERCPU);
775 /*
776 * Map per-cpu memory as private COW mappings of init values.
777 */
778 {
779 int cpu;
780
781 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
782 void *p = base + (pool->attr.stride * cpu);
783 size_t len = pool->attr.stride;
784
785 if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED,
49f96dc5 786 memfd.fd, ptr_to_off_t(range->init)) != (void *) p) {
a5694a4d
MD
787 goto error_alloc;
788 }
789 }
790 }
791 }
792
b73b0c25
MD
793 if (pool->attr.robust_set) {
794 if (create_alloc_bitmap(pool, range))
795 goto error_alloc;
796 }
135811f2 797 if (pool->attr.init_set) {
374c2773
MD
798 switch (pool->attr.type) {
799 case MEMPOOL_TYPE_GLOBAL:
6e329183 800 if (pool->attr.init_func(pool->attr.init_priv,
374c2773 801 base, pool->attr.stride, -1)) {
6e329183
MD
802 goto error_alloc;
803 }
374c2773
MD
804 break;
805 case MEMPOOL_TYPE_PERCPU:
806 {
807 int cpu;
808 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
809 if (pool->attr.init_func(pool->attr.init_priv,
810 base + (pool->attr.stride * cpu),
811 pool->attr.stride, cpu)) {
812 goto error_alloc;
813 }
814 }
815 break;
816 }
817 default:
818 abort();
135811f2
MD
819 }
820 }
9d986353 821 pool->nr_ranges++;
b73b0c25
MD
822 return range;
823
824error_alloc:
0ba2a93e 825 (void) rseq_mempool_range_destroy(pool, range);
b73b0c25
MD
826 return NULL;
827}
828
a5694a4d
MD
829static
830int rseq_mempool_memfd_ref(struct rseq_mempool *pool)
831{
832 int ret = 0;
833
4e8ae59d 834 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
a5694a4d
MD
835 return 0;
836
837 pthread_mutex_lock(&memfd.lock);
838 if (memfd.refcount == 0) {
839 memfd.fd = memfd_create("mempool", MFD_CLOEXEC);
840 if (memfd.fd < 0) {
841 perror("memfd_create");
842 ret = -1;
843 goto unlock;
844 }
845 }
846 memfd.refcount++;
847unlock:
848 pthread_mutex_unlock(&memfd.lock);
849 return ret;
850}
851
852static
853void rseq_mempool_memfd_unref(struct rseq_mempool *pool)
854{
4e8ae59d 855 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
a5694a4d
MD
856 return;
857
858 pthread_mutex_lock(&memfd.lock);
859 if (memfd.refcount == 1) {
860 if (close(memfd.fd)) {
861 perror("close");
862 abort();
863 }
864 memfd.fd = -1;
865 memfd.reserved_size = 0;
866 }
867 memfd.refcount--;
868 pthread_mutex_unlock(&memfd.lock);
869}
870
0ba2a93e 871int rseq_mempool_destroy(struct rseq_mempool *pool)
9649c7ee 872{
0ba2a93e 873 struct rseq_mempool_range *range, *next_range;
b73b0c25 874 int ret = 0;
9649c7ee 875
f510ddc5
MD
876 if (!pool)
877 return 0;
b73b0c25 878 check_free_list(pool);
6fbf1fb6 879 check_pool_poison(pool);
b73b0c25 880 /* Iteration safe against removal. */
9d986353 881 for (range = pool->range_list; range && (next_range = range->next, 1); range = next_range) {
0ba2a93e 882 if (rseq_mempool_range_destroy(pool, range))
b73b0c25
MD
883 goto end;
884 /* Update list head to keep list coherent in case of partial failure. */
9d986353 885 pool->range_list = next_range;
b73b0c25 886 }
a5694a4d 887 rseq_mempool_memfd_unref(pool);
9649c7ee 888 pthread_mutex_destroy(&pool->lock);
ca452fee 889 free(pool->name);
eb8db04d 890 free(pool);
9649c7ee 891end:
b73b0c25 892 return ret;
9649c7ee
MD
893}
894
0ba2a93e 895struct rseq_mempool *rseq_mempool_create(const char *pool_name,
cb475906 896 size_t item_len, const struct rseq_mempool_attr *_attr)
ef6695f1 897{
0ba2a93e
MD
898 struct rseq_mempool *pool;
899 struct rseq_mempool_attr attr = {};
ef6695f1 900 int order;
ef6695f1
MD
901
902 /* Make sure each item is large enough to contain free list pointers. */
903 if (item_len < sizeof(void *))
904 item_len = sizeof(void *);
905
906 /* Align item_len on next power of two. */
19be9217 907 order = rseq_get_count_order_ulong(item_len);
ef6695f1
MD
908 if (order < 0) {
909 errno = EINVAL;
910 return NULL;
911 }
912 item_len = 1UL << order;
913
a82006d0
MD
914 if (_attr)
915 memcpy(&attr, _attr, sizeof(attr));
a82006d0 916
cb475906
MD
917 switch (attr.type) {
918 case MEMPOOL_TYPE_PERCPU:
919 if (attr.max_nr_cpus < 0) {
920 errno = EINVAL;
921 return NULL;
922 }
923 if (attr.max_nr_cpus == 0) {
924 /* Auto-detect */
47c725dd 925 attr.max_nr_cpus = rseq_get_max_nr_cpus();
cb475906
MD
926 if (attr.max_nr_cpus == 0) {
927 errno = EINVAL;
928 return NULL;
929 }
930 }
931 break;
932 case MEMPOOL_TYPE_GLOBAL:
a5694a4d 933 /* Override populate policy for global type. */
4e8ae59d
MD
934 if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_PRIVATE_NONE)
935 attr.populate_policy = RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL;
89b7e681
MD
936 /* Use a 1-cpu pool for global mempool type. */
937 attr.max_nr_cpus = 1;
cb475906
MD
938 break;
939 }
940 if (!attr.stride)
941 attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */
3975084e
MD
942 if (attr.robust_set && !attr.poison_set) {
943 attr.poison_set = true;
944 attr.poison = DEFAULT_POISON_VALUE;
945 }
cb475906
MD
946 if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() ||
947 !is_pow2(attr.stride)) {
948 errno = EINVAL;
949 return NULL;
950 }
951
0ba2a93e 952 pool = calloc(1, sizeof(struct rseq_mempool));
bc510b60
MD
953 if (!pool)
954 return NULL;
ef6695f1 955
b73b0c25 956 memcpy(&pool->attr, &attr, sizeof(attr));
ef6695f1 957 pthread_mutex_init(&pool->lock, NULL);
ef6695f1
MD
958 pool->item_len = item_len;
959 pool->item_order = order;
b73b0c25 960
a5694a4d
MD
961 if (rseq_mempool_memfd_ref(pool))
962 goto error_alloc;
963
9d986353
MD
964 pool->range_list = rseq_mempool_range_create(pool);
965 if (!pool->range_list)
b73b0c25 966 goto error_alloc;
0fdf7a4c 967
ca452fee
MD
968 if (pool_name) {
969 pool->name = strdup(pool_name);
970 if (!pool->name)
971 goto error_alloc;
972 }
ef6695f1 973 return pool;
ef6695f1 974
9649c7ee 975error_alloc:
0ba2a93e 976 rseq_mempool_destroy(pool);
9649c7ee
MD
977 errno = ENOMEM;
978 return NULL;
ef6695f1
MD
979}
980
e7cbbc10
MD
981/* Always inline for __builtin_return_address(0). */
982static inline __attribute__((always_inline))
9d986353 983void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
0fdf7a4c 984{
9d986353 985 unsigned long *bitmap = range->alloc_bitmap;
9649c7ee 986 size_t item_index = item_offset >> pool->item_order;
0fdf7a4c
OD
987 unsigned long mask;
988 size_t k;
989
9649c7ee 990 if (!bitmap)
0fdf7a4c 991 return;
0fdf7a4c 992
9649c7ee 993 k = item_index / BIT_PER_ULONG;
0fdf7a4c
OD
994 mask = 1ULL << (item_index % BIT_PER_ULONG);
995
9649c7ee
MD
996 /* Print error if bit is already set. */
997 if (bitmap[k] & mask) {
ca452fee
MD
998 fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
999 __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
9649c7ee
MD
1000 abort();
1001 }
0fdf7a4c
OD
1002 bitmap[k] |= mask;
1003}
1004
ef6695f1 1005static
6ff43d9a
MD
1006void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool,
1007 bool zeroed, void *init_ptr, size_t init_len)
ef6695f1 1008{
9d986353 1009 struct rseq_mempool_range *range;
ef6695f1
MD
1010 struct free_list_node *node;
1011 uintptr_t item_offset;
d24ee051 1012 void __rseq_percpu *addr;
ef6695f1 1013
6ff43d9a
MD
1014 if (init_len > pool->item_len) {
1015 errno = EINVAL;
1016 return NULL;
1017 }
ef6695f1
MD
1018 pthread_mutex_lock(&pool->lock);
1019 /* Get first entry from free list. */
1020 node = pool->free_list_head;
1021 if (node != NULL) {
a5694a4d 1022 void *range_base, *ptr;
9d986353 1023
a5694a4d
MD
1024 ptr = __rseq_free_list_to_percpu_ptr(pool, node);
1025 range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1)));
9d986353 1026 range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
ef6695f1
MD
1027 /* Remove node from free list (update head). */
1028 pool->free_list_head = node->next;
a5694a4d 1029 item_offset = (uintptr_t) (ptr - range_base);
86617384 1030 rseq_percpu_check_poison_item(pool, range, item_offset);
a5694a4d 1031 addr = __rseq_free_list_to_percpu_ptr(pool, node);
ef6695f1
MD
1032 goto end;
1033 }
9d986353
MD
1034 /*
1035 * If the most recent range (first in list) does not have any
1036 * room left, create a new range and prepend it to the list
1037 * head.
1038 */
1039 range = pool->range_list;
1040 if (range->next_unused + pool->item_len > pool->attr.stride) {
1041 range = rseq_mempool_range_create(pool);
1042 if (!range) {
1043 errno = ENOMEM;
1044 addr = NULL;
1045 goto end;
1046 }
1047 /* Add range to head of list. */
1048 range->next = pool->range_list;
1049 pool->range_list = range;
ef6695f1 1050 }
9d986353
MD
1051 /* First range in list has room left. */
1052 item_offset = range->next_unused;
1053 addr = (void __rseq_percpu *) (range->base + item_offset);
1054 range->next_unused += pool->item_len;
ef6695f1 1055end:
8f28507f 1056 if (addr)
9d986353 1057 set_alloc_slot(pool, range, item_offset);
ef6695f1 1058 pthread_mutex_unlock(&pool->lock);
6ff43d9a
MD
1059 if (addr) {
1060 if (zeroed)
1061 rseq_percpu_zero_item(pool, range, item_offset);
1062 else if (init_ptr) {
1063 rseq_percpu_init_item(pool, range, item_offset,
1064 init_ptr, init_len);
1065 }
1066 }
ef6695f1
MD
1067 return addr;
1068}
1069
15da5c27 1070void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool)
ef6695f1 1071{
6ff43d9a 1072 return __rseq_percpu_malloc(pool, false, NULL, 0);
ef6695f1
MD
1073}
1074
15da5c27 1075void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool)
ef6695f1 1076{
6ff43d9a
MD
1077 return __rseq_percpu_malloc(pool, true, NULL, 0);
1078}
1079
1080void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool,
1081 void *init_ptr, size_t len)
1082{
1083 return __rseq_percpu_malloc(pool, false, init_ptr, len);
ef6695f1
MD
1084}
1085
e7cbbc10
MD
1086/* Always inline for __builtin_return_address(0). */
1087static inline __attribute__((always_inline))
9d986353 1088void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
0fdf7a4c 1089{
9d986353 1090 unsigned long *bitmap = range->alloc_bitmap;
9649c7ee 1091 size_t item_index = item_offset >> pool->item_order;
0fdf7a4c
OD
1092 unsigned long mask;
1093 size_t k;
1094
9649c7ee 1095 if (!bitmap)
0fdf7a4c 1096 return;
0fdf7a4c 1097
9649c7ee
MD
1098 k = item_index / BIT_PER_ULONG;
1099 mask = 1ULL << (item_index % BIT_PER_ULONG);
0fdf7a4c 1100
9649c7ee
MD
1101 /* Print error if bit is not set. */
1102 if (!(bitmap[k] & mask)) {
ca452fee
MD
1103 fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1104 __func__, get_pool_name(pool), pool, item_offset,
1105 (void *) __builtin_return_address(0));
9649c7ee
MD
1106 abort();
1107 }
0fdf7a4c
OD
1108 bitmap[k] &= ~mask;
1109}
1110
cb475906 1111void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride)
ef6695f1
MD
1112{
1113 uintptr_t ptr = (uintptr_t) _ptr;
cb475906 1114 void *range_base = (void *) (ptr & (~(stride - 1)));
0ba2a93e
MD
1115 struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1116 struct rseq_mempool *pool = range->pool;
cb475906 1117 uintptr_t item_offset = ptr & (stride - 1);
ef6695f1
MD
1118 struct free_list_node *head, *item;
1119
1120 pthread_mutex_lock(&pool->lock);
9d986353 1121 clear_alloc_slot(pool, range, item_offset);
ef6695f1
MD
1122 /* Add ptr to head of free list */
1123 head = pool->free_list_head;
455e090e
MD
1124 if (pool->attr.poison_set)
1125 rseq_percpu_poison_item(pool, range, item_offset);
a5694a4d 1126 item = __rseq_percpu_to_free_list_ptr(pool, _ptr);
455e090e
MD
1127 /*
1128 * Setting the next pointer will overwrite the first uintptr_t
a5694a4d
MD
1129 * poison for either CPU 0 (populate all) or init data (populate
1130 * none).
455e090e 1131 */
ef6695f1
MD
1132 item->next = head;
1133 pool->free_list_head = item;
1134 pthread_mutex_unlock(&pool->lock);
1135}
1136
0ba2a93e 1137struct rseq_mempool_set *rseq_mempool_set_create(void)
ef6695f1 1138{
0ba2a93e 1139 struct rseq_mempool_set *pool_set;
ef6695f1 1140
0ba2a93e 1141 pool_set = calloc(1, sizeof(struct rseq_mempool_set));
ef6695f1
MD
1142 if (!pool_set)
1143 return NULL;
1144 pthread_mutex_init(&pool_set->lock, NULL);
1145 return pool_set;
1146}
1147
0ba2a93e 1148int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set)
ef6695f1
MD
1149{
1150 int order, ret;
1151
1152 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
0ba2a93e 1153 struct rseq_mempool *pool = pool_set->entries[order];
ef6695f1
MD
1154
1155 if (!pool)
1156 continue;
0ba2a93e 1157 ret = rseq_mempool_destroy(pool);
ef6695f1
MD
1158 if (ret)
1159 return ret;
1160 pool_set->entries[order] = NULL;
1161 }
1162 pthread_mutex_destroy(&pool_set->lock);
1163 free(pool_set);
1164 return 0;
1165}
1166
1167/* Ownership of pool is handed over to pool set on success. */
0ba2a93e 1168int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool)
ef6695f1
MD
1169{
1170 size_t item_order = pool->item_order;
1171 int ret = 0;
1172
1173 pthread_mutex_lock(&pool_set->lock);
1174 if (pool_set->entries[item_order]) {
1175 errno = EBUSY;
1176 ret = -1;
1177 goto end;
1178 }
1179 pool_set->entries[pool->item_order] = pool;
1180end:
1181 pthread_mutex_unlock(&pool_set->lock);
1182 return ret;
1183}
1184
1185static
6ff43d9a
MD
1186void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set,
1187 void *init_ptr, size_t len, bool zeroed)
ef6695f1
MD
1188{
1189 int order, min_order = POOL_SET_MIN_ENTRY;
0ba2a93e 1190 struct rseq_mempool *pool;
d24ee051 1191 void __rseq_percpu *addr;
ef6695f1 1192
d06f5cf5
MD
1193 order = rseq_get_count_order_ulong(len);
1194 if (order > POOL_SET_MIN_ENTRY)
1195 min_order = order;
ef6695f1
MD
1196again:
1197 pthread_mutex_lock(&pool_set->lock);
1198 /* First smallest present pool where @len fits. */
1199 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
1200 pool = pool_set->entries[order];
1201
1202 if (!pool)
1203 continue;
1204 if (pool->item_len >= len)
1205 goto found;
1206 }
1207 pool = NULL;
1208found:
1209 pthread_mutex_unlock(&pool_set->lock);
1210 if (pool) {
6ff43d9a 1211 addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len);
ef6695f1
MD
1212 if (addr == NULL && errno == ENOMEM) {
1213 /*
1214 * If the allocation failed, try again with a
1215 * larger pool.
1216 */
1217 min_order = order + 1;
1218 goto again;
1219 }
1220 } else {
1221 /* Not found. */
1222 errno = ENOMEM;
1223 addr = NULL;
1224 }
1225 return addr;
1226}
1227
15da5c27 1228void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len)
ef6695f1 1229{
6ff43d9a 1230 return __rseq_mempool_set_malloc(pool_set, NULL, len, false);
ef6695f1
MD
1231}
1232
15da5c27 1233void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len)
ef6695f1 1234{
6ff43d9a
MD
1235 return __rseq_mempool_set_malloc(pool_set, NULL, len, true);
1236}
1237
1238void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set,
1239 void *init_ptr, size_t len)
1240{
1241 return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true);
ef6695f1 1242}
9bd07c29 1243
0ba2a93e 1244struct rseq_mempool_attr *rseq_mempool_attr_create(void)
a82006d0 1245{
0ba2a93e 1246 return calloc(1, sizeof(struct rseq_mempool_attr));
a82006d0
MD
1247}
1248
0ba2a93e 1249void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr)
a82006d0
MD
1250{
1251 free(attr);
1252}
1253
135811f2 1254int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr,
6e329183 1255 int (*init_func)(void *priv, void *addr, size_t len, int cpu),
135811f2
MD
1256 void *init_priv)
1257{
1258 if (!attr) {
1259 errno = EINVAL;
1260 return -1;
1261 }
1262 attr->init_set = true;
1263 attr->init_func = init_func;
1264 attr->init_priv = init_priv;
1265 return 0;
1266}
1267
0ba2a93e 1268int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr)
d6acc8aa
MD
1269{
1270 if (!attr) {
1271 errno = EINVAL;
1272 return -1;
1273 }
1274 attr->robust_set = true;
1275 return 0;
1276}
cb475906
MD
1277
1278int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr,
1279 size_t stride, int max_nr_cpus)
1280{
1281 if (!attr) {
1282 errno = EINVAL;
1283 return -1;
1284 }
1285 attr->type = MEMPOOL_TYPE_PERCPU;
1286 attr->stride = stride;
1287 attr->max_nr_cpus = max_nr_cpus;
1288 return 0;
1289}
1290
1291int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr,
1292 size_t stride)
1293{
1294 if (!attr) {
1295 errno = EINVAL;
1296 return -1;
1297 }
1298 attr->type = MEMPOOL_TYPE_GLOBAL;
1299 attr->stride = stride;
89b7e681 1300 attr->max_nr_cpus = 0;
cb475906
MD
1301 return 0;
1302}
6037d364 1303
e11a02d7
MD
1304int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr,
1305 unsigned long max_nr_ranges)
1306{
1307 if (!attr) {
1308 errno = EINVAL;
1309 return -1;
1310 }
1311 attr->max_nr_ranges = max_nr_ranges;
1312 return 0;
1313}
1314
455e090e
MD
1315int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr,
1316 uintptr_t poison)
1317{
1318 if (!attr) {
1319 errno = EINVAL;
1320 return -1;
1321 }
1322 attr->poison_set = true;
1323 attr->poison = poison;
1324 return 0;
1325}
1326
a5694a4d
MD
1327int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr,
1328 enum rseq_mempool_populate_policy policy)
1329{
1330 if (!attr) {
1331 errno = EINVAL;
1332 return -1;
1333 }
1334 attr->populate_policy = policy;
1335 return 0;
1336}
1337
6037d364
MD
1338int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool)
1339{
1340 if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) {
1341 errno = EINVAL;
1342 return -1;
1343 }
1344 return mempool->attr.max_nr_cpus;
1345}
This page took 0.091385 seconds and 4 git commands to generate.