Commit | Line | Data |
---|---|---|
ef6695f1 MD |
1 | // SPDX-License-Identifier: MIT |
2 | // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | |
3 | ||
34337fec | 4 | #include <rseq/mempool.h> |
ef6695f1 MD |
5 | #include <sys/mman.h> |
6 | #include <assert.h> | |
7 | #include <string.h> | |
8 | #include <pthread.h> | |
9 | #include <unistd.h> | |
10 | #include <stdlib.h> | |
11 | #include <rseq/compiler.h> | |
12 | #include <errno.h> | |
13 | #include <stdint.h> | |
14 | #include <stdbool.h> | |
367e559c MD |
15 | #include <stdio.h> |
16 | ||
17 | #ifdef HAVE_LIBNUMA | |
18 | # include <numa.h> | |
19 | # include <numaif.h> | |
20 | #endif | |
ef6695f1 | 21 | |
34337fec | 22 | #include "rseq-utils.h" |
cb475906 | 23 | #include "smp.h" |
19be9217 | 24 | |
ef6695f1 | 25 | /* |
b73b0c25 | 26 | * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator. |
ef6695f1 | 27 | * |
8ab16a24 MD |
28 | * The rseq per-CPU memory allocator allows the application the request |
29 | * memory pools of CPU-Local memory each of containing objects of a | |
8aa1462d MD |
30 | * given size (rounded to next power of 2), reserving a given virtual |
31 | * address size per CPU, for a given maximum number of CPUs. | |
8ab16a24 MD |
32 | * |
33 | * The per-CPU memory allocator is analogous to TLS (Thread-Local | |
34 | * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU | |
35 | * memory allocator provides CPU-Local Storage. | |
ef6695f1 MD |
36 | */ |
37 | ||
3236da62 | 38 | #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG |
ef6695f1 | 39 | |
72b100a1 MD |
40 | /* |
41 | * Smallest allocation should hold enough space for a free list pointer. | |
42 | */ | |
ef6695f1 MD |
43 | #if RSEQ_BITS_PER_LONG == 64 |
44 | # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */ | |
45 | #else | |
46 | # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */ | |
47 | #endif | |
48 | ||
bb1552e2 MD |
49 | /* |
50 | * Skip pool index 0 to ensure allocated entries at index 0 do not match | |
51 | * a NULL pointer. | |
52 | */ | |
53 | #define FIRST_POOL 1 | |
54 | ||
0fdf7a4c OD |
55 | #define BIT_PER_ULONG (8 * sizeof(unsigned long)) |
56 | ||
57d8b586 OD |
57 | #define MOVE_PAGES_BATCH_SIZE 4096 |
58 | ||
0ba2a93e | 59 | #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range) |
4aa3220c | 60 | |
ef6695f1 MD |
61 | struct free_list_node; |
62 | ||
63 | struct free_list_node { | |
64 | struct free_list_node *next; | |
65 | }; | |
66 | ||
cb475906 | 67 | enum mempool_type { |
89b7e681 MD |
68 | MEMPOOL_TYPE_GLOBAL = 0, /* Default */ |
69 | MEMPOOL_TYPE_PERCPU = 1, | |
cb475906 MD |
70 | }; |
71 | ||
0ba2a93e | 72 | struct rseq_mempool_attr { |
a82006d0 | 73 | bool mmap_set; |
9bd07c29 MD |
74 | void *(*mmap_func)(void *priv, size_t len); |
75 | int (*munmap_func)(void *priv, void *ptr, size_t len); | |
76 | void *mmap_priv; | |
d6acc8aa | 77 | |
135811f2 MD |
78 | bool init_set; |
79 | void (*init_func)(void *priv, void *addr, size_t len, int cpu); | |
80 | void *init_priv; | |
81 | ||
d6acc8aa | 82 | bool robust_set; |
cb475906 MD |
83 | |
84 | enum mempool_type type; | |
85 | size_t stride; | |
86 | int max_nr_cpus; | |
9bd07c29 MD |
87 | }; |
88 | ||
0ba2a93e | 89 | struct rseq_mempool_range; |
b73b0c25 | 90 | |
0ba2a93e MD |
91 | struct rseq_mempool_range { |
92 | struct rseq_mempool_range *next; | |
93 | struct rseq_mempool *pool; /* Backward ref. to container pool. */ | |
4aa3220c | 94 | void *header; |
ef6695f1 | 95 | void *base; |
b73b0c25 MD |
96 | size_t next_unused; |
97 | /* Track alloc/free. */ | |
98 | unsigned long *alloc_bitmap; | |
99 | }; | |
100 | ||
0ba2a93e | 101 | struct rseq_mempool { |
b73b0c25 | 102 | /* Linked-list of ranges. */ |
0ba2a93e | 103 | struct rseq_mempool_range *ranges; |
b73b0c25 | 104 | |
ef6695f1 | 105 | size_t item_len; |
ef6695f1 | 106 | int item_order; |
ef6695f1 MD |
107 | |
108 | /* | |
8ab16a24 | 109 | * The free list chains freed items on the CPU 0 address range. |
ef6695f1 | 110 | * We should rethink this decision if false sharing between |
8ab16a24 | 111 | * malloc/free from other CPUs and data accesses from CPU 0 |
ef6695f1 MD |
112 | * becomes an issue. This is a NULL-terminated singly-linked |
113 | * list. | |
114 | */ | |
115 | struct free_list_node *free_list_head; | |
b73b0c25 | 116 | |
ef6695f1 MD |
117 | /* This lock protects allocation/free within the pool. */ |
118 | pthread_mutex_t lock; | |
9bd07c29 | 119 | |
0ba2a93e | 120 | struct rseq_mempool_attr attr; |
ca452fee | 121 | char *name; |
ef6695f1 MD |
122 | }; |
123 | ||
ef6695f1 MD |
124 | /* |
125 | * Pool set entries are indexed by item_len rounded to the next power of | |
126 | * 2. A pool set can contain NULL pool entries, in which case the next | |
127 | * large enough entry will be used for allocation. | |
128 | */ | |
0ba2a93e | 129 | struct rseq_mempool_set { |
ef6695f1 MD |
130 | /* This lock protects add vs malloc/zmalloc within the pool set. */ |
131 | pthread_mutex_t lock; | |
0ba2a93e | 132 | struct rseq_mempool *entries[POOL_SET_NR_ENTRIES]; |
ef6695f1 MD |
133 | }; |
134 | ||
367e559c | 135 | static |
15b63c9f | 136 | void *__rseq_pool_range_percpu_ptr(struct rseq_mempool_range *range, int cpu, |
f2981623 | 137 | uintptr_t item_offset, size_t stride) |
367e559c | 138 | { |
15b63c9f | 139 | return range->base + (stride * cpu) + item_offset; |
367e559c MD |
140 | } |
141 | ||
367e559c | 142 | static |
15b63c9f MD |
143 | void rseq_percpu_zero_item(struct rseq_mempool *pool, |
144 | struct rseq_mempool_range *range, uintptr_t item_offset) | |
367e559c MD |
145 | { |
146 | int i; | |
147 | ||
cb475906 | 148 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
15b63c9f | 149 | char *p = __rseq_pool_range_percpu_ptr(range, i, |
cb475906 | 150 | item_offset, pool->attr.stride); |
367e559c MD |
151 | memset(p, 0, pool->item_len); |
152 | } | |
153 | } | |
154 | ||
15b63c9f | 155 | #ifdef HAVE_LIBNUMA |
c6fd3981 | 156 | int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags) |
367e559c | 157 | { |
f2981623 | 158 | unsigned long nr_pages, page_len; |
c6fd3981 MD |
159 | int status[MOVE_PAGES_BATCH_SIZE]; |
160 | int nodes[MOVE_PAGES_BATCH_SIZE]; | |
161 | void *pages[MOVE_PAGES_BATCH_SIZE]; | |
f2981623 | 162 | long ret; |
367e559c | 163 | |
c6fd3981 MD |
164 | if (!numa_flags) { |
165 | errno = EINVAL; | |
166 | return -1; | |
167 | } | |
367e559c | 168 | page_len = rseq_get_page_len(); |
c6fd3981 | 169 | nr_pages = len >> rseq_get_count_order_ulong(page_len); |
57d8b586 | 170 | |
c6fd3981 MD |
171 | nodes[0] = numa_node_of_cpu(cpu); |
172 | if (nodes[0] < 0) | |
173 | return -1; | |
57d8b586 | 174 | |
c6fd3981 MD |
175 | for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) { |
176 | nodes[k] = nodes[0]; | |
177 | } | |
57d8b586 | 178 | |
c6fd3981 | 179 | for (unsigned long page = 0; page < nr_pages;) { |
57d8b586 | 180 | |
c6fd3981 MD |
181 | size_t max_k = RSEQ_ARRAY_SIZE(pages); |
182 | size_t left = nr_pages - page; | |
57d8b586 | 183 | |
c6fd3981 MD |
184 | if (left < max_k) { |
185 | max_k = left; | |
186 | } | |
57d8b586 | 187 | |
c6fd3981 MD |
188 | for (size_t k = 0; k < max_k; ++k, ++page) { |
189 | pages[k] = addr + (page * page_len); | |
190 | status[k] = -EPERM; | |
367e559c | 191 | } |
b73b0c25 | 192 | |
c6fd3981 MD |
193 | ret = move_pages(0, max_k, pages, nodes, status, numa_flags); |
194 | ||
195 | if (ret < 0) | |
b73b0c25 | 196 | return ret; |
c6fd3981 MD |
197 | |
198 | if (ret > 0) { | |
199 | fprintf(stderr, "%lu pages were not migrated\n", ret); | |
200 | for (size_t k = 0; k < max_k; ++k) { | |
201 | if (status[k] < 0) | |
202 | fprintf(stderr, | |
203 | "Error while moving page %p to numa node %d: %u\n", | |
204 | pages[k], nodes[k], -status[k]); | |
205 | } | |
206 | } | |
b73b0c25 MD |
207 | } |
208 | return 0; | |
209 | } | |
367e559c | 210 | #else |
c6fd3981 MD |
211 | int rseq_mempool_range_init_numa(void *addr __attribute__((unused)), |
212 | size_t len __attribute__((unused)), | |
213 | int cpu __attribute__((unused)), | |
367e559c MD |
214 | int numa_flags __attribute__((unused))) |
215 | { | |
c6fd3981 MD |
216 | errno = ENOSYS; |
217 | return -1; | |
367e559c MD |
218 | } |
219 | #endif | |
220 | ||
9bd07c29 MD |
221 | static |
222 | void *default_mmap_func(void *priv __attribute__((unused)), size_t len) | |
223 | { | |
224 | void *base; | |
225 | ||
226 | base = mmap(NULL, len, PROT_READ | PROT_WRITE, | |
227 | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | |
228 | if (base == MAP_FAILED) | |
229 | return NULL; | |
230 | return base; | |
231 | } | |
232 | ||
233 | static | |
234 | int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len) | |
235 | { | |
236 | return munmap(ptr, len); | |
237 | } | |
238 | ||
0fdf7a4c | 239 | static |
0ba2a93e | 240 | int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) |
0fdf7a4c OD |
241 | { |
242 | size_t count; | |
243 | ||
cb475906 | 244 | count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG; |
0fdf7a4c OD |
245 | |
246 | /* | |
9649c7ee MD |
247 | * Not being able to create the validation bitmap is an error |
248 | * that needs to be reported. | |
0fdf7a4c | 249 | */ |
b73b0c25 MD |
250 | range->alloc_bitmap = calloc(count, sizeof(unsigned long)); |
251 | if (!range->alloc_bitmap) | |
9649c7ee MD |
252 | return -1; |
253 | return 0; | |
0fdf7a4c OD |
254 | } |
255 | ||
ca452fee | 256 | static |
0ba2a93e | 257 | const char *get_pool_name(const struct rseq_mempool *pool) |
ca452fee MD |
258 | { |
259 | return pool->name ? : "<anonymous>"; | |
260 | } | |
261 | ||
b73b0c25 | 262 | static |
0ba2a93e | 263 | bool addr_in_pool(const struct rseq_mempool *pool, void *addr) |
b73b0c25 | 264 | { |
0ba2a93e | 265 | struct rseq_mempool_range *range; |
b73b0c25 MD |
266 | |
267 | for (range = pool->ranges; range; range = range->next) { | |
268 | if (addr >= range->base && addr < range->base + range->next_unused) | |
269 | return true; | |
270 | } | |
271 | return false; | |
272 | } | |
273 | ||
a9ec6111 OD |
274 | /* Always inline for __builtin_return_address(0). */ |
275 | static inline __attribute__((always_inline)) | |
0ba2a93e | 276 | void check_free_list(const struct rseq_mempool *pool) |
a9ec6111 | 277 | { |
b73b0c25 MD |
278 | size_t total_item = 0, total_never_allocated = 0, total_freed = 0, |
279 | max_list_traversal = 0, traversal_iteration = 0; | |
0ba2a93e | 280 | struct rseq_mempool_range *range; |
b73b0c25 MD |
281 | |
282 | if (!pool->attr.robust_set) | |
283 | return; | |
284 | ||
285 | for (range = pool->ranges; range; range = range->next) { | |
cb475906 MD |
286 | total_item += pool->attr.stride >> pool->item_order; |
287 | total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order; | |
b73b0c25 MD |
288 | } |
289 | max_list_traversal = total_item - total_never_allocated; | |
a9ec6111 OD |
290 | |
291 | for (struct free_list_node *node = pool->free_list_head, *prev = NULL; | |
292 | node; | |
293 | prev = node, | |
294 | node = node->next) { | |
295 | ||
296 | void *node_addr = node; | |
297 | ||
298 | if (traversal_iteration >= max_list_traversal) { | |
ca452fee MD |
299 | fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n", |
300 | __func__, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 OD |
301 | abort(); |
302 | } | |
303 | ||
304 | /* Node is out of range. */ | |
b73b0c25 | 305 | if (!addr_in_pool(pool, node_addr)) { |
a9ec6111 | 306 | if (prev) |
ca452fee MD |
307 | fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", |
308 | __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 | 309 | else |
ca452fee MD |
310 | fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", |
311 | __func__, node, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 OD |
312 | abort(); |
313 | } | |
314 | ||
b73b0c25 MD |
315 | traversal_iteration++; |
316 | total_freed++; | |
a9ec6111 OD |
317 | } |
318 | ||
319 | if (total_never_allocated + total_freed != total_item) { | |
ca452fee MD |
320 | fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n", |
321 | __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0)); | |
a9ec6111 OD |
322 | abort(); |
323 | } | |
a9ec6111 OD |
324 | } |
325 | ||
e7cbbc10 MD |
326 | /* Always inline for __builtin_return_address(0). */ |
327 | static inline __attribute__((always_inline)) | |
0ba2a93e | 328 | void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) |
0fdf7a4c | 329 | { |
b73b0c25 | 330 | unsigned long *bitmap = range->alloc_bitmap; |
9649c7ee | 331 | size_t count, total_leaks = 0; |
0fdf7a4c | 332 | |
9649c7ee | 333 | if (!bitmap) |
0fdf7a4c | 334 | return; |
0fdf7a4c | 335 | |
cb475906 | 336 | count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG; |
0fdf7a4c OD |
337 | |
338 | /* Assert that all items in the pool were freed. */ | |
9649c7ee MD |
339 | for (size_t k = 0; k < count; ++k) |
340 | total_leaks += rseq_hweight_ulong(bitmap[k]); | |
341 | if (total_leaks) { | |
ca452fee MD |
342 | fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n", |
343 | __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0)); | |
9649c7ee | 344 | abort(); |
0fdf7a4c OD |
345 | } |
346 | ||
347 | free(bitmap); | |
348 | } | |
349 | ||
b73b0c25 MD |
350 | /* Always inline for __builtin_return_address(0). */ |
351 | static inline __attribute__((always_inline)) | |
0ba2a93e MD |
352 | int rseq_mempool_range_destroy(struct rseq_mempool *pool, |
353 | struct rseq_mempool_range *range) | |
b73b0c25 MD |
354 | { |
355 | destroy_alloc_bitmap(pool, range); | |
5c99f3d6 | 356 | /* range is a header located one page before the aligned mapping. */ |
4aa3220c | 357 | return pool->attr.munmap_func(pool->attr.mmap_priv, range->header, |
cb475906 | 358 | (pool->attr.stride * pool->attr.max_nr_cpus) + rseq_get_page_len()); |
5c99f3d6 MD |
359 | } |
360 | ||
361 | /* | |
362 | * Allocate a memory mapping aligned on @alignment, with an optional | |
363 | * @pre_header before the mapping. | |
364 | */ | |
365 | static | |
0ba2a93e | 366 | void *aligned_mmap_anonymous(struct rseq_mempool *pool, |
5c99f3d6 MD |
367 | size_t page_size, size_t len, size_t alignment, |
368 | void **pre_header, size_t pre_header_len) | |
369 | { | |
370 | size_t minimum_page_count, page_count, extra, total_allocate = 0; | |
371 | int page_order; | |
372 | void *ptr; | |
373 | ||
374 | if (len < page_size || alignment < page_size || | |
375 | !is_pow2(len) || !is_pow2(alignment)) { | |
376 | errno = EINVAL; | |
377 | return NULL; | |
378 | } | |
379 | page_order = rseq_get_count_order_ulong(page_size); | |
380 | if (page_order < 0) { | |
381 | errno = EINVAL; | |
382 | return NULL; | |
383 | } | |
384 | if (pre_header_len && (pre_header_len & (page_size - 1))) { | |
385 | errno = EINVAL; | |
386 | return NULL; | |
387 | } | |
388 | ||
389 | minimum_page_count = (pre_header_len + len) >> page_order; | |
390 | page_count = (pre_header_len + len + alignment - page_size) >> page_order; | |
391 | ||
392 | assert(page_count >= minimum_page_count); | |
393 | ||
394 | ptr = pool->attr.mmap_func(pool->attr.mmap_priv, page_count << page_order); | |
395 | if (!ptr) | |
396 | goto alloc_error; | |
397 | ||
398 | total_allocate = page_count << page_order; | |
399 | ||
400 | if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) { | |
401 | /* Pointer is already aligned. ptr points to pre_header. */ | |
402 | goto out; | |
403 | } | |
404 | ||
405 | /* Unmap extra before. */ | |
406 | extra = offset_align((uintptr_t) ptr + pre_header_len, alignment); | |
407 | assert(!(extra & (page_size - 1))); | |
408 | if (pool->attr.munmap_func(pool->attr.mmap_priv, ptr, extra)) { | |
409 | perror("munmap"); | |
410 | abort(); | |
411 | } | |
412 | total_allocate -= extra; | |
413 | ptr += extra; /* ptr points to pre_header */ | |
414 | page_count -= extra >> page_order; | |
415 | out: | |
416 | assert(page_count >= minimum_page_count); | |
417 | ||
418 | if (page_count > minimum_page_count) { | |
419 | void *extra_ptr; | |
420 | ||
421 | /* Unmap extra after. */ | |
422 | extra_ptr = ptr + (minimum_page_count << page_order); | |
423 | extra = (page_count - minimum_page_count) << page_order; | |
424 | if (pool->attr.munmap_func(pool->attr.mmap_priv, extra_ptr, extra)) { | |
425 | perror("munmap"); | |
426 | abort(); | |
427 | } | |
428 | total_allocate -= extra; | |
429 | } | |
430 | ||
431 | assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1))); | |
432 | assert(total_allocate == len + pre_header_len); | |
433 | ||
434 | alloc_error: | |
435 | if (ptr) { | |
436 | if (pre_header) | |
437 | *pre_header = ptr; | |
438 | ptr += pre_header_len; | |
439 | } | |
440 | return ptr; | |
b73b0c25 MD |
441 | } |
442 | ||
443 | static | |
0ba2a93e | 444 | struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool) |
b73b0c25 | 445 | { |
0ba2a93e | 446 | struct rseq_mempool_range *range; |
5c99f3d6 | 447 | unsigned long page_size; |
4aa3220c | 448 | void *header; |
b73b0c25 MD |
449 | void *base; |
450 | ||
5c99f3d6 | 451 | page_size = rseq_get_page_len(); |
b73b0c25 | 452 | |
5c99f3d6 | 453 | base = aligned_mmap_anonymous(pool, page_size, |
cb475906 MD |
454 | pool->attr.stride * pool->attr.max_nr_cpus, |
455 | pool->attr.stride, | |
4aa3220c | 456 | &header, page_size); |
b73b0c25 | 457 | if (!base) |
5c99f3d6 | 458 | return NULL; |
0ba2a93e | 459 | range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET); |
5c99f3d6 | 460 | range->pool = pool; |
b73b0c25 | 461 | range->base = base; |
4aa3220c | 462 | range->header = header; |
b73b0c25 MD |
463 | if (pool->attr.robust_set) { |
464 | if (create_alloc_bitmap(pool, range)) | |
465 | goto error_alloc; | |
466 | } | |
135811f2 MD |
467 | if (pool->attr.init_set) { |
468 | int cpu; | |
469 | ||
470 | for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) { | |
471 | pool->attr.init_func(pool->attr.init_priv, | |
472 | base + (pool->attr.stride * cpu), | |
473 | pool->attr.stride, cpu); | |
474 | } | |
475 | } | |
b73b0c25 MD |
476 | return range; |
477 | ||
478 | error_alloc: | |
0ba2a93e | 479 | (void) rseq_mempool_range_destroy(pool, range); |
b73b0c25 MD |
480 | return NULL; |
481 | } | |
482 | ||
0ba2a93e | 483 | int rseq_mempool_destroy(struct rseq_mempool *pool) |
9649c7ee | 484 | { |
0ba2a93e | 485 | struct rseq_mempool_range *range, *next_range; |
b73b0c25 | 486 | int ret = 0; |
9649c7ee | 487 | |
f510ddc5 MD |
488 | if (!pool) |
489 | return 0; | |
b73b0c25 MD |
490 | check_free_list(pool); |
491 | /* Iteration safe against removal. */ | |
492 | for (range = pool->ranges; range && (next_range = range->next, 1); range = next_range) { | |
0ba2a93e | 493 | if (rseq_mempool_range_destroy(pool, range)) |
b73b0c25 MD |
494 | goto end; |
495 | /* Update list head to keep list coherent in case of partial failure. */ | |
496 | pool->ranges = next_range; | |
497 | } | |
9649c7ee | 498 | pthread_mutex_destroy(&pool->lock); |
ca452fee | 499 | free(pool->name); |
9649c7ee MD |
500 | memset(pool, 0, sizeof(*pool)); |
501 | end: | |
b73b0c25 | 502 | return ret; |
9649c7ee MD |
503 | } |
504 | ||
0ba2a93e | 505 | struct rseq_mempool *rseq_mempool_create(const char *pool_name, |
cb475906 | 506 | size_t item_len, const struct rseq_mempool_attr *_attr) |
ef6695f1 | 507 | { |
0ba2a93e MD |
508 | struct rseq_mempool *pool; |
509 | struct rseq_mempool_attr attr = {}; | |
ef6695f1 | 510 | int order; |
ef6695f1 MD |
511 | |
512 | /* Make sure each item is large enough to contain free list pointers. */ | |
513 | if (item_len < sizeof(void *)) | |
514 | item_len = sizeof(void *); | |
515 | ||
516 | /* Align item_len on next power of two. */ | |
19be9217 | 517 | order = rseq_get_count_order_ulong(item_len); |
ef6695f1 MD |
518 | if (order < 0) { |
519 | errno = EINVAL; | |
520 | return NULL; | |
521 | } | |
522 | item_len = 1UL << order; | |
523 | ||
a82006d0 MD |
524 | if (_attr) |
525 | memcpy(&attr, _attr, sizeof(attr)); | |
526 | if (!attr.mmap_set) { | |
527 | attr.mmap_func = default_mmap_func; | |
528 | attr.munmap_func = default_munmap_func; | |
529 | attr.mmap_priv = NULL; | |
9bd07c29 | 530 | } |
a82006d0 | 531 | |
cb475906 MD |
532 | switch (attr.type) { |
533 | case MEMPOOL_TYPE_PERCPU: | |
534 | if (attr.max_nr_cpus < 0) { | |
535 | errno = EINVAL; | |
536 | return NULL; | |
537 | } | |
538 | if (attr.max_nr_cpus == 0) { | |
539 | /* Auto-detect */ | |
540 | attr.max_nr_cpus = get_possible_cpus_array_len(); | |
541 | if (attr.max_nr_cpus == 0) { | |
542 | errno = EINVAL; | |
543 | return NULL; | |
544 | } | |
545 | } | |
546 | break; | |
547 | case MEMPOOL_TYPE_GLOBAL: | |
89b7e681 MD |
548 | /* Use a 1-cpu pool for global mempool type. */ |
549 | attr.max_nr_cpus = 1; | |
cb475906 MD |
550 | break; |
551 | } | |
552 | if (!attr.stride) | |
553 | attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */ | |
554 | if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() || | |
555 | !is_pow2(attr.stride)) { | |
556 | errno = EINVAL; | |
557 | return NULL; | |
558 | } | |
559 | ||
0ba2a93e | 560 | pool = calloc(1, sizeof(struct rseq_mempool)); |
bc510b60 MD |
561 | if (!pool) |
562 | return NULL; | |
ef6695f1 | 563 | |
b73b0c25 | 564 | memcpy(&pool->attr, &attr, sizeof(attr)); |
ef6695f1 | 565 | pthread_mutex_init(&pool->lock, NULL); |
ef6695f1 MD |
566 | pool->item_len = item_len; |
567 | pool->item_order = order; | |
b73b0c25 MD |
568 | |
569 | //TODO: implement multi-range support. | |
0ba2a93e | 570 | pool->ranges = rseq_mempool_range_create(pool); |
b73b0c25 MD |
571 | if (!pool->ranges) |
572 | goto error_alloc; | |
0fdf7a4c | 573 | |
ca452fee MD |
574 | if (pool_name) { |
575 | pool->name = strdup(pool_name); | |
576 | if (!pool->name) | |
577 | goto error_alloc; | |
578 | } | |
ef6695f1 | 579 | return pool; |
ef6695f1 | 580 | |
9649c7ee | 581 | error_alloc: |
0ba2a93e | 582 | rseq_mempool_destroy(pool); |
9649c7ee MD |
583 | errno = ENOMEM; |
584 | return NULL; | |
ef6695f1 MD |
585 | } |
586 | ||
e7cbbc10 MD |
587 | /* Always inline for __builtin_return_address(0). */ |
588 | static inline __attribute__((always_inline)) | |
0ba2a93e | 589 | void set_alloc_slot(struct rseq_mempool *pool, size_t item_offset) |
0fdf7a4c | 590 | { |
b73b0c25 | 591 | unsigned long *bitmap = pool->ranges->alloc_bitmap; |
9649c7ee | 592 | size_t item_index = item_offset >> pool->item_order; |
0fdf7a4c OD |
593 | unsigned long mask; |
594 | size_t k; | |
595 | ||
9649c7ee | 596 | if (!bitmap) |
0fdf7a4c | 597 | return; |
0fdf7a4c | 598 | |
9649c7ee | 599 | k = item_index / BIT_PER_ULONG; |
0fdf7a4c OD |
600 | mask = 1ULL << (item_index % BIT_PER_ULONG); |
601 | ||
9649c7ee MD |
602 | /* Print error if bit is already set. */ |
603 | if (bitmap[k] & mask) { | |
ca452fee MD |
604 | fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
605 | __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0)); | |
9649c7ee MD |
606 | abort(); |
607 | } | |
0fdf7a4c OD |
608 | bitmap[k] |= mask; |
609 | } | |
610 | ||
ef6695f1 | 611 | static |
0ba2a93e | 612 | void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool, bool zeroed) |
ef6695f1 MD |
613 | { |
614 | struct free_list_node *node; | |
615 | uintptr_t item_offset; | |
d24ee051 | 616 | void __rseq_percpu *addr; |
ef6695f1 MD |
617 | |
618 | pthread_mutex_lock(&pool->lock); | |
619 | /* Get first entry from free list. */ | |
620 | node = pool->free_list_head; | |
621 | if (node != NULL) { | |
622 | /* Remove node from free list (update head). */ | |
623 | pool->free_list_head = node->next; | |
b73b0c25 | 624 | item_offset = (uintptr_t) ((void *) node - pool->ranges->base); |
4aa3220c | 625 | addr = (void __rseq_percpu *) (pool->ranges->base + item_offset); |
ef6695f1 MD |
626 | goto end; |
627 | } | |
cb475906 | 628 | if (pool->ranges->next_unused + pool->item_len > pool->attr.stride) { |
ea1a3ada | 629 | errno = ENOMEM; |
ef6695f1 MD |
630 | addr = NULL; |
631 | goto end; | |
632 | } | |
b73b0c25 | 633 | item_offset = pool->ranges->next_unused; |
4aa3220c | 634 | addr = (void __rseq_percpu *) (pool->ranges->base + item_offset); |
b73b0c25 | 635 | pool->ranges->next_unused += pool->item_len; |
ef6695f1 | 636 | end: |
8f28507f OD |
637 | if (addr) |
638 | set_alloc_slot(pool, item_offset); | |
ef6695f1 MD |
639 | pthread_mutex_unlock(&pool->lock); |
640 | if (zeroed && addr) | |
15b63c9f | 641 | rseq_percpu_zero_item(pool, pool->ranges, item_offset); |
ef6695f1 MD |
642 | return addr; |
643 | } | |
644 | ||
15da5c27 | 645 | void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool) |
ef6695f1 MD |
646 | { |
647 | return __rseq_percpu_malloc(pool, false); | |
648 | } | |
649 | ||
15da5c27 | 650 | void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool) |
ef6695f1 MD |
651 | { |
652 | return __rseq_percpu_malloc(pool, true); | |
653 | } | |
654 | ||
e7cbbc10 MD |
655 | /* Always inline for __builtin_return_address(0). */ |
656 | static inline __attribute__((always_inline)) | |
0ba2a93e | 657 | void clear_alloc_slot(struct rseq_mempool *pool, size_t item_offset) |
0fdf7a4c | 658 | { |
b73b0c25 | 659 | unsigned long *bitmap = pool->ranges->alloc_bitmap; |
9649c7ee | 660 | size_t item_index = item_offset >> pool->item_order; |
0fdf7a4c OD |
661 | unsigned long mask; |
662 | size_t k; | |
663 | ||
9649c7ee | 664 | if (!bitmap) |
0fdf7a4c | 665 | return; |
0fdf7a4c | 666 | |
9649c7ee MD |
667 | k = item_index / BIT_PER_ULONG; |
668 | mask = 1ULL << (item_index % BIT_PER_ULONG); | |
0fdf7a4c | 669 | |
9649c7ee MD |
670 | /* Print error if bit is not set. */ |
671 | if (!(bitmap[k] & mask)) { | |
ca452fee MD |
672 | fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
673 | __func__, get_pool_name(pool), pool, item_offset, | |
674 | (void *) __builtin_return_address(0)); | |
9649c7ee MD |
675 | abort(); |
676 | } | |
0fdf7a4c OD |
677 | bitmap[k] &= ~mask; |
678 | } | |
679 | ||
cb475906 | 680 | void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride) |
ef6695f1 MD |
681 | { |
682 | uintptr_t ptr = (uintptr_t) _ptr; | |
cb475906 | 683 | void *range_base = (void *) (ptr & (~(stride - 1))); |
0ba2a93e MD |
684 | struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET); |
685 | struct rseq_mempool *pool = range->pool; | |
cb475906 | 686 | uintptr_t item_offset = ptr & (stride - 1); |
ef6695f1 MD |
687 | struct free_list_node *head, *item; |
688 | ||
689 | pthread_mutex_lock(&pool->lock); | |
9649c7ee | 690 | clear_alloc_slot(pool, item_offset); |
ef6695f1 MD |
691 | /* Add ptr to head of free list */ |
692 | head = pool->free_list_head; | |
8ab16a24 | 693 | /* Free-list is in CPU 0 range. */ |
4aa3220c | 694 | item = (struct free_list_node *) ptr; |
ef6695f1 MD |
695 | item->next = head; |
696 | pool->free_list_head = item; | |
697 | pthread_mutex_unlock(&pool->lock); | |
698 | } | |
699 | ||
0ba2a93e | 700 | struct rseq_mempool_set *rseq_mempool_set_create(void) |
ef6695f1 | 701 | { |
0ba2a93e | 702 | struct rseq_mempool_set *pool_set; |
ef6695f1 | 703 | |
0ba2a93e | 704 | pool_set = calloc(1, sizeof(struct rseq_mempool_set)); |
ef6695f1 MD |
705 | if (!pool_set) |
706 | return NULL; | |
707 | pthread_mutex_init(&pool_set->lock, NULL); | |
708 | return pool_set; | |
709 | } | |
710 | ||
0ba2a93e | 711 | int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set) |
ef6695f1 MD |
712 | { |
713 | int order, ret; | |
714 | ||
715 | for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) { | |
0ba2a93e | 716 | struct rseq_mempool *pool = pool_set->entries[order]; |
ef6695f1 MD |
717 | |
718 | if (!pool) | |
719 | continue; | |
0ba2a93e | 720 | ret = rseq_mempool_destroy(pool); |
ef6695f1 MD |
721 | if (ret) |
722 | return ret; | |
723 | pool_set->entries[order] = NULL; | |
724 | } | |
725 | pthread_mutex_destroy(&pool_set->lock); | |
726 | free(pool_set); | |
727 | return 0; | |
728 | } | |
729 | ||
730 | /* Ownership of pool is handed over to pool set on success. */ | |
0ba2a93e | 731 | int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool) |
ef6695f1 MD |
732 | { |
733 | size_t item_order = pool->item_order; | |
734 | int ret = 0; | |
735 | ||
736 | pthread_mutex_lock(&pool_set->lock); | |
737 | if (pool_set->entries[item_order]) { | |
738 | errno = EBUSY; | |
739 | ret = -1; | |
740 | goto end; | |
741 | } | |
742 | pool_set->entries[pool->item_order] = pool; | |
743 | end: | |
744 | pthread_mutex_unlock(&pool_set->lock); | |
745 | return ret; | |
746 | } | |
747 | ||
748 | static | |
0ba2a93e | 749 | void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set, size_t len, bool zeroed) |
ef6695f1 MD |
750 | { |
751 | int order, min_order = POOL_SET_MIN_ENTRY; | |
0ba2a93e | 752 | struct rseq_mempool *pool; |
d24ee051 | 753 | void __rseq_percpu *addr; |
ef6695f1 | 754 | |
d06f5cf5 MD |
755 | order = rseq_get_count_order_ulong(len); |
756 | if (order > POOL_SET_MIN_ENTRY) | |
757 | min_order = order; | |
ef6695f1 MD |
758 | again: |
759 | pthread_mutex_lock(&pool_set->lock); | |
760 | /* First smallest present pool where @len fits. */ | |
761 | for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) { | |
762 | pool = pool_set->entries[order]; | |
763 | ||
764 | if (!pool) | |
765 | continue; | |
766 | if (pool->item_len >= len) | |
767 | goto found; | |
768 | } | |
769 | pool = NULL; | |
770 | found: | |
771 | pthread_mutex_unlock(&pool_set->lock); | |
772 | if (pool) { | |
773 | addr = __rseq_percpu_malloc(pool, zeroed); | |
774 | if (addr == NULL && errno == ENOMEM) { | |
775 | /* | |
776 | * If the allocation failed, try again with a | |
777 | * larger pool. | |
778 | */ | |
779 | min_order = order + 1; | |
780 | goto again; | |
781 | } | |
782 | } else { | |
783 | /* Not found. */ | |
784 | errno = ENOMEM; | |
785 | addr = NULL; | |
786 | } | |
787 | return addr; | |
788 | } | |
789 | ||
15da5c27 | 790 | void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len) |
ef6695f1 | 791 | { |
0ba2a93e | 792 | return __rseq_mempool_set_malloc(pool_set, len, false); |
ef6695f1 MD |
793 | } |
794 | ||
15da5c27 | 795 | void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len) |
ef6695f1 | 796 | { |
0ba2a93e | 797 | return __rseq_mempool_set_malloc(pool_set, len, true); |
ef6695f1 | 798 | } |
9bd07c29 | 799 | |
0ba2a93e | 800 | struct rseq_mempool_attr *rseq_mempool_attr_create(void) |
a82006d0 | 801 | { |
0ba2a93e | 802 | return calloc(1, sizeof(struct rseq_mempool_attr)); |
a82006d0 MD |
803 | } |
804 | ||
0ba2a93e | 805 | void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr) |
a82006d0 MD |
806 | { |
807 | free(attr); | |
808 | } | |
809 | ||
0ba2a93e | 810 | int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr *attr, |
a82006d0 | 811 | void *(*mmap_func)(void *priv, size_t len), |
9bd07c29 MD |
812 | int (*munmap_func)(void *priv, void *ptr, size_t len), |
813 | void *mmap_priv) | |
814 | { | |
8118247e MD |
815 | if (!attr) { |
816 | errno = EINVAL; | |
817 | return -1; | |
818 | } | |
a82006d0 | 819 | attr->mmap_set = true; |
9bd07c29 MD |
820 | attr->mmap_func = mmap_func; |
821 | attr->munmap_func = munmap_func; | |
822 | attr->mmap_priv = mmap_priv; | |
8118247e | 823 | return 0; |
9bd07c29 | 824 | } |
d6acc8aa | 825 | |
135811f2 MD |
826 | int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr, |
827 | void (*init_func)(void *priv, void *addr, size_t len, int cpu), | |
828 | void *init_priv) | |
829 | { | |
830 | if (!attr) { | |
831 | errno = EINVAL; | |
832 | return -1; | |
833 | } | |
834 | attr->init_set = true; | |
835 | attr->init_func = init_func; | |
836 | attr->init_priv = init_priv; | |
837 | return 0; | |
838 | } | |
839 | ||
0ba2a93e | 840 | int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr) |
d6acc8aa MD |
841 | { |
842 | if (!attr) { | |
843 | errno = EINVAL; | |
844 | return -1; | |
845 | } | |
846 | attr->robust_set = true; | |
847 | return 0; | |
848 | } | |
cb475906 MD |
849 | |
850 | int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr, | |
851 | size_t stride, int max_nr_cpus) | |
852 | { | |
853 | if (!attr) { | |
854 | errno = EINVAL; | |
855 | return -1; | |
856 | } | |
857 | attr->type = MEMPOOL_TYPE_PERCPU; | |
858 | attr->stride = stride; | |
859 | attr->max_nr_cpus = max_nr_cpus; | |
860 | return 0; | |
861 | } | |
862 | ||
863 | int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr, | |
864 | size_t stride) | |
865 | { | |
866 | if (!attr) { | |
867 | errno = EINVAL; | |
868 | return -1; | |
869 | } | |
870 | attr->type = MEMPOOL_TYPE_GLOBAL; | |
871 | attr->stride = stride; | |
89b7e681 | 872 | attr->max_nr_cpus = 0; |
cb475906 MD |
873 | return 0; |
874 | } | |
6037d364 MD |
875 | |
876 | int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool) | |
877 | { | |
878 | if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) { | |
879 | errno = EINVAL; | |
880 | return -1; | |
881 | } | |
882 | return mempool->attr.max_nr_cpus; | |
883 | } |