Commit | Line | Data |
---|---|---|
ef6695f1 MD |
1 | // SPDX-License-Identifier: MIT |
2 | // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | |
3 | ||
34337fec | 4 | #include <rseq/mempool.h> |
ef6695f1 MD |
5 | #include <sys/mman.h> |
6 | #include <assert.h> | |
7 | #include <string.h> | |
8 | #include <pthread.h> | |
9 | #include <unistd.h> | |
10 | #include <stdlib.h> | |
11 | #include <rseq/compiler.h> | |
12 | #include <errno.h> | |
13 | #include <stdint.h> | |
14 | #include <stdbool.h> | |
367e559c | 15 | #include <stdio.h> |
a5694a4d | 16 | #include <fcntl.h> |
367e559c MD |
17 | |
18 | #ifdef HAVE_LIBNUMA | |
19 | # include <numa.h> | |
20 | # include <numaif.h> | |
21 | #endif | |
ef6695f1 | 22 | |
34337fec | 23 | #include "rseq-utils.h" |
47c725dd | 24 | #include <rseq/rseq.h> |
19be9217 | 25 | |
ef6695f1 | 26 | /* |
b73b0c25 | 27 | * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator. |
ef6695f1 | 28 | * |
8ab16a24 MD |
29 | * The rseq per-CPU memory allocator allows the application the request |
30 | * memory pools of CPU-Local memory each of containing objects of a | |
8aa1462d MD |
31 | * given size (rounded to next power of 2), reserving a given virtual |
32 | * address size per CPU, for a given maximum number of CPUs. | |
8ab16a24 MD |
33 | * |
34 | * The per-CPU memory allocator is analogous to TLS (Thread-Local | |
35 | * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU | |
36 | * memory allocator provides CPU-Local Storage. | |
ef6695f1 MD |
37 | */ |
38 | ||
3236da62 | 39 | #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG |
ef6695f1 | 40 | |
72b100a1 MD |
41 | /* |
42 | * Smallest allocation should hold enough space for a free list pointer. | |
43 | */ | |
ef6695f1 MD |
44 | #if RSEQ_BITS_PER_LONG == 64 |
45 | # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */ | |
46 | #else | |
47 | # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */ | |
48 | #endif | |
49 | ||
0fdf7a4c OD |
50 | #define BIT_PER_ULONG (8 * sizeof(unsigned long)) |
51 | ||
57d8b586 OD |
52 | #define MOVE_PAGES_BATCH_SIZE 4096 |
53 | ||
0ba2a93e | 54 | #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range) |
4aa3220c | 55 | |
3975084e MD |
56 | #if RSEQ_BITS_PER_LONG == 64 |
57 | # define DEFAULT_POISON_VALUE 0x5555555555555555ULL | |
58 | #else | |
59 | # define DEFAULT_POISON_VALUE 0x55555555UL | |
60 | #endif | |
61 | ||
ef6695f1 MD |
62 | struct free_list_node; |
63 | ||
64 | struct free_list_node { | |
65 | struct free_list_node *next; | |
66 | }; | |
67 | ||
cb475906 | 68 | enum mempool_type { |
89b7e681 MD |
69 | MEMPOOL_TYPE_GLOBAL = 0, /* Default */ |
70 | MEMPOOL_TYPE_PERCPU = 1, | |
cb475906 MD |
71 | }; |
72 | ||
0ba2a93e | 73 | struct rseq_mempool_attr { |
a82006d0 | 74 | bool mmap_set; |
9bd07c29 MD |
75 | void *(*mmap_func)(void *priv, size_t len); |
76 | int (*munmap_func)(void *priv, void *ptr, size_t len); | |
77 | void *mmap_priv; | |
d6acc8aa | 78 | |
135811f2 | 79 | bool init_set; |
6e329183 | 80 | int (*init_func)(void *priv, void *addr, size_t len, int cpu); |
135811f2 MD |
81 | void *init_priv; |
82 | ||
d6acc8aa | 83 | bool robust_set; |
cb475906 MD |
84 | |
85 | enum mempool_type type; | |
86 | size_t stride; | |
87 | int max_nr_cpus; | |
e11a02d7 MD |
88 | |
89 | unsigned long max_nr_ranges; | |
455e090e MD |
90 | |
91 | bool poison_set; | |
92 | uintptr_t poison; | |
a5694a4d MD |
93 | |
94 | enum rseq_mempool_populate_policy populate_policy; | |
9bd07c29 MD |
95 | }; |
96 | ||
0ba2a93e | 97 | struct rseq_mempool_range; |
b73b0c25 | 98 | |
0ba2a93e | 99 | struct rseq_mempool_range { |
9d986353 MD |
100 | struct rseq_mempool_range *next; /* Linked list of ranges. */ |
101 | struct rseq_mempool *pool; /* Backward reference to container pool. */ | |
a5694a4d MD |
102 | |
103 | /* | |
104 | * Memory layout of a mempool range: | |
105 | * - Header page (contains struct rseq_mempool_range at the very end), | |
c0de0012 MD |
106 | * - Base of the per-cpu data, starting with CPU 0. |
107 | * Aliases with free-list for non-robust populate all pool. | |
a5694a4d MD |
108 | * - CPU 1, |
109 | * ... | |
110 | * - CPU max_nr_cpus - 1 | |
111 | * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_ALL). | |
c0de0012 MD |
112 | * Aliases with free-list for non-robust populate none pool. |
113 | * - free list (for robust pool). | |
114 | * | |
115 | * The free list aliases the CPU 0 memory area for non-robust | |
116 | * populate all pools. It aliases with init values for | |
117 | * non-robust populate none pools. It is located immediately | |
118 | * after the init values for robust pools. | |
a5694a4d | 119 | */ |
4aa3220c | 120 | void *header; |
ef6695f1 | 121 | void *base; |
a5694a4d MD |
122 | /* |
123 | * The init values contains malloc_init/zmalloc values. | |
124 | * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_ALL. | |
125 | */ | |
126 | void *init; | |
b73b0c25 | 127 | size_t next_unused; |
fa6a0fb3 MD |
128 | |
129 | /* Pool range mmap/munmap */ | |
130 | void *mmap_addr; | |
131 | size_t mmap_len; | |
132 | ||
b73b0c25 MD |
133 | /* Track alloc/free. */ |
134 | unsigned long *alloc_bitmap; | |
135 | }; | |
136 | ||
0ba2a93e | 137 | struct rseq_mempool { |
9d986353 MD |
138 | /* Head of ranges linked-list. */ |
139 | struct rseq_mempool_range *range_list; | |
140 | unsigned long nr_ranges; | |
b73b0c25 | 141 | |
ef6695f1 | 142 | size_t item_len; |
ef6695f1 | 143 | int item_order; |
ef6695f1 MD |
144 | |
145 | /* | |
8ab16a24 | 146 | * The free list chains freed items on the CPU 0 address range. |
ef6695f1 | 147 | * We should rethink this decision if false sharing between |
8ab16a24 | 148 | * malloc/free from other CPUs and data accesses from CPU 0 |
ef6695f1 MD |
149 | * becomes an issue. This is a NULL-terminated singly-linked |
150 | * list. | |
151 | */ | |
152 | struct free_list_node *free_list_head; | |
b73b0c25 | 153 | |
ef6695f1 MD |
154 | /* This lock protects allocation/free within the pool. */ |
155 | pthread_mutex_t lock; | |
9bd07c29 | 156 | |
0ba2a93e | 157 | struct rseq_mempool_attr attr; |
ca452fee | 158 | char *name; |
ef6695f1 MD |
159 | }; |
160 | ||
ef6695f1 MD |
161 | /* |
162 | * Pool set entries are indexed by item_len rounded to the next power of | |
163 | * 2. A pool set can contain NULL pool entries, in which case the next | |
164 | * large enough entry will be used for allocation. | |
165 | */ | |
0ba2a93e | 166 | struct rseq_mempool_set { |
ef6695f1 MD |
167 | /* This lock protects add vs malloc/zmalloc within the pool set. */ |
168 | pthread_mutex_t lock; | |
0ba2a93e | 169 | struct rseq_mempool *entries[POOL_SET_NR_ENTRIES]; |
ef6695f1 MD |
170 | }; |
171 | ||
a5694a4d MD |
172 | /* |
173 | * This memfd is used to implement the user COW behavior for the page | |
174 | * protection scheme. memfd is a sparse virtual file. Its layout (in | |
175 | * offset from beginning of file) matches the process address space | |
176 | * (pointers directly converted to file offsets). | |
177 | */ | |
178 | struct rseq_memfd { | |
179 | pthread_mutex_t lock; | |
180 | size_t reserved_size; | |
181 | unsigned int refcount; | |
182 | int fd; | |
183 | }; | |
184 | ||
185 | static struct rseq_memfd memfd = { | |
186 | .lock = PTHREAD_MUTEX_INITIALIZER, | |
187 | .reserved_size = 0, | |
188 | .refcount = 0, | |
189 | .fd = -1, | |
190 | }; | |
191 | ||
86617384 MD |
192 | static |
193 | const char *get_pool_name(const struct rseq_mempool *pool) | |
194 | { | |
195 | return pool->name ? : "<anonymous>"; | |
196 | } | |
197 | ||
367e559c | 198 | static |
6fbf1fb6 | 199 | void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu, |
f2981623 | 200 | uintptr_t item_offset, size_t stride) |
367e559c | 201 | { |
15b63c9f | 202 | return range->base + (stride * cpu) + item_offset; |
367e559c MD |
203 | } |
204 | ||
a5694a4d MD |
205 | static |
206 | void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range, | |
207 | uintptr_t item_offset) | |
208 | { | |
209 | if (!range->init) | |
210 | return NULL; | |
211 | return range->init + item_offset; | |
212 | } | |
213 | ||
214 | static | |
215 | void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool, | |
216 | struct free_list_node *node) | |
217 | { | |
218 | void __rseq_percpu *p = (void __rseq_percpu *) node; | |
219 | ||
c0de0012 MD |
220 | if (pool->attr.robust_set) { |
221 | /* Skip cpus. */ | |
a5694a4d | 222 | p -= pool->attr.max_nr_cpus * pool->attr.stride; |
c0de0012 MD |
223 | /* Skip init values */ |
224 | if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) | |
225 | p -= pool->attr.stride; | |
226 | ||
227 | } else { | |
228 | /* Populate none free list is in init values */ | |
229 | if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) | |
230 | p -= pool->attr.max_nr_cpus * pool->attr.stride; | |
231 | } | |
a5694a4d MD |
232 | return p; |
233 | } | |
234 | ||
235 | static | |
236 | struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool, | |
237 | void __rseq_percpu *p) | |
238 | { | |
c0de0012 MD |
239 | if (pool->attr.robust_set) { |
240 | /* Skip cpus. */ | |
a5694a4d | 241 | p += pool->attr.max_nr_cpus * pool->attr.stride; |
c0de0012 MD |
242 | /* Skip init values */ |
243 | if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) | |
244 | p += pool->attr.stride; | |
245 | ||
246 | } else { | |
247 | /* Populate none free list is in init values */ | |
248 | if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) | |
249 | p += pool->attr.max_nr_cpus * pool->attr.stride; | |
250 | } | |
a5694a4d MD |
251 | return (struct free_list_node *) p; |
252 | } | |
253 | ||
49f96dc5 MD |
254 | static |
255 | off_t ptr_to_off_t(void *p) | |
256 | { | |
257 | return (off_t) (uintptr_t) p; | |
258 | } | |
259 | ||
a5694a4d MD |
260 | static |
261 | int memcmpbyte(const char *s, int c, size_t n) | |
262 | { | |
263 | int res = 0; | |
264 | ||
265 | while (n-- > 0) | |
266 | if ((res = *(s++) - c) != 0) | |
267 | break; | |
268 | return res; | |
269 | } | |
270 | ||
367e559c | 271 | static |
15b63c9f MD |
272 | void rseq_percpu_zero_item(struct rseq_mempool *pool, |
273 | struct rseq_mempool_range *range, uintptr_t item_offset) | |
367e559c | 274 | { |
a5694a4d | 275 | char *init_p = NULL; |
367e559c MD |
276 | int i; |
277 | ||
a5694a4d MD |
278 | init_p = __rseq_pool_range_init_ptr(range, item_offset); |
279 | if (init_p) | |
280 | memset(init_p, 0, pool->item_len); | |
cb475906 | 281 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
15b63c9f | 282 | char *p = __rseq_pool_range_percpu_ptr(range, i, |
cb475906 | 283 | item_offset, pool->attr.stride); |
a5694a4d | 284 | |
1b658191 MD |
285 | /* |
286 | * If item is already zeroed, either because the | |
287 | * init range update has propagated or because the | |
288 | * content is already zeroed (e.g. zero page), don't | |
289 | * write to the page. This eliminates useless COW over | |
290 | * the zero page just for overwriting it with zeroes. | |
291 | * | |
292 | * This means zmalloc() in populate all policy pool do | |
293 | * not trigger COW for CPUs which are not actively | |
294 | * writing to the pool. This is however not the case for | |
295 | * malloc_init() in populate-all pools if it populates | |
296 | * non-zero content. | |
297 | */ | |
298 | if (!memcmpbyte(p, 0, pool->item_len)) | |
a5694a4d | 299 | continue; |
367e559c MD |
300 | memset(p, 0, pool->item_len); |
301 | } | |
302 | } | |
303 | ||
6ff43d9a MD |
304 | static |
305 | void rseq_percpu_init_item(struct rseq_mempool *pool, | |
306 | struct rseq_mempool_range *range, uintptr_t item_offset, | |
307 | void *init_ptr, size_t init_len) | |
308 | { | |
a5694a4d | 309 | char *init_p = NULL; |
6ff43d9a MD |
310 | int i; |
311 | ||
a5694a4d MD |
312 | init_p = __rseq_pool_range_init_ptr(range, item_offset); |
313 | if (init_p) | |
314 | memcpy(init_p, init_ptr, init_len); | |
6ff43d9a MD |
315 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
316 | char *p = __rseq_pool_range_percpu_ptr(range, i, | |
317 | item_offset, pool->attr.stride); | |
a5694a4d | 318 | |
1b658191 MD |
319 | /* |
320 | * If the update propagated through a shared mapping, | |
321 | * or the item already has the correct content, skip | |
322 | * writing it into the cpu item to eliminate useless | |
323 | * COW of the page. | |
324 | */ | |
325 | if (!memcmp(init_ptr, p, init_len)) | |
a5694a4d | 326 | continue; |
6ff43d9a MD |
327 | memcpy(p, init_ptr, init_len); |
328 | } | |
329 | } | |
330 | ||
a5694a4d MD |
331 | static |
332 | void rseq_poison_item(void *p, size_t item_len, uintptr_t poison) | |
333 | { | |
334 | size_t offset; | |
335 | ||
336 | for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) | |
337 | *((uintptr_t *) (p + offset)) = poison; | |
338 | } | |
339 | ||
1b658191 MD |
340 | static |
341 | intptr_t rseq_cmp_poison_item(void *p, size_t item_len, uintptr_t poison, intptr_t *unexpected_value) | |
342 | { | |
343 | size_t offset; | |
344 | intptr_t res = 0; | |
345 | ||
346 | for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) { | |
347 | intptr_t v = *((intptr_t *) (p + offset)); | |
348 | ||
349 | if ((res = v - (intptr_t) poison) != 0) { | |
350 | if (unexpected_value) | |
351 | *unexpected_value = v; | |
352 | break; | |
353 | } | |
354 | } | |
355 | return res; | |
356 | } | |
357 | ||
455e090e MD |
358 | static |
359 | void rseq_percpu_poison_item(struct rseq_mempool *pool, | |
360 | struct rseq_mempool_range *range, uintptr_t item_offset) | |
361 | { | |
362 | uintptr_t poison = pool->attr.poison; | |
a5694a4d | 363 | char *init_p = NULL; |
455e090e MD |
364 | int i; |
365 | ||
a5694a4d MD |
366 | init_p = __rseq_pool_range_init_ptr(range, item_offset); |
367 | if (init_p) | |
368 | rseq_poison_item(init_p, pool->item_len, poison); | |
455e090e MD |
369 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
370 | char *p = __rseq_pool_range_percpu_ptr(range, i, | |
371 | item_offset, pool->attr.stride); | |
455e090e | 372 | |
1b658191 MD |
373 | /* |
374 | * If the update propagated through a shared mapping, | |
375 | * or the item already has the correct content, skip | |
376 | * writing it into the cpu item to eliminate useless | |
377 | * COW of the page. | |
378 | * | |
379 | * It is recommended to use zero as poison value for | |
380 | * populate-all pools to eliminate COW due to writing | |
381 | * poison to unused CPU memory. | |
382 | */ | |
383 | if (rseq_cmp_poison_item(p, pool->item_len, poison, NULL) == 0) | |
a5694a4d MD |
384 | continue; |
385 | rseq_poison_item(p, pool->item_len, poison); | |
386 | } | |
387 | } | |
388 | ||
389 | /* Always inline for __builtin_return_address(0). */ | |
390 | static inline __attribute__((always_inline)) | |
391 | void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset, | |
c0de0012 | 392 | void *p, size_t item_len, uintptr_t poison) |
a5694a4d | 393 | { |
1b658191 | 394 | intptr_t unexpected_value; |
a5694a4d | 395 | |
1b658191 MD |
396 | if (rseq_cmp_poison_item(p, item_len, poison, &unexpected_value) == 0) |
397 | return; | |
a5694a4d | 398 | |
1b658191 MD |
399 | fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
400 | __func__, (unsigned long) unexpected_value, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0)); | |
401 | abort(); | |
86617384 MD |
402 | } |
403 | ||
404 | /* Always inline for __builtin_return_address(0). */ | |
405 | static inline __attribute__((always_inline)) | |
6fbf1fb6 MD |
406 | void rseq_percpu_check_poison_item(const struct rseq_mempool *pool, |
407 | const struct rseq_mempool_range *range, uintptr_t item_offset) | |
86617384 MD |
408 | { |
409 | uintptr_t poison = pool->attr.poison; | |
a5694a4d | 410 | char *init_p; |
86617384 MD |
411 | int i; |
412 | ||
3975084e | 413 | if (!pool->attr.robust_set) |
86617384 | 414 | return; |
a5694a4d MD |
415 | init_p = __rseq_pool_range_init_ptr(range, item_offset); |
416 | if (init_p) | |
c0de0012 | 417 | rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison); |
86617384 MD |
418 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
419 | char *p = __rseq_pool_range_percpu_ptr(range, i, | |
420 | item_offset, pool->attr.stride); | |
c0de0012 | 421 | rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison); |
455e090e MD |
422 | } |
423 | } | |
424 | ||
15b63c9f | 425 | #ifdef HAVE_LIBNUMA |
c6fd3981 | 426 | int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags) |
367e559c | 427 | { |
f2981623 | 428 | unsigned long nr_pages, page_len; |
c6fd3981 MD |
429 | int status[MOVE_PAGES_BATCH_SIZE]; |
430 | int nodes[MOVE_PAGES_BATCH_SIZE]; | |
431 | void *pages[MOVE_PAGES_BATCH_SIZE]; | |
f2981623 | 432 | long ret; |
367e559c | 433 | |
c6fd3981 MD |
434 | if (!numa_flags) { |
435 | errno = EINVAL; | |
436 | return -1; | |
437 | } | |
367e559c | 438 | page_len = rseq_get_page_len(); |
c6fd3981 | 439 | nr_pages = len >> rseq_get_count_order_ulong(page_len); |
57d8b586 | 440 | |
c6fd3981 MD |
441 | nodes[0] = numa_node_of_cpu(cpu); |
442 | if (nodes[0] < 0) | |
443 | return -1; | |
57d8b586 | 444 | |
c6fd3981 MD |
445 | for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) { |
446 | nodes[k] = nodes[0]; | |
447 | } | |
57d8b586 | 448 | |
c6fd3981 | 449 | for (unsigned long page = 0; page < nr_pages;) { |
57d8b586 | 450 | |
c6fd3981 MD |
451 | size_t max_k = RSEQ_ARRAY_SIZE(pages); |
452 | size_t left = nr_pages - page; | |
57d8b586 | 453 | |
c6fd3981 MD |
454 | if (left < max_k) { |
455 | max_k = left; | |
456 | } | |
57d8b586 | 457 | |
c6fd3981 MD |
458 | for (size_t k = 0; k < max_k; ++k, ++page) { |
459 | pages[k] = addr + (page * page_len); | |
460 | status[k] = -EPERM; | |
367e559c | 461 | } |
b73b0c25 | 462 | |
c6fd3981 MD |
463 | ret = move_pages(0, max_k, pages, nodes, status, numa_flags); |
464 | ||
465 | if (ret < 0) | |
b73b0c25 | 466 | return ret; |
c6fd3981 MD |
467 | |
468 | if (ret > 0) { | |
469 | fprintf(stderr, "%lu pages were not migrated\n", ret); | |
470 | for (size_t k = 0; k < max_k; ++k) { | |
471 | if (status[k] < 0) | |
472 | fprintf(stderr, | |
473 | "Error while moving page %p to numa node %d: %u\n", | |
474 | pages[k], nodes[k], -status[k]); | |
475 | } | |
476 | } | |
b73b0c25 MD |
477 | } |
478 | return 0; | |
479 | } | |
367e559c | 480 | #else |
c6fd3981 MD |
481 | int rseq_mempool_range_init_numa(void *addr __attribute__((unused)), |
482 | size_t len __attribute__((unused)), | |
483 | int cpu __attribute__((unused)), | |
367e559c MD |
484 | int numa_flags __attribute__((unused))) |
485 | { | |
c6fd3981 MD |
486 | errno = ENOSYS; |
487 | return -1; | |
367e559c MD |
488 | } |
489 | #endif | |
490 | ||
9bd07c29 MD |
491 | static |
492 | void *default_mmap_func(void *priv __attribute__((unused)), size_t len) | |
493 | { | |
494 | void *base; | |
495 | ||
496 | base = mmap(NULL, len, PROT_READ | PROT_WRITE, | |
497 | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | |
498 | if (base == MAP_FAILED) | |
499 | return NULL; | |
500 | return base; | |
501 | } | |
502 | ||
503 | static | |
504 | int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len) | |
505 | { | |
506 | return munmap(ptr, len); | |
507 | } | |
508 | ||
0fdf7a4c | 509 | static |
0ba2a93e | 510 | int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) |
0fdf7a4c OD |
511 | { |
512 | size_t count; | |
513 | ||
cb475906 | 514 | count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG; |
0fdf7a4c OD |
515 | |
516 | /* | |
9649c7ee MD |
517 | * Not being able to create the validation bitmap is an error |
518 | * that needs to be reported. | |
0fdf7a4c | 519 | */ |
b73b0c25 MD |
520 | range->alloc_bitmap = calloc(count, sizeof(unsigned long)); |
521 | if (!range->alloc_bitmap) | |
9649c7ee MD |
522 | return -1; |
523 | return 0; | |
0fdf7a4c OD |
524 | } |
525 | ||
b73b0c25 | 526 | static |
a5694a4d | 527 | bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr) |
b73b0c25 | 528 | { |
0ba2a93e | 529 | struct rseq_mempool_range *range; |
a5694a4d | 530 | void *addr = (void *) _addr; |
b73b0c25 | 531 | |
9d986353 | 532 | for (range = pool->range_list; range; range = range->next) { |
b73b0c25 MD |
533 | if (addr >= range->base && addr < range->base + range->next_unused) |
534 | return true; | |
535 | } | |
536 | return false; | |
537 | } | |
538 | ||
a9ec6111 OD |
539 | /* Always inline for __builtin_return_address(0). */ |
540 | static inline __attribute__((always_inline)) | |
0ba2a93e | 541 | void check_free_list(const struct rseq_mempool *pool) |
a9ec6111 | 542 | { |
b73b0c25 MD |
543 | size_t total_item = 0, total_never_allocated = 0, total_freed = 0, |
544 | max_list_traversal = 0, traversal_iteration = 0; | |
0ba2a93e | 545 | struct rseq_mempool_range *range; |
b73b0c25 MD |
546 | |
547 | if (!pool->attr.robust_set) | |
548 | return; | |
549 | ||
9d986353 | 550 | for (range = pool->range_list; range; range = range->next) { |
cb475906 MD |
551 | total_item += pool->attr.stride >> pool->item_order; |
552 | total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order; | |
b73b0c25 MD |
553 | } |
554 | max_list_traversal = total_item - total_never_allocated; | |
a9ec6111 OD |
555 | |
556 | for (struct free_list_node *node = pool->free_list_head, *prev = NULL; | |
557 | node; | |
558 | prev = node, | |
559 | node = node->next) { | |
560 | ||
a9ec6111 | 561 | if (traversal_iteration >= max_list_traversal) { |
ca452fee MD |
562 | fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n", |
563 | __func__, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 OD |
564 | abort(); |
565 | } | |
566 | ||
567 | /* Node is out of range. */ | |
a5694a4d | 568 | if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) { |
a9ec6111 | 569 | if (prev) |
ca452fee MD |
570 | fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", |
571 | __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 | 572 | else |
ca452fee MD |
573 | fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", |
574 | __func__, node, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 OD |
575 | abort(); |
576 | } | |
577 | ||
b73b0c25 MD |
578 | traversal_iteration++; |
579 | total_freed++; | |
a9ec6111 OD |
580 | } |
581 | ||
582 | if (total_never_allocated + total_freed != total_item) { | |
ca452fee MD |
583 | fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n", |
584 | __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0)); | |
a9ec6111 OD |
585 | abort(); |
586 | } | |
a9ec6111 OD |
587 | } |
588 | ||
6fbf1fb6 MD |
589 | /* Always inline for __builtin_return_address(0). */ |
590 | static inline __attribute__((always_inline)) | |
591 | void check_range_poison(const struct rseq_mempool *pool, | |
592 | const struct rseq_mempool_range *range) | |
593 | { | |
594 | size_t item_offset; | |
595 | ||
596 | for (item_offset = 0; item_offset < range->next_unused; | |
597 | item_offset += pool->item_len) | |
598 | rseq_percpu_check_poison_item(pool, range, item_offset); | |
599 | } | |
600 | ||
601 | /* Always inline for __builtin_return_address(0). */ | |
602 | static inline __attribute__((always_inline)) | |
603 | void check_pool_poison(const struct rseq_mempool *pool) | |
604 | { | |
605 | struct rseq_mempool_range *range; | |
606 | ||
3975084e | 607 | if (!pool->attr.robust_set) |
6fbf1fb6 MD |
608 | return; |
609 | for (range = pool->range_list; range; range = range->next) | |
610 | check_range_poison(pool, range); | |
611 | } | |
612 | ||
e7cbbc10 MD |
613 | /* Always inline for __builtin_return_address(0). */ |
614 | static inline __attribute__((always_inline)) | |
0ba2a93e | 615 | void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) |
0fdf7a4c | 616 | { |
b73b0c25 | 617 | unsigned long *bitmap = range->alloc_bitmap; |
9649c7ee | 618 | size_t count, total_leaks = 0; |
0fdf7a4c | 619 | |
9649c7ee | 620 | if (!bitmap) |
0fdf7a4c | 621 | return; |
0fdf7a4c | 622 | |
cb475906 | 623 | count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG; |
0fdf7a4c OD |
624 | |
625 | /* Assert that all items in the pool were freed. */ | |
9649c7ee MD |
626 | for (size_t k = 0; k < count; ++k) |
627 | total_leaks += rseq_hweight_ulong(bitmap[k]); | |
628 | if (total_leaks) { | |
ca452fee MD |
629 | fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n", |
630 | __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0)); | |
9649c7ee | 631 | abort(); |
0fdf7a4c OD |
632 | } |
633 | ||
634 | free(bitmap); | |
a5694a4d | 635 | range->alloc_bitmap = NULL; |
0fdf7a4c OD |
636 | } |
637 | ||
b73b0c25 MD |
638 | /* Always inline for __builtin_return_address(0). */ |
639 | static inline __attribute__((always_inline)) | |
0ba2a93e MD |
640 | int rseq_mempool_range_destroy(struct rseq_mempool *pool, |
641 | struct rseq_mempool_range *range) | |
b73b0c25 | 642 | { |
a5694a4d MD |
643 | int ret = 0; |
644 | ||
b73b0c25 | 645 | destroy_alloc_bitmap(pool, range); |
a5694a4d MD |
646 | |
647 | /* | |
648 | * Punch a hole into memfd where the init values used to be. | |
649 | */ | |
650 | if (range->init) { | |
651 | ret = fallocate(memfd.fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | |
49f96dc5 | 652 | ptr_to_off_t(range->init), pool->attr.stride); |
a5694a4d MD |
653 | if (ret) |
654 | return ret; | |
655 | range->init = NULL; | |
656 | } | |
657 | ||
5c99f3d6 | 658 | /* range is a header located one page before the aligned mapping. */ |
fa6a0fb3 | 659 | return pool->attr.munmap_func(pool->attr.mmap_priv, range->mmap_addr, range->mmap_len); |
5c99f3d6 MD |
660 | } |
661 | ||
662 | /* | |
663 | * Allocate a memory mapping aligned on @alignment, with an optional | |
664 | * @pre_header before the mapping. | |
665 | */ | |
666 | static | |
0ba2a93e | 667 | void *aligned_mmap_anonymous(struct rseq_mempool *pool, |
5c99f3d6 MD |
668 | size_t page_size, size_t len, size_t alignment, |
669 | void **pre_header, size_t pre_header_len) | |
670 | { | |
671 | size_t minimum_page_count, page_count, extra, total_allocate = 0; | |
672 | int page_order; | |
673 | void *ptr; | |
674 | ||
675 | if (len < page_size || alignment < page_size || | |
b72b2d9e | 676 | !is_pow2(alignment) || (len & (alignment - 1))) { |
5c99f3d6 MD |
677 | errno = EINVAL; |
678 | return NULL; | |
679 | } | |
680 | page_order = rseq_get_count_order_ulong(page_size); | |
681 | if (page_order < 0) { | |
682 | errno = EINVAL; | |
683 | return NULL; | |
684 | } | |
685 | if (pre_header_len && (pre_header_len & (page_size - 1))) { | |
686 | errno = EINVAL; | |
687 | return NULL; | |
688 | } | |
689 | ||
690 | minimum_page_count = (pre_header_len + len) >> page_order; | |
691 | page_count = (pre_header_len + len + alignment - page_size) >> page_order; | |
692 | ||
693 | assert(page_count >= minimum_page_count); | |
694 | ||
695 | ptr = pool->attr.mmap_func(pool->attr.mmap_priv, page_count << page_order); | |
696 | if (!ptr) | |
697 | goto alloc_error; | |
698 | ||
699 | total_allocate = page_count << page_order; | |
700 | ||
701 | if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) { | |
702 | /* Pointer is already aligned. ptr points to pre_header. */ | |
703 | goto out; | |
704 | } | |
705 | ||
706 | /* Unmap extra before. */ | |
707 | extra = offset_align((uintptr_t) ptr + pre_header_len, alignment); | |
708 | assert(!(extra & (page_size - 1))); | |
709 | if (pool->attr.munmap_func(pool->attr.mmap_priv, ptr, extra)) { | |
710 | perror("munmap"); | |
711 | abort(); | |
712 | } | |
713 | total_allocate -= extra; | |
714 | ptr += extra; /* ptr points to pre_header */ | |
715 | page_count -= extra >> page_order; | |
716 | out: | |
717 | assert(page_count >= minimum_page_count); | |
718 | ||
719 | if (page_count > minimum_page_count) { | |
720 | void *extra_ptr; | |
721 | ||
722 | /* Unmap extra after. */ | |
723 | extra_ptr = ptr + (minimum_page_count << page_order); | |
724 | extra = (page_count - minimum_page_count) << page_order; | |
725 | if (pool->attr.munmap_func(pool->attr.mmap_priv, extra_ptr, extra)) { | |
726 | perror("munmap"); | |
727 | abort(); | |
728 | } | |
729 | total_allocate -= extra; | |
730 | } | |
731 | ||
732 | assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1))); | |
733 | assert(total_allocate == len + pre_header_len); | |
734 | ||
735 | alloc_error: | |
736 | if (ptr) { | |
737 | if (pre_header) | |
738 | *pre_header = ptr; | |
739 | ptr += pre_header_len; | |
740 | } | |
741 | return ptr; | |
b73b0c25 MD |
742 | } |
743 | ||
a5694a4d MD |
744 | static |
745 | int rseq_memfd_reserve_init(void *init, size_t init_len) | |
746 | { | |
747 | int ret = 0; | |
748 | size_t reserve_len; | |
749 | ||
750 | pthread_mutex_lock(&memfd.lock); | |
49f96dc5 | 751 | reserve_len = (size_t) ptr_to_off_t(init) + init_len; |
a5694a4d MD |
752 | if (reserve_len > memfd.reserved_size) { |
753 | if (ftruncate(memfd.fd, (off_t) reserve_len)) { | |
754 | ret = -1; | |
755 | goto unlock; | |
756 | } | |
757 | memfd.reserved_size = reserve_len; | |
758 | } | |
759 | unlock: | |
760 | pthread_mutex_unlock(&memfd.lock); | |
761 | return ret; | |
762 | } | |
763 | ||
b73b0c25 | 764 | static |
0ba2a93e | 765 | struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool) |
b73b0c25 | 766 | { |
0ba2a93e | 767 | struct rseq_mempool_range *range; |
5c99f3d6 | 768 | unsigned long page_size; |
4aa3220c | 769 | void *header; |
b73b0c25 | 770 | void *base; |
a5694a4d | 771 | size_t range_len; /* Range len excludes header. */ |
b73b0c25 | 772 | |
e11a02d7 MD |
773 | if (pool->attr.max_nr_ranges && |
774 | pool->nr_ranges >= pool->attr.max_nr_ranges) { | |
9d986353 MD |
775 | errno = ENOMEM; |
776 | return NULL; | |
777 | } | |
5c99f3d6 | 778 | page_size = rseq_get_page_len(); |
b73b0c25 | 779 | |
a5694a4d MD |
780 | range_len = pool->attr.stride * pool->attr.max_nr_cpus; |
781 | if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) | |
782 | range_len += pool->attr.stride; /* init values */ | |
c0de0012 MD |
783 | if (pool->attr.robust_set) |
784 | range_len += pool->attr.stride; /* free list */ | |
5c99f3d6 | 785 | base = aligned_mmap_anonymous(pool, page_size, |
a5694a4d | 786 | range_len, |
cb475906 | 787 | pool->attr.stride, |
4aa3220c | 788 | &header, page_size); |
b73b0c25 | 789 | if (!base) |
5c99f3d6 | 790 | return NULL; |
0ba2a93e | 791 | range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET); |
5c99f3d6 | 792 | range->pool = pool; |
4aa3220c | 793 | range->header = header; |
a5694a4d | 794 | range->base = base; |
fa6a0fb3 | 795 | range->mmap_addr = header; |
a5694a4d MD |
796 | range->mmap_len = page_size + range_len; |
797 | ||
798 | if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) { | |
799 | range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus); | |
800 | /* Populate init values pages from memfd */ | |
801 | if (rseq_memfd_reserve_init(range->init, pool->attr.stride)) | |
802 | goto error_alloc; | |
803 | if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE, | |
804 | MAP_SHARED | MAP_FIXED, memfd.fd, | |
49f96dc5 | 805 | ptr_to_off_t(range->init)) != (void *) range->init) { |
a5694a4d MD |
806 | goto error_alloc; |
807 | } | |
808 | assert(pool->attr.type == MEMPOOL_TYPE_PERCPU); | |
809 | /* | |
810 | * Map per-cpu memory as private COW mappings of init values. | |
811 | */ | |
812 | { | |
813 | int cpu; | |
814 | ||
815 | for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) { | |
816 | void *p = base + (pool->attr.stride * cpu); | |
817 | size_t len = pool->attr.stride; | |
818 | ||
819 | if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED, | |
49f96dc5 | 820 | memfd.fd, ptr_to_off_t(range->init)) != (void *) p) { |
a5694a4d MD |
821 | goto error_alloc; |
822 | } | |
823 | } | |
824 | } | |
825 | } | |
826 | ||
b73b0c25 MD |
827 | if (pool->attr.robust_set) { |
828 | if (create_alloc_bitmap(pool, range)) | |
829 | goto error_alloc; | |
830 | } | |
135811f2 | 831 | if (pool->attr.init_set) { |
374c2773 MD |
832 | switch (pool->attr.type) { |
833 | case MEMPOOL_TYPE_GLOBAL: | |
6e329183 | 834 | if (pool->attr.init_func(pool->attr.init_priv, |
374c2773 | 835 | base, pool->attr.stride, -1)) { |
6e329183 MD |
836 | goto error_alloc; |
837 | } | |
374c2773 MD |
838 | break; |
839 | case MEMPOOL_TYPE_PERCPU: | |
840 | { | |
841 | int cpu; | |
842 | for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) { | |
843 | if (pool->attr.init_func(pool->attr.init_priv, | |
844 | base + (pool->attr.stride * cpu), | |
845 | pool->attr.stride, cpu)) { | |
846 | goto error_alloc; | |
847 | } | |
848 | } | |
849 | break; | |
850 | } | |
851 | default: | |
852 | abort(); | |
135811f2 MD |
853 | } |
854 | } | |
9d986353 | 855 | pool->nr_ranges++; |
b73b0c25 MD |
856 | return range; |
857 | ||
858 | error_alloc: | |
0ba2a93e | 859 | (void) rseq_mempool_range_destroy(pool, range); |
b73b0c25 MD |
860 | return NULL; |
861 | } | |
862 | ||
a5694a4d MD |
863 | static |
864 | int rseq_mempool_memfd_ref(struct rseq_mempool *pool) | |
865 | { | |
866 | int ret = 0; | |
867 | ||
868 | if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_ALL) | |
869 | return 0; | |
870 | ||
871 | pthread_mutex_lock(&memfd.lock); | |
872 | if (memfd.refcount == 0) { | |
873 | memfd.fd = memfd_create("mempool", MFD_CLOEXEC); | |
874 | if (memfd.fd < 0) { | |
875 | perror("memfd_create"); | |
876 | ret = -1; | |
877 | goto unlock; | |
878 | } | |
879 | } | |
880 | memfd.refcount++; | |
881 | unlock: | |
882 | pthread_mutex_unlock(&memfd.lock); | |
883 | return ret; | |
884 | } | |
885 | ||
886 | static | |
887 | void rseq_mempool_memfd_unref(struct rseq_mempool *pool) | |
888 | { | |
889 | if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_ALL) | |
890 | return; | |
891 | ||
892 | pthread_mutex_lock(&memfd.lock); | |
893 | if (memfd.refcount == 1) { | |
894 | if (close(memfd.fd)) { | |
895 | perror("close"); | |
896 | abort(); | |
897 | } | |
898 | memfd.fd = -1; | |
899 | memfd.reserved_size = 0; | |
900 | } | |
901 | memfd.refcount--; | |
902 | pthread_mutex_unlock(&memfd.lock); | |
903 | } | |
904 | ||
0ba2a93e | 905 | int rseq_mempool_destroy(struct rseq_mempool *pool) |
9649c7ee | 906 | { |
0ba2a93e | 907 | struct rseq_mempool_range *range, *next_range; |
b73b0c25 | 908 | int ret = 0; |
9649c7ee | 909 | |
f510ddc5 MD |
910 | if (!pool) |
911 | return 0; | |
b73b0c25 | 912 | check_free_list(pool); |
6fbf1fb6 | 913 | check_pool_poison(pool); |
b73b0c25 | 914 | /* Iteration safe against removal. */ |
9d986353 | 915 | for (range = pool->range_list; range && (next_range = range->next, 1); range = next_range) { |
0ba2a93e | 916 | if (rseq_mempool_range_destroy(pool, range)) |
b73b0c25 MD |
917 | goto end; |
918 | /* Update list head to keep list coherent in case of partial failure. */ | |
9d986353 | 919 | pool->range_list = next_range; |
b73b0c25 | 920 | } |
a5694a4d | 921 | rseq_mempool_memfd_unref(pool); |
9649c7ee | 922 | pthread_mutex_destroy(&pool->lock); |
ca452fee | 923 | free(pool->name); |
eb8db04d | 924 | free(pool); |
9649c7ee | 925 | end: |
b73b0c25 | 926 | return ret; |
9649c7ee MD |
927 | } |
928 | ||
0ba2a93e | 929 | struct rseq_mempool *rseq_mempool_create(const char *pool_name, |
cb475906 | 930 | size_t item_len, const struct rseq_mempool_attr *_attr) |
ef6695f1 | 931 | { |
0ba2a93e MD |
932 | struct rseq_mempool *pool; |
933 | struct rseq_mempool_attr attr = {}; | |
ef6695f1 | 934 | int order; |
ef6695f1 MD |
935 | |
936 | /* Make sure each item is large enough to contain free list pointers. */ | |
937 | if (item_len < sizeof(void *)) | |
938 | item_len = sizeof(void *); | |
939 | ||
940 | /* Align item_len on next power of two. */ | |
19be9217 | 941 | order = rseq_get_count_order_ulong(item_len); |
ef6695f1 MD |
942 | if (order < 0) { |
943 | errno = EINVAL; | |
944 | return NULL; | |
945 | } | |
946 | item_len = 1UL << order; | |
947 | ||
a82006d0 MD |
948 | if (_attr) |
949 | memcpy(&attr, _attr, sizeof(attr)); | |
950 | if (!attr.mmap_set) { | |
951 | attr.mmap_func = default_mmap_func; | |
952 | attr.munmap_func = default_munmap_func; | |
953 | attr.mmap_priv = NULL; | |
9bd07c29 | 954 | } |
a82006d0 | 955 | |
cb475906 MD |
956 | switch (attr.type) { |
957 | case MEMPOOL_TYPE_PERCPU: | |
958 | if (attr.max_nr_cpus < 0) { | |
959 | errno = EINVAL; | |
960 | return NULL; | |
961 | } | |
962 | if (attr.max_nr_cpus == 0) { | |
963 | /* Auto-detect */ | |
47c725dd | 964 | attr.max_nr_cpus = rseq_get_max_nr_cpus(); |
cb475906 MD |
965 | if (attr.max_nr_cpus == 0) { |
966 | errno = EINVAL; | |
967 | return NULL; | |
968 | } | |
969 | } | |
970 | break; | |
971 | case MEMPOOL_TYPE_GLOBAL: | |
a5694a4d MD |
972 | /* Override populate policy for global type. */ |
973 | attr.populate_policy = RSEQ_MEMPOOL_POPULATE_ALL; | |
89b7e681 MD |
974 | /* Use a 1-cpu pool for global mempool type. */ |
975 | attr.max_nr_cpus = 1; | |
cb475906 MD |
976 | break; |
977 | } | |
978 | if (!attr.stride) | |
979 | attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */ | |
3975084e MD |
980 | if (attr.robust_set && !attr.poison_set) { |
981 | attr.poison_set = true; | |
982 | attr.poison = DEFAULT_POISON_VALUE; | |
983 | } | |
cb475906 MD |
984 | if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() || |
985 | !is_pow2(attr.stride)) { | |
986 | errno = EINVAL; | |
987 | return NULL; | |
988 | } | |
989 | ||
0ba2a93e | 990 | pool = calloc(1, sizeof(struct rseq_mempool)); |
bc510b60 MD |
991 | if (!pool) |
992 | return NULL; | |
ef6695f1 | 993 | |
b73b0c25 | 994 | memcpy(&pool->attr, &attr, sizeof(attr)); |
ef6695f1 | 995 | pthread_mutex_init(&pool->lock, NULL); |
ef6695f1 MD |
996 | pool->item_len = item_len; |
997 | pool->item_order = order; | |
b73b0c25 | 998 | |
a5694a4d MD |
999 | if (rseq_mempool_memfd_ref(pool)) |
1000 | goto error_alloc; | |
1001 | ||
9d986353 MD |
1002 | pool->range_list = rseq_mempool_range_create(pool); |
1003 | if (!pool->range_list) | |
b73b0c25 | 1004 | goto error_alloc; |
0fdf7a4c | 1005 | |
ca452fee MD |
1006 | if (pool_name) { |
1007 | pool->name = strdup(pool_name); | |
1008 | if (!pool->name) | |
1009 | goto error_alloc; | |
1010 | } | |
ef6695f1 | 1011 | return pool; |
ef6695f1 | 1012 | |
9649c7ee | 1013 | error_alloc: |
0ba2a93e | 1014 | rseq_mempool_destroy(pool); |
9649c7ee MD |
1015 | errno = ENOMEM; |
1016 | return NULL; | |
ef6695f1 MD |
1017 | } |
1018 | ||
e7cbbc10 MD |
1019 | /* Always inline for __builtin_return_address(0). */ |
1020 | static inline __attribute__((always_inline)) | |
9d986353 | 1021 | void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset) |
0fdf7a4c | 1022 | { |
9d986353 | 1023 | unsigned long *bitmap = range->alloc_bitmap; |
9649c7ee | 1024 | size_t item_index = item_offset >> pool->item_order; |
0fdf7a4c OD |
1025 | unsigned long mask; |
1026 | size_t k; | |
1027 | ||
9649c7ee | 1028 | if (!bitmap) |
0fdf7a4c | 1029 | return; |
0fdf7a4c | 1030 | |
9649c7ee | 1031 | k = item_index / BIT_PER_ULONG; |
0fdf7a4c OD |
1032 | mask = 1ULL << (item_index % BIT_PER_ULONG); |
1033 | ||
9649c7ee MD |
1034 | /* Print error if bit is already set. */ |
1035 | if (bitmap[k] & mask) { | |
ca452fee MD |
1036 | fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
1037 | __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0)); | |
9649c7ee MD |
1038 | abort(); |
1039 | } | |
0fdf7a4c OD |
1040 | bitmap[k] |= mask; |
1041 | } | |
1042 | ||
ef6695f1 | 1043 | static |
6ff43d9a MD |
1044 | void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool, |
1045 | bool zeroed, void *init_ptr, size_t init_len) | |
ef6695f1 | 1046 | { |
9d986353 | 1047 | struct rseq_mempool_range *range; |
ef6695f1 MD |
1048 | struct free_list_node *node; |
1049 | uintptr_t item_offset; | |
d24ee051 | 1050 | void __rseq_percpu *addr; |
ef6695f1 | 1051 | |
6ff43d9a MD |
1052 | if (init_len > pool->item_len) { |
1053 | errno = EINVAL; | |
1054 | return NULL; | |
1055 | } | |
ef6695f1 MD |
1056 | pthread_mutex_lock(&pool->lock); |
1057 | /* Get first entry from free list. */ | |
1058 | node = pool->free_list_head; | |
1059 | if (node != NULL) { | |
a5694a4d | 1060 | void *range_base, *ptr; |
9d986353 | 1061 | |
a5694a4d MD |
1062 | ptr = __rseq_free_list_to_percpu_ptr(pool, node); |
1063 | range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1))); | |
9d986353 | 1064 | range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET); |
ef6695f1 MD |
1065 | /* Remove node from free list (update head). */ |
1066 | pool->free_list_head = node->next; | |
a5694a4d | 1067 | item_offset = (uintptr_t) (ptr - range_base); |
86617384 | 1068 | rseq_percpu_check_poison_item(pool, range, item_offset); |
a5694a4d | 1069 | addr = __rseq_free_list_to_percpu_ptr(pool, node); |
ef6695f1 MD |
1070 | goto end; |
1071 | } | |
9d986353 MD |
1072 | /* |
1073 | * If the most recent range (first in list) does not have any | |
1074 | * room left, create a new range and prepend it to the list | |
1075 | * head. | |
1076 | */ | |
1077 | range = pool->range_list; | |
1078 | if (range->next_unused + pool->item_len > pool->attr.stride) { | |
1079 | range = rseq_mempool_range_create(pool); | |
1080 | if (!range) { | |
1081 | errno = ENOMEM; | |
1082 | addr = NULL; | |
1083 | goto end; | |
1084 | } | |
1085 | /* Add range to head of list. */ | |
1086 | range->next = pool->range_list; | |
1087 | pool->range_list = range; | |
ef6695f1 | 1088 | } |
9d986353 MD |
1089 | /* First range in list has room left. */ |
1090 | item_offset = range->next_unused; | |
1091 | addr = (void __rseq_percpu *) (range->base + item_offset); | |
1092 | range->next_unused += pool->item_len; | |
ef6695f1 | 1093 | end: |
8f28507f | 1094 | if (addr) |
9d986353 | 1095 | set_alloc_slot(pool, range, item_offset); |
ef6695f1 | 1096 | pthread_mutex_unlock(&pool->lock); |
6ff43d9a MD |
1097 | if (addr) { |
1098 | if (zeroed) | |
1099 | rseq_percpu_zero_item(pool, range, item_offset); | |
1100 | else if (init_ptr) { | |
1101 | rseq_percpu_init_item(pool, range, item_offset, | |
1102 | init_ptr, init_len); | |
1103 | } | |
1104 | } | |
ef6695f1 MD |
1105 | return addr; |
1106 | } | |
1107 | ||
15da5c27 | 1108 | void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool) |
ef6695f1 | 1109 | { |
6ff43d9a | 1110 | return __rseq_percpu_malloc(pool, false, NULL, 0); |
ef6695f1 MD |
1111 | } |
1112 | ||
15da5c27 | 1113 | void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool) |
ef6695f1 | 1114 | { |
6ff43d9a MD |
1115 | return __rseq_percpu_malloc(pool, true, NULL, 0); |
1116 | } | |
1117 | ||
1118 | void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool, | |
1119 | void *init_ptr, size_t len) | |
1120 | { | |
1121 | return __rseq_percpu_malloc(pool, false, init_ptr, len); | |
ef6695f1 MD |
1122 | } |
1123 | ||
e7cbbc10 MD |
1124 | /* Always inline for __builtin_return_address(0). */ |
1125 | static inline __attribute__((always_inline)) | |
9d986353 | 1126 | void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset) |
0fdf7a4c | 1127 | { |
9d986353 | 1128 | unsigned long *bitmap = range->alloc_bitmap; |
9649c7ee | 1129 | size_t item_index = item_offset >> pool->item_order; |
0fdf7a4c OD |
1130 | unsigned long mask; |
1131 | size_t k; | |
1132 | ||
9649c7ee | 1133 | if (!bitmap) |
0fdf7a4c | 1134 | return; |
0fdf7a4c | 1135 | |
9649c7ee MD |
1136 | k = item_index / BIT_PER_ULONG; |
1137 | mask = 1ULL << (item_index % BIT_PER_ULONG); | |
0fdf7a4c | 1138 | |
9649c7ee MD |
1139 | /* Print error if bit is not set. */ |
1140 | if (!(bitmap[k] & mask)) { | |
ca452fee MD |
1141 | fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
1142 | __func__, get_pool_name(pool), pool, item_offset, | |
1143 | (void *) __builtin_return_address(0)); | |
9649c7ee MD |
1144 | abort(); |
1145 | } | |
0fdf7a4c OD |
1146 | bitmap[k] &= ~mask; |
1147 | } | |
1148 | ||
cb475906 | 1149 | void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride) |
ef6695f1 MD |
1150 | { |
1151 | uintptr_t ptr = (uintptr_t) _ptr; | |
cb475906 | 1152 | void *range_base = (void *) (ptr & (~(stride - 1))); |
0ba2a93e MD |
1153 | struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET); |
1154 | struct rseq_mempool *pool = range->pool; | |
cb475906 | 1155 | uintptr_t item_offset = ptr & (stride - 1); |
ef6695f1 MD |
1156 | struct free_list_node *head, *item; |
1157 | ||
1158 | pthread_mutex_lock(&pool->lock); | |
9d986353 | 1159 | clear_alloc_slot(pool, range, item_offset); |
ef6695f1 MD |
1160 | /* Add ptr to head of free list */ |
1161 | head = pool->free_list_head; | |
455e090e MD |
1162 | if (pool->attr.poison_set) |
1163 | rseq_percpu_poison_item(pool, range, item_offset); | |
a5694a4d | 1164 | item = __rseq_percpu_to_free_list_ptr(pool, _ptr); |
455e090e MD |
1165 | /* |
1166 | * Setting the next pointer will overwrite the first uintptr_t | |
a5694a4d MD |
1167 | * poison for either CPU 0 (populate all) or init data (populate |
1168 | * none). | |
455e090e | 1169 | */ |
ef6695f1 MD |
1170 | item->next = head; |
1171 | pool->free_list_head = item; | |
1172 | pthread_mutex_unlock(&pool->lock); | |
1173 | } | |
1174 | ||
0ba2a93e | 1175 | struct rseq_mempool_set *rseq_mempool_set_create(void) |
ef6695f1 | 1176 | { |
0ba2a93e | 1177 | struct rseq_mempool_set *pool_set; |
ef6695f1 | 1178 | |
0ba2a93e | 1179 | pool_set = calloc(1, sizeof(struct rseq_mempool_set)); |
ef6695f1 MD |
1180 | if (!pool_set) |
1181 | return NULL; | |
1182 | pthread_mutex_init(&pool_set->lock, NULL); | |
1183 | return pool_set; | |
1184 | } | |
1185 | ||
0ba2a93e | 1186 | int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set) |
ef6695f1 MD |
1187 | { |
1188 | int order, ret; | |
1189 | ||
1190 | for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) { | |
0ba2a93e | 1191 | struct rseq_mempool *pool = pool_set->entries[order]; |
ef6695f1 MD |
1192 | |
1193 | if (!pool) | |
1194 | continue; | |
0ba2a93e | 1195 | ret = rseq_mempool_destroy(pool); |
ef6695f1 MD |
1196 | if (ret) |
1197 | return ret; | |
1198 | pool_set->entries[order] = NULL; | |
1199 | } | |
1200 | pthread_mutex_destroy(&pool_set->lock); | |
1201 | free(pool_set); | |
1202 | return 0; | |
1203 | } | |
1204 | ||
1205 | /* Ownership of pool is handed over to pool set on success. */ | |
0ba2a93e | 1206 | int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool) |
ef6695f1 MD |
1207 | { |
1208 | size_t item_order = pool->item_order; | |
1209 | int ret = 0; | |
1210 | ||
1211 | pthread_mutex_lock(&pool_set->lock); | |
1212 | if (pool_set->entries[item_order]) { | |
1213 | errno = EBUSY; | |
1214 | ret = -1; | |
1215 | goto end; | |
1216 | } | |
1217 | pool_set->entries[pool->item_order] = pool; | |
1218 | end: | |
1219 | pthread_mutex_unlock(&pool_set->lock); | |
1220 | return ret; | |
1221 | } | |
1222 | ||
1223 | static | |
6ff43d9a MD |
1224 | void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set, |
1225 | void *init_ptr, size_t len, bool zeroed) | |
ef6695f1 MD |
1226 | { |
1227 | int order, min_order = POOL_SET_MIN_ENTRY; | |
0ba2a93e | 1228 | struct rseq_mempool *pool; |
d24ee051 | 1229 | void __rseq_percpu *addr; |
ef6695f1 | 1230 | |
d06f5cf5 MD |
1231 | order = rseq_get_count_order_ulong(len); |
1232 | if (order > POOL_SET_MIN_ENTRY) | |
1233 | min_order = order; | |
ef6695f1 MD |
1234 | again: |
1235 | pthread_mutex_lock(&pool_set->lock); | |
1236 | /* First smallest present pool where @len fits. */ | |
1237 | for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) { | |
1238 | pool = pool_set->entries[order]; | |
1239 | ||
1240 | if (!pool) | |
1241 | continue; | |
1242 | if (pool->item_len >= len) | |
1243 | goto found; | |
1244 | } | |
1245 | pool = NULL; | |
1246 | found: | |
1247 | pthread_mutex_unlock(&pool_set->lock); | |
1248 | if (pool) { | |
6ff43d9a | 1249 | addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len); |
ef6695f1 MD |
1250 | if (addr == NULL && errno == ENOMEM) { |
1251 | /* | |
1252 | * If the allocation failed, try again with a | |
1253 | * larger pool. | |
1254 | */ | |
1255 | min_order = order + 1; | |
1256 | goto again; | |
1257 | } | |
1258 | } else { | |
1259 | /* Not found. */ | |
1260 | errno = ENOMEM; | |
1261 | addr = NULL; | |
1262 | } | |
1263 | return addr; | |
1264 | } | |
1265 | ||
15da5c27 | 1266 | void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len) |
ef6695f1 | 1267 | { |
6ff43d9a | 1268 | return __rseq_mempool_set_malloc(pool_set, NULL, len, false); |
ef6695f1 MD |
1269 | } |
1270 | ||
15da5c27 | 1271 | void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len) |
ef6695f1 | 1272 | { |
6ff43d9a MD |
1273 | return __rseq_mempool_set_malloc(pool_set, NULL, len, true); |
1274 | } | |
1275 | ||
1276 | void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set, | |
1277 | void *init_ptr, size_t len) | |
1278 | { | |
1279 | return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true); | |
ef6695f1 | 1280 | } |
9bd07c29 | 1281 | |
0ba2a93e | 1282 | struct rseq_mempool_attr *rseq_mempool_attr_create(void) |
a82006d0 | 1283 | { |
0ba2a93e | 1284 | return calloc(1, sizeof(struct rseq_mempool_attr)); |
a82006d0 MD |
1285 | } |
1286 | ||
0ba2a93e | 1287 | void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr) |
a82006d0 MD |
1288 | { |
1289 | free(attr); | |
1290 | } | |
1291 | ||
0ba2a93e | 1292 | int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr *attr, |
a82006d0 | 1293 | void *(*mmap_func)(void *priv, size_t len), |
9bd07c29 MD |
1294 | int (*munmap_func)(void *priv, void *ptr, size_t len), |
1295 | void *mmap_priv) | |
1296 | { | |
8118247e MD |
1297 | if (!attr) { |
1298 | errno = EINVAL; | |
1299 | return -1; | |
1300 | } | |
a82006d0 | 1301 | attr->mmap_set = true; |
9bd07c29 MD |
1302 | attr->mmap_func = mmap_func; |
1303 | attr->munmap_func = munmap_func; | |
1304 | attr->mmap_priv = mmap_priv; | |
8118247e | 1305 | return 0; |
9bd07c29 | 1306 | } |
d6acc8aa | 1307 | |
135811f2 | 1308 | int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr, |
6e329183 | 1309 | int (*init_func)(void *priv, void *addr, size_t len, int cpu), |
135811f2 MD |
1310 | void *init_priv) |
1311 | { | |
1312 | if (!attr) { | |
1313 | errno = EINVAL; | |
1314 | return -1; | |
1315 | } | |
1316 | attr->init_set = true; | |
1317 | attr->init_func = init_func; | |
1318 | attr->init_priv = init_priv; | |
1319 | return 0; | |
1320 | } | |
1321 | ||
0ba2a93e | 1322 | int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr) |
d6acc8aa MD |
1323 | { |
1324 | if (!attr) { | |
1325 | errno = EINVAL; | |
1326 | return -1; | |
1327 | } | |
1328 | attr->robust_set = true; | |
1329 | return 0; | |
1330 | } | |
cb475906 MD |
1331 | |
1332 | int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr, | |
1333 | size_t stride, int max_nr_cpus) | |
1334 | { | |
1335 | if (!attr) { | |
1336 | errno = EINVAL; | |
1337 | return -1; | |
1338 | } | |
1339 | attr->type = MEMPOOL_TYPE_PERCPU; | |
1340 | attr->stride = stride; | |
1341 | attr->max_nr_cpus = max_nr_cpus; | |
1342 | return 0; | |
1343 | } | |
1344 | ||
1345 | int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr, | |
1346 | size_t stride) | |
1347 | { | |
1348 | if (!attr) { | |
1349 | errno = EINVAL; | |
1350 | return -1; | |
1351 | } | |
1352 | attr->type = MEMPOOL_TYPE_GLOBAL; | |
1353 | attr->stride = stride; | |
89b7e681 | 1354 | attr->max_nr_cpus = 0; |
cb475906 MD |
1355 | return 0; |
1356 | } | |
6037d364 | 1357 | |
e11a02d7 MD |
1358 | int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr, |
1359 | unsigned long max_nr_ranges) | |
1360 | { | |
1361 | if (!attr) { | |
1362 | errno = EINVAL; | |
1363 | return -1; | |
1364 | } | |
1365 | attr->max_nr_ranges = max_nr_ranges; | |
1366 | return 0; | |
1367 | } | |
1368 | ||
455e090e MD |
1369 | int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr, |
1370 | uintptr_t poison) | |
1371 | { | |
1372 | if (!attr) { | |
1373 | errno = EINVAL; | |
1374 | return -1; | |
1375 | } | |
1376 | attr->poison_set = true; | |
1377 | attr->poison = poison; | |
1378 | return 0; | |
1379 | } | |
1380 | ||
a5694a4d MD |
1381 | int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr, |
1382 | enum rseq_mempool_populate_policy policy) | |
1383 | { | |
1384 | if (!attr) { | |
1385 | errno = EINVAL; | |
1386 | return -1; | |
1387 | } | |
1388 | attr->populate_policy = policy; | |
1389 | return 0; | |
1390 | } | |
1391 | ||
6037d364 MD |
1392 | int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool) |
1393 | { | |
1394 | if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) { | |
1395 | errno = EINVAL; | |
1396 | return -1; | |
1397 | } | |
1398 | return mempool->attr.max_nr_cpus; | |
1399 | } |