781a22d72372a7051f4d976433808849491698c0
[librseq.git] / src / rseq-percpu-alloc.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4 #include <rseq/percpu-alloc.h>
5 #include <sys/mman.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <pthread.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <rseq/compiler.h>
12 #include <errno.h>
13 #include <stdint.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16
17 #ifdef HAVE_LIBNUMA
18 # include <numa.h>
19 # include <numaif.h>
20 #endif
21
22 #include "rseq-alloc-utils.h"
23
24 /*
25 * rseq-percpu-alloc.c: rseq CPU-Local Storage (CLS) memory allocator.
26 *
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
29 * given size (rounded to next power of 2), reserving a given virtual
30 * address size per CPU, for a given maximum number of CPUs.
31 *
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
35 */
36
37 /*
38 * Use high bits of per-CPU addresses to index the pool.
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
41 */
42 #if RSEQ_BITS_PER_LONG == 64
43 # define POOL_INDEX_BITS 16
44 #else
45 # define POOL_INDEX_BITS 8
46 #endif
47 #define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48 #define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49 #define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50 #define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
51
52 #define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
53
54 /*
55 * Smallest allocation should hold enough space for a free list pointer.
56 */
57 #if RSEQ_BITS_PER_LONG == 64
58 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
59 #else
60 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
61 #endif
62
63 /*
64 * Skip pool index 0 to ensure allocated entries at index 0 do not match
65 * a NULL pointer.
66 */
67 #define FIRST_POOL 1
68
69 struct free_list_node;
70
71 struct free_list_node {
72 struct free_list_node *next;
73 };
74
75 /* This lock protects pool create/destroy. */
76 static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
77
78 struct rseq_percpu_pool {
79 void *base;
80 unsigned int index;
81 size_t item_len;
82 size_t percpu_len;
83 int item_order;
84 int max_nr_cpus;
85
86 /*
87 * The free list chains freed items on the CPU 0 address range.
88 * We should rethink this decision if false sharing between
89 * malloc/free from other CPUs and data accesses from CPU 0
90 * becomes an issue. This is a NULL-terminated singly-linked
91 * list.
92 */
93 struct free_list_node *free_list_head;
94 size_t next_unused;
95 /* This lock protects allocation/free within the pool. */
96 pthread_mutex_t lock;
97 };
98
99 //TODO: the array of pools should grow dynamically on create.
100 static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
101
102 /*
103 * Pool set entries are indexed by item_len rounded to the next power of
104 * 2. A pool set can contain NULL pool entries, in which case the next
105 * large enough entry will be used for allocation.
106 */
107 struct rseq_percpu_pool_set {
108 /* This lock protects add vs malloc/zmalloc within the pool set. */
109 pthread_mutex_t lock;
110 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
111 };
112
113 static
114 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
115 {
116 return pool->base + (pool->percpu_len * cpu) + item_offset;
117 }
118
119 ptrdiff_t rseq_percpu_pool_ptr_offset(struct rseq_percpu_pool *pool, int cpu)
120 {
121 uintptr_t rseq_percpu_base = (uintptr_t) pool->index << POOL_INDEX_SHIFT;
122 uintptr_t refptr = (uintptr_t) __rseq_pool_percpu_ptr(pool, cpu, 0);
123
124 return (ptrdiff_t) (refptr - rseq_percpu_base);
125 }
126
127 void *__rseq_percpu_ptr(void __rseq_percpu *_ptr, int cpu)
128 {
129 uintptr_t ptr = (uintptr_t) _ptr;
130 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
131 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
132 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
133
134 assert(cpu >= 0);
135 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
136 }
137
138 static
139 void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
140 {
141 int i;
142
143 for (i = 0; i < pool->max_nr_cpus; i++) {
144 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
145 memset(p, 0, pool->item_len);
146 }
147 }
148
149 #ifdef HAVE_LIBNUMA
150 static
151 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, int numa_flags)
152 {
153 unsigned long nr_pages, page;
154 long ret, page_len;
155 int cpu;
156
157 if (!numa_flags)
158 return;
159 page_len = rseq_get_page_len();
160 nr_pages = pool->percpu_len >> rseq_get_count_order_ulong(page_len);
161 for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
162 int node = numa_node_of_cpu(cpu);
163
164 /* TODO: batch move_pages() call with an array of pages. */
165 for (page = 0; page < nr_pages; page++) {
166 void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
167 int status = -EPERM;
168
169 ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
170 if (ret) {
171 perror("move_pages");
172 abort();
173 }
174 }
175 }
176 }
177 #else
178 static
179 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
180 int numa_flags __attribute__((unused)))
181 {
182 }
183 #endif
184
185 /*
186 * Expected numa_flags:
187 * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing).
188 * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes.
189 * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE).
190 */
191 struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
192 size_t percpu_len, int max_nr_cpus,
193 int mmap_prot, int mmap_flags, int mmap_fd,
194 off_t mmap_offset, int numa_flags)
195 {
196 struct rseq_percpu_pool *pool;
197 void *base;
198 unsigned int i;
199 int order;
200
201 /* Make sure each item is large enough to contain free list pointers. */
202 if (item_len < sizeof(void *))
203 item_len = sizeof(void *);
204
205 /* Align item_len on next power of two. */
206 order = rseq_get_count_order_ulong(item_len);
207 if (order < 0) {
208 errno = EINVAL;
209 return NULL;
210 }
211 item_len = 1UL << order;
212
213 /* Align percpu_len on page size. */
214 percpu_len = rseq_align(percpu_len, rseq_get_page_len());
215
216 if (max_nr_cpus < 0 || item_len > percpu_len ||
217 percpu_len > (UINTPTR_MAX >> POOL_INDEX_BITS)) {
218 errno = EINVAL;
219 return NULL;
220 }
221
222 pthread_mutex_lock(&pool_lock);
223 /* Linear scan in array of pools to find empty spot. */
224 for (i = FIRST_POOL; i < MAX_NR_POOLS; i++) {
225 pool = &rseq_percpu_pool[i];
226 if (!pool->base)
227 goto found_empty;
228 }
229 errno = ENOMEM;
230 pool = NULL;
231 goto end;
232
233 found_empty:
234 base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot,
235 mmap_flags, mmap_fd, mmap_offset);
236 if (base == MAP_FAILED) {
237 pool = NULL;
238 goto end;
239 }
240 rseq_percpu_pool_init_numa(pool, numa_flags);
241 pthread_mutex_init(&pool->lock, NULL);
242 pool->base = base;
243 pool->percpu_len = percpu_len;
244 pool->max_nr_cpus = max_nr_cpus;
245 pool->index = i;
246 pool->item_len = item_len;
247 pool->item_order = order;
248 end:
249 pthread_mutex_unlock(&pool_lock);
250 return pool;
251 }
252
253 int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
254 {
255 int ret;
256
257 pthread_mutex_lock(&pool_lock);
258 if (!pool->base) {
259 errno = ENOENT;
260 ret = -1;
261 goto end;
262 }
263 ret = munmap(pool->base, pool->percpu_len * pool->max_nr_cpus);
264 if (ret)
265 goto end;
266 pthread_mutex_destroy(&pool->lock);
267 memset(pool, 0, sizeof(*pool));
268 end:
269 pthread_mutex_unlock(&pool_lock);
270 return 0;
271 }
272
273 static
274 void __rseq_percpu *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
275 {
276 struct free_list_node *node;
277 uintptr_t item_offset;
278 void __rseq_percpu *addr;
279
280 pthread_mutex_lock(&pool->lock);
281 /* Get first entry from free list. */
282 node = pool->free_list_head;
283 if (node != NULL) {
284 /* Remove node from free list (update head). */
285 pool->free_list_head = node->next;
286 item_offset = (uintptr_t) ((void *) node - pool->base);
287 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
288 goto end;
289 }
290 if (pool->next_unused + pool->item_len > pool->percpu_len) {
291 errno = ENOMEM;
292 addr = NULL;
293 goto end;
294 }
295 item_offset = pool->next_unused;
296 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
297 pool->next_unused += pool->item_len;
298 end:
299 pthread_mutex_unlock(&pool->lock);
300 if (zeroed && addr)
301 rseq_percpu_zero_item(pool, item_offset);
302 return addr;
303 }
304
305 void __rseq_percpu *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
306 {
307 return __rseq_percpu_malloc(pool, false);
308 }
309
310 void __rseq_percpu *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
311 {
312 return __rseq_percpu_malloc(pool, true);
313 }
314
315 void rseq_percpu_free(void __rseq_percpu *_ptr)
316 {
317 uintptr_t ptr = (uintptr_t) _ptr;
318 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
319 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
320 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
321 struct free_list_node *head, *item;
322
323 pthread_mutex_lock(&pool->lock);
324 /* Add ptr to head of free list */
325 head = pool->free_list_head;
326 /* Free-list is in CPU 0 range. */
327 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
328 item->next = head;
329 pool->free_list_head = item;
330 pthread_mutex_unlock(&pool->lock);
331 }
332
333 struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
334 {
335 struct rseq_percpu_pool_set *pool_set;
336
337 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
338 if (!pool_set)
339 return NULL;
340 pthread_mutex_init(&pool_set->lock, NULL);
341 return pool_set;
342 }
343
344 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
345 {
346 int order, ret;
347
348 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
349 struct rseq_percpu_pool *pool = pool_set->entries[order];
350
351 if (!pool)
352 continue;
353 ret = rseq_percpu_pool_destroy(pool);
354 if (ret)
355 return ret;
356 pool_set->entries[order] = NULL;
357 }
358 pthread_mutex_destroy(&pool_set->lock);
359 free(pool_set);
360 return 0;
361 }
362
363 /* Ownership of pool is handed over to pool set on success. */
364 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
365 {
366 size_t item_order = pool->item_order;
367 int ret = 0;
368
369 pthread_mutex_lock(&pool_set->lock);
370 if (pool_set->entries[item_order]) {
371 errno = EBUSY;
372 ret = -1;
373 goto end;
374 }
375 pool_set->entries[pool->item_order] = pool;
376 end:
377 pthread_mutex_unlock(&pool_set->lock);
378 return ret;
379 }
380
381 static
382 void __rseq_percpu *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
383 {
384 int order, min_order = POOL_SET_MIN_ENTRY;
385 struct rseq_percpu_pool *pool;
386 void __rseq_percpu *addr;
387
388 order = rseq_get_count_order_ulong(len);
389 if (order > POOL_SET_MIN_ENTRY)
390 min_order = order;
391 again:
392 pthread_mutex_lock(&pool_set->lock);
393 /* First smallest present pool where @len fits. */
394 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
395 pool = pool_set->entries[order];
396
397 if (!pool)
398 continue;
399 if (pool->item_len >= len)
400 goto found;
401 }
402 pool = NULL;
403 found:
404 pthread_mutex_unlock(&pool_set->lock);
405 if (pool) {
406 addr = __rseq_percpu_malloc(pool, zeroed);
407 if (addr == NULL && errno == ENOMEM) {
408 /*
409 * If the allocation failed, try again with a
410 * larger pool.
411 */
412 min_order = order + 1;
413 goto again;
414 }
415 } else {
416 /* Not found. */
417 errno = ENOMEM;
418 addr = NULL;
419 }
420 return addr;
421 }
422
423 void __rseq_percpu *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
424 {
425 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
426 }
427
428 void __rseq_percpu *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
429 {
430 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
431 }
This page took 0.038019 seconds and 3 git commands to generate.