1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/percpu-alloc.h>
11 #include <rseq/compiler.h>
22 #include "rseq-alloc-utils.h"
25 * rseq-percpu-alloc.c: rseq CPU-Local Storage (CLS) memory allocator.
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
29 * given size (rounded to next power of 2), reserving a given virtual
30 * address size per CPU, for a given maximum number of CPUs.
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
38 * Use high bits of per-CPU addresses to index the pool.
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
42 #if RSEQ_BITS_PER_LONG == 64
43 # define POOL_INDEX_BITS 16
45 # define POOL_INDEX_BITS 8
47 #define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48 #define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49 #define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50 #define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
52 #define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
55 * Smallest allocation should hold enough space for a free list pointer.
57 #if RSEQ_BITS_PER_LONG == 64
58 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
60 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
64 * Skip pool index 0 to ensure allocated entries at index 0 do not match
69 struct free_list_node
;
71 struct free_list_node
{
72 struct free_list_node
*next
;
75 /* This lock protects pool create/destroy. */
76 static pthread_mutex_t pool_lock
= PTHREAD_MUTEX_INITIALIZER
;
78 struct rseq_percpu_pool
{
87 * The free list chains freed items on the CPU 0 address range.
88 * We should rethink this decision if false sharing between
89 * malloc/free from other CPUs and data accesses from CPU 0
90 * becomes an issue. This is a NULL-terminated singly-linked
93 struct free_list_node
*free_list_head
;
95 /* This lock protects allocation/free within the pool. */
99 //TODO: the array of pools should grow dynamically on create.
100 static struct rseq_percpu_pool rseq_percpu_pool
[MAX_NR_POOLS
];
103 * Pool set entries are indexed by item_len rounded to the next power of
104 * 2. A pool set can contain NULL pool entries, in which case the next
105 * large enough entry will be used for allocation.
107 struct rseq_percpu_pool_set
{
108 /* This lock protects add vs malloc/zmalloc within the pool set. */
109 pthread_mutex_t lock
;
110 struct rseq_percpu_pool
*entries
[POOL_SET_NR_ENTRIES
];
114 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool
*pool
, int cpu
, uintptr_t item_offset
)
116 return pool
->base
+ (pool
->percpu_len
* cpu
) + item_offset
;
119 ptrdiff_t rseq_percpu_pool_ptr_offset(struct rseq_percpu_pool
*pool
, int cpu
)
121 uintptr_t rseq_percpu_base
= (uintptr_t) pool
->index
<< POOL_INDEX_SHIFT
;
122 uintptr_t refptr
= (uintptr_t) __rseq_pool_percpu_ptr(pool
, cpu
, 0);
124 return (ptrdiff_t) (refptr
- rseq_percpu_base
);
127 void *__rseq_percpu_ptr(void __rseq_percpu
*_ptr
, int cpu
)
129 uintptr_t ptr
= (uintptr_t) _ptr
;
130 uintptr_t item_offset
= ptr
& MAX_POOL_LEN_MASK
;
131 uintptr_t pool_index
= ptr
>> POOL_INDEX_SHIFT
;
132 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
135 return __rseq_pool_percpu_ptr(pool
, cpu
, item_offset
);
139 void rseq_percpu_zero_item(struct rseq_percpu_pool
*pool
, uintptr_t item_offset
)
143 for (i
= 0; i
< pool
->max_nr_cpus
; i
++) {
144 char *p
= __rseq_pool_percpu_ptr(pool
, i
, item_offset
);
145 memset(p
, 0, pool
->item_len
);
151 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool
*pool
, int numa_flags
)
153 unsigned long nr_pages
, page
;
159 page_len
= rseq_get_page_len();
160 nr_pages
= pool
->percpu_len
>> rseq_get_count_order_ulong(page_len
);
161 for (cpu
= 0; cpu
< pool
->max_nr_cpus
; cpu
++) {
162 int node
= numa_node_of_cpu(cpu
);
164 /* TODO: batch move_pages() call with an array of pages. */
165 for (page
= 0; page
< nr_pages
; page
++) {
166 void *pageptr
= __rseq_pool_percpu_ptr(pool
, cpu
, page
* page_len
);
169 ret
= move_pages(0, 1, &pageptr
, &node
, &status
, numa_flags
);
171 perror("move_pages");
179 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool
*pool
__attribute__((unused
)),
180 int numa_flags
__attribute__((unused
)))
186 * Expected numa_flags:
187 * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing).
188 * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes.
189 * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE).
191 struct rseq_percpu_pool
*rseq_percpu_pool_create(size_t item_len
,
192 size_t percpu_len
, int max_nr_cpus
,
193 int mmap_prot
, int mmap_flags
, int mmap_fd
,
194 off_t mmap_offset
, int numa_flags
)
196 struct rseq_percpu_pool
*pool
;
201 /* Make sure each item is large enough to contain free list pointers. */
202 if (item_len
< sizeof(void *))
203 item_len
= sizeof(void *);
205 /* Align item_len on next power of two. */
206 order
= rseq_get_count_order_ulong(item_len
);
211 item_len
= 1UL << order
;
213 /* Align percpu_len on page size. */
214 percpu_len
= rseq_align(percpu_len
, rseq_get_page_len());
216 if (max_nr_cpus
< 0 || item_len
> percpu_len
||
217 percpu_len
> (UINTPTR_MAX
>> POOL_INDEX_BITS
)) {
222 pthread_mutex_lock(&pool_lock
);
223 /* Linear scan in array of pools to find empty spot. */
224 for (i
= FIRST_POOL
; i
< MAX_NR_POOLS
; i
++) {
225 pool
= &rseq_percpu_pool
[i
];
234 base
= mmap(NULL
, percpu_len
* max_nr_cpus
, mmap_prot
,
235 mmap_flags
, mmap_fd
, mmap_offset
);
236 if (base
== MAP_FAILED
) {
240 rseq_percpu_pool_init_numa(pool
, numa_flags
);
241 pthread_mutex_init(&pool
->lock
, NULL
);
243 pool
->percpu_len
= percpu_len
;
244 pool
->max_nr_cpus
= max_nr_cpus
;
246 pool
->item_len
= item_len
;
247 pool
->item_order
= order
;
249 pthread_mutex_unlock(&pool_lock
);
253 int rseq_percpu_pool_destroy(struct rseq_percpu_pool
*pool
)
257 pthread_mutex_lock(&pool_lock
);
263 ret
= munmap(pool
->base
, pool
->percpu_len
* pool
->max_nr_cpus
);
266 pthread_mutex_destroy(&pool
->lock
);
267 memset(pool
, 0, sizeof(*pool
));
269 pthread_mutex_unlock(&pool_lock
);
274 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_percpu_pool
*pool
, bool zeroed
)
276 struct free_list_node
*node
;
277 uintptr_t item_offset
;
278 void __rseq_percpu
*addr
;
280 pthread_mutex_lock(&pool
->lock
);
281 /* Get first entry from free list. */
282 node
= pool
->free_list_head
;
284 /* Remove node from free list (update head). */
285 pool
->free_list_head
= node
->next
;
286 item_offset
= (uintptr_t) ((void *) node
- pool
->base
);
287 addr
= (void *) (((uintptr_t) pool
->index
<< POOL_INDEX_SHIFT
) | item_offset
);
290 if (pool
->next_unused
+ pool
->item_len
> pool
->percpu_len
) {
295 item_offset
= pool
->next_unused
;
296 addr
= (void *) (((uintptr_t) pool
->index
<< POOL_INDEX_SHIFT
) | item_offset
);
297 pool
->next_unused
+= pool
->item_len
;
299 pthread_mutex_unlock(&pool
->lock
);
301 rseq_percpu_zero_item(pool
, item_offset
);
305 void __rseq_percpu
*rseq_percpu_malloc(struct rseq_percpu_pool
*pool
)
307 return __rseq_percpu_malloc(pool
, false);
310 void __rseq_percpu
*rseq_percpu_zmalloc(struct rseq_percpu_pool
*pool
)
312 return __rseq_percpu_malloc(pool
, true);
315 void rseq_percpu_free(void __rseq_percpu
*_ptr
)
317 uintptr_t ptr
= (uintptr_t) _ptr
;
318 uintptr_t item_offset
= ptr
& MAX_POOL_LEN_MASK
;
319 uintptr_t pool_index
= ptr
>> POOL_INDEX_SHIFT
;
320 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
321 struct free_list_node
*head
, *item
;
323 pthread_mutex_lock(&pool
->lock
);
324 /* Add ptr to head of free list */
325 head
= pool
->free_list_head
;
326 /* Free-list is in CPU 0 range. */
327 item
= (struct free_list_node
*)__rseq_pool_percpu_ptr(pool
, 0, item_offset
);
329 pool
->free_list_head
= item
;
330 pthread_mutex_unlock(&pool
->lock
);
333 struct rseq_percpu_pool_set
*rseq_percpu_pool_set_create(void)
335 struct rseq_percpu_pool_set
*pool_set
;
337 pool_set
= calloc(1, sizeof(struct rseq_percpu_pool_set
));
340 pthread_mutex_init(&pool_set
->lock
, NULL
);
344 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set
*pool_set
)
348 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
349 struct rseq_percpu_pool
*pool
= pool_set
->entries
[order
];
353 ret
= rseq_percpu_pool_destroy(pool
);
356 pool_set
->entries
[order
] = NULL
;
358 pthread_mutex_destroy(&pool_set
->lock
);
363 /* Ownership of pool is handed over to pool set on success. */
364 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set
*pool_set
, struct rseq_percpu_pool
*pool
)
366 size_t item_order
= pool
->item_order
;
369 pthread_mutex_lock(&pool_set
->lock
);
370 if (pool_set
->entries
[item_order
]) {
375 pool_set
->entries
[pool
->item_order
] = pool
;
377 pthread_mutex_unlock(&pool_set
->lock
);
382 void __rseq_percpu
*__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
, bool zeroed
)
384 int order
, min_order
= POOL_SET_MIN_ENTRY
;
385 struct rseq_percpu_pool
*pool
;
386 void __rseq_percpu
*addr
;
388 order
= rseq_get_count_order_ulong(len
);
389 if (order
> POOL_SET_MIN_ENTRY
)
392 pthread_mutex_lock(&pool_set
->lock
);
393 /* First smallest present pool where @len fits. */
394 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
395 pool
= pool_set
->entries
[order
];
399 if (pool
->item_len
>= len
)
404 pthread_mutex_unlock(&pool_set
->lock
);
406 addr
= __rseq_percpu_malloc(pool
, zeroed
);
407 if (addr
== NULL
&& errno
== ENOMEM
) {
409 * If the allocation failed, try again with a
412 min_order
= order
+ 1;
423 void __rseq_percpu
*rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
425 return __rseq_percpu_pool_set_malloc(pool_set
, len
, false);
428 void __rseq_percpu
*rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
430 return __rseq_percpu_pool_set_malloc(pool_set
, len
, true);