Commit | Line | Data |
---|---|---|
48363c84 MD |
1 | // SPDX-License-Identifier: MIT |
2 | /* | |
3 | * Copyright 2022 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | |
4 | */ | |
5 | ||
6 | #include <sched.h> | |
054b7b5c | 7 | #include <string.h> |
48363c84 MD |
8 | #include <stdint.h> |
9 | #include <pthread.h> | |
10 | #include <stdbool.h> | |
11 | #include <poll.h> | |
054b7b5c | 12 | #include <stdlib.h> |
5a76c31e | 13 | #include <unistd.h> |
bddcdc92 | 14 | #include <stdio.h> |
5a76c31e MD |
15 | #include <sys/syscall.h> |
16 | #include <linux/membarrier.h> | |
48363c84 MD |
17 | |
18 | #include "rcu.h" | |
054b7b5c | 19 | #include "smp.h" |
48363c84 | 20 | |
bddcdc92 MD |
21 | /* |
22 | * If both rseq (with glibc support) and membarrier system calls are | |
23 | * available, use them to replace barriers and atomics on the fast-path. | |
24 | */ | |
67337c4a | 25 | unsigned int side_rcu_rseq_membarrier_available; |
bddcdc92 | 26 | |
5a76c31e MD |
27 | static int |
28 | membarrier(int cmd, unsigned int flags, int cpu_id) | |
29 | { | |
30 | return syscall(__NR_membarrier, cmd, flags, cpu_id); | |
31 | } | |
32 | ||
6635dd92 MD |
33 | /* |
34 | * Wait/wakeup scheme with single waiter/many wakers. | |
35 | */ | |
36 | static | |
67337c4a | 37 | void wait_gp_prepare(struct side_rcu_gp_state *gp_state) |
6635dd92 MD |
38 | { |
39 | __atomic_store_n(&gp_state->futex, -1, __ATOMIC_RELAXED); | |
40 | /* | |
41 | * This memory barrier (H) pairs with memory barrier (F). It | |
42 | * orders store to futex before load of RCU reader's counter | |
43 | * state, thus ensuring that load of RCU reader's counters does | |
67337c4a | 44 | * not leak outside of futex state=-1. |
6635dd92 | 45 | */ |
67337c4a | 46 | if (side_rcu_rseq_membarrier_available) { |
6635dd92 MD |
47 | if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) { |
48 | perror("membarrier"); | |
49 | abort(); | |
50 | } | |
51 | } else { | |
52 | __atomic_thread_fence(__ATOMIC_SEQ_CST); | |
53 | } | |
54 | } | |
55 | ||
56 | static | |
67337c4a | 57 | void wait_gp_end(struct side_rcu_gp_state *gp_state) |
6635dd92 MD |
58 | { |
59 | /* | |
60 | * This memory barrier (G) pairs with memory barrier (F). It | |
61 | * orders load of RCU reader's counter state before storing the | |
62 | * futex value, thus ensuring that load of RCU reader's counters | |
67337c4a | 63 | * does not leak outside of futex state=-1. |
6635dd92 | 64 | */ |
67337c4a | 65 | if (side_rcu_rseq_membarrier_available) { |
6635dd92 MD |
66 | if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) { |
67 | perror("membarrier"); | |
68 | abort(); | |
69 | } | |
70 | } else { | |
71 | __atomic_thread_fence(__ATOMIC_SEQ_CST); | |
72 | } | |
73 | __atomic_store_n(&gp_state->futex, 0, __ATOMIC_RELAXED); | |
74 | } | |
75 | ||
76 | static | |
67337c4a | 77 | void wait_gp(struct side_rcu_gp_state *gp_state) |
6635dd92 MD |
78 | { |
79 | /* | |
80 | * This memory barrier (G) pairs with memory barrier (F). It | |
81 | * orders load of RCU reader's counter state before loading the | |
82 | * futex value. | |
83 | */ | |
67337c4a | 84 | if (side_rcu_rseq_membarrier_available) { |
6635dd92 MD |
85 | if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) { |
86 | perror("membarrier"); | |
87 | abort(); | |
88 | } | |
89 | } else { | |
90 | __atomic_thread_fence(__ATOMIC_SEQ_CST); | |
91 | } | |
504b51f1 MD |
92 | while (__atomic_load_n(&gp_state->futex, __ATOMIC_RELAXED) == -1) { |
93 | if (!futex(&gp_state->futex, FUTEX_WAIT, -1, NULL, NULL, 0)) { | |
94 | /* | |
95 | * May be awakened by either spurious wake up or | |
96 | * because the state is now as expected. | |
97 | */ | |
98 | continue; | |
99 | } | |
6635dd92 MD |
100 | switch (errno) { |
101 | case EWOULDBLOCK: | |
102 | /* Value already changed. */ | |
103 | return; | |
104 | case EINTR: | |
105 | /* Retry if interrupted by signal. */ | |
106 | break; /* Get out of switch. */ | |
107 | default: | |
108 | /* Unexpected error. */ | |
109 | abort(); | |
110 | } | |
111 | } | |
112 | return; | |
113 | } | |
114 | ||
48363c84 MD |
115 | /* active_readers is an input/output parameter. */ |
116 | static | |
67337c4a | 117 | void check_active_readers(struct side_rcu_gp_state *gp_state, bool *active_readers) |
48363c84 MD |
118 | { |
119 | uintptr_t sum[2] = { 0, 0 }; /* begin - end */ | |
120 | int i; | |
121 | ||
122 | for (i = 0; i < gp_state->nr_cpus; i++) { | |
67337c4a | 123 | struct side_rcu_cpu_gp_state *cpu_state = &gp_state->percpu_state[i]; |
48363c84 | 124 | |
7fb53c62 | 125 | if (active_readers[0]) { |
48363c84 | 126 | sum[0] -= __atomic_load_n(&cpu_state->count[0].end, __ATOMIC_RELAXED); |
7fb53c62 MD |
127 | sum[0] -= __atomic_load_n(&cpu_state->count[0].rseq_end, __ATOMIC_RELAXED); |
128 | } | |
129 | if (active_readers[1]) { | |
48363c84 | 130 | sum[1] -= __atomic_load_n(&cpu_state->count[1].end, __ATOMIC_RELAXED); |
7fb53c62 MD |
131 | sum[1] -= __atomic_load_n(&cpu_state->count[1].rseq_end, __ATOMIC_RELAXED); |
132 | } | |
48363c84 MD |
133 | } |
134 | ||
135 | /* | |
136 | * This memory barrier (C) pairs with either of memory barriers | |
137 | * (A) or (B) (one is sufficient). | |
138 | * | |
139 | * Read end counts before begin counts. Reading "end" before | |
140 | * "begin" counts ensures we never see an "end" without having | |
141 | * seen its associated "begin", because "begin" is always | |
142 | * incremented before "end", as guaranteed by memory barriers | |
143 | * (A) or (B). | |
144 | */ | |
67337c4a | 145 | if (side_rcu_rseq_membarrier_available) { |
bddcdc92 MD |
146 | if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) { |
147 | perror("membarrier"); | |
148 | abort(); | |
149 | } | |
150 | } else { | |
151 | __atomic_thread_fence(__ATOMIC_SEQ_CST); | |
152 | } | |
48363c84 MD |
153 | |
154 | for (i = 0; i < gp_state->nr_cpus; i++) { | |
67337c4a | 155 | struct side_rcu_cpu_gp_state *cpu_state = &gp_state->percpu_state[i]; |
48363c84 | 156 | |
7fb53c62 | 157 | if (active_readers[0]) { |
48363c84 | 158 | sum[0] += __atomic_load_n(&cpu_state->count[0].begin, __ATOMIC_RELAXED); |
7fb53c62 MD |
159 | sum[0] += __atomic_load_n(&cpu_state->count[0].rseq_begin, __ATOMIC_RELAXED); |
160 | } | |
161 | if (active_readers[1]) { | |
48363c84 | 162 | sum[1] += __atomic_load_n(&cpu_state->count[1].begin, __ATOMIC_RELAXED); |
7fb53c62 MD |
163 | sum[1] += __atomic_load_n(&cpu_state->count[1].rseq_begin, __ATOMIC_RELAXED); |
164 | } | |
48363c84 MD |
165 | } |
166 | if (active_readers[0]) | |
167 | active_readers[0] = sum[0]; | |
168 | if (active_readers[1]) | |
169 | active_readers[1] = sum[1]; | |
170 | } | |
171 | ||
172 | /* | |
173 | * Wait for previous period to have no active readers. | |
174 | * | |
175 | * active_readers is an input/output parameter. | |
176 | */ | |
177 | static | |
67337c4a | 178 | void wait_for_prev_period_readers(struct side_rcu_gp_state *gp_state, bool *active_readers) |
48363c84 MD |
179 | { |
180 | unsigned int prev_period = gp_state->period ^ 1; | |
181 | ||
182 | /* | |
183 | * If a prior active readers scan already observed that no | |
184 | * readers are present for the previous period, there is no need | |
185 | * to scan again. | |
186 | */ | |
187 | if (!active_readers[prev_period]) | |
188 | return; | |
189 | /* | |
190 | * Wait for the sum of CPU begin/end counts to match for the | |
191 | * previous period. | |
192 | */ | |
193 | for (;;) { | |
6635dd92 | 194 | wait_gp_prepare(gp_state); |
48363c84 | 195 | check_active_readers(gp_state, active_readers); |
6635dd92 MD |
196 | if (!active_readers[prev_period]) { |
197 | wait_gp_end(gp_state); | |
48363c84 | 198 | break; |
6635dd92 MD |
199 | } |
200 | wait_gp(gp_state); | |
48363c84 MD |
201 | } |
202 | } | |
203 | ||
204 | /* | |
205 | * The grace period completes when it observes that there are no active | |
206 | * readers within each of the periods. | |
207 | * | |
208 | * The active_readers state is initially true for each period, until the | |
209 | * grace period observes that no readers are present for each given | |
210 | * period, at which point the active_readers state becomes false. | |
211 | */ | |
67337c4a | 212 | void side_rcu_wait_grace_period(struct side_rcu_gp_state *gp_state) |
48363c84 MD |
213 | { |
214 | bool active_readers[2] = { true, true }; | |
215 | ||
216 | /* | |
217 | * This memory barrier (D) pairs with memory barriers (A) and | |
67337c4a | 218 | * (B) on the read-side. |
48363c84 MD |
219 | * |
220 | * It orders prior loads and stores before the "end"/"begin" | |
221 | * reader state loads. In other words, it orders prior loads and | |
222 | * stores before observation of active readers quiescence, | |
67337c4a | 223 | * effectively ensuring that read-side critical sections which |
48363c84 MD |
224 | * exist after the grace period completes are ordered after |
225 | * loads and stores performed before the grace period. | |
226 | */ | |
67337c4a | 227 | if (side_rcu_rseq_membarrier_available) { |
bddcdc92 MD |
228 | if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) { |
229 | perror("membarrier"); | |
230 | abort(); | |
231 | } | |
232 | } else { | |
233 | __atomic_thread_fence(__ATOMIC_SEQ_CST); | |
234 | } | |
48363c84 MD |
235 | |
236 | /* | |
237 | * First scan through all cpus, for both period. If no readers | |
238 | * are accounted for, we have observed quiescence and can | |
239 | * complete the grace period immediately. | |
240 | */ | |
241 | check_active_readers(gp_state, active_readers); | |
242 | if (!active_readers[0] && !active_readers[1]) | |
243 | goto end; | |
244 | ||
245 | pthread_mutex_lock(&gp_state->gp_lock); | |
246 | ||
247 | wait_for_prev_period_readers(gp_state, active_readers); | |
248 | /* | |
249 | * If the reader scan detected that there are no readers in the | |
250 | * current period as well, we can complete the grace period | |
251 | * immediately. | |
252 | */ | |
253 | if (!active_readers[gp_state->period]) | |
254 | goto unlock; | |
255 | ||
256 | /* Flip period: 0 -> 1, 1 -> 0. */ | |
257 | (void) __atomic_xor_fetch(&gp_state->period, 1, __ATOMIC_RELAXED); | |
258 | ||
259 | wait_for_prev_period_readers(gp_state, active_readers); | |
260 | unlock: | |
261 | pthread_mutex_unlock(&gp_state->gp_lock); | |
262 | end: | |
263 | /* | |
264 | * This memory barrier (E) pairs with memory barriers (A) and | |
67337c4a | 265 | * (B) on the read-side. |
48363c84 MD |
266 | * |
267 | * It orders the "end"/"begin" reader state loads before | |
268 | * following loads and stores. In other words, it orders | |
269 | * observation of active readers quiescence before following | |
67337c4a | 270 | * loads and stores, effectively ensuring that read-side |
48363c84 MD |
271 | * critical sections which existed prior to the grace period |
272 | * are ordered before loads and stores performed after the grace | |
273 | * period. | |
274 | */ | |
67337c4a | 275 | if (side_rcu_rseq_membarrier_available) { |
bddcdc92 MD |
276 | if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) { |
277 | perror("membarrier"); | |
278 | abort(); | |
279 | } | |
280 | } else { | |
281 | __atomic_thread_fence(__ATOMIC_SEQ_CST); | |
282 | } | |
48363c84 | 283 | } |
054b7b5c | 284 | |
67337c4a | 285 | void side_rcu_gp_init(struct side_rcu_gp_state *rcu_gp) |
054b7b5c | 286 | { |
bddcdc92 MD |
287 | bool has_membarrier = false, has_rseq = false; |
288 | ||
054b7b5c MD |
289 | memset(rcu_gp, 0, sizeof(*rcu_gp)); |
290 | rcu_gp->nr_cpus = get_possible_cpus_array_len(); | |
291 | if (!rcu_gp->nr_cpus) | |
292 | abort(); | |
293 | pthread_mutex_init(&rcu_gp->gp_lock, NULL); | |
67337c4a MD |
294 | rcu_gp->percpu_state = (struct side_rcu_cpu_gp_state *) |
295 | calloc(rcu_gp->nr_cpus, sizeof(struct side_rcu_cpu_gp_state)); | |
054b7b5c MD |
296 | if (!rcu_gp->percpu_state) |
297 | abort(); | |
bddcdc92 MD |
298 | if (!membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0)) |
299 | has_membarrier = true; | |
300 | if (rseq_available(RSEQ_AVAILABLE_QUERY_LIBC)) | |
301 | has_rseq = true; | |
302 | if (has_membarrier && has_rseq) | |
67337c4a | 303 | side_rcu_rseq_membarrier_available = 1; |
054b7b5c | 304 | } |
6e46f5e6 | 305 | |
67337c4a | 306 | void side_rcu_gp_exit(struct side_rcu_gp_state *rcu_gp) |
6e46f5e6 | 307 | { |
7fb53c62 | 308 | rseq_prepare_unload(); |
6e46f5e6 MD |
309 | pthread_mutex_destroy(&rcu_gp->gp_lock); |
310 | free(rcu_gp->percpu_state); | |
311 | } |