Implement RCU wait/wakeup scheme based on futex
[libside.git] / src / rcu.c
1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2022 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 */
5
6 #include <sched.h>
7 #include <string.h>
8 #include <stdint.h>
9 #include <pthread.h>
10 #include <stdbool.h>
11 #include <poll.h>
12 #include <stdlib.h>
13 #include <unistd.h>
14 #include <stdio.h>
15 #include <sys/syscall.h>
16 #include <linux/membarrier.h>
17
18 #include "rcu.h"
19 #include "smp.h"
20
21 /*
22 * If both rseq (with glibc support) and membarrier system calls are
23 * available, use them to replace barriers and atomics on the fast-path.
24 */
25 unsigned int side_rcu_rseq_membarrier_available;
26
27 static int
28 membarrier(int cmd, unsigned int flags, int cpu_id)
29 {
30 return syscall(__NR_membarrier, cmd, flags, cpu_id);
31 }
32
33 /*
34 * Wait/wakeup scheme with single waiter/many wakers.
35 */
36 static
37 void wait_gp_prepare(struct side_rcu_gp_state *gp_state)
38 {
39 __atomic_store_n(&gp_state->futex, -1, __ATOMIC_RELAXED);
40 /*
41 * This memory barrier (H) pairs with memory barrier (F). It
42 * orders store to futex before load of RCU reader's counter
43 * state, thus ensuring that load of RCU reader's counters does
44 * not leak outside of futex state=-1.
45 */
46 if (side_rcu_rseq_membarrier_available) {
47 if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) {
48 perror("membarrier");
49 abort();
50 }
51 } else {
52 __atomic_thread_fence(__ATOMIC_SEQ_CST);
53 }
54 }
55
56 static
57 void wait_gp_end(struct side_rcu_gp_state *gp_state)
58 {
59 /*
60 * This memory barrier (G) pairs with memory barrier (F). It
61 * orders load of RCU reader's counter state before storing the
62 * futex value, thus ensuring that load of RCU reader's counters
63 * does not leak outside of futex state=-1.
64 */
65 if (side_rcu_rseq_membarrier_available) {
66 if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) {
67 perror("membarrier");
68 abort();
69 }
70 } else {
71 __atomic_thread_fence(__ATOMIC_SEQ_CST);
72 }
73 __atomic_store_n(&gp_state->futex, 0, __ATOMIC_RELAXED);
74 }
75
76 static
77 void wait_gp(struct side_rcu_gp_state *gp_state)
78 {
79 /*
80 * This memory barrier (G) pairs with memory barrier (F). It
81 * orders load of RCU reader's counter state before loading the
82 * futex value.
83 */
84 if (side_rcu_rseq_membarrier_available) {
85 if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) {
86 perror("membarrier");
87 abort();
88 }
89 } else {
90 __atomic_thread_fence(__ATOMIC_SEQ_CST);
91 }
92 if (__atomic_load_n(&gp_state->futex, __ATOMIC_RELAXED) != -1)
93 return;
94 while (futex(&gp_state->futex, FUTEX_WAIT, -1, NULL, NULL, 0)) {
95 switch (errno) {
96 case EWOULDBLOCK:
97 /* Value already changed. */
98 return;
99 case EINTR:
100 /* Retry if interrupted by signal. */
101 break; /* Get out of switch. */
102 default:
103 /* Unexpected error. */
104 abort();
105 }
106 }
107 return;
108 }
109
110 /* active_readers is an input/output parameter. */
111 static
112 void check_active_readers(struct side_rcu_gp_state *gp_state, bool *active_readers)
113 {
114 uintptr_t sum[2] = { 0, 0 }; /* begin - end */
115 int i;
116
117 for (i = 0; i < gp_state->nr_cpus; i++) {
118 struct side_rcu_cpu_gp_state *cpu_state = &gp_state->percpu_state[i];
119
120 if (active_readers[0]) {
121 sum[0] -= __atomic_load_n(&cpu_state->count[0].end, __ATOMIC_RELAXED);
122 sum[0] -= __atomic_load_n(&cpu_state->count[0].rseq_end, __ATOMIC_RELAXED);
123 }
124 if (active_readers[1]) {
125 sum[1] -= __atomic_load_n(&cpu_state->count[1].end, __ATOMIC_RELAXED);
126 sum[1] -= __atomic_load_n(&cpu_state->count[1].rseq_end, __ATOMIC_RELAXED);
127 }
128 }
129
130 /*
131 * This memory barrier (C) pairs with either of memory barriers
132 * (A) or (B) (one is sufficient).
133 *
134 * Read end counts before begin counts. Reading "end" before
135 * "begin" counts ensures we never see an "end" without having
136 * seen its associated "begin", because "begin" is always
137 * incremented before "end", as guaranteed by memory barriers
138 * (A) or (B).
139 */
140 if (side_rcu_rseq_membarrier_available) {
141 if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) {
142 perror("membarrier");
143 abort();
144 }
145 } else {
146 __atomic_thread_fence(__ATOMIC_SEQ_CST);
147 }
148
149 for (i = 0; i < gp_state->nr_cpus; i++) {
150 struct side_rcu_cpu_gp_state *cpu_state = &gp_state->percpu_state[i];
151
152 if (active_readers[0]) {
153 sum[0] += __atomic_load_n(&cpu_state->count[0].begin, __ATOMIC_RELAXED);
154 sum[0] += __atomic_load_n(&cpu_state->count[0].rseq_begin, __ATOMIC_RELAXED);
155 }
156 if (active_readers[1]) {
157 sum[1] += __atomic_load_n(&cpu_state->count[1].begin, __ATOMIC_RELAXED);
158 sum[1] += __atomic_load_n(&cpu_state->count[1].rseq_begin, __ATOMIC_RELAXED);
159 }
160 }
161 if (active_readers[0])
162 active_readers[0] = sum[0];
163 if (active_readers[1])
164 active_readers[1] = sum[1];
165 }
166
167 /*
168 * Wait for previous period to have no active readers.
169 *
170 * active_readers is an input/output parameter.
171 */
172 static
173 void wait_for_prev_period_readers(struct side_rcu_gp_state *gp_state, bool *active_readers)
174 {
175 unsigned int prev_period = gp_state->period ^ 1;
176
177 /*
178 * If a prior active readers scan already observed that no
179 * readers are present for the previous period, there is no need
180 * to scan again.
181 */
182 if (!active_readers[prev_period])
183 return;
184 /*
185 * Wait for the sum of CPU begin/end counts to match for the
186 * previous period.
187 */
188 for (;;) {
189 wait_gp_prepare(gp_state);
190 check_active_readers(gp_state, active_readers);
191 if (!active_readers[prev_period]) {
192 wait_gp_end(gp_state);
193 break;
194 }
195 wait_gp(gp_state);
196 }
197 }
198
199 /*
200 * The grace period completes when it observes that there are no active
201 * readers within each of the periods.
202 *
203 * The active_readers state is initially true for each period, until the
204 * grace period observes that no readers are present for each given
205 * period, at which point the active_readers state becomes false.
206 */
207 void side_rcu_wait_grace_period(struct side_rcu_gp_state *gp_state)
208 {
209 bool active_readers[2] = { true, true };
210
211 /*
212 * This memory barrier (D) pairs with memory barriers (A) and
213 * (B) on the read-side.
214 *
215 * It orders prior loads and stores before the "end"/"begin"
216 * reader state loads. In other words, it orders prior loads and
217 * stores before observation of active readers quiescence,
218 * effectively ensuring that read-side critical sections which
219 * exist after the grace period completes are ordered after
220 * loads and stores performed before the grace period.
221 */
222 if (side_rcu_rseq_membarrier_available) {
223 if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) {
224 perror("membarrier");
225 abort();
226 }
227 } else {
228 __atomic_thread_fence(__ATOMIC_SEQ_CST);
229 }
230
231 /*
232 * First scan through all cpus, for both period. If no readers
233 * are accounted for, we have observed quiescence and can
234 * complete the grace period immediately.
235 */
236 check_active_readers(gp_state, active_readers);
237 if (!active_readers[0] && !active_readers[1])
238 goto end;
239
240 pthread_mutex_lock(&gp_state->gp_lock);
241
242 wait_for_prev_period_readers(gp_state, active_readers);
243 /*
244 * If the reader scan detected that there are no readers in the
245 * current period as well, we can complete the grace period
246 * immediately.
247 */
248 if (!active_readers[gp_state->period])
249 goto unlock;
250
251 /* Flip period: 0 -> 1, 1 -> 0. */
252 (void) __atomic_xor_fetch(&gp_state->period, 1, __ATOMIC_RELAXED);
253
254 wait_for_prev_period_readers(gp_state, active_readers);
255 unlock:
256 pthread_mutex_unlock(&gp_state->gp_lock);
257 end:
258 /*
259 * This memory barrier (E) pairs with memory barriers (A) and
260 * (B) on the read-side.
261 *
262 * It orders the "end"/"begin" reader state loads before
263 * following loads and stores. In other words, it orders
264 * observation of active readers quiescence before following
265 * loads and stores, effectively ensuring that read-side
266 * critical sections which existed prior to the grace period
267 * are ordered before loads and stores performed after the grace
268 * period.
269 */
270 if (side_rcu_rseq_membarrier_available) {
271 if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0)) {
272 perror("membarrier");
273 abort();
274 }
275 } else {
276 __atomic_thread_fence(__ATOMIC_SEQ_CST);
277 }
278 }
279
280 void side_rcu_gp_init(struct side_rcu_gp_state *rcu_gp)
281 {
282 bool has_membarrier = false, has_rseq = false;
283
284 memset(rcu_gp, 0, sizeof(*rcu_gp));
285 rcu_gp->nr_cpus = get_possible_cpus_array_len();
286 if (!rcu_gp->nr_cpus)
287 abort();
288 pthread_mutex_init(&rcu_gp->gp_lock, NULL);
289 rcu_gp->percpu_state = calloc(rcu_gp->nr_cpus, sizeof(struct side_rcu_cpu_gp_state));
290 if (!rcu_gp->percpu_state)
291 abort();
292 if (!membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0))
293 has_membarrier = true;
294 if (rseq_available(RSEQ_AVAILABLE_QUERY_LIBC))
295 has_rseq = true;
296 if (has_membarrier && has_rseq)
297 side_rcu_rseq_membarrier_available = 1;
298 }
299
300 void side_rcu_gp_exit(struct side_rcu_gp_state *rcu_gp)
301 {
302 rseq_prepare_unload();
303 pthread_mutex_destroy(&rcu_gp->gp_lock);
304 free(rcu_gp->percpu_state);
305 }
This page took 0.035985 seconds and 5 git commands to generate.