Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
6a5b99a4 | 18 | * http://www.gnu.org/licenses/gpl-2.0.html |
d7e09d03 | 19 | * |
d7e09d03 PT |
20 | * GPL HEADER END |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Use is subject to license terms. | |
25 | * | |
1dc563a6 | 26 | * Copyright (c) 2010, 2015, Intel Corporation. |
d7e09d03 PT |
27 | */ |
28 | /* | |
29 | * This file is part of Lustre, http://www.lustre.org/ | |
30 | * Lustre is a trademark of Sun Microsystems, Inc. | |
31 | */ | |
32 | ||
33 | #define DEBUG_SUBSYSTEM S_RPC | |
e27db149 GKH |
34 | #include "../include/obd_support.h" |
35 | #include "../include/obd_class.h" | |
36 | #include "../include/lustre_net.h" | |
37 | #include "../include/lu_object.h" | |
9fdaf8c0 | 38 | #include "../../include/linux/lnet/types.h" |
d7e09d03 PT |
39 | #include "ptlrpc_internal.h" |
40 | ||
41 | /* The following are visible and mutable through /sys/module/ptlrpc */ | |
b963e722 | 42 | int test_req_buffer_pressure; |
8cc7b4b9 PT |
43 | module_param(test_req_buffer_pressure, int, 0444); |
44 | MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools"); | |
45 | module_param(at_min, int, 0644); | |
46 | MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)"); | |
47 | module_param(at_max, int, 0644); | |
48 | MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)"); | |
49 | module_param(at_history, int, 0644); | |
50 | MODULE_PARM_DESC(at_history, | |
51 | "Adaptive timeouts remember the slowest event that took place within this period (sec)"); | |
52 | module_param(at_early_margin, int, 0644); | |
53 | MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply"); | |
54 | module_param(at_extra, int, 0644); | |
55 | MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply"); | |
d7e09d03 | 56 | |
d7e09d03 PT |
57 | /* forward ref */ |
58 | static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); | |
59 | static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); | |
60 | static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); | |
61 | ||
62 | /** Holds a list of all PTLRPC services */ | |
63 | LIST_HEAD(ptlrpc_all_services); | |
64 | /** Used to protect the \e ptlrpc_all_services list */ | |
65 | struct mutex ptlrpc_all_services_mutex; | |
66 | ||
a96389d9 | 67 | static struct ptlrpc_request_buffer_desc * |
d7e09d03 PT |
68 | ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) |
69 | { | |
d0bfef31 | 70 | struct ptlrpc_service *svc = svcpt->scp_service; |
d7e09d03 PT |
71 | struct ptlrpc_request_buffer_desc *rqbd; |
72 | ||
bae97e81 JL |
73 | rqbd = kzalloc_node(sizeof(*rqbd), GFP_NOFS, |
74 | cfs_cpt_spread_node(svc->srv_cptable, | |
75 | svcpt->scp_cpt)); | |
8b382089 | 76 | if (!rqbd) |
d7e09d03 PT |
77 | return NULL; |
78 | ||
79 | rqbd->rqbd_svcpt = svcpt; | |
80 | rqbd->rqbd_refcount = 0; | |
81 | rqbd->rqbd_cbid.cbid_fn = request_in_callback; | |
82 | rqbd->rqbd_cbid.cbid_arg = rqbd; | |
83 | INIT_LIST_HEAD(&rqbd->rqbd_reqs); | |
6fd57333 OD |
84 | rqbd->rqbd_buffer = libcfs_kvzalloc_cpt(svc->srv_cptable, |
85 | svcpt->scp_cpt, | |
86 | svc->srv_buf_size, | |
87 | GFP_KERNEL); | |
8b382089 | 88 | if (!rqbd->rqbd_buffer) { |
9ae10597 | 89 | kfree(rqbd); |
d7e09d03 PT |
90 | return NULL; |
91 | } | |
92 | ||
93 | spin_lock(&svcpt->scp_lock); | |
94 | list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); | |
95 | svcpt->scp_nrqbds_total++; | |
96 | spin_unlock(&svcpt->scp_lock); | |
97 | ||
98 | return rqbd; | |
99 | } | |
100 | ||
a96389d9 | 101 | static void |
d7e09d03 PT |
102 | ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) |
103 | { | |
104 | struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; | |
105 | ||
106 | LASSERT(rqbd->rqbd_refcount == 0); | |
107 | LASSERT(list_empty(&rqbd->rqbd_reqs)); | |
108 | ||
109 | spin_lock(&svcpt->scp_lock); | |
110 | list_del(&rqbd->rqbd_list); | |
111 | svcpt->scp_nrqbds_total--; | |
112 | spin_unlock(&svcpt->scp_lock); | |
113 | ||
ee0ec194 | 114 | kvfree(rqbd->rqbd_buffer); |
9ae10597 | 115 | kfree(rqbd); |
d7e09d03 PT |
116 | } |
117 | ||
a96389d9 | 118 | static int |
d7e09d03 PT |
119 | ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) |
120 | { | |
d0bfef31 | 121 | struct ptlrpc_service *svc = svcpt->scp_service; |
d7e09d03 | 122 | struct ptlrpc_request_buffer_desc *rqbd; |
d0bfef31 CH |
123 | int rc = 0; |
124 | int i; | |
d7e09d03 PT |
125 | |
126 | if (svcpt->scp_rqbd_allocating) | |
127 | goto try_post; | |
128 | ||
129 | spin_lock(&svcpt->scp_lock); | |
130 | /* check again with lock */ | |
131 | if (svcpt->scp_rqbd_allocating) { | |
132 | /* NB: we might allow more than one thread in the future */ | |
133 | LASSERT(svcpt->scp_rqbd_allocating == 1); | |
134 | spin_unlock(&svcpt->scp_lock); | |
135 | goto try_post; | |
136 | } | |
137 | ||
138 | svcpt->scp_rqbd_allocating++; | |
139 | spin_unlock(&svcpt->scp_lock); | |
140 | ||
d7e09d03 PT |
141 | for (i = 0; i < svc->srv_nbuf_per_group; i++) { |
142 | /* NB: another thread might have recycled enough rqbds, we | |
dadfcdab OD |
143 | * need to make sure it wouldn't over-allocate, see LU-1212. |
144 | */ | |
d7e09d03 PT |
145 | if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group) |
146 | break; | |
147 | ||
148 | rqbd = ptlrpc_alloc_rqbd(svcpt); | |
149 | ||
8b382089 | 150 | if (!rqbd) { |
d7e09d03 PT |
151 | CERROR("%s: Can't allocate request buffer\n", |
152 | svc->srv_name); | |
153 | rc = -ENOMEM; | |
154 | break; | |
155 | } | |
156 | } | |
157 | ||
158 | spin_lock(&svcpt->scp_lock); | |
159 | ||
160 | LASSERT(svcpt->scp_rqbd_allocating == 1); | |
161 | svcpt->scp_rqbd_allocating--; | |
162 | ||
163 | spin_unlock(&svcpt->scp_lock); | |
164 | ||
165 | CDEBUG(D_RPCTRACE, | |
166 | "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", | |
167 | svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, | |
168 | svcpt->scp_nrqbds_total, rc); | |
169 | ||
170 | try_post: | |
171 | if (post && rc == 0) | |
172 | rc = ptlrpc_server_post_idle_rqbds(svcpt); | |
173 | ||
174 | return rc; | |
175 | } | |
176 | ||
d7e09d03 PT |
177 | struct ptlrpc_hr_partition; |
178 | ||
179 | struct ptlrpc_hr_thread { | |
180 | int hrt_id; /* thread ID */ | |
181 | spinlock_t hrt_lock; | |
182 | wait_queue_head_t hrt_waitq; | |
183 | struct list_head hrt_queue; /* RS queue */ | |
184 | struct ptlrpc_hr_partition *hrt_partition; | |
185 | }; | |
186 | ||
187 | struct ptlrpc_hr_partition { | |
188 | /* # of started threads */ | |
189 | atomic_t hrp_nstarted; | |
190 | /* # of stopped threads */ | |
191 | atomic_t hrp_nstopped; | |
192 | /* cpu partition id */ | |
193 | int hrp_cpt; | |
194 | /* round-robin rotor for choosing thread */ | |
195 | int hrp_rotor; | |
196 | /* total number of threads on this partition */ | |
197 | int hrp_nthrs; | |
198 | /* threads table */ | |
199 | struct ptlrpc_hr_thread *hrp_thrs; | |
200 | }; | |
201 | ||
202 | #define HRT_RUNNING 0 | |
203 | #define HRT_STOPPING 1 | |
204 | ||
205 | struct ptlrpc_hr_service { | |
206 | /* CPU partition table, it's just cfs_cpt_table for now */ | |
207 | struct cfs_cpt_table *hr_cpt_table; | |
208 | /** controller sleep waitq */ | |
209 | wait_queue_head_t hr_waitq; | |
210 | unsigned int hr_stopping; | |
211 | /** roundrobin rotor for non-affinity service */ | |
212 | unsigned int hr_rotor; | |
213 | /* partition data */ | |
214 | struct ptlrpc_hr_partition **hr_partitions; | |
215 | }; | |
216 | ||
d7e09d03 PT |
217 | /** reply handling service. */ |
218 | static struct ptlrpc_hr_service ptlrpc_hr; | |
219 | ||
d7e09d03 PT |
220 | /** |
221 | * Choose an hr thread to dispatch requests to. | |
222 | */ | |
223 | static struct ptlrpc_hr_thread * | |
224 | ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) | |
225 | { | |
d0bfef31 CH |
226 | struct ptlrpc_hr_partition *hrp; |
227 | unsigned int rotor; | |
d7e09d03 PT |
228 | |
229 | if (svcpt->scp_cpt >= 0 && | |
230 | svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { | |
231 | /* directly match partition */ | |
232 | hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; | |
233 | ||
234 | } else { | |
235 | rotor = ptlrpc_hr.hr_rotor++; | |
236 | rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); | |
237 | ||
238 | hrp = ptlrpc_hr.hr_partitions[rotor]; | |
239 | } | |
240 | ||
241 | rotor = hrp->hrp_rotor++; | |
242 | return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; | |
243 | } | |
244 | ||
d7e09d03 PT |
245 | /** |
246 | * Put reply state into a queue for processing because we received | |
247 | * ACK from the client | |
248 | */ | |
249 | void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) | |
250 | { | |
251 | struct ptlrpc_hr_thread *hrt; | |
d7e09d03 PT |
252 | |
253 | LASSERT(list_empty(&rs->rs_list)); | |
254 | ||
255 | hrt = ptlrpc_hr_select(rs->rs_svcpt); | |
256 | ||
257 | spin_lock(&hrt->hrt_lock); | |
258 | list_add_tail(&rs->rs_list, &hrt->hrt_queue); | |
259 | spin_unlock(&hrt->hrt_lock); | |
260 | ||
261 | wake_up(&hrt->hrt_waitq); | |
d7e09d03 PT |
262 | } |
263 | ||
264 | void | |
265 | ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) | |
266 | { | |
5e42bc9d LX |
267 | assert_spin_locked(&rs->rs_svcpt->scp_rep_lock); |
268 | assert_spin_locked(&rs->rs_lock); | |
3949015e | 269 | LASSERT(rs->rs_difficult); |
d7e09d03 PT |
270 | rs->rs_scheduled_ever = 1; /* flag any notification attempt */ |
271 | ||
272 | if (rs->rs_scheduled) { /* being set up or already notified */ | |
d7e09d03 PT |
273 | return; |
274 | } | |
275 | ||
276 | rs->rs_scheduled = 1; | |
277 | list_del_init(&rs->rs_list); | |
278 | ptlrpc_dispatch_difficult_reply(rs); | |
d7e09d03 PT |
279 | } |
280 | EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); | |
281 | ||
d7e09d03 PT |
282 | static int |
283 | ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) | |
284 | { | |
285 | struct ptlrpc_request_buffer_desc *rqbd; | |
d0bfef31 CH |
286 | int rc; |
287 | int posted = 0; | |
d7e09d03 PT |
288 | |
289 | for (;;) { | |
290 | spin_lock(&svcpt->scp_lock); | |
291 | ||
292 | if (list_empty(&svcpt->scp_rqbd_idle)) { | |
293 | spin_unlock(&svcpt->scp_lock); | |
294 | return posted; | |
295 | } | |
296 | ||
297 | rqbd = list_entry(svcpt->scp_rqbd_idle.next, | |
30c0aa39 OD |
298 | struct ptlrpc_request_buffer_desc, |
299 | rqbd_list); | |
d7e09d03 PT |
300 | list_del(&rqbd->rqbd_list); |
301 | ||
302 | /* assume we will post successfully */ | |
303 | svcpt->scp_nrqbds_posted++; | |
304 | list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); | |
305 | ||
306 | spin_unlock(&svcpt->scp_lock); | |
307 | ||
308 | rc = ptlrpc_register_rqbd(rqbd); | |
309 | if (rc != 0) | |
310 | break; | |
311 | ||
312 | posted = 1; | |
313 | } | |
314 | ||
315 | spin_lock(&svcpt->scp_lock); | |
316 | ||
317 | svcpt->scp_nrqbds_posted--; | |
318 | list_del(&rqbd->rqbd_list); | |
319 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); | |
320 | ||
321 | /* Don't complain if no request buffers are posted right now; LNET | |
dadfcdab OD |
322 | * won't drop requests because we set the portal lazy! |
323 | */ | |
d7e09d03 PT |
324 | |
325 | spin_unlock(&svcpt->scp_lock); | |
326 | ||
327 | return -1; | |
328 | } | |
329 | ||
330 | static void ptlrpc_at_timer(unsigned long castmeharder) | |
331 | { | |
332 | struct ptlrpc_service_part *svcpt; | |
333 | ||
334 | svcpt = (struct ptlrpc_service_part *)castmeharder; | |
335 | ||
336 | svcpt->scp_at_check = 1; | |
337 | svcpt->scp_at_checktime = cfs_time_current(); | |
338 | wake_up(&svcpt->scp_waitq); | |
339 | } | |
340 | ||
341 | static void | |
342 | ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, | |
343 | struct ptlrpc_service_conf *conf) | |
344 | { | |
d0bfef31 CH |
345 | struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; |
346 | unsigned init; | |
347 | unsigned total; | |
348 | unsigned nthrs; | |
349 | int weight; | |
d7e09d03 PT |
350 | |
351 | /* | |
352 | * Common code for estimating & validating threads number. | |
353 | * CPT affinity service could have percpt thread-pool instead | |
354 | * of a global thread-pool, which means user might not always | |
355 | * get the threads number they give it in conf::tc_nthrs_user | |
356 | * even they did set. It's because we need to validate threads | |
357 | * number for each CPT to guarantee each pool will have enough | |
358 | * threads to keep the service healthy. | |
359 | */ | |
360 | init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); | |
361 | init = max_t(int, init, tc->tc_nthrs_init); | |
362 | ||
363 | /* NB: please see comments in lustre_lnet.h for definition | |
dadfcdab OD |
364 | * details of these members |
365 | */ | |
d7e09d03 PT |
366 | LASSERT(tc->tc_nthrs_max != 0); |
367 | ||
368 | if (tc->tc_nthrs_user != 0) { | |
369 | /* In case there is a reason to test a service with many | |
370 | * threads, we give a less strict check here, it can | |
dadfcdab OD |
371 | * be up to 8 * nthrs_max |
372 | */ | |
d7e09d03 PT |
373 | total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); |
374 | nthrs = total / svc->srv_ncpts; | |
d0bfef31 | 375 | init = max(init, nthrs); |
d7e09d03 PT |
376 | goto out; |
377 | } | |
378 | ||
379 | total = tc->tc_nthrs_max; | |
380 | if (tc->tc_nthrs_base == 0) { | |
381 | /* don't care about base threads number per partition, | |
dadfcdab OD |
382 | * this is most for non-affinity service |
383 | */ | |
d7e09d03 PT |
384 | nthrs = total / svc->srv_ncpts; |
385 | goto out; | |
386 | } | |
387 | ||
388 | nthrs = tc->tc_nthrs_base; | |
389 | if (svc->srv_ncpts == 1) { | |
d0bfef31 | 390 | int i; |
d7e09d03 PT |
391 | |
392 | /* NB: Increase the base number if it's single partition | |
393 | * and total number of cores/HTs is larger or equal to 4. | |
dadfcdab OD |
394 | * result will always < 2 * nthrs_base |
395 | */ | |
d7e09d03 PT |
396 | weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); |
397 | for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ | |
398 | (tc->tc_nthrs_base >> i) != 0; i++) | |
399 | nthrs += tc->tc_nthrs_base >> i; | |
400 | } | |
401 | ||
402 | if (tc->tc_thr_factor != 0) { | |
d0bfef31 | 403 | int factor = tc->tc_thr_factor; |
d7e09d03 PT |
404 | const int fade = 4; |
405 | ||
406 | /* | |
407 | * User wants to increase number of threads with for | |
408 | * each CPU core/HT, most likely the factor is larger then | |
409 | * one thread/core because service threads are supposed to | |
410 | * be blocked by lock or wait for IO. | |
411 | */ | |
412 | /* | |
413 | * Amdahl's law says that adding processors wouldn't give | |
414 | * a linear increasing of parallelism, so it's nonsense to | |
415 | * have too many threads no matter how many cores/HTs | |
416 | * there are. | |
417 | */ | |
6301647b | 418 | /* weight is # of HTs */ |
06931e62 | 419 | if (cpumask_weight(topology_sibling_cpumask(0)) > 1) { |
d7e09d03 PT |
420 | /* depress thread factor for hyper-thread */ |
421 | factor = factor - (factor >> 1) + (factor >> 3); | |
422 | } | |
423 | ||
424 | weight = cfs_cpt_weight(svc->srv_cptable, 0); | |
425 | LASSERT(weight > 0); | |
426 | ||
427 | for (; factor > 0 && weight > 0; factor--, weight -= fade) | |
428 | nthrs += min(weight, fade) * factor; | |
429 | } | |
430 | ||
431 | if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { | |
432 | nthrs = max(tc->tc_nthrs_base, | |
433 | tc->tc_nthrs_max / svc->srv_ncpts); | |
434 | } | |
435 | out: | |
436 | nthrs = max(nthrs, tc->tc_nthrs_init); | |
437 | svc->srv_nthrs_cpt_limit = nthrs; | |
438 | svc->srv_nthrs_cpt_init = init; | |
439 | ||
440 | if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { | |
2d00bd17 | 441 | CDEBUG(D_OTHER, "%s: This service may have more threads (%d) than the given soft limit (%d)\n", |
d7e09d03 PT |
442 | svc->srv_name, nthrs * svc->srv_ncpts, |
443 | tc->tc_nthrs_max); | |
444 | } | |
445 | } | |
446 | ||
447 | /** | |
448 | * Initialize percpt data for a service | |
449 | */ | |
450 | static int | |
451 | ptlrpc_service_part_init(struct ptlrpc_service *svc, | |
452 | struct ptlrpc_service_part *svcpt, int cpt) | |
453 | { | |
454 | struct ptlrpc_at_array *array; | |
d0bfef31 CH |
455 | int size; |
456 | int index; | |
457 | int rc; | |
d7e09d03 PT |
458 | |
459 | svcpt->scp_cpt = cpt; | |
460 | INIT_LIST_HEAD(&svcpt->scp_threads); | |
461 | ||
462 | /* rqbd and incoming request queue */ | |
463 | spin_lock_init(&svcpt->scp_lock); | |
464 | INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); | |
465 | INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); | |
466 | INIT_LIST_HEAD(&svcpt->scp_req_incoming); | |
467 | init_waitqueue_head(&svcpt->scp_waitq); | |
468 | /* history request & rqbd list */ | |
469 | INIT_LIST_HEAD(&svcpt->scp_hist_reqs); | |
470 | INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); | |
471 | ||
369e5c9a | 472 | /* active requests and hp requests */ |
d7e09d03 PT |
473 | spin_lock_init(&svcpt->scp_req_lock); |
474 | ||
475 | /* reply states */ | |
476 | spin_lock_init(&svcpt->scp_rep_lock); | |
477 | INIT_LIST_HEAD(&svcpt->scp_rep_active); | |
478 | INIT_LIST_HEAD(&svcpt->scp_rep_idle); | |
479 | init_waitqueue_head(&svcpt->scp_rep_waitq); | |
480 | atomic_set(&svcpt->scp_nreps_difficult, 0); | |
481 | ||
482 | /* adaptive timeout */ | |
483 | spin_lock_init(&svcpt->scp_at_lock); | |
484 | array = &svcpt->scp_at_array; | |
485 | ||
486 | size = at_est2timeout(at_max); | |
d0bfef31 CH |
487 | array->paa_size = size; |
488 | array->paa_count = 0; | |
d7e09d03 PT |
489 | array->paa_deadline = -1; |
490 | ||
491 | /* allocate memory for scp_at_array (ptlrpc_at_array) */ | |
bae97e81 JL |
492 | array->paa_reqs_array = |
493 | kzalloc_node(sizeof(struct list_head) * size, GFP_NOFS, | |
494 | cfs_cpt_spread_node(svc->srv_cptable, cpt)); | |
8b382089 | 495 | if (!array->paa_reqs_array) |
d7e09d03 PT |
496 | return -ENOMEM; |
497 | ||
498 | for (index = 0; index < size; index++) | |
499 | INIT_LIST_HEAD(&array->paa_reqs_array[index]); | |
500 | ||
bae97e81 JL |
501 | array->paa_reqs_count = |
502 | kzalloc_node(sizeof(__u32) * size, GFP_NOFS, | |
503 | cfs_cpt_spread_node(svc->srv_cptable, cpt)); | |
8b382089 | 504 | if (!array->paa_reqs_count) |
207e99c2 | 505 | goto free_reqs_array; |
d7e09d03 | 506 | |
922da0c5 AB |
507 | setup_timer(&svcpt->scp_at_timer, ptlrpc_at_timer, |
508 | (unsigned long)svcpt); | |
509 | ||
d7e09d03 | 510 | /* At SOW, service time should be quick; 10s seems generous. If client |
dadfcdab OD |
511 | * timeout is less than this, we'll be sending an early reply. |
512 | */ | |
d7e09d03 PT |
513 | at_init(&svcpt->scp_at_estimate, 10, 0); |
514 | ||
515 | /* assign this before call ptlrpc_grow_req_bufs */ | |
516 | svcpt->scp_service = svc; | |
517 | /* Now allocate the request buffers, but don't post them now */ | |
518 | rc = ptlrpc_grow_req_bufs(svcpt, 0); | |
519 | /* We shouldn't be under memory pressure at startup, so | |
dadfcdab OD |
520 | * fail if we can't allocate all our buffers at this time. |
521 | */ | |
d7e09d03 | 522 | if (rc != 0) |
207e99c2 | 523 | goto free_reqs_count; |
d7e09d03 PT |
524 | |
525 | return 0; | |
526 | ||
207e99c2 JL |
527 | free_reqs_count: |
528 | kfree(array->paa_reqs_count); | |
529 | array->paa_reqs_count = NULL; | |
530 | free_reqs_array: | |
531 | kfree(array->paa_reqs_array); | |
532 | array->paa_reqs_array = NULL; | |
d7e09d03 PT |
533 | |
534 | return -ENOMEM; | |
535 | } | |
536 | ||
537 | /** | |
538 | * Initialize service on a given portal. | |
539 | * This includes starting serving threads , allocating and posting rqbds and | |
540 | * so on. | |
541 | */ | |
542 | struct ptlrpc_service * | |
543 | ptlrpc_register_service(struct ptlrpc_service_conf *conf, | |
328676f8 | 544 | struct kset *parent, |
700815d4 | 545 | struct dentry *debugfs_entry) |
d7e09d03 | 546 | { |
d0bfef31 CH |
547 | struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; |
548 | struct ptlrpc_service *service; | |
549 | struct ptlrpc_service_part *svcpt; | |
550 | struct cfs_cpt_table *cptable; | |
551 | __u32 *cpts = NULL; | |
552 | int ncpts; | |
553 | int cpt; | |
554 | int rc; | |
555 | int i; | |
d7e09d03 PT |
556 | |
557 | LASSERT(conf->psc_buf.bc_nbufs > 0); | |
558 | LASSERT(conf->psc_buf.bc_buf_size >= | |
559 | conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); | |
560 | LASSERT(conf->psc_thr.tc_ctx_tags != 0); | |
561 | ||
562 | cptable = cconf->cc_cptable; | |
8b382089 | 563 | if (!cptable) |
d7e09d03 PT |
564 | cptable = cfs_cpt_table; |
565 | ||
566 | if (!conf->psc_thr.tc_cpu_affinity) { | |
567 | ncpts = 1; | |
568 | } else { | |
569 | ncpts = cfs_cpt_number(cptable); | |
8b382089 | 570 | if (cconf->cc_pattern) { |
d0bfef31 | 571 | struct cfs_expr_list *el; |
d7e09d03 PT |
572 | |
573 | rc = cfs_expr_list_parse(cconf->cc_pattern, | |
574 | strlen(cconf->cc_pattern), | |
575 | 0, ncpts - 1, &el); | |
576 | if (rc != 0) { | |
577 | CERROR("%s: invalid CPT pattern string: %s", | |
578 | conf->psc_name, cconf->cc_pattern); | |
0a3bdb00 | 579 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
580 | } |
581 | ||
582 | rc = cfs_expr_list_values(el, ncpts, &cpts); | |
583 | cfs_expr_list_free(el); | |
584 | if (rc <= 0) { | |
585 | CERROR("%s: failed to parse CPT array %s: %d\n", | |
586 | conf->psc_name, cconf->cc_pattern, rc); | |
207e99c2 | 587 | kfree(cpts); |
0a3bdb00 | 588 | return ERR_PTR(rc < 0 ? rc : -EINVAL); |
d7e09d03 PT |
589 | } |
590 | ncpts = rc; | |
591 | } | |
592 | } | |
593 | ||
9ae10597 JL |
594 | service = kzalloc(offsetof(struct ptlrpc_service, srv_parts[ncpts]), |
595 | GFP_NOFS); | |
597851ac | 596 | if (!service) { |
207e99c2 | 597 | kfree(cpts); |
0a3bdb00 | 598 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
599 | } |
600 | ||
d0bfef31 CH |
601 | service->srv_cptable = cptable; |
602 | service->srv_cpts = cpts; | |
603 | service->srv_ncpts = ncpts; | |
d7e09d03 PT |
604 | |
605 | service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ | |
606 | while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) | |
607 | service->srv_cpt_bits++; | |
608 | ||
609 | /* public members */ | |
610 | spin_lock_init(&service->srv_lock); | |
d0bfef31 CH |
611 | service->srv_name = conf->psc_name; |
612 | service->srv_watchdog_factor = conf->psc_watchdog_factor; | |
b6da17f3 | 613 | INIT_LIST_HEAD(&service->srv_list); /* for safety of cleanup */ |
d7e09d03 PT |
614 | |
615 | /* buffer configuration */ | |
d0bfef31 | 616 | service->srv_nbuf_per_group = test_req_buffer_pressure ? |
d7e09d03 | 617 | 1 : conf->psc_buf.bc_nbufs; |
d0bfef31 | 618 | service->srv_max_req_size = conf->psc_buf.bc_req_max_size + |
d7e09d03 | 619 | SPTLRPC_MAX_PAYLOAD; |
d0bfef31 CH |
620 | service->srv_buf_size = conf->psc_buf.bc_buf_size; |
621 | service->srv_rep_portal = conf->psc_buf.bc_rep_portal; | |
622 | service->srv_req_portal = conf->psc_buf.bc_req_portal; | |
d7e09d03 PT |
623 | |
624 | /* Increase max reply size to next power of two */ | |
625 | service->srv_max_reply_size = 1; | |
626 | while (service->srv_max_reply_size < | |
627 | conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) | |
628 | service->srv_max_reply_size <<= 1; | |
629 | ||
d0bfef31 CH |
630 | service->srv_thread_name = conf->psc_thr.tc_thr_name; |
631 | service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; | |
632 | service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; | |
633 | service->srv_ops = conf->psc_ops; | |
d7e09d03 PT |
634 | |
635 | for (i = 0; i < ncpts; i++) { | |
636 | if (!conf->psc_thr.tc_cpu_affinity) | |
637 | cpt = CFS_CPT_ANY; | |
638 | else | |
8b382089 | 639 | cpt = cpts ? cpts[i] : i; |
d7e09d03 | 640 | |
bae97e81 JL |
641 | svcpt = kzalloc_node(sizeof(*svcpt), GFP_NOFS, |
642 | cfs_cpt_spread_node(cptable, cpt)); | |
8b382089 | 643 | if (!svcpt) { |
a9b3e8f3 JL |
644 | rc = -ENOMEM; |
645 | goto failed; | |
646 | } | |
d7e09d03 PT |
647 | |
648 | service->srv_parts[i] = svcpt; | |
649 | rc = ptlrpc_service_part_init(service, svcpt, cpt); | |
650 | if (rc != 0) | |
a9b3e8f3 | 651 | goto failed; |
d7e09d03 PT |
652 | } |
653 | ||
654 | ptlrpc_server_nthreads_check(service, conf); | |
655 | ||
656 | rc = LNetSetLazyPortal(service->srv_req_portal); | |
657 | LASSERT(rc == 0); | |
658 | ||
659 | mutex_lock(&ptlrpc_all_services_mutex); | |
3949015e | 660 | list_add(&service->srv_list, &ptlrpc_all_services); |
d7e09d03 PT |
661 | mutex_unlock(&ptlrpc_all_services_mutex); |
662 | ||
328676f8 OD |
663 | if (parent) { |
664 | rc = ptlrpc_sysfs_register_service(parent, service); | |
665 | if (rc) | |
666 | goto failed; | |
667 | } | |
668 | ||
700815d4 DE |
669 | if (!IS_ERR_OR_NULL(debugfs_entry)) |
670 | ptlrpc_ldebugfs_register_service(debugfs_entry, service); | |
d7e09d03 PT |
671 | |
672 | rc = ptlrpc_service_nrs_setup(service); | |
673 | if (rc != 0) | |
a9b3e8f3 | 674 | goto failed; |
d7e09d03 PT |
675 | |
676 | CDEBUG(D_NET, "%s: Started, listening on portal %d\n", | |
677 | service->srv_name, service->srv_req_portal); | |
678 | ||
679 | rc = ptlrpc_start_threads(service); | |
680 | if (rc != 0) { | |
681 | CERROR("Failed to start threads for service %s: %d\n", | |
682 | service->srv_name, rc); | |
a9b3e8f3 | 683 | goto failed; |
d7e09d03 PT |
684 | } |
685 | ||
0a3bdb00 | 686 | return service; |
d7e09d03 PT |
687 | failed: |
688 | ptlrpc_unregister_service(service); | |
0a3bdb00 | 689 | return ERR_PTR(rc); |
d7e09d03 PT |
690 | } |
691 | EXPORT_SYMBOL(ptlrpc_register_service); | |
692 | ||
693 | /** | |
694 | * to actually free the request, must be called without holding svc_lock. | |
695 | * note it's caller's responsibility to unlink req->rq_list. | |
696 | */ | |
697 | static void ptlrpc_server_free_request(struct ptlrpc_request *req) | |
698 | { | |
699 | LASSERT(atomic_read(&req->rq_refcount) == 0); | |
700 | LASSERT(list_empty(&req->rq_timed_list)); | |
701 | ||
702 | /* DEBUG_REQ() assumes the reply state of a request with a valid | |
dadfcdab OD |
703 | * ref will not be destroyed until that reference is dropped. |
704 | */ | |
d7e09d03 PT |
705 | ptlrpc_req_drop_rs(req); |
706 | ||
707 | sptlrpc_svc_ctx_decref(req); | |
708 | ||
709 | if (req != &req->rq_rqbd->rqbd_req) { | |
710 | /* NB request buffers use an embedded | |
711 | * req if the incoming req unlinked the | |
dadfcdab OD |
712 | * MD; this isn't one of them! |
713 | */ | |
35b2e1b7 | 714 | ptlrpc_request_cache_free(req); |
d7e09d03 PT |
715 | } |
716 | } | |
717 | ||
718 | /** | |
719 | * drop a reference count of the request. if it reaches 0, we either | |
720 | * put it into history list, or free it immediately. | |
721 | */ | |
230a8da1 | 722 | static void ptlrpc_server_drop_request(struct ptlrpc_request *req) |
d7e09d03 PT |
723 | { |
724 | struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; | |
d0bfef31 CH |
725 | struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; |
726 | struct ptlrpc_service *svc = svcpt->scp_service; | |
727 | int refcount; | |
728 | struct list_head *tmp; | |
729 | struct list_head *nxt; | |
d7e09d03 PT |
730 | |
731 | if (!atomic_dec_and_test(&req->rq_refcount)) | |
732 | return; | |
733 | ||
734 | if (req->rq_at_linked) { | |
735 | spin_lock(&svcpt->scp_at_lock); | |
736 | /* recheck with lock, in case it's unlinked by | |
dadfcdab OD |
737 | * ptlrpc_at_check_timed() |
738 | */ | |
d7e09d03 PT |
739 | if (likely(req->rq_at_linked)) |
740 | ptlrpc_at_remove_timed(req); | |
741 | spin_unlock(&svcpt->scp_at_lock); | |
742 | } | |
743 | ||
744 | LASSERT(list_empty(&req->rq_timed_list)); | |
745 | ||
746 | /* finalize request */ | |
747 | if (req->rq_export) { | |
748 | class_export_put(req->rq_export); | |
749 | req->rq_export = NULL; | |
750 | } | |
751 | ||
752 | spin_lock(&svcpt->scp_lock); | |
753 | ||
754 | list_add(&req->rq_list, &rqbd->rqbd_reqs); | |
755 | ||
756 | refcount = --(rqbd->rqbd_refcount); | |
757 | if (refcount == 0) { | |
758 | /* request buffer is now idle: add to history */ | |
759 | list_del(&rqbd->rqbd_list); | |
760 | ||
761 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); | |
762 | svcpt->scp_hist_nrqbds++; | |
763 | ||
764 | /* cull some history? | |
dadfcdab OD |
765 | * I expect only about 1 or 2 rqbds need to be recycled here |
766 | */ | |
d7e09d03 PT |
767 | while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { |
768 | rqbd = list_entry(svcpt->scp_hist_rqbds.next, | |
30c0aa39 OD |
769 | struct ptlrpc_request_buffer_desc, |
770 | rqbd_list); | |
d7e09d03 PT |
771 | |
772 | list_del(&rqbd->rqbd_list); | |
773 | svcpt->scp_hist_nrqbds--; | |
774 | ||
775 | /* remove rqbd's reqs from svc's req history while | |
dadfcdab OD |
776 | * I've got the service lock |
777 | */ | |
d7e09d03 PT |
778 | list_for_each(tmp, &rqbd->rqbd_reqs) { |
779 | req = list_entry(tmp, struct ptlrpc_request, | |
30c0aa39 | 780 | rq_list); |
d7e09d03 PT |
781 | /* Track the highest culled req seq */ |
782 | if (req->rq_history_seq > | |
783 | svcpt->scp_hist_seq_culled) { | |
784 | svcpt->scp_hist_seq_culled = | |
785 | req->rq_history_seq; | |
786 | } | |
787 | list_del(&req->rq_history_list); | |
788 | } | |
789 | ||
790 | spin_unlock(&svcpt->scp_lock); | |
791 | ||
792 | list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) { | |
793 | req = list_entry(rqbd->rqbd_reqs.next, | |
30c0aa39 OD |
794 | struct ptlrpc_request, |
795 | rq_list); | |
d7e09d03 PT |
796 | list_del(&req->rq_list); |
797 | ptlrpc_server_free_request(req); | |
798 | } | |
799 | ||
800 | spin_lock(&svcpt->scp_lock); | |
801 | /* | |
802 | * now all reqs including the embedded req has been | |
803 | * disposed, schedule request buffer for re-use. | |
804 | */ | |
805 | LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == | |
806 | 0); | |
30c0aa39 | 807 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); |
d7e09d03 PT |
808 | } |
809 | ||
810 | spin_unlock(&svcpt->scp_lock); | |
811 | } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { | |
812 | /* If we are low on memory, we are not interested in history */ | |
813 | list_del(&req->rq_list); | |
814 | list_del_init(&req->rq_history_list); | |
815 | ||
816 | /* Track the highest culled req seq */ | |
817 | if (req->rq_history_seq > svcpt->scp_hist_seq_culled) | |
818 | svcpt->scp_hist_seq_culled = req->rq_history_seq; | |
819 | ||
820 | spin_unlock(&svcpt->scp_lock); | |
821 | ||
822 | ptlrpc_server_free_request(req); | |
823 | } else { | |
824 | spin_unlock(&svcpt->scp_lock); | |
825 | } | |
826 | } | |
827 | ||
d7e09d03 PT |
828 | /** |
829 | * to finish a request: stop sending more early replies, and release | |
830 | * the request. | |
831 | */ | |
832 | static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, | |
833 | struct ptlrpc_request *req) | |
834 | { | |
835 | ptlrpc_server_hpreq_fini(req); | |
836 | ||
70187506 MP |
837 | if (req->rq_session.lc_thread) { |
838 | lu_context_exit(&req->rq_session); | |
839 | lu_context_fini(&req->rq_session); | |
840 | } | |
841 | ||
d7e09d03 PT |
842 | ptlrpc_server_drop_request(req); |
843 | } | |
844 | ||
845 | /** | |
846 | * to finish a active request: stop sending more early replies, and release | |
847 | * the request. should be called after we finished handling the request. | |
848 | */ | |
849 | static void ptlrpc_server_finish_active_request( | |
850 | struct ptlrpc_service_part *svcpt, | |
851 | struct ptlrpc_request *req) | |
852 | { | |
853 | spin_lock(&svcpt->scp_req_lock); | |
854 | ptlrpc_nrs_req_stop_nolock(req); | |
855 | svcpt->scp_nreqs_active--; | |
856 | if (req->rq_hp) | |
857 | svcpt->scp_nhreqs_active--; | |
858 | spin_unlock(&svcpt->scp_req_lock); | |
859 | ||
860 | ptlrpc_nrs_req_finalize(req); | |
861 | ||
8b382089 | 862 | if (req->rq_export) |
d7e09d03 PT |
863 | class_export_rpc_dec(req->rq_export); |
864 | ||
865 | ptlrpc_server_finish_request(svcpt, req); | |
866 | } | |
867 | ||
d7e09d03 PT |
868 | /** |
869 | * Sanity check request \a req. | |
870 | * Return 0 if all is ok, error code otherwise. | |
871 | */ | |
872 | static int ptlrpc_check_req(struct ptlrpc_request *req) | |
873 | { | |
f60d7c39 | 874 | struct obd_device *obd = req->rq_export->exp_obd; |
d7e09d03 PT |
875 | int rc = 0; |
876 | ||
877 | if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < | |
878 | req->rq_export->exp_conn_cnt)) { | |
879 | DEBUG_REQ(D_RPCTRACE, req, | |
880 | "DROPPING req from old connection %d < %d", | |
881 | lustre_msg_get_conn_cnt(req->rq_reqmsg), | |
882 | req->rq_export->exp_conn_cnt); | |
883 | return -EEXIST; | |
884 | } | |
8b382089 | 885 | if (unlikely(!obd || obd->obd_fail)) { |
532118c0 KM |
886 | /* |
887 | * Failing over, don't handle any more reqs, send | |
888 | * error response instead. | |
889 | */ | |
d7e09d03 | 890 | CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", |
8b382089 | 891 | req, obd ? obd->obd_name : "unknown"); |
d7e09d03 PT |
892 | rc = -ENODEV; |
893 | } else if (lustre_msg_get_flags(req->rq_reqmsg) & | |
af3ec53b OD |
894 | (MSG_REPLAY | MSG_REQ_REPLAY_DONE)) { |
895 | DEBUG_REQ(D_ERROR, req, "Invalid replay without recovery"); | |
896 | class_fail_export(req->rq_export); | |
897 | rc = -ENODEV; | |
898 | } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0) { | |
899 | DEBUG_REQ(D_ERROR, req, | |
900 | "Invalid req with transno %llu without recovery", | |
901 | lustre_msg_get_transno(req->rq_reqmsg)); | |
902 | class_fail_export(req->rq_export); | |
903 | rc = -ENODEV; | |
d7e09d03 PT |
904 | } |
905 | ||
906 | if (unlikely(rc < 0)) { | |
907 | req->rq_status = rc; | |
908 | ptlrpc_error(req); | |
909 | } | |
910 | return rc; | |
911 | } | |
912 | ||
913 | static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) | |
914 | { | |
915 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
916 | __s32 next; | |
917 | ||
918 | if (array->paa_count == 0) { | |
922da0c5 | 919 | del_timer(&svcpt->scp_at_timer); |
d7e09d03 PT |
920 | return; |
921 | } | |
922 | ||
923 | /* Set timer for closest deadline */ | |
219e6de6 | 924 | next = (__s32)(array->paa_deadline - ktime_get_real_seconds() - |
d7e09d03 PT |
925 | at_early_margin); |
926 | if (next <= 0) { | |
927 | ptlrpc_at_timer((unsigned long)svcpt); | |
928 | } else { | |
922da0c5 | 929 | mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next)); |
d7e09d03 PT |
930 | CDEBUG(D_INFO, "armed %s at %+ds\n", |
931 | svcpt->scp_service->srv_name, next); | |
932 | } | |
933 | } | |
934 | ||
935 | /* Add rpc to early reply check list */ | |
936 | static int ptlrpc_at_add_timed(struct ptlrpc_request *req) | |
937 | { | |
938 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; | |
939 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
940 | struct ptlrpc_request *rq = NULL; | |
941 | __u32 index; | |
942 | ||
943 | if (AT_OFF) | |
fbe7c6c7 | 944 | return 0; |
d7e09d03 PT |
945 | |
946 | if (req->rq_no_reply) | |
947 | return 0; | |
948 | ||
949 | if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) | |
fbe7c6c7 | 950 | return -ENOSYS; |
d7e09d03 PT |
951 | |
952 | spin_lock(&svcpt->scp_at_lock); | |
953 | LASSERT(list_empty(&req->rq_timed_list)); | |
954 | ||
219e6de6 | 955 | div_u64_rem(req->rq_deadline, array->paa_size, &index); |
d7e09d03 PT |
956 | if (array->paa_reqs_count[index] > 0) { |
957 | /* latest rpcs will have the latest deadlines in the list, | |
dadfcdab OD |
958 | * so search backward. |
959 | */ | |
30c0aa39 OD |
960 | list_for_each_entry_reverse(rq, &array->paa_reqs_array[index], |
961 | rq_timed_list) { | |
d7e09d03 PT |
962 | if (req->rq_deadline >= rq->rq_deadline) { |
963 | list_add(&req->rq_timed_list, | |
30c0aa39 | 964 | &rq->rq_timed_list); |
d7e09d03 PT |
965 | break; |
966 | } | |
967 | } | |
968 | } | |
969 | ||
970 | /* Add the request at the head of the list */ | |
971 | if (list_empty(&req->rq_timed_list)) | |
30c0aa39 | 972 | list_add(&req->rq_timed_list, &array->paa_reqs_array[index]); |
d7e09d03 PT |
973 | |
974 | spin_lock(&req->rq_lock); | |
975 | req->rq_at_linked = 1; | |
976 | spin_unlock(&req->rq_lock); | |
977 | req->rq_at_index = index; | |
978 | array->paa_reqs_count[index]++; | |
979 | array->paa_count++; | |
980 | if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { | |
981 | array->paa_deadline = req->rq_deadline; | |
982 | ptlrpc_at_set_timer(svcpt); | |
983 | } | |
984 | spin_unlock(&svcpt->scp_at_lock); | |
985 | ||
986 | return 0; | |
987 | } | |
988 | ||
989 | static void | |
990 | ptlrpc_at_remove_timed(struct ptlrpc_request *req) | |
991 | { | |
992 | struct ptlrpc_at_array *array; | |
993 | ||
994 | array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; | |
995 | ||
996 | /* NB: must call with hold svcpt::scp_at_lock */ | |
997 | LASSERT(!list_empty(&req->rq_timed_list)); | |
998 | list_del_init(&req->rq_timed_list); | |
999 | ||
1000 | spin_lock(&req->rq_lock); | |
1001 | req->rq_at_linked = 0; | |
1002 | spin_unlock(&req->rq_lock); | |
1003 | ||
1004 | array->paa_reqs_count[req->rq_at_index]--; | |
1005 | array->paa_count--; | |
1006 | } | |
1007 | ||
31c5e95e CH |
1008 | /* |
1009 | * Attempt to extend the request deadline by sending an early reply to the | |
1010 | * client. | |
1011 | */ | |
d7e09d03 PT |
1012 | static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) |
1013 | { | |
1014 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; | |
1015 | struct ptlrpc_request *reqcopy; | |
1016 | struct lustre_msg *reqmsg; | |
219e6de6 | 1017 | long olddl = req->rq_deadline - ktime_get_real_seconds(); |
d7e09d03 | 1018 | int rc; |
d7e09d03 PT |
1019 | |
1020 | /* deadline is when the client expects us to reply, margin is the | |
dadfcdab OD |
1021 | * difference between clients' and servers' expectations |
1022 | */ | |
d7e09d03 | 1023 | DEBUG_REQ(D_ADAPTTO, req, |
2d00bd17 JP |
1024 | "%ssending early reply (deadline %+lds, margin %+lds) for %d+%d", |
1025 | AT_OFF ? "AT off - not " : "", | |
d7e09d03 PT |
1026 | olddl, olddl - at_get(&svcpt->scp_at_estimate), |
1027 | at_get(&svcpt->scp_at_estimate), at_extra); | |
1028 | ||
1029 | if (AT_OFF) | |
0a3bdb00 | 1030 | return 0; |
d7e09d03 PT |
1031 | |
1032 | if (olddl < 0) { | |
2d00bd17 JP |
1033 | DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), not sending early reply. Consider increasing at_early_margin (%d)?", |
1034 | olddl, at_early_margin); | |
d7e09d03 PT |
1035 | |
1036 | /* Return an error so we're not re-added to the timed list. */ | |
0a3bdb00 | 1037 | return -ETIMEDOUT; |
d7e09d03 PT |
1038 | } |
1039 | ||
cb68dd2d | 1040 | if (!(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { |
2d00bd17 | 1041 | DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, but no AT support"); |
0a3bdb00 | 1042 | return -ENOSYS; |
d7e09d03 PT |
1043 | } |
1044 | ||
31c5e95e CH |
1045 | /* |
1046 | * We want to extend the request deadline by at_extra seconds, | |
1047 | * so we set our service estimate to reflect how much time has | |
1048 | * passed since this request arrived plus an additional | |
1049 | * at_extra seconds. The client will calculate the new deadline | |
1050 | * based on this service estimate (plus some additional time to | |
1051 | * account for network latency). See ptlrpc_at_recv_early_reply | |
dadfcdab | 1052 | */ |
af3ec53b OD |
1053 | at_measured(&svcpt->scp_at_estimate, at_extra + |
1054 | ktime_get_real_seconds() - req->rq_arrival_time.tv_sec); | |
1055 | ||
1056 | /* Check to see if we've actually increased the deadline - | |
dadfcdab OD |
1057 | * we may be past adaptive_max |
1058 | */ | |
af3ec53b OD |
1059 | if (req->rq_deadline >= req->rq_arrival_time.tv_sec + |
1060 | at_get(&svcpt->scp_at_estimate)) { | |
1061 | DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%ld/%lld), not sending early reply\n", | |
1062 | olddl, req->rq_arrival_time.tv_sec + | |
1063 | at_get(&svcpt->scp_at_estimate) - | |
1064 | ktime_get_real_seconds()); | |
1065 | return -ETIMEDOUT; | |
d7e09d03 | 1066 | } |
d7e09d03 | 1067 | |
0be19afa | 1068 | reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS); |
8b382089 | 1069 | if (!reqcopy) |
0a3bdb00 | 1070 | return -ENOMEM; |
ee0ec194 | 1071 | reqmsg = libcfs_kvzalloc(req->rq_reqlen, GFP_NOFS); |
a9b3e8f3 JL |
1072 | if (!reqmsg) { |
1073 | rc = -ENOMEM; | |
1074 | goto out_free; | |
1075 | } | |
d7e09d03 PT |
1076 | |
1077 | *reqcopy = *req; | |
1078 | reqcopy->rq_reply_state = NULL; | |
1079 | reqcopy->rq_rep_swab_mask = 0; | |
1080 | reqcopy->rq_pack_bulk = 0; | |
1081 | reqcopy->rq_pack_udesc = 0; | |
1082 | reqcopy->rq_packed_final = 0; | |
1083 | sptlrpc_svc_ctx_addref(reqcopy); | |
1084 | /* We only need the reqmsg for the magic */ | |
1085 | reqcopy->rq_reqmsg = reqmsg; | |
1086 | memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); | |
1087 | ||
1088 | LASSERT(atomic_read(&req->rq_refcount)); | |
1089 | /** if it is last refcount then early reply isn't needed */ | |
1090 | if (atomic_read(&req->rq_refcount) == 1) { | |
2d00bd17 | 1091 | DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, abort sending early reply\n"); |
a9b3e8f3 JL |
1092 | rc = -EINVAL; |
1093 | goto out; | |
d7e09d03 PT |
1094 | } |
1095 | ||
1096 | /* Connection ref */ | |
1097 | reqcopy->rq_export = class_conn2export( | |
1098 | lustre_msg_get_handle(reqcopy->rq_reqmsg)); | |
8b382089 | 1099 | if (!reqcopy->rq_export) { |
a9b3e8f3 JL |
1100 | rc = -ENODEV; |
1101 | goto out; | |
1102 | } | |
d7e09d03 PT |
1103 | |
1104 | /* RPC ref */ | |
1105 | class_export_rpc_inc(reqcopy->rq_export); | |
1106 | if (reqcopy->rq_export->exp_obd && | |
a9b3e8f3 JL |
1107 | reqcopy->rq_export->exp_obd->obd_fail) { |
1108 | rc = -ENODEV; | |
1109 | goto out_put; | |
1110 | } | |
d7e09d03 PT |
1111 | |
1112 | rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); | |
1113 | if (rc) | |
a9b3e8f3 | 1114 | goto out_put; |
d7e09d03 PT |
1115 | |
1116 | rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); | |
1117 | ||
1118 | if (!rc) { | |
1119 | /* Adjust our own deadline to what we told the client */ | |
31c5e95e CH |
1120 | req->rq_deadline = req->rq_arrival_time.tv_sec + |
1121 | at_get(&svcpt->scp_at_estimate); | |
d7e09d03 PT |
1122 | req->rq_early_count++; /* number sent, server side */ |
1123 | } else { | |
1124 | DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); | |
1125 | } | |
1126 | ||
1127 | /* Free the (early) reply state from lustre_pack_reply. | |
dadfcdab OD |
1128 | * (ptlrpc_send_reply takes it's own rs ref, so this is safe here) |
1129 | */ | |
d7e09d03 PT |
1130 | ptlrpc_req_drop_rs(reqcopy); |
1131 | ||
1132 | out_put: | |
1133 | class_export_rpc_dec(reqcopy->rq_export); | |
1134 | class_export_put(reqcopy->rq_export); | |
1135 | out: | |
1136 | sptlrpc_svc_ctx_decref(reqcopy); | |
ee0ec194 | 1137 | kvfree(reqmsg); |
35b2e1b7 AS |
1138 | out_free: |
1139 | ptlrpc_request_cache_free(reqcopy); | |
0a3bdb00 | 1140 | return rc; |
d7e09d03 PT |
1141 | } |
1142 | ||
1143 | /* Send early replies to everybody expiring within at_early_margin | |
dadfcdab OD |
1144 | * asking for at_extra time |
1145 | */ | |
80b6f295 | 1146 | static void ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) |
d7e09d03 PT |
1147 | { |
1148 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
1149 | struct ptlrpc_request *rq, *n; | |
1150 | struct list_head work_list; | |
d0bfef31 | 1151 | __u32 index, count; |
219e6de6 AB |
1152 | time64_t deadline; |
1153 | time64_t now = ktime_get_real_seconds(); | |
b2d201bd | 1154 | long delay; |
d7e09d03 | 1155 | int first, counter = 0; |
d7e09d03 PT |
1156 | |
1157 | spin_lock(&svcpt->scp_at_lock); | |
1158 | if (svcpt->scp_at_check == 0) { | |
1159 | spin_unlock(&svcpt->scp_at_lock); | |
80b6f295 | 1160 | return; |
d7e09d03 PT |
1161 | } |
1162 | delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime); | |
1163 | svcpt->scp_at_check = 0; | |
1164 | ||
1165 | if (array->paa_count == 0) { | |
1166 | spin_unlock(&svcpt->scp_at_lock); | |
80b6f295 | 1167 | return; |
d7e09d03 PT |
1168 | } |
1169 | ||
1170 | /* The timer went off, but maybe the nearest rpc already completed. */ | |
1171 | first = array->paa_deadline - now; | |
1172 | if (first > at_early_margin) { | |
1173 | /* We've still got plenty of time. Reset the timer. */ | |
1174 | ptlrpc_at_set_timer(svcpt); | |
1175 | spin_unlock(&svcpt->scp_at_lock); | |
80b6f295 | 1176 | return; |
d7e09d03 PT |
1177 | } |
1178 | ||
1179 | /* We're close to a timeout, and we don't know how much longer the | |
dadfcdab OD |
1180 | * server will take. Send early replies to everyone expiring soon. |
1181 | */ | |
d7e09d03 PT |
1182 | INIT_LIST_HEAD(&work_list); |
1183 | deadline = -1; | |
219e6de6 | 1184 | div_u64_rem(array->paa_deadline, array->paa_size, &index); |
d7e09d03 PT |
1185 | count = array->paa_count; |
1186 | while (count > 0) { | |
1187 | count -= array->paa_reqs_count[index]; | |
30c0aa39 OD |
1188 | list_for_each_entry_safe(rq, n, &array->paa_reqs_array[index], |
1189 | rq_timed_list) { | |
d7e09d03 PT |
1190 | if (rq->rq_deadline > now + at_early_margin) { |
1191 | /* update the earliest deadline */ | |
1192 | if (deadline == -1 || | |
1193 | rq->rq_deadline < deadline) | |
1194 | deadline = rq->rq_deadline; | |
1195 | break; | |
1196 | } | |
1197 | ||
1198 | ptlrpc_at_remove_timed(rq); | |
1199 | /** | |
1200 | * ptlrpc_server_drop_request() may drop | |
1201 | * refcount to 0 already. Let's check this and | |
1202 | * don't add entry to work_list | |
1203 | */ | |
1204 | if (likely(atomic_inc_not_zero(&rq->rq_refcount))) | |
1205 | list_add(&rq->rq_timed_list, &work_list); | |
1206 | counter++; | |
1207 | } | |
1208 | ||
1209 | if (++index >= array->paa_size) | |
1210 | index = 0; | |
1211 | } | |
1212 | array->paa_deadline = deadline; | |
1213 | /* we have a new earliest deadline, restart the timer */ | |
1214 | ptlrpc_at_set_timer(svcpt); | |
1215 | ||
1216 | spin_unlock(&svcpt->scp_at_lock); | |
1217 | ||
2d00bd17 JP |
1218 | CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early replies\n", |
1219 | first, at_extra, counter); | |
d7e09d03 PT |
1220 | if (first < 0) { |
1221 | /* We're already past request deadlines before we even get a | |
dadfcdab OD |
1222 | * chance to send early replies |
1223 | */ | |
2d00bd17 | 1224 | LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n", |
d7e09d03 | 1225 | svcpt->scp_service->srv_name); |
219e6de6 | 1226 | CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%ld(jiff)\n", |
d7e09d03 PT |
1227 | counter, svcpt->scp_nreqs_incoming, |
1228 | svcpt->scp_nreqs_active, | |
1229 | at_get(&svcpt->scp_at_estimate), delay); | |
1230 | } | |
1231 | ||
1232 | /* we took additional refcount so entries can't be deleted from list, no | |
dadfcdab OD |
1233 | * locking is needed |
1234 | */ | |
d7e09d03 PT |
1235 | while (!list_empty(&work_list)) { |
1236 | rq = list_entry(work_list.next, struct ptlrpc_request, | |
30c0aa39 | 1237 | rq_timed_list); |
d7e09d03 PT |
1238 | list_del_init(&rq->rq_timed_list); |
1239 | ||
1240 | if (ptlrpc_at_send_early_reply(rq) == 0) | |
1241 | ptlrpc_at_add_timed(rq); | |
1242 | ||
1243 | ptlrpc_server_drop_request(rq); | |
1244 | } | |
d7e09d03 PT |
1245 | } |
1246 | ||
1247 | /** | |
1248 | * Put the request to the export list if the request may become | |
1249 | * a high priority one. | |
1250 | */ | |
1251 | static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, | |
1252 | struct ptlrpc_request *req) | |
1253 | { | |
1254 | int rc = 0; | |
d7e09d03 PT |
1255 | |
1256 | if (svcpt->scp_service->srv_ops.so_hpreq_handler) { | |
1257 | rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); | |
1258 | if (rc < 0) | |
0a3bdb00 | 1259 | return rc; |
d7e09d03 PT |
1260 | LASSERT(rc == 0); |
1261 | } | |
1262 | if (req->rq_export && req->rq_ops) { | |
1263 | /* Perform request specific check. We should do this check | |
1264 | * before the request is added into exp_hp_rpcs list otherwise | |
dadfcdab OD |
1265 | * it may hit swab race at LU-1044. |
1266 | */ | |
d7e09d03 PT |
1267 | if (req->rq_ops->hpreq_check) { |
1268 | rc = req->rq_ops->hpreq_check(req); | |
1269 | /** | |
1270 | * XXX: Out of all current | |
1271 | * ptlrpc_hpreq_ops::hpreq_check(), only | |
1272 | * ldlm_cancel_hpreq_check() can return an error code; | |
1273 | * other functions assert in similar places, which seems | |
1274 | * odd. What also does not seem right is that handlers | |
1275 | * for those RPCs do not assert on the same checks, but | |
1276 | * rather handle the error cases. e.g. see | |
1277 | * ost_rw_hpreq_check(), and ost_brw_read(), | |
1278 | * ost_brw_write(). | |
1279 | */ | |
1280 | if (rc < 0) | |
0a3bdb00 | 1281 | return rc; |
d7e09d03 PT |
1282 | LASSERT(rc == 0 || rc == 1); |
1283 | } | |
1284 | ||
1285 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
30c0aa39 | 1286 | list_add(&req->rq_exp_list, &req->rq_export->exp_hp_rpcs); |
d7e09d03 PT |
1287 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); |
1288 | } | |
1289 | ||
1290 | ptlrpc_nrs_req_initialize(svcpt, req, rc); | |
1291 | ||
0a3bdb00 | 1292 | return rc; |
d7e09d03 PT |
1293 | } |
1294 | ||
1295 | /** Remove the request from the export list. */ | |
1296 | static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) | |
1297 | { | |
d7e09d03 PT |
1298 | if (req->rq_export && req->rq_ops) { |
1299 | /* refresh lock timeout again so that client has more | |
dadfcdab OD |
1300 | * room to send lock cancel RPC. |
1301 | */ | |
d7e09d03 PT |
1302 | if (req->rq_ops->hpreq_fini) |
1303 | req->rq_ops->hpreq_fini(req); | |
1304 | ||
1305 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
1306 | list_del_init(&req->rq_exp_list); | |
1307 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); | |
1308 | } | |
d7e09d03 PT |
1309 | } |
1310 | ||
d7e09d03 PT |
1311 | static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, |
1312 | struct ptlrpc_request *req) | |
1313 | { | |
1314 | int rc; | |
d7e09d03 PT |
1315 | |
1316 | rc = ptlrpc_server_hpreq_init(svcpt, req); | |
1317 | if (rc < 0) | |
0a3bdb00 | 1318 | return rc; |
d7e09d03 PT |
1319 | |
1320 | ptlrpc_nrs_req_add(svcpt, req, !!rc); | |
1321 | ||
0a3bdb00 | 1322 | return 0; |
d7e09d03 PT |
1323 | } |
1324 | ||
1325 | /** | |
1326 | * Allow to handle high priority request | |
1327 | * User can call it w/o any lock but need to hold | |
1328 | * ptlrpc_service_part::scp_req_lock to get reliable result | |
1329 | */ | |
1330 | static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, | |
1331 | bool force) | |
1332 | { | |
1333 | int running = svcpt->scp_nthrs_running; | |
1334 | ||
1335 | if (!nrs_svcpt_has_hp(svcpt)) | |
1336 | return false; | |
1337 | ||
1338 | if (force) | |
1339 | return true; | |
1340 | ||
1341 | if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && | |
1342 | CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { | |
1343 | /* leave just 1 thread for normal RPCs */ | |
1344 | running = PTLRPC_NTHRS_INIT; | |
8b382089 | 1345 | if (svcpt->scp_service->srv_ops.so_hpreq_handler) |
d7e09d03 PT |
1346 | running += 1; |
1347 | } | |
1348 | ||
1349 | if (svcpt->scp_nreqs_active >= running - 1) | |
1350 | return false; | |
1351 | ||
1352 | if (svcpt->scp_nhreqs_active == 0) | |
1353 | return true; | |
1354 | ||
1355 | return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || | |
1356 | svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; | |
1357 | } | |
1358 | ||
1359 | static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, | |
1360 | bool force) | |
1361 | { | |
1362 | return ptlrpc_server_allow_high(svcpt, force) && | |
1363 | ptlrpc_nrs_req_pending_nolock(svcpt, true); | |
1364 | } | |
1365 | ||
1366 | /** | |
1367 | * Only allow normal priority requests on a service that has a high-priority | |
1368 | * queue if forced (i.e. cleanup), if there are other high priority requests | |
1369 | * already being processed (i.e. those threads can service more high-priority | |
1370 | * requests), or if there are enough idle threads that a later thread can do | |
1371 | * a high priority request. | |
1372 | * User can call it w/o any lock but need to hold | |
1373 | * ptlrpc_service_part::scp_req_lock to get reliable result | |
1374 | */ | |
1375 | static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, | |
1376 | bool force) | |
1377 | { | |
1378 | int running = svcpt->scp_nthrs_running; | |
50ffcb7e | 1379 | |
d7e09d03 PT |
1380 | if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && |
1381 | CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { | |
1382 | /* leave just 1 thread for normal RPCs */ | |
1383 | running = PTLRPC_NTHRS_INIT; | |
8b382089 | 1384 | if (svcpt->scp_service->srv_ops.so_hpreq_handler) |
d7e09d03 PT |
1385 | running += 1; |
1386 | } | |
1387 | ||
1388 | if (force || | |
1389 | svcpt->scp_nreqs_active < running - 2) | |
1390 | return true; | |
1391 | ||
1392 | if (svcpt->scp_nreqs_active >= running - 1) | |
1393 | return false; | |
1394 | ||
1395 | return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); | |
1396 | } | |
1397 | ||
1398 | static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, | |
1399 | bool force) | |
1400 | { | |
1401 | return ptlrpc_server_allow_normal(svcpt, force) && | |
1402 | ptlrpc_nrs_req_pending_nolock(svcpt, false); | |
1403 | } | |
1404 | ||
1405 | /** | |
1406 | * Returns true if there are requests available in incoming | |
1407 | * request queue for processing and it is allowed to fetch them. | |
1408 | * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock | |
1409 | * to get reliable result | |
1410 | * \see ptlrpc_server_allow_normal | |
1411 | * \see ptlrpc_server_allow high | |
1412 | */ | |
1413 | static inline bool | |
1414 | ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force) | |
1415 | { | |
1416 | return ptlrpc_server_high_pending(svcpt, force) || | |
1417 | ptlrpc_server_normal_pending(svcpt, force); | |
1418 | } | |
1419 | ||
1420 | /** | |
1421 | * Fetch a request for processing from queue of unprocessed requests. | |
1422 | * Favors high-priority requests. | |
1423 | * Returns a pointer to fetched request. | |
1424 | */ | |
1425 | static struct ptlrpc_request * | |
1426 | ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) | |
1427 | { | |
1428 | struct ptlrpc_request *req = NULL; | |
d7e09d03 PT |
1429 | |
1430 | spin_lock(&svcpt->scp_req_lock); | |
1431 | ||
1432 | if (ptlrpc_server_high_pending(svcpt, force)) { | |
1433 | req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); | |
8b382089 | 1434 | if (req) { |
d7e09d03 PT |
1435 | svcpt->scp_hreq_count++; |
1436 | goto got_request; | |
1437 | } | |
1438 | } | |
1439 | ||
1440 | if (ptlrpc_server_normal_pending(svcpt, force)) { | |
1441 | req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); | |
8b382089 | 1442 | if (req) { |
d7e09d03 PT |
1443 | svcpt->scp_hreq_count = 0; |
1444 | goto got_request; | |
1445 | } | |
1446 | } | |
1447 | ||
1448 | spin_unlock(&svcpt->scp_req_lock); | |
0a3bdb00 | 1449 | return NULL; |
d7e09d03 PT |
1450 | |
1451 | got_request: | |
1452 | svcpt->scp_nreqs_active++; | |
1453 | if (req->rq_hp) | |
1454 | svcpt->scp_nhreqs_active++; | |
1455 | ||
1456 | spin_unlock(&svcpt->scp_req_lock); | |
1457 | ||
1458 | if (likely(req->rq_export)) | |
1459 | class_export_rpc_inc(req->rq_export); | |
1460 | ||
0a3bdb00 | 1461 | return req; |
d7e09d03 PT |
1462 | } |
1463 | ||
1464 | /** | |
1465 | * Handle freshly incoming reqs, add to timed early reply list, | |
1466 | * pass on to regular request queue. | |
1467 | * All incoming requests pass through here before getting into | |
1468 | * ptlrpc_server_handle_req later on. | |
1469 | */ | |
1470 | static int | |
1471 | ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, | |
1472 | struct ptlrpc_thread *thread) | |
1473 | { | |
d0bfef31 CH |
1474 | struct ptlrpc_service *svc = svcpt->scp_service; |
1475 | struct ptlrpc_request *req; | |
1476 | __u32 deadline; | |
1477 | int rc; | |
d7e09d03 PT |
1478 | |
1479 | spin_lock(&svcpt->scp_lock); | |
1480 | if (list_empty(&svcpt->scp_req_incoming)) { | |
1481 | spin_unlock(&svcpt->scp_lock); | |
0a3bdb00 | 1482 | return 0; |
d7e09d03 PT |
1483 | } |
1484 | ||
1485 | req = list_entry(svcpt->scp_req_incoming.next, | |
30c0aa39 | 1486 | struct ptlrpc_request, rq_list); |
d7e09d03 PT |
1487 | list_del_init(&req->rq_list); |
1488 | svcpt->scp_nreqs_incoming--; | |
1489 | /* Consider this still a "queued" request as far as stats are | |
dadfcdab OD |
1490 | * concerned |
1491 | */ | |
d7e09d03 PT |
1492 | spin_unlock(&svcpt->scp_lock); |
1493 | ||
1494 | /* go through security check/transform */ | |
1495 | rc = sptlrpc_svc_unwrap_request(req); | |
1496 | switch (rc) { | |
1497 | case SECSVC_OK: | |
1498 | break; | |
1499 | case SECSVC_COMPLETE: | |
1500 | target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); | |
1501 | goto err_req; | |
1502 | case SECSVC_DROP: | |
1503 | goto err_req; | |
1504 | default: | |
1505 | LBUG(); | |
1506 | } | |
1507 | ||
1508 | /* | |
1509 | * for null-flavored rpc, msg has been unpacked by sptlrpc, although | |
1510 | * redo it wouldn't be harmful. | |
1511 | */ | |
1512 | if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { | |
1513 | rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); | |
1514 | if (rc != 0) { | |
b0f5aad5 GKH |
1515 | CERROR("error unpacking request: ptl %d from %s x%llu\n", |
1516 | svc->srv_req_portal, libcfs_id2str(req->rq_peer), | |
1517 | req->rq_xid); | |
d7e09d03 PT |
1518 | goto err_req; |
1519 | } | |
1520 | } | |
1521 | ||
1522 | rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); | |
1523 | if (rc) { | |
b0f5aad5 GKH |
1524 | CERROR("error unpacking ptlrpc body: ptl %d from %s x%llu\n", |
1525 | svc->srv_req_portal, libcfs_id2str(req->rq_peer), | |
1526 | req->rq_xid); | |
d7e09d03 PT |
1527 | goto err_req; |
1528 | } | |
1529 | ||
1530 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && | |
1531 | lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) { | |
b0f5aad5 | 1532 | CERROR("drop incoming rpc opc %u, x%llu\n", |
d7e09d03 PT |
1533 | cfs_fail_val, req->rq_xid); |
1534 | goto err_req; | |
1535 | } | |
1536 | ||
1537 | rc = -EINVAL; | |
1538 | if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { | |
1539 | CERROR("wrong packet type received (type=%u) from %s\n", | |
1540 | lustre_msg_get_type(req->rq_reqmsg), | |
1541 | libcfs_id2str(req->rq_peer)); | |
1542 | goto err_req; | |
1543 | } | |
1544 | ||
3949015e | 1545 | switch (lustre_msg_get_opc(req->rq_reqmsg)) { |
d7e09d03 PT |
1546 | case MDS_WRITEPAGE: |
1547 | case OST_WRITE: | |
1548 | req->rq_bulk_write = 1; | |
1549 | break; | |
1550 | case MDS_READPAGE: | |
1551 | case OST_READ: | |
1552 | case MGS_CONFIG_READ: | |
1553 | req->rq_bulk_read = 1; | |
1554 | break; | |
1555 | } | |
1556 | ||
b0f5aad5 | 1557 | CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid); |
d7e09d03 PT |
1558 | |
1559 | req->rq_export = class_conn2export( | |
1560 | lustre_msg_get_handle(req->rq_reqmsg)); | |
1561 | if (req->rq_export) { | |
1562 | rc = ptlrpc_check_req(req); | |
1563 | if (rc == 0) { | |
1564 | rc = sptlrpc_target_export_check(req->rq_export, req); | |
1565 | if (rc) | |
2d00bd17 | 1566 | DEBUG_REQ(D_ERROR, req, "DROPPING req with illegal security flavor,"); |
d7e09d03 PT |
1567 | } |
1568 | ||
1569 | if (rc) | |
1570 | goto err_req; | |
d7e09d03 PT |
1571 | } |
1572 | ||
1573 | /* req_in handling should/must be fast */ | |
219e6de6 | 1574 | if (ktime_get_real_seconds() - req->rq_arrival_time.tv_sec > 5) |
d7e09d03 | 1575 | DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s", |
219e6de6 AB |
1576 | (long)(ktime_get_real_seconds() - |
1577 | req->rq_arrival_time.tv_sec)); | |
d7e09d03 PT |
1578 | |
1579 | /* Set rpc server deadline and add it to the timed list */ | |
1580 | deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & | |
1581 | MSGHDR_AT_SUPPORT) ? | |
1582 | /* The max time the client expects us to take */ | |
1583 | lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; | |
1584 | req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; | |
1585 | if (unlikely(deadline == 0)) { | |
1586 | DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); | |
1587 | goto err_req; | |
1588 | } | |
1589 | ||
1590 | req->rq_svc_thread = thread; | |
70187506 MP |
1591 | if (thread) { |
1592 | /* initialize request session, it is needed for request | |
1593 | * processing by target | |
1594 | */ | |
1595 | rc = lu_context_init(&req->rq_session, | |
1596 | LCT_SERVER_SESSION | LCT_NOREF); | |
1597 | if (rc) { | |
1598 | CERROR("%s: failure to initialize session: rc = %d\n", | |
1599 | thread->t_name, rc); | |
1600 | goto err_req; | |
1601 | } | |
1602 | req->rq_session.lc_thread = thread; | |
1603 | lu_context_enter(&req->rq_session); | |
1604 | req->rq_svc_thread->t_env->le_ses = &req->rq_session; | |
1605 | } | |
d7e09d03 PT |
1606 | |
1607 | ptlrpc_at_add_timed(req); | |
1608 | ||
1609 | /* Move it over to the request processing queue */ | |
1610 | rc = ptlrpc_server_request_add(svcpt, req); | |
1611 | if (rc) | |
a9b3e8f3 | 1612 | goto err_req; |
d7e09d03 PT |
1613 | |
1614 | wake_up(&svcpt->scp_waitq); | |
0a3bdb00 | 1615 | return 1; |
d7e09d03 PT |
1616 | |
1617 | err_req: | |
1618 | ptlrpc_server_finish_request(svcpt, req); | |
1619 | ||
0a3bdb00 | 1620 | return 1; |
d7e09d03 PT |
1621 | } |
1622 | ||
1623 | /** | |
1624 | * Main incoming request handling logic. | |
1625 | * Calls handler function from service to do actual processing. | |
1626 | */ | |
1627 | static int | |
1628 | ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, | |
1629 | struct ptlrpc_thread *thread) | |
1630 | { | |
1631 | struct ptlrpc_service *svc = svcpt->scp_service; | |
1632 | struct ptlrpc_request *request; | |
219e6de6 AB |
1633 | struct timespec64 work_start; |
1634 | struct timespec64 work_end; | |
1635 | struct timespec64 timediff; | |
1636 | struct timespec64 arrived; | |
1637 | unsigned long timediff_usecs; | |
1638 | unsigned long arrived_usecs; | |
d0bfef31 | 1639 | int fail_opc = 0; |
d7e09d03 PT |
1640 | |
1641 | request = ptlrpc_server_request_get(svcpt, false); | |
8b382089 | 1642 | if (!request) |
0a3bdb00 | 1643 | return 0; |
d7e09d03 PT |
1644 | |
1645 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) | |
1646 | fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; | |
1647 | else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) | |
1648 | fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; | |
1649 | ||
1650 | if (unlikely(fail_opc)) { | |
1651 | if (request->rq_export && request->rq_ops) | |
1652 | OBD_FAIL_TIMEOUT(fail_opc, 4); | |
1653 | } | |
1654 | ||
1655 | ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); | |
1656 | ||
3949015e | 1657 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) |
d7e09d03 PT |
1658 | libcfs_debug_dumplog(); |
1659 | ||
219e6de6 AB |
1660 | ktime_get_real_ts64(&work_start); |
1661 | timediff = timespec64_sub(work_start, request->rq_arrival_time); | |
1662 | timediff_usecs = timediff.tv_sec * USEC_PER_SEC + | |
1663 | timediff.tv_nsec / NSEC_PER_USEC; | |
8b382089 | 1664 | if (likely(svc->srv_stats)) { |
d7e09d03 | 1665 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, |
219e6de6 | 1666 | timediff_usecs); |
d7e09d03 PT |
1667 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, |
1668 | svcpt->scp_nreqs_incoming); | |
1669 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, | |
1670 | svcpt->scp_nreqs_active); | |
1671 | lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, | |
1672 | at_get(&svcpt->scp_at_estimate)); | |
1673 | } | |
1674 | ||
d7e09d03 PT |
1675 | if (likely(request->rq_export)) { |
1676 | if (unlikely(ptlrpc_check_req(request))) | |
1677 | goto put_conn; | |
d7e09d03 PT |
1678 | } |
1679 | ||
1680 | /* Discard requests queued for longer than the deadline. | |
dadfcdab OD |
1681 | * The deadline is increased if we send an early reply. |
1682 | */ | |
219e6de6 | 1683 | if (ktime_get_real_seconds() > request->rq_deadline) { |
2d00bd17 | 1684 | DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline " CFS_DURATION_T ":" CFS_DURATION_T "s ago\n", |
d7e09d03 | 1685 | libcfs_id2str(request->rq_peer), |
219e6de6 AB |
1686 | (long)(request->rq_deadline - |
1687 | request->rq_arrival_time.tv_sec), | |
1688 | (long)(ktime_get_real_seconds() - | |
1689 | request->rq_deadline)); | |
d7e09d03 PT |
1690 | goto put_conn; |
1691 | } | |
1692 | ||
2d00bd17 JP |
1693 | CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d\n", |
1694 | current_comm(), | |
d7e09d03 PT |
1695 | (request->rq_export ? |
1696 | (char *)request->rq_export->exp_client_uuid.uuid : "0"), | |
1697 | (request->rq_export ? | |
1698 | atomic_read(&request->rq_export->exp_refcount) : -99), | |
1699 | lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, | |
1700 | libcfs_id2str(request->rq_peer), | |
1701 | lustre_msg_get_opc(request->rq_reqmsg)); | |
1702 | ||
1703 | if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) | |
1704 | CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); | |
1705 | ||
70187506 MP |
1706 | CDEBUG(D_NET, "got req %llu\n", request->rq_xid); |
1707 | ||
1708 | /* re-assign request and sesson thread to the current one */ | |
1709 | request->rq_svc_thread = thread; | |
1710 | if (thread) { | |
1711 | LASSERT(request->rq_session.lc_thread); | |
1712 | request->rq_session.lc_thread = thread; | |
1713 | request->rq_session.lc_cookie = 0x55; | |
1714 | thread->t_env->le_ses = &request->rq_session; | |
1715 | } | |
1716 | svc->srv_ops.so_req_handler(request); | |
d7e09d03 PT |
1717 | |
1718 | ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); | |
1719 | ||
1720 | put_conn: | |
219e6de6 | 1721 | if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) { |
532118c0 | 1722 | DEBUG_REQ(D_WARNING, request, |
219e6de6 AB |
1723 | "Request took longer than estimated (%lld:%llds); " |
1724 | "client may timeout.", | |
1725 | (s64)request->rq_deadline - | |
1726 | request->rq_arrival_time.tv_sec, | |
1727 | (s64)ktime_get_real_seconds() - request->rq_deadline); | |
1728 | } | |
1729 | ||
1730 | ktime_get_real_ts64(&work_end); | |
1731 | timediff = timespec64_sub(work_end, work_start); | |
1732 | timediff_usecs = timediff.tv_sec * USEC_PER_SEC + | |
1733 | timediff.tv_nsec / NSEC_PER_USEC; | |
1734 | arrived = timespec64_sub(work_end, request->rq_arrival_time); | |
1735 | arrived_usecs = arrived.tv_sec * USEC_PER_SEC + | |
1736 | arrived.tv_nsec / NSEC_PER_USEC; | |
2d00bd17 JP |
1737 | CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request processed in %ldus (%ldus total) trans %llu rc %d/%d\n", |
1738 | current_comm(), | |
1739 | (request->rq_export ? | |
1740 | (char *)request->rq_export->exp_client_uuid.uuid : "0"), | |
1741 | (request->rq_export ? | |
1742 | atomic_read(&request->rq_export->exp_refcount) : -99), | |
1743 | lustre_msg_get_status(request->rq_reqmsg), | |
1744 | request->rq_xid, | |
1745 | libcfs_id2str(request->rq_peer), | |
1746 | lustre_msg_get_opc(request->rq_reqmsg), | |
219e6de6 AB |
1747 | timediff_usecs, |
1748 | arrived_usecs, | |
2d00bd17 JP |
1749 | (request->rq_repmsg ? |
1750 | lustre_msg_get_transno(request->rq_repmsg) : | |
1751 | request->rq_transno), | |
1752 | request->rq_status, | |
1753 | (request->rq_repmsg ? | |
1754 | lustre_msg_get_status(request->rq_repmsg) : -999)); | |
8b382089 | 1755 | if (likely(svc->srv_stats && request->rq_reqmsg)) { |
d7e09d03 PT |
1756 | __u32 op = lustre_msg_get_opc(request->rq_reqmsg); |
1757 | int opc = opcode_offset(op); | |
50ffcb7e | 1758 | |
d7e09d03 PT |
1759 | if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { |
1760 | LASSERT(opc < LUSTRE_MAX_OPCODES); | |
1761 | lprocfs_counter_add(svc->srv_stats, | |
1762 | opc + EXTRA_MAX_OPCODES, | |
219e6de6 | 1763 | timediff_usecs); |
d7e09d03 PT |
1764 | } |
1765 | } | |
1766 | if (unlikely(request->rq_early_count)) { | |
1767 | DEBUG_REQ(D_ADAPTTO, request, | |
219e6de6 | 1768 | "sent %d early replies before finishing in %llds", |
d7e09d03 | 1769 | request->rq_early_count, |
219e6de6 AB |
1770 | (s64)work_end.tv_sec - |
1771 | request->rq_arrival_time.tv_sec); | |
d7e09d03 PT |
1772 | } |
1773 | ||
d7e09d03 PT |
1774 | ptlrpc_server_finish_active_request(svcpt, request); |
1775 | ||
0a3bdb00 | 1776 | return 1; |
d7e09d03 PT |
1777 | } |
1778 | ||
1779 | /** | |
1780 | * An internal function to process a single reply state object. | |
1781 | */ | |
1782 | static int | |
1783 | ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) | |
1784 | { | |
1785 | struct ptlrpc_service_part *svcpt = rs->rs_svcpt; | |
d0bfef31 CH |
1786 | struct ptlrpc_service *svc = svcpt->scp_service; |
1787 | struct obd_export *exp; | |
1788 | int nlocks; | |
1789 | int been_handled; | |
d7e09d03 PT |
1790 | |
1791 | exp = rs->rs_export; | |
1792 | ||
3949015e KM |
1793 | LASSERT(rs->rs_difficult); |
1794 | LASSERT(rs->rs_scheduled); | |
1795 | LASSERT(list_empty(&rs->rs_list)); | |
d7e09d03 PT |
1796 | |
1797 | spin_lock(&exp->exp_lock); | |
1798 | /* Noop if removed already */ | |
3949015e | 1799 | list_del_init(&rs->rs_exp_list); |
d7e09d03 PT |
1800 | spin_unlock(&exp->exp_lock); |
1801 | ||
1802 | /* The disk commit callback holds exp_uncommitted_replies_lock while it | |
1803 | * iterates over newly committed replies, removing them from | |
1804 | * exp_uncommitted_replies. It then drops this lock and schedules the | |
1805 | * replies it found for handling here. | |
1806 | * | |
1807 | * We can avoid contention for exp_uncommitted_replies_lock between the | |
1808 | * HRT threads and further commit callbacks by checking rs_committed | |
1809 | * which is set in the commit callback while it holds both | |
1810 | * rs_lock and exp_uncommitted_reples. | |
1811 | * | |
1812 | * If we see rs_committed clear, the commit callback _may_ not have | |
1813 | * handled this reply yet and we race with it to grab | |
1814 | * exp_uncommitted_replies_lock before removing the reply from | |
1815 | * exp_uncommitted_replies. Note that if we lose the race and the | |
1816 | * reply has already been removed, list_del_init() is a noop. | |
1817 | * | |
1818 | * If we see rs_committed set, we know the commit callback is handling, | |
1819 | * or has handled this reply since store reordering might allow us to | |
1820 | * see rs_committed set out of sequence. But since this is done | |
1821 | * holding rs_lock, we can be sure it has all completed once we hold | |
1822 | * rs_lock, which we do right next. | |
1823 | */ | |
1824 | if (!rs->rs_committed) { | |
1825 | spin_lock(&exp->exp_uncommitted_replies_lock); | |
1826 | list_del_init(&rs->rs_obd_list); | |
1827 | spin_unlock(&exp->exp_uncommitted_replies_lock); | |
1828 | } | |
1829 | ||
1830 | spin_lock(&rs->rs_lock); | |
1831 | ||
1832 | been_handled = rs->rs_handled; | |
1833 | rs->rs_handled = 1; | |
1834 | ||
1835 | nlocks = rs->rs_nlocks; /* atomic "steal", but */ | |
1836 | rs->rs_nlocks = 0; /* locks still on rs_locks! */ | |
1837 | ||
1838 | if (nlocks == 0 && !been_handled) { | |
1839 | /* If we see this, we should already have seen the warning | |
dadfcdab OD |
1840 | * in mds_steal_ack_locks() |
1841 | */ | |
f537dd2c | 1842 | CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n", |
d7e09d03 PT |
1843 | rs, |
1844 | rs->rs_xid, rs->rs_transno, rs->rs_opc, | |
1845 | libcfs_nid2str(exp->exp_connection->c_peer.nid)); | |
1846 | } | |
1847 | ||
1848 | if ((!been_handled && rs->rs_on_net) || nlocks > 0) { | |
1849 | spin_unlock(&rs->rs_lock); | |
1850 | ||
1851 | if (!been_handled && rs->rs_on_net) { | |
1852 | LNetMDUnlink(rs->rs_md_h); | |
1853 | /* Ignore return code; we're racing with completion */ | |
1854 | } | |
1855 | ||
1856 | while (nlocks-- > 0) | |
1857 | ldlm_lock_decref(&rs->rs_locks[nlocks], | |
1858 | rs->rs_modes[nlocks]); | |
1859 | ||
1860 | spin_lock(&rs->rs_lock); | |
1861 | } | |
1862 | ||
1863 | rs->rs_scheduled = 0; | |
1864 | ||
1865 | if (!rs->rs_on_net) { | |
1866 | /* Off the net */ | |
1867 | spin_unlock(&rs->rs_lock); | |
1868 | ||
3949015e | 1869 | class_export_put(exp); |
d7e09d03 | 1870 | rs->rs_export = NULL; |
3949015e | 1871 | ptlrpc_rs_decref(rs); |
d7e09d03 PT |
1872 | if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && |
1873 | svc->srv_is_stopping) | |
1874 | wake_up_all(&svcpt->scp_waitq); | |
0a3bdb00 | 1875 | return 1; |
d7e09d03 PT |
1876 | } |
1877 | ||
1878 | /* still on the net; callback will schedule */ | |
1879 | spin_unlock(&rs->rs_lock); | |
0a3bdb00 | 1880 | return 1; |
d7e09d03 PT |
1881 | } |
1882 | ||
d7e09d03 PT |
1883 | static void |
1884 | ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) | |
1885 | { | |
1886 | int avail = svcpt->scp_nrqbds_posted; | |
1887 | int low_water = test_req_buffer_pressure ? 0 : | |
1888 | svcpt->scp_service->srv_nbuf_per_group / 2; | |
1889 | ||
1890 | /* NB I'm not locking; just looking. */ | |
1891 | ||
1892 | /* CAVEAT EMPTOR: We might be allocating buffers here because we've | |
1893 | * allowed the request history to grow out of control. We could put a | |
1894 | * sanity check on that here and cull some history if we need the | |
dadfcdab OD |
1895 | * space. |
1896 | */ | |
d7e09d03 PT |
1897 | |
1898 | if (avail <= low_water) | |
1899 | ptlrpc_grow_req_bufs(svcpt, 1); | |
1900 | ||
1901 | if (svcpt->scp_service->srv_stats) { | |
1902 | lprocfs_counter_add(svcpt->scp_service->srv_stats, | |
1903 | PTLRPC_REQBUF_AVAIL_CNTR, avail); | |
1904 | } | |
1905 | } | |
1906 | ||
1907 | static int | |
1908 | ptlrpc_retry_rqbds(void *arg) | |
1909 | { | |
864ef621 | 1910 | struct ptlrpc_service_part *svcpt = arg; |
d7e09d03 PT |
1911 | |
1912 | svcpt->scp_rqbd_timeout = 0; | |
1913 | return -ETIMEDOUT; | |
1914 | } | |
1915 | ||
1916 | static inline int | |
1917 | ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) | |
1918 | { | |
1919 | return svcpt->scp_nreqs_active < | |
1920 | svcpt->scp_nthrs_running - 1 - | |
1921 | (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); | |
1922 | } | |
1923 | ||
1924 | /** | |
1925 | * allowed to create more threads | |
1926 | * user can call it w/o any lock but need to hold | |
1927 | * ptlrpc_service_part::scp_lock to get reliable result | |
1928 | */ | |
1929 | static inline int | |
1930 | ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) | |
1931 | { | |
1932 | return svcpt->scp_nthrs_running + | |
1933 | svcpt->scp_nthrs_starting < | |
1934 | svcpt->scp_service->srv_nthrs_cpt_limit; | |
1935 | } | |
1936 | ||
1937 | /** | |
1938 | * too many requests and allowed to create more threads | |
1939 | */ | |
1940 | static inline int | |
1941 | ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) | |
1942 | { | |
1943 | return !ptlrpc_threads_enough(svcpt) && | |
1944 | ptlrpc_threads_increasable(svcpt); | |
1945 | } | |
1946 | ||
1947 | static inline int | |
1948 | ptlrpc_thread_stopping(struct ptlrpc_thread *thread) | |
1949 | { | |
1950 | return thread_is_stopping(thread) || | |
1951 | thread->t_svcpt->scp_service->srv_is_stopping; | |
1952 | } | |
1953 | ||
1954 | static inline int | |
1955 | ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) | |
1956 | { | |
1957 | return !list_empty(&svcpt->scp_rqbd_idle) && | |
1958 | svcpt->scp_rqbd_timeout == 0; | |
1959 | } | |
1960 | ||
1961 | static inline int | |
1962 | ptlrpc_at_check(struct ptlrpc_service_part *svcpt) | |
1963 | { | |
1964 | return svcpt->scp_at_check; | |
1965 | } | |
1966 | ||
1967 | /** | |
1968 | * requests wait on preprocessing | |
1969 | * user can call it w/o any lock but need to hold | |
1970 | * ptlrpc_service_part::scp_lock to get reliable result | |
1971 | */ | |
1972 | static inline int | |
1973 | ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) | |
1974 | { | |
1975 | return !list_empty(&svcpt->scp_req_incoming); | |
1976 | } | |
1977 | ||
1978 | static __attribute__((__noinline__)) int | |
1979 | ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, | |
1980 | struct ptlrpc_thread *thread) | |
1981 | { | |
1982 | /* Don't exit while there are replies to be handled */ | |
1983 | struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout, | |
1984 | ptlrpc_retry_rqbds, svcpt); | |
1985 | ||
5d4450c4 | 1986 | /* XXX: Add this back when libcfs watchdog is merged upstream |
d7e09d03 | 1987 | lc_watchdog_disable(thread->t_watchdog); |
5d4450c4 | 1988 | */ |
d7e09d03 PT |
1989 | |
1990 | cond_resched(); | |
1991 | ||
1992 | l_wait_event_exclusive_head(svcpt->scp_waitq, | |
24c198e9 OD |
1993 | ptlrpc_thread_stopping(thread) || |
1994 | ptlrpc_server_request_incoming(svcpt) || | |
1995 | ptlrpc_server_request_pending(svcpt, | |
1996 | false) || | |
1997 | ptlrpc_rqbd_pending(svcpt) || | |
1998 | ptlrpc_at_check(svcpt), &lwi); | |
d7e09d03 PT |
1999 | |
2000 | if (ptlrpc_thread_stopping(thread)) | |
2001 | return -EINTR; | |
2002 | ||
5d4450c4 | 2003 | /* |
d7e09d03 PT |
2004 | lc_watchdog_touch(thread->t_watchdog, |
2005 | ptlrpc_server_get_timeout(svcpt)); | |
5d4450c4 | 2006 | */ |
d7e09d03 PT |
2007 | return 0; |
2008 | } | |
2009 | ||
2010 | /** | |
2011 | * Main thread body for service threads. | |
2012 | * Waits in a loop waiting for new requests to process to appear. | |
2013 | * Every time an incoming requests is added to its queue, a waitq | |
2014 | * is woken up and one of the threads will handle it. | |
2015 | */ | |
2016 | static int ptlrpc_main(void *arg) | |
2017 | { | |
864ef621 | 2018 | struct ptlrpc_thread *thread = arg; |
d0bfef31 CH |
2019 | struct ptlrpc_service_part *svcpt = thread->t_svcpt; |
2020 | struct ptlrpc_service *svc = svcpt->scp_service; | |
2021 | struct ptlrpc_reply_state *rs; | |
c88a6cbb | 2022 | struct group_info *ginfo = NULL; |
d7e09d03 PT |
2023 | struct lu_env *env; |
2024 | int counter = 0, rc = 0; | |
d7e09d03 PT |
2025 | |
2026 | thread->t_pid = current_pid(); | |
2027 | unshare_fs_struct(); | |
2028 | ||
2029 | /* NB: we will call cfs_cpt_bind() for all threads, because we | |
2030 | * might want to run lustre server only on a subset of system CPUs, | |
dadfcdab OD |
2031 | * in that case ->scp_cpt is CFS_CPT_ANY |
2032 | */ | |
d7e09d03 PT |
2033 | rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); |
2034 | if (rc != 0) { | |
2035 | CWARN("%s: failed to bind %s on CPT %d\n", | |
2036 | svc->srv_name, thread->t_name, svcpt->scp_cpt); | |
2037 | } | |
2038 | ||
d7e09d03 PT |
2039 | ginfo = groups_alloc(0); |
2040 | if (!ginfo) { | |
2041 | rc = -ENOMEM; | |
2042 | goto out; | |
2043 | } | |
2044 | ||
2045 | set_current_groups(ginfo); | |
2046 | put_group_info(ginfo); | |
d7e09d03 | 2047 | |
8b382089 | 2048 | if (svc->srv_ops.so_thr_init) { |
d7e09d03 PT |
2049 | rc = svc->srv_ops.so_thr_init(thread); |
2050 | if (rc) | |
2051 | goto out; | |
2052 | } | |
2053 | ||
9ae10597 | 2054 | env = kzalloc(sizeof(*env), GFP_NOFS); |
597851ac | 2055 | if (!env) { |
d7e09d03 PT |
2056 | rc = -ENOMEM; |
2057 | goto out_srv_fini; | |
2058 | } | |
2059 | ||
2060 | rc = lu_context_init(&env->le_ctx, | |
cd94f231 | 2061 | svc->srv_ctx_tags | LCT_REMEMBER | LCT_NOREF); |
d7e09d03 PT |
2062 | if (rc) |
2063 | goto out_srv_fini; | |
2064 | ||
2065 | thread->t_env = env; | |
2066 | env->le_ctx.lc_thread = thread; | |
2067 | env->le_ctx.lc_cookie = 0x6; | |
2068 | ||
2069 | while (!list_empty(&svcpt->scp_rqbd_idle)) { | |
2070 | rc = ptlrpc_server_post_idle_rqbds(svcpt); | |
2071 | if (rc >= 0) | |
2072 | continue; | |
2073 | ||
2074 | CERROR("Failed to post rqbd for %s on CPT %d: %d\n", | |
30c0aa39 | 2075 | svc->srv_name, svcpt->scp_cpt, rc); |
d7e09d03 PT |
2076 | goto out_srv_fini; |
2077 | } | |
2078 | ||
2079 | /* Alloc reply state structure for this one */ | |
ee0ec194 | 2080 | rs = libcfs_kvzalloc(svc->srv_max_reply_size, GFP_NOFS); |
d7e09d03 PT |
2081 | if (!rs) { |
2082 | rc = -ENOMEM; | |
2083 | goto out_srv_fini; | |
2084 | } | |
2085 | ||
2086 | spin_lock(&svcpt->scp_lock); | |
2087 | ||
2088 | LASSERT(thread_is_starting(thread)); | |
2089 | thread_clear_flags(thread, SVC_STARTING); | |
2090 | ||
2091 | LASSERT(svcpt->scp_nthrs_starting == 1); | |
2092 | svcpt->scp_nthrs_starting--; | |
2093 | ||
2094 | /* SVC_STOPPING may already be set here if someone else is trying | |
2095 | * to stop the service while this new thread has been dynamically | |
2096 | * forked. We still set SVC_RUNNING to let our creator know that | |
dadfcdab OD |
2097 | * we are now running, however we will exit as soon as possible |
2098 | */ | |
d7e09d03 PT |
2099 | thread_add_flags(thread, SVC_RUNNING); |
2100 | svcpt->scp_nthrs_running++; | |
2101 | spin_unlock(&svcpt->scp_lock); | |
2102 | ||
2103 | /* wake up our creator in case he's still waiting. */ | |
2104 | wake_up(&thread->t_ctl_waitq); | |
2105 | ||
5d4450c4 | 2106 | /* |
d7e09d03 PT |
2107 | thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt), |
2108 | NULL, NULL); | |
5d4450c4 | 2109 | */ |
d7e09d03 PT |
2110 | |
2111 | spin_lock(&svcpt->scp_rep_lock); | |
2112 | list_add(&rs->rs_list, &svcpt->scp_rep_idle); | |
2113 | wake_up(&svcpt->scp_rep_waitq); | |
2114 | spin_unlock(&svcpt->scp_rep_lock); | |
2115 | ||
2116 | CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, | |
2117 | svcpt->scp_nthrs_running); | |
2118 | ||
2119 | /* XXX maintain a list of all managed devices: insert here */ | |
2120 | while (!ptlrpc_thread_stopping(thread)) { | |
2121 | if (ptlrpc_wait_event(svcpt, thread)) | |
2122 | break; | |
2123 | ||
2124 | ptlrpc_check_rqbd_pool(svcpt); | |
2125 | ||
2126 | if (ptlrpc_threads_need_create(svcpt)) { | |
2127 | /* Ignore return code - we tried... */ | |
2128 | ptlrpc_start_thread(svcpt, 0); | |
2129 | } | |
2130 | ||
2131 | /* Process all incoming reqs before handling any */ | |
2132 | if (ptlrpc_server_request_incoming(svcpt)) { | |
2133 | lu_context_enter(&env->le_ctx); | |
4ee688d0 | 2134 | env->le_ses = NULL; |
d7e09d03 PT |
2135 | ptlrpc_server_handle_req_in(svcpt, thread); |
2136 | lu_context_exit(&env->le_ctx); | |
2137 | ||
2138 | /* but limit ourselves in case of flood */ | |
2139 | if (counter++ < 100) | |
2140 | continue; | |
2141 | counter = 0; | |
2142 | } | |
2143 | ||
2144 | if (ptlrpc_at_check(svcpt)) | |
2145 | ptlrpc_at_check_timed(svcpt); | |
2146 | ||
2147 | if (ptlrpc_server_request_pending(svcpt, false)) { | |
2148 | lu_context_enter(&env->le_ctx); | |
2149 | ptlrpc_server_handle_request(svcpt, thread); | |
2150 | lu_context_exit(&env->le_ctx); | |
2151 | } | |
2152 | ||
2153 | if (ptlrpc_rqbd_pending(svcpt) && | |
2154 | ptlrpc_server_post_idle_rqbds(svcpt) < 0) { | |
2155 | /* I just failed to repost request buffers. | |
2156 | * Wait for a timeout (unless something else | |
dadfcdab OD |
2157 | * happens) before I try again |
2158 | */ | |
d7e09d03 PT |
2159 | svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; |
2160 | CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", | |
2161 | svcpt->scp_nrqbds_posted); | |
2162 | } | |
2163 | } | |
2164 | ||
5d4450c4 | 2165 | /* |
d7e09d03 PT |
2166 | lc_watchdog_delete(thread->t_watchdog); |
2167 | thread->t_watchdog = NULL; | |
5d4450c4 | 2168 | */ |
d7e09d03 PT |
2169 | |
2170 | out_srv_fini: | |
2171 | /* | |
2172 | * deconstruct service specific state created by ptlrpc_start_thread() | |
2173 | */ | |
8b382089 | 2174 | if (svc->srv_ops.so_thr_done) |
d7e09d03 PT |
2175 | svc->srv_ops.so_thr_done(thread); |
2176 | ||
8b382089 | 2177 | if (env) { |
d7e09d03 | 2178 | lu_context_fini(&env->le_ctx); |
9ae10597 | 2179 | kfree(env); |
d7e09d03 PT |
2180 | } |
2181 | out: | |
2182 | CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n", | |
2183 | thread, thread->t_pid, thread->t_id, rc); | |
2184 | ||
2185 | spin_lock(&svcpt->scp_lock); | |
2186 | if (thread_test_and_clear_flags(thread, SVC_STARTING)) | |
2187 | svcpt->scp_nthrs_starting--; | |
2188 | ||
2189 | if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { | |
2190 | /* must know immediately */ | |
2191 | svcpt->scp_nthrs_running--; | |
2192 | } | |
2193 | ||
2194 | thread->t_id = rc; | |
2195 | thread_add_flags(thread, SVC_STOPPED); | |
2196 | ||
2197 | wake_up(&thread->t_ctl_waitq); | |
2198 | spin_unlock(&svcpt->scp_lock); | |
2199 | ||
2200 | return rc; | |
2201 | } | |
2202 | ||
2203 | static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, | |
2204 | struct list_head *replies) | |
2205 | { | |
2206 | int result; | |
2207 | ||
2208 | spin_lock(&hrt->hrt_lock); | |
2209 | ||
2210 | list_splice_init(&hrt->hrt_queue, replies); | |
2211 | result = ptlrpc_hr.hr_stopping || !list_empty(replies); | |
2212 | ||
2213 | spin_unlock(&hrt->hrt_lock); | |
2214 | return result; | |
2215 | } | |
2216 | ||
2217 | /** | |
2218 | * Main body of "handle reply" function. | |
2219 | * It processes acked reply states | |
2220 | */ | |
2221 | static int ptlrpc_hr_main(void *arg) | |
2222 | { | |
864ef621 | 2223 | struct ptlrpc_hr_thread *hrt = arg; |
d0bfef31 | 2224 | struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; |
24721b29 | 2225 | LIST_HEAD(replies); |
d0bfef31 CH |
2226 | char threadname[20]; |
2227 | int rc; | |
d7e09d03 PT |
2228 | |
2229 | snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", | |
2230 | hrp->hrp_cpt, hrt->hrt_id); | |
2231 | unshare_fs_struct(); | |
2232 | ||
2233 | rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); | |
2234 | if (rc != 0) { | |
2235 | CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", | |
2236 | threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); | |
2237 | } | |
2238 | ||
2239 | atomic_inc(&hrp->hrp_nstarted); | |
2240 | wake_up(&ptlrpc_hr.hr_waitq); | |
2241 | ||
2242 | while (!ptlrpc_hr.hr_stopping) { | |
2243 | l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); | |
2244 | ||
2245 | while (!list_empty(&replies)) { | |
2246 | struct ptlrpc_reply_state *rs; | |
2247 | ||
30c0aa39 OD |
2248 | rs = list_entry(replies.prev, struct ptlrpc_reply_state, |
2249 | rs_list); | |
d7e09d03 PT |
2250 | list_del_init(&rs->rs_list); |
2251 | ptlrpc_handle_rs(rs); | |
2252 | } | |
2253 | } | |
2254 | ||
2255 | atomic_inc(&hrp->hrp_nstopped); | |
2256 | wake_up(&ptlrpc_hr.hr_waitq); | |
2257 | ||
2258 | return 0; | |
2259 | } | |
2260 | ||
2261 | static void ptlrpc_stop_hr_threads(void) | |
2262 | { | |
d0bfef31 CH |
2263 | struct ptlrpc_hr_partition *hrp; |
2264 | int i; | |
2265 | int j; | |
d7e09d03 PT |
2266 | |
2267 | ptlrpc_hr.hr_stopping = 1; | |
2268 | ||
2269 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
8b382089 | 2270 | if (!hrp->hrp_thrs) |
d7e09d03 PT |
2271 | continue; /* uninitialized */ |
2272 | for (j = 0; j < hrp->hrp_nthrs; j++) | |
2273 | wake_up_all(&hrp->hrp_thrs[j].hrt_waitq); | |
2274 | } | |
2275 | ||
2276 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
8b382089 | 2277 | if (!hrp->hrp_thrs) |
d7e09d03 PT |
2278 | continue; /* uninitialized */ |
2279 | wait_event(ptlrpc_hr.hr_waitq, | |
30c0aa39 OD |
2280 | atomic_read(&hrp->hrp_nstopped) == |
2281 | atomic_read(&hrp->hrp_nstarted)); | |
d7e09d03 PT |
2282 | } |
2283 | } | |
2284 | ||
2285 | static int ptlrpc_start_hr_threads(void) | |
2286 | { | |
d0bfef31 CH |
2287 | struct ptlrpc_hr_partition *hrp; |
2288 | int i; | |
2289 | int j; | |
d7e09d03 PT |
2290 | |
2291 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
d0bfef31 | 2292 | int rc = 0; |
d7e09d03 PT |
2293 | |
2294 | for (j = 0; j < hrp->hrp_nthrs; j++) { | |
2295 | struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; | |
060c2820 | 2296 | struct task_struct *task; |
50ffcb7e | 2297 | |
060c2820 | 2298 | task = kthread_run(ptlrpc_hr_main, |
30c0aa39 OD |
2299 | &hrp->hrp_thrs[j], |
2300 | "ptlrpc_hr%02d_%03d", | |
2301 | hrp->hrp_cpt, hrt->hrt_id); | |
060c2820 JH |
2302 | if (IS_ERR(task)) { |
2303 | rc = PTR_ERR(task); | |
d7e09d03 | 2304 | break; |
060c2820 | 2305 | } |
d7e09d03 PT |
2306 | } |
2307 | wait_event(ptlrpc_hr.hr_waitq, | |
30c0aa39 | 2308 | atomic_read(&hrp->hrp_nstarted) == j); |
d7e09d03 | 2309 | |
060c2820 JH |
2310 | if (rc < 0) { |
2311 | CERROR("cannot start reply handler thread %d:%d: rc = %d\n", | |
2312 | i, j, rc); | |
2313 | ptlrpc_stop_hr_threads(); | |
2314 | return rc; | |
2315 | } | |
d7e09d03 | 2316 | } |
0a3bdb00 | 2317 | return 0; |
d7e09d03 PT |
2318 | } |
2319 | ||
2320 | static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) | |
2321 | { | |
d0bfef31 CH |
2322 | struct l_wait_info lwi = { 0 }; |
2323 | struct ptlrpc_thread *thread; | |
24721b29 | 2324 | LIST_HEAD(zombie); |
d7e09d03 | 2325 | |
d7e09d03 PT |
2326 | CDEBUG(D_INFO, "Stopping threads for service %s\n", |
2327 | svcpt->scp_service->srv_name); | |
2328 | ||
2329 | spin_lock(&svcpt->scp_lock); | |
2330 | /* let the thread know that we would like it to stop asap */ | |
2331 | list_for_each_entry(thread, &svcpt->scp_threads, t_link) { | |
2332 | CDEBUG(D_INFO, "Stopping thread %s #%u\n", | |
2333 | svcpt->scp_service->srv_thread_name, thread->t_id); | |
2334 | thread_add_flags(thread, SVC_STOPPING); | |
2335 | } | |
2336 | ||
2337 | wake_up_all(&svcpt->scp_waitq); | |
2338 | ||
2339 | while (!list_empty(&svcpt->scp_threads)) { | |
2340 | thread = list_entry(svcpt->scp_threads.next, | |
30c0aa39 | 2341 | struct ptlrpc_thread, t_link); |
d7e09d03 PT |
2342 | if (thread_is_stopped(thread)) { |
2343 | list_del(&thread->t_link); | |
2344 | list_add(&thread->t_link, &zombie); | |
2345 | continue; | |
2346 | } | |
2347 | spin_unlock(&svcpt->scp_lock); | |
2348 | ||
2349 | CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", | |
2350 | svcpt->scp_service->srv_thread_name, thread->t_id); | |
2351 | l_wait_event(thread->t_ctl_waitq, | |
2352 | thread_is_stopped(thread), &lwi); | |
2353 | ||
2354 | spin_lock(&svcpt->scp_lock); | |
2355 | } | |
2356 | ||
2357 | spin_unlock(&svcpt->scp_lock); | |
2358 | ||
2359 | while (!list_empty(&zombie)) { | |
2360 | thread = list_entry(zombie.next, | |
24c198e9 | 2361 | struct ptlrpc_thread, t_link); |
d7e09d03 | 2362 | list_del(&thread->t_link); |
9ae10597 | 2363 | kfree(thread); |
d7e09d03 | 2364 | } |
d7e09d03 PT |
2365 | } |
2366 | ||
2367 | /** | |
2368 | * Stops all threads of a particular service \a svc | |
2369 | */ | |
230a8da1 | 2370 | static void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) |
d7e09d03 PT |
2371 | { |
2372 | struct ptlrpc_service_part *svcpt; | |
d0bfef31 | 2373 | int i; |
d7e09d03 PT |
2374 | |
2375 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2376 | if (svcpt->scp_service) |
d7e09d03 PT |
2377 | ptlrpc_svcpt_stop_threads(svcpt); |
2378 | } | |
d7e09d03 | 2379 | } |
d7e09d03 PT |
2380 | |
2381 | int ptlrpc_start_threads(struct ptlrpc_service *svc) | |
2382 | { | |
d0bfef31 CH |
2383 | int rc = 0; |
2384 | int i; | |
2385 | int j; | |
d7e09d03 PT |
2386 | |
2387 | /* We require 2 threads min, see note in ptlrpc_server_handle_request */ | |
2388 | LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); | |
2389 | ||
2390 | for (i = 0; i < svc->srv_ncpts; i++) { | |
2391 | for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { | |
2392 | rc = ptlrpc_start_thread(svc->srv_parts[i], 1); | |
2393 | if (rc == 0) | |
2394 | continue; | |
2395 | ||
2396 | if (rc != -EMFILE) | |
2397 | goto failed; | |
2398 | /* We have enough threads, don't start more. b=15759 */ | |
2399 | break; | |
2400 | } | |
2401 | } | |
2402 | ||
0a3bdb00 | 2403 | return 0; |
d7e09d03 PT |
2404 | failed: |
2405 | CERROR("cannot start %s thread #%d_%d: rc %d\n", | |
2406 | svc->srv_thread_name, i, j, rc); | |
2407 | ptlrpc_stop_all_threads(svc); | |
0a3bdb00 | 2408 | return rc; |
d7e09d03 PT |
2409 | } |
2410 | EXPORT_SYMBOL(ptlrpc_start_threads); | |
2411 | ||
2412 | int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) | |
2413 | { | |
d0bfef31 CH |
2414 | struct l_wait_info lwi = { 0 }; |
2415 | struct ptlrpc_thread *thread; | |
2416 | struct ptlrpc_service *svc; | |
060c2820 | 2417 | struct task_struct *task; |
d0bfef31 | 2418 | int rc; |
d7e09d03 | 2419 | |
d7e09d03 PT |
2420 | svc = svcpt->scp_service; |
2421 | ||
2422 | CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", | |
2423 | svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, | |
2424 | svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); | |
2425 | ||
2426 | again: | |
2427 | if (unlikely(svc->srv_is_stopping)) | |
0a3bdb00 | 2428 | return -ESRCH; |
d7e09d03 PT |
2429 | |
2430 | if (!ptlrpc_threads_increasable(svcpt) || | |
2431 | (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && | |
2432 | svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) | |
0a3bdb00 | 2433 | return -EMFILE; |
d7e09d03 | 2434 | |
bae97e81 JL |
2435 | thread = kzalloc_node(sizeof(*thread), GFP_NOFS, |
2436 | cfs_cpt_spread_node(svc->srv_cptable, | |
2437 | svcpt->scp_cpt)); | |
8b382089 | 2438 | if (!thread) |
0a3bdb00 | 2439 | return -ENOMEM; |
d7e09d03 PT |
2440 | init_waitqueue_head(&thread->t_ctl_waitq); |
2441 | ||
2442 | spin_lock(&svcpt->scp_lock); | |
2443 | if (!ptlrpc_threads_increasable(svcpt)) { | |
2444 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2445 | kfree(thread); |
0a3bdb00 | 2446 | return -EMFILE; |
d7e09d03 PT |
2447 | } |
2448 | ||
2449 | if (svcpt->scp_nthrs_starting != 0) { | |
2450 | /* serialize starting because some modules (obdfilter) | |
dadfcdab OD |
2451 | * might require unique and contiguous t_id |
2452 | */ | |
d7e09d03 PT |
2453 | LASSERT(svcpt->scp_nthrs_starting == 1); |
2454 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2455 | kfree(thread); |
d7e09d03 PT |
2456 | if (wait) { |
2457 | CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", | |
2458 | svc->srv_thread_name, svcpt->scp_thr_nextid); | |
2459 | schedule(); | |
2460 | goto again; | |
2461 | } | |
2462 | ||
2463 | CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", | |
2464 | svc->srv_thread_name, svcpt->scp_thr_nextid); | |
0a3bdb00 | 2465 | return -EAGAIN; |
d7e09d03 PT |
2466 | } |
2467 | ||
2468 | svcpt->scp_nthrs_starting++; | |
2469 | thread->t_id = svcpt->scp_thr_nextid++; | |
2470 | thread_add_flags(thread, SVC_STARTING); | |
2471 | thread->t_svcpt = svcpt; | |
2472 | ||
2473 | list_add(&thread->t_link, &svcpt->scp_threads); | |
2474 | spin_unlock(&svcpt->scp_lock); | |
2475 | ||
2476 | if (svcpt->scp_cpt >= 0) { | |
9edf0f67 | 2477 | snprintf(thread->t_name, sizeof(thread->t_name), "%s%02d_%03d", |
d7e09d03 PT |
2478 | svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); |
2479 | } else { | |
9edf0f67 | 2480 | snprintf(thread->t_name, sizeof(thread->t_name), "%s_%04d", |
d7e09d03 PT |
2481 | svc->srv_thread_name, thread->t_id); |
2482 | } | |
2483 | ||
2484 | CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); | |
060c2820 JH |
2485 | task = kthread_run(ptlrpc_main, thread, "%s", thread->t_name); |
2486 | if (IS_ERR(task)) { | |
2487 | rc = PTR_ERR(task); | |
2488 | CERROR("cannot start thread '%s': rc = %d\n", | |
d7e09d03 PT |
2489 | thread->t_name, rc); |
2490 | spin_lock(&svcpt->scp_lock); | |
d7e09d03 | 2491 | --svcpt->scp_nthrs_starting; |
5be8e070 | 2492 | if (thread_is_stopping(thread)) { |
369e5c9a | 2493 | /* this ptlrpc_thread is being handled |
5be8e070 HN |
2494 | * by ptlrpc_svcpt_stop_threads now |
2495 | */ | |
2496 | thread_add_flags(thread, SVC_STOPPED); | |
2497 | wake_up(&thread->t_ctl_waitq); | |
2498 | spin_unlock(&svcpt->scp_lock); | |
2499 | } else { | |
2500 | list_del(&thread->t_link); | |
2501 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2502 | kfree(thread); |
5be8e070 | 2503 | } |
0a3bdb00 | 2504 | return rc; |
d7e09d03 PT |
2505 | } |
2506 | ||
2507 | if (!wait) | |
0a3bdb00 | 2508 | return 0; |
d7e09d03 PT |
2509 | |
2510 | l_wait_event(thread->t_ctl_waitq, | |
2511 | thread_is_running(thread) || thread_is_stopped(thread), | |
2512 | &lwi); | |
2513 | ||
2514 | rc = thread_is_stopped(thread) ? thread->t_id : 0; | |
0a3bdb00 | 2515 | return rc; |
d7e09d03 PT |
2516 | } |
2517 | ||
2518 | int ptlrpc_hr_init(void) | |
2519 | { | |
d0bfef31 CH |
2520 | struct ptlrpc_hr_partition *hrp; |
2521 | struct ptlrpc_hr_thread *hrt; | |
2522 | int rc; | |
2523 | int i; | |
2524 | int j; | |
2525 | int weight; | |
d7e09d03 PT |
2526 | |
2527 | memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); | |
2528 | ptlrpc_hr.hr_cpt_table = cfs_cpt_table; | |
2529 | ||
2530 | ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, | |
2531 | sizeof(*hrp)); | |
8b382089 | 2532 | if (!ptlrpc_hr.hr_partitions) |
0a3bdb00 | 2533 | return -ENOMEM; |
d7e09d03 PT |
2534 | |
2535 | init_waitqueue_head(&ptlrpc_hr.hr_waitq); | |
2536 | ||
06931e62 | 2537 | weight = cpumask_weight(topology_sibling_cpumask(0)); |
3867ea5a | 2538 | |
d7e09d03 PT |
2539 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { |
2540 | hrp->hrp_cpt = i; | |
2541 | ||
2542 | atomic_set(&hrp->hrp_nstarted, 0); | |
2543 | atomic_set(&hrp->hrp_nstopped, 0); | |
2544 | ||
2545 | hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i); | |
3867ea5a | 2546 | hrp->hrp_nthrs /= weight; |
d7e09d03 PT |
2547 | |
2548 | LASSERT(hrp->hrp_nthrs > 0); | |
bae97e81 JL |
2549 | hrp->hrp_thrs = |
2550 | kzalloc_node(hrp->hrp_nthrs * sizeof(*hrt), GFP_NOFS, | |
24c198e9 OD |
2551 | cfs_cpt_spread_node(ptlrpc_hr.hr_cpt_table, |
2552 | i)); | |
8b382089 | 2553 | if (!hrp->hrp_thrs) { |
a9b3e8f3 JL |
2554 | rc = -ENOMEM; |
2555 | goto out; | |
2556 | } | |
d7e09d03 PT |
2557 | |
2558 | for (j = 0; j < hrp->hrp_nthrs; j++) { | |
2559 | hrt = &hrp->hrp_thrs[j]; | |
2560 | ||
2561 | hrt->hrt_id = j; | |
2562 | hrt->hrt_partition = hrp; | |
2563 | init_waitqueue_head(&hrt->hrt_waitq); | |
2564 | spin_lock_init(&hrt->hrt_lock); | |
2565 | INIT_LIST_HEAD(&hrt->hrt_queue); | |
2566 | } | |
2567 | } | |
2568 | ||
2569 | rc = ptlrpc_start_hr_threads(); | |
2570 | out: | |
2571 | if (rc != 0) | |
2572 | ptlrpc_hr_fini(); | |
0a3bdb00 | 2573 | return rc; |
d7e09d03 PT |
2574 | } |
2575 | ||
2576 | void ptlrpc_hr_fini(void) | |
2577 | { | |
d0bfef31 CH |
2578 | struct ptlrpc_hr_partition *hrp; |
2579 | int i; | |
d7e09d03 | 2580 | |
8b382089 | 2581 | if (!ptlrpc_hr.hr_partitions) |
d7e09d03 PT |
2582 | return; |
2583 | ||
2584 | ptlrpc_stop_hr_threads(); | |
2585 | ||
2586 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
a5cb8880 | 2587 | kfree(hrp->hrp_thrs); |
d7e09d03 PT |
2588 | } |
2589 | ||
2590 | cfs_percpt_free(ptlrpc_hr.hr_partitions); | |
2591 | ptlrpc_hr.hr_partitions = NULL; | |
2592 | } | |
2593 | ||
d7e09d03 PT |
2594 | /** |
2595 | * Wait until all already scheduled replies are processed. | |
2596 | */ | |
2597 | static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) | |
2598 | { | |
2599 | while (1) { | |
2600 | int rc; | |
2601 | struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10), | |
2602 | NULL, NULL); | |
2603 | ||
2604 | rc = l_wait_event(svcpt->scp_waitq, | |
24c198e9 OD |
2605 | atomic_read(&svcpt->scp_nreps_difficult) == 0, |
2606 | &lwi); | |
d7e09d03 PT |
2607 | if (rc == 0) |
2608 | break; | |
2609 | CWARN("Unexpectedly long timeout %s %p\n", | |
2610 | svcpt->scp_service->srv_name, svcpt->scp_service); | |
2611 | } | |
2612 | } | |
2613 | ||
2614 | static void | |
2615 | ptlrpc_service_del_atimer(struct ptlrpc_service *svc) | |
2616 | { | |
d0bfef31 CH |
2617 | struct ptlrpc_service_part *svcpt; |
2618 | int i; | |
d7e09d03 PT |
2619 | |
2620 | /* early disarm AT timer... */ | |
2621 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2622 | if (svcpt->scp_service) |
922da0c5 | 2623 | del_timer(&svcpt->scp_at_timer); |
d7e09d03 PT |
2624 | } |
2625 | } | |
2626 | ||
2627 | static void | |
2628 | ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) | |
2629 | { | |
d0bfef31 | 2630 | struct ptlrpc_service_part *svcpt; |
d7e09d03 | 2631 | struct ptlrpc_request_buffer_desc *rqbd; |
d0bfef31 CH |
2632 | struct l_wait_info lwi; |
2633 | int rc; | |
2634 | int i; | |
d7e09d03 PT |
2635 | |
2636 | /* All history will be culled when the next request buffer is | |
dadfcdab OD |
2637 | * freed in ptlrpc_service_purge_all() |
2638 | */ | |
d7e09d03 PT |
2639 | svc->srv_hist_nrqbds_cpt_max = 0; |
2640 | ||
2641 | rc = LNetClearLazyPortal(svc->srv_req_portal); | |
2642 | LASSERT(rc == 0); | |
2643 | ||
2644 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2645 | if (!svcpt->scp_service) |
d7e09d03 PT |
2646 | break; |
2647 | ||
2648 | /* Unlink all the request buffers. This forces a 'final' | |
dadfcdab OD |
2649 | * event with its 'unlink' flag set for each posted rqbd |
2650 | */ | |
d7e09d03 | 2651 | list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, |
24c198e9 | 2652 | rqbd_list) { |
d7e09d03 PT |
2653 | rc = LNetMDUnlink(rqbd->rqbd_md_h); |
2654 | LASSERT(rc == 0 || rc == -ENOENT); | |
2655 | } | |
2656 | } | |
2657 | ||
2658 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2659 | if (!svcpt->scp_service) |
d7e09d03 PT |
2660 | break; |
2661 | ||
2662 | /* Wait for the network to release any buffers | |
dadfcdab OD |
2663 | * it's currently filling |
2664 | */ | |
d7e09d03 PT |
2665 | spin_lock(&svcpt->scp_lock); |
2666 | while (svcpt->scp_nrqbds_posted != 0) { | |
2667 | spin_unlock(&svcpt->scp_lock); | |
2668 | /* Network access will complete in finite time but | |
2669 | * the HUGE timeout lets us CWARN for visibility | |
dadfcdab OD |
2670 | * of sluggish LNDs |
2671 | */ | |
d7e09d03 PT |
2672 | lwi = LWI_TIMEOUT_INTERVAL( |
2673 | cfs_time_seconds(LONG_UNLINK), | |
2674 | cfs_time_seconds(1), NULL, NULL); | |
2675 | rc = l_wait_event(svcpt->scp_waitq, | |
2676 | svcpt->scp_nrqbds_posted == 0, &lwi); | |
2677 | if (rc == -ETIMEDOUT) { | |
2d00bd17 | 2678 | CWARN("Service %s waiting for request buffers\n", |
d7e09d03 PT |
2679 | svcpt->scp_service->srv_name); |
2680 | } | |
2681 | spin_lock(&svcpt->scp_lock); | |
2682 | } | |
2683 | spin_unlock(&svcpt->scp_lock); | |
2684 | } | |
2685 | } | |
2686 | ||
2687 | static void | |
2688 | ptlrpc_service_purge_all(struct ptlrpc_service *svc) | |
2689 | { | |
d0bfef31 CH |
2690 | struct ptlrpc_service_part *svcpt; |
2691 | struct ptlrpc_request_buffer_desc *rqbd; | |
2692 | struct ptlrpc_request *req; | |
2693 | struct ptlrpc_reply_state *rs; | |
2694 | int i; | |
d7e09d03 PT |
2695 | |
2696 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2697 | if (!svcpt->scp_service) |
d7e09d03 PT |
2698 | break; |
2699 | ||
2700 | spin_lock(&svcpt->scp_rep_lock); | |
2701 | while (!list_empty(&svcpt->scp_rep_active)) { | |
2702 | rs = list_entry(svcpt->scp_rep_active.next, | |
30c0aa39 | 2703 | struct ptlrpc_reply_state, rs_list); |
d7e09d03 PT |
2704 | spin_lock(&rs->rs_lock); |
2705 | ptlrpc_schedule_difficult_reply(rs); | |
2706 | spin_unlock(&rs->rs_lock); | |
2707 | } | |
2708 | spin_unlock(&svcpt->scp_rep_lock); | |
2709 | ||
2710 | /* purge the request queue. NB No new replies (rqbds | |
2711 | * all unlinked) and no service threads, so I'm the only | |
dadfcdab OD |
2712 | * thread noodling the request queue now |
2713 | */ | |
d7e09d03 PT |
2714 | while (!list_empty(&svcpt->scp_req_incoming)) { |
2715 | req = list_entry(svcpt->scp_req_incoming.next, | |
30c0aa39 | 2716 | struct ptlrpc_request, rq_list); |
d7e09d03 PT |
2717 | |
2718 | list_del(&req->rq_list); | |
2719 | svcpt->scp_nreqs_incoming--; | |
2720 | ptlrpc_server_finish_request(svcpt, req); | |
2721 | } | |
2722 | ||
2723 | while (ptlrpc_server_request_pending(svcpt, true)) { | |
2724 | req = ptlrpc_server_request_get(svcpt, true); | |
2725 | ptlrpc_server_finish_active_request(svcpt, req); | |
2726 | } | |
2727 | ||
2728 | LASSERT(list_empty(&svcpt->scp_rqbd_posted)); | |
2729 | LASSERT(svcpt->scp_nreqs_incoming == 0); | |
2730 | LASSERT(svcpt->scp_nreqs_active == 0); | |
2731 | /* history should have been culled by | |
dadfcdab OD |
2732 | * ptlrpc_server_finish_request |
2733 | */ | |
d7e09d03 PT |
2734 | LASSERT(svcpt->scp_hist_nrqbds == 0); |
2735 | ||
2736 | /* Now free all the request buffers since nothing | |
dadfcdab OD |
2737 | * references them any more... |
2738 | */ | |
d7e09d03 PT |
2739 | |
2740 | while (!list_empty(&svcpt->scp_rqbd_idle)) { | |
2741 | rqbd = list_entry(svcpt->scp_rqbd_idle.next, | |
30c0aa39 OD |
2742 | struct ptlrpc_request_buffer_desc, |
2743 | rqbd_list); | |
d7e09d03 PT |
2744 | ptlrpc_free_rqbd(rqbd); |
2745 | } | |
2746 | ptlrpc_wait_replies(svcpt); | |
2747 | ||
2748 | while (!list_empty(&svcpt->scp_rep_idle)) { | |
2749 | rs = list_entry(svcpt->scp_rep_idle.next, | |
30c0aa39 OD |
2750 | struct ptlrpc_reply_state, |
2751 | rs_list); | |
d7e09d03 | 2752 | list_del(&rs->rs_list); |
ee0ec194 | 2753 | kvfree(rs); |
d7e09d03 PT |
2754 | } |
2755 | } | |
2756 | } | |
2757 | ||
2758 | static void | |
2759 | ptlrpc_service_free(struct ptlrpc_service *svc) | |
2760 | { | |
d0bfef31 CH |
2761 | struct ptlrpc_service_part *svcpt; |
2762 | struct ptlrpc_at_array *array; | |
2763 | int i; | |
d7e09d03 PT |
2764 | |
2765 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2766 | if (!svcpt->scp_service) |
d7e09d03 PT |
2767 | break; |
2768 | ||
2769 | /* In case somebody rearmed this in the meantime */ | |
922da0c5 | 2770 | del_timer(&svcpt->scp_at_timer); |
d7e09d03 PT |
2771 | array = &svcpt->scp_at_array; |
2772 | ||
207e99c2 JL |
2773 | kfree(array->paa_reqs_array); |
2774 | array->paa_reqs_array = NULL; | |
2775 | kfree(array->paa_reqs_count); | |
2776 | array->paa_reqs_count = NULL; | |
d7e09d03 PT |
2777 | } |
2778 | ||
2779 | ptlrpc_service_for_each_part(svcpt, i, svc) | |
9ae10597 | 2780 | kfree(svcpt); |
d7e09d03 | 2781 | |
8b382089 | 2782 | if (svc->srv_cpts) |
d7e09d03 PT |
2783 | cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); |
2784 | ||
9ae10597 | 2785 | kfree(svc); |
d7e09d03 PT |
2786 | } |
2787 | ||
2788 | int ptlrpc_unregister_service(struct ptlrpc_service *service) | |
2789 | { | |
d7e09d03 PT |
2790 | CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); |
2791 | ||
2792 | service->srv_is_stopping = 1; | |
2793 | ||
2794 | mutex_lock(&ptlrpc_all_services_mutex); | |
2795 | list_del_init(&service->srv_list); | |
2796 | mutex_unlock(&ptlrpc_all_services_mutex); | |
2797 | ||
2798 | ptlrpc_service_del_atimer(service); | |
2799 | ptlrpc_stop_all_threads(service); | |
2800 | ||
2801 | ptlrpc_service_unlink_rqbd(service); | |
2802 | ptlrpc_service_purge_all(service); | |
2803 | ptlrpc_service_nrs_cleanup(service); | |
2804 | ||
2805 | ptlrpc_lprocfs_unregister_service(service); | |
328676f8 | 2806 | ptlrpc_sysfs_unregister_service(service); |
d7e09d03 PT |
2807 | |
2808 | ptlrpc_service_free(service); | |
2809 | ||
0a3bdb00 | 2810 | return 0; |
d7e09d03 PT |
2811 | } |
2812 | EXPORT_SYMBOL(ptlrpc_unregister_service); |