staging: lustre: balance braces properly in LNet layer
[deliverable/linux.git] / drivers / staging / lustre / lnet / lnet / router.c
CommitLineData
d7e09d03
PT
1/*
2 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
3 *
1dc563a6 4 * Copyright (c) 2011, 2015, Intel Corporation.
d7e09d03
PT
5 *
6 * This file is part of Portals
7 * http://sourceforge.net/projects/sandiaportals/
8 *
9 * Portals is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
12 *
13 * Portals is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with Portals; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 */
23
24#define DEBUG_SUBSYSTEM S_LNET
9fdaf8c0 25#include "../../include/linux/lnet/lib-lnet.h"
d7e09d03 26
d7e09d03
PT
27#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */
28#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4)
29#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */
30#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4)
31#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */
32#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4)
33
34static char *forwarding = "";
8cc7b4b9
PT
35module_param(forwarding, charp, 0444);
36MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks");
d7e09d03
PT
37
38static int tiny_router_buffers;
8cc7b4b9
PT
39module_param(tiny_router_buffers, int, 0444);
40MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router");
d7e09d03 41static int small_router_buffers;
8cc7b4b9
PT
42module_param(small_router_buffers, int, 0444);
43MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router");
d7e09d03 44static int large_router_buffers;
8cc7b4b9
PT
45module_param(large_router_buffers, int, 0444);
46MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router");
f96d1d7e 47static int peer_buffer_credits;
8cc7b4b9
PT
48module_param(peer_buffer_credits, int, 0444);
49MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer");
d7e09d03
PT
50
51static int auto_down = 1;
8cc7b4b9
PT
52module_param(auto_down, int, 0444);
53MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error");
d7e09d03
PT
54
55int
56lnet_peer_buffer_credits(lnet_ni_t *ni)
57{
58 /* NI option overrides LNet default */
59 if (ni->ni_peerrtrcredits > 0)
60 return ni->ni_peerrtrcredits;
61 if (peer_buffer_credits > 0)
62 return peer_buffer_credits;
63
4420cfd3
JS
64 /*
65 * As an approximation, allow this peer the same number of router
66 * buffers as it is allowed outstanding sends
67 */
d7e09d03
PT
68 return ni->ni_peertxcredits;
69}
70
71/* forward ref's */
72static int lnet_router_checker(void *);
d7e09d03 73
f96d1d7e 74static int check_routers_before_use;
8cc7b4b9
PT
75module_param(check_routers_before_use, int, 0444);
76MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
d7e09d03 77
af3fa7c7 78int avoid_asym_router_failure = 1;
8cc7b4b9
PT
79module_param(avoid_asym_router_failure, int, 0644);
80MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)");
d7e09d03
PT
81
82static int dead_router_check_interval = 60;
8cc7b4b9
PT
83module_param(dead_router_check_interval, int, 0644);
84MODULE_PARM_DESC(dead_router_check_interval, "Seconds between dead router health checks (<= 0 to disable)");
d7e09d03
PT
85
86static int live_router_check_interval = 60;
8cc7b4b9
PT
87module_param(live_router_check_interval, int, 0644);
88MODULE_PARM_DESC(live_router_check_interval, "Seconds between live router health checks (<= 0 to disable)");
d7e09d03
PT
89
90static int router_ping_timeout = 50;
8cc7b4b9
PT
91module_param(router_ping_timeout, int, 0644);
92MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query");
d7e09d03
PT
93
94int
95lnet_peers_start_down(void)
96{
97 return check_routers_before_use;
98}
99
100void
e6157b1b
RG
101lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive,
102 unsigned long when)
d7e09d03 103{
699503bc 104 if (time_before(when, lp->lp_timestamp)) { /* out of date information */
d7e09d03
PT
105 CDEBUG(D_NET, "Out of date\n");
106 return;
107 }
108
109 lp->lp_timestamp = when; /* update timestamp */
110 lp->lp_ping_deadline = 0; /* disable ping timeout */
111
112 if (lp->lp_alive_count != 0 && /* got old news */
113 (!lp->lp_alive) == (!alive)) { /* new date for old news */
114 CDEBUG(D_NET, "Old news\n");
115 return;
116 }
117
118 /* Flag that notification is outstanding */
119
120 lp->lp_alive_count++;
121 lp->lp_alive = !(!alive); /* 1 bit! */
122 lp->lp_notify = 1;
123 lp->lp_notifylnd |= notifylnd;
124 if (lp->lp_alive)
125 lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
126
127 CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
128}
129
24008203 130static void
d7e09d03
PT
131lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
132{
7e7ab095
MS
133 int alive;
134 int notifylnd;
d7e09d03 135
4420cfd3
JS
136 /*
137 * Notify only in 1 thread at any time to ensure ordered notification.
d7e09d03 138 * NB individual events can be missed; the only guarantee is that you
4420cfd3
JS
139 * always get the most recent news
140 */
d36175e9 141 if (lp->lp_notifying || ni == NULL)
d7e09d03
PT
142 return;
143
144 lp->lp_notifying = 1;
145
146 while (lp->lp_notify) {
7e7ab095 147 alive = lp->lp_alive;
d7e09d03
PT
148 notifylnd = lp->lp_notifylnd;
149
150 lp->lp_notifylnd = 0;
151 lp->lp_notify = 0;
152
153 if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
154 lnet_net_unlock(lp->lp_cpt);
155
4420cfd3
JS
156 /*
157 * A new notification could happen now; I'll handle it
158 * when control returns to me
159 */
0eee6778 160 ni->ni_lnd->lnd_notify(ni, lp->lp_nid, alive);
d7e09d03
PT
161
162 lnet_net_lock(lp->lp_cpt);
163 }
164 }
165
166 lp->lp_notifying = 0;
167}
168
d7e09d03
PT
169static void
170lnet_rtr_addref_locked(lnet_peer_t *lp)
171{
172 LASSERT(lp->lp_refcount > 0);
173 LASSERT(lp->lp_rtr_refcount >= 0);
174
175 /* lnet_net_lock must be exclusively locked */
176 lp->lp_rtr_refcount++;
177 if (lp->lp_rtr_refcount == 1) {
178 struct list_head *pos;
179
180 /* a simple insertion sort */
181 list_for_each_prev(pos, &the_lnet.ln_routers) {
182 lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
c314c319 183 lp_rtr_list);
d7e09d03
PT
184
185 if (rtr->lp_nid < lp->lp_nid)
186 break;
187 }
188
189 list_add(&lp->lp_rtr_list, pos);
190 /* addref for the_lnet.ln_routers */
191 lnet_peer_addref_locked(lp);
192 the_lnet.ln_routers_version++;
193 }
194}
195
196static void
197lnet_rtr_decref_locked(lnet_peer_t *lp)
198{
199 LASSERT(lp->lp_refcount > 0);
200 LASSERT(lp->lp_rtr_refcount > 0);
201
202 /* lnet_net_lock must be exclusively locked */
203 lp->lp_rtr_refcount--;
204 if (lp->lp_rtr_refcount == 0) {
205 LASSERT(list_empty(&lp->lp_routes));
206
207 if (lp->lp_rcd != NULL) {
208 list_add(&lp->lp_rcd->rcd_list,
c314c319 209 &the_lnet.ln_rcd_deathrow);
d7e09d03
PT
210 lp->lp_rcd = NULL;
211 }
212
213 list_del(&lp->lp_rtr_list);
214 /* decref for the_lnet.ln_routers */
215 lnet_peer_decref_locked(lp);
216 the_lnet.ln_routers_version++;
217 }
218}
219
220lnet_remotenet_t *
939af333 221lnet_find_net_locked(__u32 net)
d7e09d03 222{
7e7ab095
MS
223 lnet_remotenet_t *rnet;
224 struct list_head *tmp;
225 struct list_head *rn_list;
d7e09d03
PT
226
227 LASSERT(!the_lnet.ln_shutdown);
228
229 rn_list = lnet_net2rnethash(net);
230 list_for_each(tmp, rn_list) {
231 rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
232
233 if (rnet->lrn_net == net)
234 return rnet;
235 }
236 return NULL;
237}
238
239static void lnet_shuffle_seed(void)
240{
f96d1d7e 241 static int seeded;
80feb1ef 242 __u32 lnd_type, seed[2];
1f4fc343 243 struct timespec64 ts;
d7e09d03
PT
244 lnet_ni_t *ni;
245 struct list_head *tmp;
246
247 if (seeded)
248 return;
249
250 cfs_get_random_bytes(seed, sizeof(seed));
251
4420cfd3
JS
252 /*
253 * Nodes with small feet have little entropy
254 * the NID for this node gives the most entropy in the low bits
255 */
d7e09d03
PT
256 list_for_each(tmp, &the_lnet.ln_nis) {
257 ni = list_entry(tmp, lnet_ni_t, ni_list);
258 lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
259
260 if (lnd_type != LOLND)
261 seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
262 }
263
1f4fc343
AB
264 ktime_get_ts64(&ts);
265 cfs_srand(ts.tv_sec ^ seed[0], ts.tv_nsec ^ seed[1]);
d7e09d03 266 seeded = 1;
d7e09d03
PT
267}
268
269/* NB expects LNET_LOCK held */
24008203 270static void
939af333 271lnet_add_route_to_rnet(lnet_remotenet_t *rnet, lnet_route_t *route)
d7e09d03 272{
7e7ab095
MS
273 unsigned int len = 0;
274 unsigned int offset = 0;
275 struct list_head *e;
d7e09d03
PT
276
277 lnet_shuffle_seed();
278
939af333 279 list_for_each(e, &rnet->lrn_routes) {
d7e09d03
PT
280 len++;
281 }
282
283 /* len+1 positions to add a new entry, also prevents division by 0 */
284 offset = cfs_rand() % (len + 1);
939af333 285 list_for_each(e, &rnet->lrn_routes) {
d7e09d03
PT
286 if (offset == 0)
287 break;
288 offset--;
289 }
290 list_add(&route->lr_list, e);
291 list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
292
293 the_lnet.ln_remote_nets_version++;
294 lnet_rtr_addref_locked(route->lr_gateway);
295}
296
297int
e75fb87f
DO
298lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway,
299 unsigned int priority)
d7e09d03 300{
7e7ab095
MS
301 struct list_head *e;
302 lnet_remotenet_t *rnet;
303 lnet_remotenet_t *rnet2;
304 lnet_route_t *route;
305 lnet_ni_t *ni;
306 int add_route;
307 int rc;
d7e09d03 308
e75fb87f
DO
309 CDEBUG(D_NET, "Add route: net %s hops %u priority %u gw %s\n",
310 libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
d7e09d03
PT
311
312 if (gateway == LNET_NID_ANY ||
313 LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
314 net == LNET_NIDNET(LNET_NID_ANY) ||
315 LNET_NETTYP(net) == LOLND ||
316 LNET_NIDNET(gateway) == net ||
317 hops < 1 || hops > 255)
fbe7c6c7 318 return -EINVAL;
d7e09d03
PT
319
320 if (lnet_islocalnet(net)) /* it's a local network */
321 return 0; /* ignore the route entry */
322
323 /* Assume net, route, all new */
324 LIBCFS_ALLOC(route, sizeof(*route));
325 LIBCFS_ALLOC(rnet, sizeof(*rnet));
326 if (route == NULL || rnet == NULL) {
327 CERROR("Out of memory creating route %s %d %s\n",
328 libcfs_net2str(net), hops, libcfs_nid2str(gateway));
329 if (route != NULL)
330 LIBCFS_FREE(route, sizeof(*route));
331 if (rnet != NULL)
332 LIBCFS_FREE(rnet, sizeof(*rnet));
333 return -ENOMEM;
334 }
335
336 INIT_LIST_HEAD(&rnet->lrn_routes);
337 rnet->lrn_net = net;
338 route->lr_hops = hops;
339 route->lr_net = net;
e75fb87f 340 route->lr_priority = priority;
d7e09d03
PT
341
342 lnet_net_lock(LNET_LOCK_EX);
343
344 rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
345 if (rc != 0) {
346 lnet_net_unlock(LNET_LOCK_EX);
347
348 LIBCFS_FREE(route, sizeof(*route));
349 LIBCFS_FREE(rnet, sizeof(*rnet));
350
ec523735 351 if (rc == -EHOSTUNREACH) /* gateway is not on a local net */
d7e09d03 352 return 0; /* ignore the route entry */
ec523735
RG
353 CERROR("Error %d creating route %s %d %s\n", rc,
354 libcfs_net2str(net), hops,
355 libcfs_nid2str(gateway));
d7e09d03
PT
356 return rc;
357 }
358
939af333 359 LASSERT(!the_lnet.ln_shutdown);
d7e09d03
PT
360
361 rnet2 = lnet_find_net_locked(net);
362 if (rnet2 == NULL) {
363 /* new network */
364 list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
365 rnet2 = rnet;
366 }
367
368 /* Search for a duplicate route (it's a NOOP if it is) */
369 add_route = 1;
939af333 370 list_for_each(e, &rnet2->lrn_routes) {
d7e09d03
PT
371 lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
372
373 if (route2->lr_gateway == route->lr_gateway) {
374 add_route = 0;
375 break;
376 }
377
378 /* our lookups must be true */
939af333 379 LASSERT(route2->lr_gateway->lp_nid != gateway);
d7e09d03
PT
380 }
381
382 if (add_route) {
383 lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
384 lnet_add_route_to_rnet(rnet2, route);
385
386 ni = route->lr_gateway->lp_ni;
387 lnet_net_unlock(LNET_LOCK_EX);
388
389 /* XXX Assume alive */
390 if (ni->ni_lnd->lnd_notify != NULL)
0eee6778 391 ni->ni_lnd->lnd_notify(ni, gateway, 1);
d7e09d03
PT
392
393 lnet_net_lock(LNET_LOCK_EX);
394 }
395
396 /* -1 for notify or !add_route */
397 lnet_peer_decref_locked(route->lr_gateway);
398 lnet_net_unlock(LNET_LOCK_EX);
399
400 if (!add_route)
401 LIBCFS_FREE(route, sizeof(*route));
402
403 if (rnet != rnet2)
404 LIBCFS_FREE(rnet, sizeof(*rnet));
405
406 return 0;
407}
408
409int
410lnet_check_routes(void)
411{
7e7ab095
MS
412 lnet_remotenet_t *rnet;
413 lnet_route_t *route;
414 lnet_route_t *route2;
415 struct list_head *e1;
416 struct list_head *e2;
417 int cpt;
418 struct list_head *rn_list;
419 int i;
d7e09d03
PT
420
421 cpt = lnet_net_lock_current();
422
423 for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
424 rn_list = &the_lnet.ln_remote_nets_hash[i];
425 list_for_each(e1, rn_list) {
426 rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
427
428 route2 = NULL;
429 list_for_each(e2, &rnet->lrn_routes) {
7e7ab095
MS
430 lnet_nid_t nid1;
431 lnet_nid_t nid2;
432 int net;
d7e09d03 433
c314c319 434 route = list_entry(e2, lnet_route_t, lr_list);
d7e09d03
PT
435
436 if (route2 == NULL) {
437 route2 = route;
438 continue;
439 }
440
441 if (route->lr_gateway->lp_ni ==
442 route2->lr_gateway->lp_ni)
443 continue;
444
445 nid1 = route->lr_gateway->lp_nid;
446 nid2 = route2->lr_gateway->lp_nid;
447 net = rnet->lrn_net;
448
449 lnet_net_unlock(cpt);
450
2d00bd17 451 CERROR("Routes to %s via %s and %s not supported\n",
d7e09d03
PT
452 libcfs_net2str(net),
453 libcfs_nid2str(nid1),
454 libcfs_nid2str(nid2));
455 return -EINVAL;
456 }
457 }
458 }
459
460 lnet_net_unlock(cpt);
461 return 0;
462}
463
464int
465lnet_del_route(__u32 net, lnet_nid_t gw_nid)
466{
7e7ab095
MS
467 struct lnet_peer *gateway;
468 lnet_remotenet_t *rnet;
469 lnet_route_t *route;
470 struct list_head *e1;
471 struct list_head *e2;
472 int rc = -ENOENT;
473 struct list_head *rn_list;
474 int idx = 0;
d7e09d03
PT
475
476 CDEBUG(D_NET, "Del route: net %s : gw %s\n",
477 libcfs_net2str(net), libcfs_nid2str(gw_nid));
478
4420cfd3
JS
479 /*
480 * NB Caller may specify either all routes via the given gateway
481 * or a specific route entry actual NIDs)
482 */
d7e09d03
PT
483 lnet_net_lock(LNET_LOCK_EX);
484 if (net == LNET_NIDNET(LNET_NID_ANY))
485 rn_list = &the_lnet.ln_remote_nets_hash[0];
486 else
487 rn_list = lnet_net2rnethash(net);
488
489 again:
490 list_for_each(e1, rn_list) {
491 rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
492
493 if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
c314c319 494 net == rnet->lrn_net))
d7e09d03
PT
495 continue;
496
497 list_for_each(e2, &rnet->lrn_routes) {
498 route = list_entry(e2, lnet_route_t, lr_list);
499
500 gateway = route->lr_gateway;
501 if (!(gw_nid == LNET_NID_ANY ||
502 gw_nid == gateway->lp_nid))
503 continue;
504
505 list_del(&route->lr_list);
506 list_del(&route->lr_gwlist);
507 the_lnet.ln_remote_nets_version++;
508
509 if (list_empty(&rnet->lrn_routes))
510 list_del(&rnet->lrn_list);
511 else
512 rnet = NULL;
513
514 lnet_rtr_decref_locked(gateway);
515 lnet_peer_decref_locked(gateway);
516
517 lnet_net_unlock(LNET_LOCK_EX);
518
519 LIBCFS_FREE(route, sizeof(*route));
520
521 if (rnet != NULL)
522 LIBCFS_FREE(rnet, sizeof(*rnet));
523
524 rc = 0;
525 lnet_net_lock(LNET_LOCK_EX);
526 goto again;
527 }
528 }
529
530 if (net == LNET_NIDNET(LNET_NID_ANY) &&
531 ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
532 rn_list = &the_lnet.ln_remote_nets_hash[idx];
533 goto again;
534 }
535 lnet_net_unlock(LNET_LOCK_EX);
536
537 return rc;
538}
539
540void
939af333 541lnet_destroy_routes(void)
d7e09d03
PT
542{
543 lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
544}
545
546int
547lnet_get_route(int idx, __u32 *net, __u32 *hops,
e75fb87f 548 lnet_nid_t *gateway, __u32 *alive, __u32 *priority)
d7e09d03 549{
7e7ab095
MS
550 struct list_head *e1;
551 struct list_head *e2;
552 lnet_remotenet_t *rnet;
553 lnet_route_t *route;
554 int cpt;
555 int i;
556 struct list_head *rn_list;
d7e09d03
PT
557
558 cpt = lnet_net_lock_current();
559
560 for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
561 rn_list = &the_lnet.ln_remote_nets_hash[i];
562 list_for_each(e1, rn_list) {
563 rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
564
565 list_for_each(e2, &rnet->lrn_routes) {
c314c319 566 route = list_entry(e2, lnet_route_t, lr_list);
d7e09d03
PT
567
568 if (idx-- == 0) {
7e7ab095
MS
569 *net = rnet->lrn_net;
570 *hops = route->lr_hops;
e75fb87f
DO
571 *priority = route->lr_priority;
572 *gateway = route->lr_gateway->lp_nid;
7e7ab095 573 *alive = route->lr_gateway->lp_alive;
d7e09d03
PT
574 lnet_net_unlock(cpt);
575 return 0;
576 }
577 }
578 }
579 }
580
581 lnet_net_unlock(cpt);
582 return -ENOENT;
583}
584
585void
586lnet_swap_pinginfo(lnet_ping_info_t *info)
587{
7e7ab095 588 int i;
d7e09d03
PT
589 lnet_ni_status_t *stat;
590
591 __swab32s(&info->pi_magic);
592 __swab32s(&info->pi_features);
593 __swab32s(&info->pi_pid);
594 __swab32s(&info->pi_nnis);
595 for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
596 stat = &info->pi_ni[i];
597 __swab64s(&stat->ns_nid);
598 __swab32s(&stat->ns_status);
599 }
d7e09d03
PT
600}
601
602/**
603 * parse router-checker pinginfo, record number of down NIs for remote
604 * networks on that router.
605 */
606static void
607lnet_parse_rc_info(lnet_rc_data_t *rcd)
608{
7e7ab095
MS
609 lnet_ping_info_t *info = rcd->rcd_pinginfo;
610 struct lnet_peer *gw = rcd->rcd_gateway;
611 lnet_route_t *rtr;
d7e09d03
PT
612
613 if (!gw->lp_alive)
614 return;
615
616 if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
617 lnet_swap_pinginfo(info);
618
619 /* NB always racing with network! */
620 if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
621 CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
622 libcfs_nid2str(gw->lp_nid), info->pi_magic);
623 gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
624 return;
625 }
626
627 gw->lp_ping_feats = info->pi_features;
628 if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
629 CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
630 libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
631 return; /* nothing I can understand */
632 }
633
634 if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
635 return; /* can't carry NI status info */
636
637 list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
7e7ab095
MS
638 int ptl_status = LNET_NI_STATUS_INVALID;
639 int down = 0;
640 int up = 0;
641 int i;
d7e09d03
PT
642
643 for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
644 lnet_ni_status_t *stat = &info->pi_ni[i];
7e7ab095 645 lnet_nid_t nid = stat->ns_nid;
d7e09d03
PT
646
647 if (nid == LNET_NID_ANY) {
648 CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
649 libcfs_nid2str(gw->lp_nid));
650 gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
651 return;
652 }
653
654 if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
655 continue;
656
657 if (stat->ns_status == LNET_NI_STATUS_DOWN) {
658 if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
659 down++;
660 else if (ptl_status != LNET_NI_STATUS_UP)
661 ptl_status = LNET_NI_STATUS_DOWN;
662 continue;
663 }
664
665 if (stat->ns_status == LNET_NI_STATUS_UP) {
666 if (LNET_NIDNET(nid) == rtr->lr_net) {
667 up = 1;
668 break;
669 }
4420cfd3
JS
670 /*
671 * ptl NIs are considered down only when
672 * they're all down
673 */
d7e09d03
PT
674 if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
675 ptl_status = LNET_NI_STATUS_UP;
676 continue;
677 }
678
679 CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
680 libcfs_nid2str(gw->lp_nid), stat->ns_status);
681 gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
682 return;
683 }
684
685 if (up) { /* ignore downed NIs if NI for dest network is up */
686 rtr->lr_downis = 0;
687 continue;
688 }
689 rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
690 }
691}
692
693static void
694lnet_router_checker_event(lnet_event_t *event)
695{
7e7ab095
MS
696 lnet_rc_data_t *rcd = event->md.user_ptr;
697 struct lnet_peer *lp;
d7e09d03
PT
698
699 LASSERT(rcd != NULL);
700
701 if (event->unlinked) {
702 LNetInvalidateHandle(&rcd->rcd_mdh);
703 return;
704 }
705
706 LASSERT(event->type == LNET_EVENT_SEND ||
707 event->type == LNET_EVENT_REPLY);
708
709 lp = rcd->rcd_gateway;
710 LASSERT(lp != NULL);
711
4420cfd3
JS
712 /*
713 * NB: it's called with holding lnet_res_lock, we have a few
714 * places need to hold both locks at the same time, please take
715 * care of lock ordering
716 */
d7e09d03
PT
717 lnet_net_lock(lp->lp_cpt);
718 if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
719 /* ignore if no longer a router or rcd is replaced */
720 goto out;
721 }
722
723 if (event->type == LNET_EVENT_SEND) {
724 lp->lp_ping_notsent = 0;
725 if (event->status == 0)
726 goto out;
727 }
728
729 /* LNET_EVENT_REPLY */
4420cfd3
JS
730 /*
731 * A successful REPLY means the router is up. If _any_ comms
d7e09d03
PT
732 * to the router fail I assume it's down (this will happen if
733 * we ping alive routers to try to detect router death before
4420cfd3
JS
734 * apps get burned).
735 */
d7e09d03 736 lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
4420cfd3
JS
737
738 /*
739 * The router checker will wake up very shortly and do the
d7e09d03
PT
740 * actual notification.
741 * XXX If 'lp' stops being a router before then, it will still
4420cfd3
JS
742 * have the notification pending!!!
743 */
d7e09d03
PT
744 if (avoid_asym_router_failure && event->status == 0)
745 lnet_parse_rc_info(rcd);
746
747 out:
748 lnet_net_unlock(lp->lp_cpt);
749}
750
2595fa36 751static void
d7e09d03
PT
752lnet_wait_known_routerstate(void)
753{
7e7ab095
MS
754 lnet_peer_t *rtr;
755 struct list_head *entry;
756 int all_known;
d7e09d03 757
939af333 758 LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
d7e09d03
PT
759
760 for (;;) {
7e7ab095 761 int cpt = lnet_net_lock_current();
d7e09d03
PT
762
763 all_known = 1;
939af333 764 list_for_each(entry, &the_lnet.ln_routers) {
d7e09d03
PT
765 rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
766
767 if (rtr->lp_alive_count == 0) {
768 all_known = 0;
769 break;
770 }
771 }
772
773 lnet_net_unlock(cpt);
774
775 if (all_known)
776 return;
777
d3caf4d5
PT
778 set_current_state(TASK_UNINTERRUPTIBLE);
779 schedule_timeout(cfs_time_seconds(1));
d7e09d03
PT
780 }
781}
782
af3fa7c7
LZ
783void
784lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net)
785{
786 lnet_route_t *rte;
787
788 if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
789 list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) {
790 if (rte->lr_net == net) {
791 rte->lr_downis = 0;
792 break;
793 }
794 }
795 }
796}
797
2595fa36 798static void
d7e09d03
PT
799lnet_update_ni_status_locked(void)
800{
7e7ab095 801 lnet_ni_t *ni;
ec0067d1 802 time64_t now;
7e7ab095 803 int timeout;
d7e09d03
PT
804
805 LASSERT(the_lnet.ln_routing);
806
807 timeout = router_ping_timeout +
0c575417 808 max(live_router_check_interval, dead_router_check_interval);
d7e09d03 809
ec0067d1 810 now = ktime_get_real_seconds();
d7e09d03
PT
811 list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
812 if (ni->ni_lnd->lnd_type == LOLND)
813 continue;
814
815 if (now < ni->ni_last_alive + timeout)
816 continue;
817
818 lnet_ni_lock(ni);
819 /* re-check with lock */
820 if (now < ni->ni_last_alive + timeout) {
821 lnet_ni_unlock(ni);
822 continue;
823 }
824
825 LASSERT(ni->ni_status != NULL);
826
827 if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
828 CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
829 libcfs_nid2str(ni->ni_nid), timeout);
4420cfd3
JS
830 /*
831 * NB: so far, this is the only place to set
832 * NI status to "down"
833 */
d7e09d03
PT
834 ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
835 }
836 lnet_ni_unlock(ni);
837 }
838}
839
2595fa36 840static void
d7e09d03
PT
841lnet_destroy_rc_data(lnet_rc_data_t *rcd)
842{
843 LASSERT(list_empty(&rcd->rcd_list));
844 /* detached from network */
845 LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
846
847 if (rcd->rcd_gateway != NULL) {
848 int cpt = rcd->rcd_gateway->lp_cpt;
849
850 lnet_net_lock(cpt);
851 lnet_peer_decref_locked(rcd->rcd_gateway);
852 lnet_net_unlock(cpt);
853 }
854
855 if (rcd->rcd_pinginfo != NULL)
856 LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
857
858 LIBCFS_FREE(rcd, sizeof(*rcd));
859}
860
2595fa36 861static lnet_rc_data_t *
d7e09d03
PT
862lnet_create_rc_data_locked(lnet_peer_t *gateway)
863{
7e7ab095
MS
864 lnet_rc_data_t *rcd = NULL;
865 lnet_ping_info_t *pi;
866 int rc;
867 int i;
d7e09d03
PT
868
869 lnet_net_unlock(gateway->lp_cpt);
870
871 LIBCFS_ALLOC(rcd, sizeof(*rcd));
872 if (rcd == NULL)
873 goto out;
874
875 LNetInvalidateHandle(&rcd->rcd_mdh);
876 INIT_LIST_HEAD(&rcd->rcd_list);
877
878 LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
879 if (pi == NULL)
880 goto out;
881
d7e09d03
PT
882 for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
883 pi->pi_ni[i].ns_nid = LNET_NID_ANY;
884 pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
885 }
886 rcd->rcd_pinginfo = pi;
887
939af333 888 LASSERT(!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
d7e09d03
PT
889 rc = LNetMDBind((lnet_md_t){.start = pi,
890 .user_ptr = rcd,
891 .length = LNET_PINGINFO_SIZE,
892 .threshold = LNET_MD_THRESH_INF,
893 .options = LNET_MD_TRUNCATE,
894 .eq_handle = the_lnet.ln_rc_eqh},
895 LNET_UNLINK,
896 &rcd->rcd_mdh);
897 if (rc < 0) {
898 CERROR("Can't bind MD: %d\n", rc);
899 goto out;
900 }
901 LASSERT(rc == 0);
902
903 lnet_net_lock(gateway->lp_cpt);
904 /* router table changed or someone has created rcd for this gateway */
905 if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
906 lnet_net_unlock(gateway->lp_cpt);
907 goto out;
908 }
909
910 lnet_peer_addref_locked(gateway);
911 rcd->rcd_gateway = gateway;
912 gateway->lp_rcd = rcd;
913 gateway->lp_ping_notsent = 0;
914
915 return rcd;
916
917 out:
918 if (rcd != NULL) {
919 if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
920 rc = LNetMDUnlink(rcd->rcd_mdh);
921 LASSERT(rc == 0);
922 }
923 lnet_destroy_rc_data(rcd);
924 }
925
926 lnet_net_lock(gateway->lp_cpt);
927 return gateway->lp_rcd;
928}
929
930static int
939af333 931lnet_router_check_interval(lnet_peer_t *rtr)
d7e09d03
PT
932{
933 int secs;
934
935 secs = rtr->lp_alive ? live_router_check_interval :
936 dead_router_check_interval;
937 if (secs < 0)
938 secs = 0;
939
940 return secs;
941}
942
943static void
939af333 944lnet_ping_router_locked(lnet_peer_t *rtr)
d7e09d03
PT
945{
946 lnet_rc_data_t *rcd = NULL;
7e7ab095
MS
947 unsigned long now = cfs_time_current();
948 int secs;
d7e09d03
PT
949
950 lnet_peer_addref_locked(rtr);
951
952 if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
953 cfs_time_after(now, rtr->lp_ping_deadline))
954 lnet_notify_locked(rtr, 1, 0, now);
955
956 /* Run any outstanding notifications */
957 lnet_ni_notify_locked(rtr->lp_ni, rtr);
958
959 if (!lnet_isrouter(rtr) ||
960 the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
961 /* router table changed or router checker is shutting down */
962 lnet_peer_decref_locked(rtr);
963 return;
964 }
965
966 rcd = rtr->lp_rcd != NULL ?
967 rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
968
969 if (rcd == NULL)
970 return;
971
972 secs = lnet_router_check_interval(rtr);
973
974 CDEBUG(D_NET,
2d00bd17 975 "rtr %s %d: deadline %lu ping_notsent %d alive %d alive_count %d lp_ping_timestamp %lu\n",
d7e09d03
PT
976 libcfs_nid2str(rtr->lp_nid), secs,
977 rtr->lp_ping_deadline, rtr->lp_ping_notsent,
978 rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
979
980 if (secs != 0 && !rtr->lp_ping_notsent &&
981 cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
982 cfs_time_seconds(secs)))) {
7e7ab095 983 int rc;
d7e09d03 984 lnet_process_id_t id;
7e7ab095 985 lnet_handle_md_t mdh;
d7e09d03
PT
986
987 id.nid = rtr->lp_nid;
988 id.pid = LUSTRE_SRV_LNET_PID;
989 CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
990
991 rtr->lp_ping_notsent = 1;
992 rtr->lp_ping_timestamp = now;
993
994 mdh = rcd->rcd_mdh;
995
996 if (rtr->lp_ping_deadline == 0) {
997 rtr->lp_ping_deadline =
998 cfs_time_shift(router_ping_timeout);
999 }
1000
1001 lnet_net_unlock(rtr->lp_cpt);
1002
1003 rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
1004 LNET_PROTO_PING_MATCHBITS, 0);
1005
1006 lnet_net_lock(rtr->lp_cpt);
1007 if (rc != 0)
1008 rtr->lp_ping_notsent = 0; /* no event pending */
1009 }
1010
1011 lnet_peer_decref_locked(rtr);
d7e09d03
PT
1012}
1013
1014int
1015lnet_router_checker_start(void)
1016{
7e7ab095
MS
1017 int rc;
1018 int eqsz;
d7e09d03 1019
939af333 1020 LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
d7e09d03
PT
1021
1022 if (check_routers_before_use &&
1023 dead_router_check_interval <= 0) {
2d00bd17 1024 LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be set if 'check_routers_before_use' is set\n");
d7e09d03
PT
1025 return -EINVAL;
1026 }
1027
1028 if (!the_lnet.ln_routing &&
1029 live_router_check_interval <= 0 &&
1030 dead_router_check_interval <= 0)
1031 return 0;
1032
1033 sema_init(&the_lnet.ln_rc_signal, 0);
4420cfd3
JS
1034 /*
1035 * EQ size doesn't matter; the callback is guaranteed to get every
1036 * event
1037 */
d7e09d03
PT
1038 eqsz = 0;
1039 rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
1040 &the_lnet.ln_rc_eqh);
1041 if (rc != 0) {
1042 CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
1043 return -ENOMEM;
1044 }
1045
1046 the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
1047 rc = PTR_ERR(kthread_run(lnet_router_checker,
1048 NULL, "router_checker"));
1049 if (IS_ERR_VALUE(rc)) {
1050 CERROR("Can't start router checker thread: %d\n", rc);
1051 /* block until event callback signals exit */
1052 down(&the_lnet.ln_rc_signal);
1053 rc = LNetEQFree(the_lnet.ln_rc_eqh);
1054 LASSERT(rc == 0);
1055 the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
1056 return -ENOMEM;
1057 }
1058
1059 if (check_routers_before_use) {
4420cfd3
JS
1060 /*
1061 * Note that a helpful side-effect of pinging all known routers
d7e09d03 1062 * at startup is that it makes them drop stale connections they
4420cfd3
JS
1063 * may have to a previous instance of me.
1064 */
d7e09d03
PT
1065 lnet_wait_known_routerstate();
1066 }
1067
1068 return 0;
1069}
1070
1071void
939af333 1072lnet_router_checker_stop(void)
d7e09d03
PT
1073{
1074 int rc;
1075
1076 if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
1077 return;
1078
939af333 1079 LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
d7e09d03
PT
1080 the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
1081
1082 /* block until event callback signals exit */
1083 down(&the_lnet.ln_rc_signal);
1084 LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
1085
1086 rc = LNetEQFree(the_lnet.ln_rc_eqh);
939af333 1087 LASSERT(rc == 0);
d7e09d03
PT
1088}
1089
1090static void
1091lnet_prune_rc_data(int wait_unlink)
1092{
7e7ab095
MS
1093 lnet_rc_data_t *rcd;
1094 lnet_rc_data_t *tmp;
1095 lnet_peer_t *lp;
1096 struct list_head head;
1097 int i = 2;
d7e09d03
PT
1098
1099 if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
1100 list_empty(&the_lnet.ln_rcd_deathrow) &&
1101 list_empty(&the_lnet.ln_rcd_zombie)))
1102 return;
1103
1104 INIT_LIST_HEAD(&head);
1105
1106 lnet_net_lock(LNET_LOCK_EX);
1107
1108 if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
1109 /* router checker is stopping, prune all */
1110 list_for_each_entry(lp, &the_lnet.ln_routers,
c314c319 1111 lp_rtr_list) {
d7e09d03
PT
1112 if (lp->lp_rcd == NULL)
1113 continue;
1114
1115 LASSERT(list_empty(&lp->lp_rcd->rcd_list));
1116 list_add(&lp->lp_rcd->rcd_list,
c314c319 1117 &the_lnet.ln_rcd_deathrow);
d7e09d03
PT
1118 lp->lp_rcd = NULL;
1119 }
1120 }
1121
1122 /* unlink all RCDs on deathrow list */
1123 list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
1124
1125 if (!list_empty(&head)) {
1126 lnet_net_unlock(LNET_LOCK_EX);
1127
1128 list_for_each_entry(rcd, &head, rcd_list)
1129 LNetMDUnlink(rcd->rcd_mdh);
1130
1131 lnet_net_lock(LNET_LOCK_EX);
1132 }
1133
1134 list_splice_init(&head, &the_lnet.ln_rcd_zombie);
1135
1136 /* release all zombie RCDs */
1137 while (!list_empty(&the_lnet.ln_rcd_zombie)) {
1138 list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
c314c319 1139 rcd_list) {
d7e09d03
PT
1140 if (LNetHandleIsInvalid(rcd->rcd_mdh))
1141 list_move(&rcd->rcd_list, &head);
1142 }
1143
1144 wait_unlink = wait_unlink &&
1145 !list_empty(&the_lnet.ln_rcd_zombie);
1146
1147 lnet_net_unlock(LNET_LOCK_EX);
1148
1149 while (!list_empty(&head)) {
1150 rcd = list_entry(head.next,
c314c319 1151 lnet_rc_data_t, rcd_list);
d7e09d03
PT
1152 list_del_init(&rcd->rcd_list);
1153 lnet_destroy_rc_data(rcd);
1154 }
1155
1156 if (!wait_unlink)
1157 return;
1158
1159 i++;
1160 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1161 "Waiting for rc buffers to unlink\n");
d3caf4d5
PT
1162 set_current_state(TASK_UNINTERRUPTIBLE);
1163 schedule_timeout(cfs_time_seconds(1) / 4);
d7e09d03
PT
1164
1165 lnet_net_lock(LNET_LOCK_EX);
1166 }
1167
1168 lnet_net_unlock(LNET_LOCK_EX);
1169}
1170
d7e09d03
PT
1171static int
1172lnet_router_checker(void *arg)
1173{
7e7ab095
MS
1174 lnet_peer_t *rtr;
1175 struct list_head *entry;
d7e09d03
PT
1176
1177 cfs_block_allsigs();
1178
939af333 1179 LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
d7e09d03
PT
1180
1181 while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
7e7ab095
MS
1182 __u64 version;
1183 int cpt;
1184 int cpt2;
d7e09d03
PT
1185
1186 cpt = lnet_net_lock_current();
1187rescan:
1188 version = the_lnet.ln_routers_version;
1189
1190 list_for_each(entry, &the_lnet.ln_routers) {
1191 rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
1192
1193 cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
1194 if (cpt != cpt2) {
1195 lnet_net_unlock(cpt);
1196 cpt = cpt2;
1197 lnet_net_lock(cpt);
1198 /* the routers list has changed */
1199 if (version != the_lnet.ln_routers_version)
1200 goto rescan;
1201 }
1202
1203 lnet_ping_router_locked(rtr);
1204
1205 /* NB dropped lock */
1206 if (version != the_lnet.ln_routers_version) {
1207 /* the routers list has changed */
1208 goto rescan;
1209 }
1210 }
1211
1212 if (the_lnet.ln_routing)
1213 lnet_update_ni_status_locked();
1214
1215 lnet_net_unlock(cpt);
1216
1217 lnet_prune_rc_data(0); /* don't wait for UNLINK */
1218
4420cfd3
JS
1219 /*
1220 * Call schedule_timeout() here always adds 1 to load average
d7e09d03 1221 * because kernel counts # active tasks as nr_running
4420cfd3
JS
1222 * + nr_uninterruptible.
1223 */
18fd5baa
PT
1224 set_current_state(TASK_INTERRUPTIBLE);
1225 schedule_timeout(cfs_time_seconds(1));
d7e09d03
PT
1226 }
1227
1228 LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
1229
1230 lnet_prune_rc_data(1); /* wait for UNLINK */
1231
1232 the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
1233 up(&the_lnet.ln_rc_signal);
1234 /* The unlink event callback will signal final completion */
1235 return 0;
1236}
1237
2595fa36 1238static void
d7e09d03
PT
1239lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
1240{
1241 int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
1242
1243 while (--npages >= 0)
1244 __free_page(rb->rb_kiov[npages].kiov_page);
1245
1246 LIBCFS_FREE(rb, sz);
1247}
1248
2595fa36 1249static lnet_rtrbuf_t *
d7e09d03
PT
1250lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
1251{
7e7ab095
MS
1252 int npages = rbp->rbp_npages;
1253 int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
1254 struct page *page;
d7e09d03 1255 lnet_rtrbuf_t *rb;
7e7ab095 1256 int i;
d7e09d03
PT
1257
1258 LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
1259 if (rb == NULL)
1260 return NULL;
1261
1262 rb->rb_pool = rbp;
1263
1264 for (i = 0; i < npages; i++) {
49c02a75
PT
1265 page = alloc_pages_node(
1266 cfs_cpt_spread_node(lnet_cpt_table(), cpt),
40113370 1267 GFP_KERNEL | __GFP_ZERO, 0);
d7e09d03
PT
1268 if (page == NULL) {
1269 while (--i >= 0)
1270 __free_page(rb->rb_kiov[i].kiov_page);
1271
1272 LIBCFS_FREE(rb, sz);
1273 return NULL;
1274 }
1275
1276 rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
1277 rb->rb_kiov[i].kiov_offset = 0;
1278 rb->rb_kiov[i].kiov_page = page;
1279 }
1280
1281 return rb;
1282}
1283
2595fa36 1284static void
d7e09d03
PT
1285lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
1286{
7e7ab095
MS
1287 int npages = rbp->rbp_npages;
1288 int nbuffers = 0;
1289 lnet_rtrbuf_t *rb;
d7e09d03
PT
1290
1291 if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
1292 return;
1293
939af333
HE
1294 LASSERT(list_empty(&rbp->rbp_msgs));
1295 LASSERT(rbp->rbp_credits == rbp->rbp_nbuffers);
d7e09d03
PT
1296
1297 while (!list_empty(&rbp->rbp_bufs)) {
939af333 1298 LASSERT(rbp->rbp_credits > 0);
d7e09d03
PT
1299
1300 rb = list_entry(rbp->rbp_bufs.next,
c314c319 1301 lnet_rtrbuf_t, rb_list);
d7e09d03
PT
1302 list_del(&rb->rb_list);
1303 lnet_destroy_rtrbuf(rb, npages);
1304 nbuffers++;
1305 }
1306
939af333
HE
1307 LASSERT(rbp->rbp_nbuffers == nbuffers);
1308 LASSERT(rbp->rbp_credits == nbuffers);
d7e09d03 1309
d3d3d37a
JS
1310 rbp->rbp_nbuffers = 0;
1311 rbp->rbp_credits = 0;
d7e09d03
PT
1312}
1313
2595fa36 1314static int
d7e09d03
PT
1315lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
1316{
1317 lnet_rtrbuf_t *rb;
7e7ab095 1318 int i;
d7e09d03
PT
1319
1320 if (rbp->rbp_nbuffers != 0) {
939af333 1321 LASSERT(rbp->rbp_nbuffers == nbufs);
d7e09d03
PT
1322 return 0;
1323 }
1324
1325 for (i = 0; i < nbufs; i++) {
1326 rb = lnet_new_rtrbuf(rbp, cpt);
1327
1328 if (rb == NULL) {
1329 CERROR("Failed to allocate %d router bufs of %d pages\n",
1330 nbufs, rbp->rbp_npages);
1331 return -ENOMEM;
1332 }
1333
1334 rbp->rbp_nbuffers++;
1335 rbp->rbp_credits++;
1336 rbp->rbp_mincredits++;
1337 list_add(&rb->rb_list, &rbp->rbp_bufs);
1338
1339 /* No allocation "under fire" */
1340 /* Otherwise we'd need code to schedule blocked msgs etc */
939af333 1341 LASSERT(!the_lnet.ln_routing);
d7e09d03
PT
1342 }
1343
939af333 1344 LASSERT(rbp->rbp_credits == nbufs);
d7e09d03
PT
1345 return 0;
1346}
1347
2595fa36 1348static void
d7e09d03
PT
1349lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
1350{
1351 INIT_LIST_HEAD(&rbp->rbp_msgs);
1352 INIT_LIST_HEAD(&rbp->rbp_bufs);
1353
1354 rbp->rbp_npages = npages;
1355 rbp->rbp_credits = 0;
1356 rbp->rbp_mincredits = 0;
1357}
1358
1359void
1360lnet_rtrpools_free(void)
1361{
1362 lnet_rtrbufpool_t *rtrp;
7e7ab095 1363 int i;
d7e09d03
PT
1364
1365 if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
1366 return;
1367
1368 cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
1369 lnet_rtrpool_free_bufs(&rtrp[0]);
1370 lnet_rtrpool_free_bufs(&rtrp[1]);
1371 lnet_rtrpool_free_bufs(&rtrp[2]);
1372 }
1373
1374 cfs_percpt_free(the_lnet.ln_rtrpools);
1375 the_lnet.ln_rtrpools = NULL;
1376}
1377
1378static int
1379lnet_nrb_tiny_calculate(int npages)
1380{
7e7ab095 1381 int nrbs = LNET_NRB_TINY;
d7e09d03
PT
1382
1383 if (tiny_router_buffers < 0) {
1384 LCONSOLE_ERROR_MSG(0x10c,
2d00bd17
JP
1385 "tiny_router_buffers=%d invalid when routing enabled\n",
1386 tiny_router_buffers);
d7e09d03
PT
1387 return -1;
1388 }
1389
1390 if (tiny_router_buffers > 0)
1391 nrbs = tiny_router_buffers;
1392
1393 nrbs /= LNET_CPT_NUMBER;
1394 return max(nrbs, LNET_NRB_TINY_MIN);
1395}
1396
1397static int
1398lnet_nrb_small_calculate(int npages)
1399{
7e7ab095 1400 int nrbs = LNET_NRB_SMALL;
d7e09d03
PT
1401
1402 if (small_router_buffers < 0) {
1403 LCONSOLE_ERROR_MSG(0x10c,
2d00bd17
JP
1404 "small_router_buffers=%d invalid when routing enabled\n",
1405 small_router_buffers);
d7e09d03
PT
1406 return -1;
1407 }
1408
1409 if (small_router_buffers > 0)
1410 nrbs = small_router_buffers;
1411
1412 nrbs /= LNET_CPT_NUMBER;
1413 return max(nrbs, LNET_NRB_SMALL_MIN);
1414}
1415
1416static int
1417lnet_nrb_large_calculate(int npages)
1418{
7e7ab095 1419 int nrbs = LNET_NRB_LARGE;
d7e09d03
PT
1420
1421 if (large_router_buffers < 0) {
1422 LCONSOLE_ERROR_MSG(0x10c,
2d00bd17
JP
1423 "large_router_buffers=%d invalid when routing enabled\n",
1424 large_router_buffers);
d7e09d03
PT
1425 return -1;
1426 }
1427
1428 if (large_router_buffers > 0)
1429 nrbs = large_router_buffers;
1430
1431 nrbs /= LNET_CPT_NUMBER;
1432 return max(nrbs, LNET_NRB_LARGE_MIN);
1433}
1434
1435int
1436lnet_rtrpools_alloc(int im_a_router)
1437{
1438 lnet_rtrbufpool_t *rtrp;
7e7ab095
MS
1439 int large_pages;
1440 int small_pages = 1;
1441 int nrb_tiny;
1442 int nrb_small;
1443 int nrb_large;
1444 int rc;
1445 int i;
d7e09d03 1446
e6157b1b
RG
1447 large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1448
d7e09d03
PT
1449 if (!strcmp(forwarding, "")) {
1450 /* not set either way */
1451 if (!im_a_router)
1452 return 0;
1453 } else if (!strcmp(forwarding, "disabled")) {
1454 /* explicitly disabled */
1455 return 0;
1456 } else if (!strcmp(forwarding, "enabled")) {
1457 /* explicitly enabled */
1458 } else {
2d00bd17 1459 LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either 'enabled' or 'disabled'\n");
d7e09d03
PT
1460 return -EINVAL;
1461 }
1462
1463 nrb_tiny = lnet_nrb_tiny_calculate(0);
1464 if (nrb_tiny < 0)
1465 return -EINVAL;
1466
1467 nrb_small = lnet_nrb_small_calculate(small_pages);
1468 if (nrb_small < 0)
1469 return -EINVAL;
1470
1471 nrb_large = lnet_nrb_large_calculate(large_pages);
1472 if (nrb_large < 0)
1473 return -EINVAL;
1474
1475 the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
1476 LNET_NRBPOOLS *
1477 sizeof(lnet_rtrbufpool_t));
1478 if (the_lnet.ln_rtrpools == NULL) {
1479 LCONSOLE_ERROR_MSG(0x10c,
1480 "Failed to initialize router buffe pool\n");
1481 return -ENOMEM;
1482 }
1483
1484 cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
1485 lnet_rtrpool_init(&rtrp[0], 0);
1486 rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
1487 if (rc != 0)
1488 goto failed;
1489
1490 lnet_rtrpool_init(&rtrp[1], small_pages);
1491 rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
1492 if (rc != 0)
1493 goto failed;
1494
1495 lnet_rtrpool_init(&rtrp[2], large_pages);
1496 rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
1497 if (rc != 0)
1498 goto failed;
1499 }
1500
1501 lnet_net_lock(LNET_LOCK_EX);
1502 the_lnet.ln_routing = 1;
1503 lnet_net_unlock(LNET_LOCK_EX);
1504
1505 return 0;
1506
1507 failed:
1508 lnet_rtrpools_free();
1509 return rc;
1510}
1511
1512int
a649ad1d 1513lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when)
d7e09d03 1514{
7e7ab095
MS
1515 struct lnet_peer *lp = NULL;
1516 unsigned long now = cfs_time_current();
1517 int cpt = lnet_cpt_of_nid(nid);
d7e09d03 1518
59cfb96f 1519 LASSERT(!in_interrupt());
d7e09d03 1520
939af333 1521 CDEBUG(D_NET, "%s notifying %s: %s\n",
c314c319
JS
1522 (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
1523 libcfs_nid2str(nid),
1524 alive ? "up" : "down");
d7e09d03
PT
1525
1526 if (ni != NULL &&
1527 LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
939af333 1528 CWARN("Ignoring notification of %s %s by %s (different net)\n",
c314c319
JS
1529 libcfs_nid2str(nid), alive ? "birth" : "death",
1530 libcfs_nid2str(ni->ni_nid));
d7e09d03
PT
1531 return -EINVAL;
1532 }
1533
1534 /* can't do predictions... */
1535 if (cfs_time_after(when, now)) {
2d00bd17
JP
1536 CWARN("Ignoring prediction from %s of %s %s %ld seconds in the future\n",
1537 (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
1538 libcfs_nid2str(nid), alive ? "up" : "down",
1539 cfs_duration_sec(cfs_time_sub(when, now)));
d7e09d03
PT
1540 return -EINVAL;
1541 }
1542
1543 if (ni != NULL && !alive && /* LND telling me she's down */
1544 !auto_down) { /* auto-down disabled */
1545 CDEBUG(D_NET, "Auto-down disabled\n");
1546 return 0;
1547 }
1548
1549 lnet_net_lock(cpt);
1550
1551 if (the_lnet.ln_shutdown) {
1552 lnet_net_unlock(cpt);
1553 return -ESHUTDOWN;
1554 }
1555
1556 lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
1557 if (lp == NULL) {
1558 /* nid not found */
1559 lnet_net_unlock(cpt);
1560 CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
1561 return 0;
1562 }
1563
4420cfd3
JS
1564 /*
1565 * We can't fully trust LND on reporting exact peer last_alive
d7e09d03
PT
1566 * if he notifies us about dead peer. For example ksocklnd can
1567 * call us with when == _time_when_the_node_was_booted_ if
4420cfd3
JS
1568 * no connections were successfully established
1569 */
d7e09d03
PT
1570 if (ni != NULL && !alive && when < lp->lp_last_alive)
1571 when = lp->lp_last_alive;
1572
1573 lnet_notify_locked(lp, ni == NULL, alive, when);
1574
1575 lnet_ni_notify_locked(ni, lp);
1576
1577 lnet_peer_decref_locked(lp);
1578
1579 lnet_net_unlock(cpt);
1580 return 0;
1581}
1582EXPORT_SYMBOL(lnet_notify);
This page took 0.507164 seconds and 5 git commands to generate.