Merge remote-tracking branch 'staging/staging-next'
[deliverable/linux.git] / drivers / staging / lustre / lnet / klnds / o2iblnd / o2iblnd.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
6a5b99a4 18 * http://www.gnu.org/licenses/gpl-2.0.html
d7e09d03 19 *
d7e09d03
PT
20 * GPL HEADER END
21 */
22/*
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
25 *
1dc563a6 26 * Copyright (c) 2011, 2015, Intel Corporation.
d7e09d03
PT
27 */
28/*
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
31 *
32 * lnet/klnds/o2iblnd/o2iblnd.c
33 *
34 * Author: Eric Barton <eric@bartonsoftware.com>
35 */
36
5f43264c 37#include <asm/div64.h>
d664d1fd
JH
38#include <asm/page.h>
39#include "o2iblnd.h"
d7e09d03 40
439b4d45 41static lnd_t the_o2iblnd;
d7e09d03 42
8d9de3f4 43struct kib_data kiblnd_data;
d7e09d03 44
febe73bd 45static __u32 kiblnd_cksum(void *ptr, int nob)
d7e09d03 46{
ec3d17c0
MS
47 char *c = ptr;
48 __u32 sum = 0;
d7e09d03
PT
49
50 while (nob-- > 0)
51 sum = ((sum << 1) | (sum >> 31)) + *c++;
52
53 /* ensure I don't return 0 (== no checksum) */
5fd88337 54 return !sum ? 1 : sum;
d7e09d03
PT
55}
56
febe73bd 57static char *kiblnd_msgtype2str(int type)
d7e09d03
PT
58{
59 switch (type) {
60 case IBLND_MSG_CONNREQ:
61 return "CONNREQ";
62
63 case IBLND_MSG_CONNACK:
64 return "CONNACK";
65
66 case IBLND_MSG_NOOP:
67 return "NOOP";
68
69 case IBLND_MSG_IMMEDIATE:
70 return "IMMEDIATE";
71
72 case IBLND_MSG_PUT_REQ:
73 return "PUT_REQ";
74
75 case IBLND_MSG_PUT_NAK:
76 return "PUT_NAK";
77
78 case IBLND_MSG_PUT_ACK:
79 return "PUT_ACK";
80
81 case IBLND_MSG_PUT_DONE:
82 return "PUT_DONE";
83
84 case IBLND_MSG_GET_REQ:
85 return "GET_REQ";
86
87 case IBLND_MSG_GET_DONE:
88 return "GET_DONE";
89
90 default:
91 return "???";
92 }
93}
94
febe73bd 95static int kiblnd_msgtype2size(int type)
d7e09d03 96{
8d9de3f4 97 const int hdr_size = offsetof(struct kib_msg, ibm_u);
d7e09d03
PT
98
99 switch (type) {
100 case IBLND_MSG_CONNREQ:
101 case IBLND_MSG_CONNACK:
8d9de3f4 102 return hdr_size + sizeof(struct kib_connparams);
d7e09d03
PT
103
104 case IBLND_MSG_NOOP:
105 return hdr_size;
106
107 case IBLND_MSG_IMMEDIATE:
8d9de3f4 108 return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]);
d7e09d03
PT
109
110 case IBLND_MSG_PUT_REQ:
8d9de3f4 111 return hdr_size + sizeof(struct kib_putreq_msg);
d7e09d03
PT
112
113 case IBLND_MSG_PUT_ACK:
8d9de3f4 114 return hdr_size + sizeof(struct kib_putack_msg);
d7e09d03
PT
115
116 case IBLND_MSG_GET_REQ:
8d9de3f4 117 return hdr_size + sizeof(struct kib_get_msg);
d7e09d03
PT
118
119 case IBLND_MSG_PUT_NAK:
120 case IBLND_MSG_PUT_DONE:
121 case IBLND_MSG_GET_DONE:
8d9de3f4 122 return hdr_size + sizeof(struct kib_completion_msg);
d7e09d03
PT
123 default:
124 return -1;
125 }
126}
127
8d9de3f4 128static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
d7e09d03 129{
8d9de3f4 130 struct kib_rdma_desc *rd;
bbc2d82f 131 int msg_size;
ec3d17c0
MS
132 int nob;
133 int n;
134 int i;
d7e09d03 135
febe73bd 136 LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
c314c319 137 msg->ibm_type == IBLND_MSG_PUT_ACK);
d7e09d03
PT
138
139 rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
140 &msg->ibm_u.get.ibgm_rd :
141 &msg->ibm_u.putack.ibpam_rd;
142
143 if (flip) {
144 __swab32s(&rd->rd_key);
145 __swab32s(&rd->rd_nfrags);
146 }
147
148 n = rd->rd_nfrags;
149
8d9de3f4 150 nob = offsetof(struct kib_msg, ibm_u) +
d7e09d03
PT
151 kiblnd_rd_msg_size(rd, msg->ibm_type, n);
152
153 if (msg->ibm_nob < nob) {
154 CERROR("Short %s: %d(%d)\n",
155 kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
156 return 1;
157 }
158
bbc2d82f
JS
159 msg_size = kiblnd_rd_size(rd);
160 if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) {
161 CERROR("Bad msg_size: %d, should be 0 < n <= %d\n",
162 msg_size, LNET_MAX_PAYLOAD);
163 return 1;
164 }
165
d7e09d03
PT
166 if (!flip)
167 return 0;
168
169 for (i = 0; i < n; i++) {
170 __swab32s(&rd->rd_frags[i].rf_nob);
171 __swab64s(&rd->rd_frags[i].rf_addr);
172 }
173
174 return 0;
175}
176
8d9de3f4 177void kiblnd_pack_msg(lnet_ni_t *ni, struct kib_msg *msg, int version,
febe73bd 178 int credits, lnet_nid_t dstnid, __u64 dststamp)
d7e09d03 179{
8d9de3f4 180 struct kib_net *net = ni->ni_data;
d7e09d03 181
4420cfd3
JS
182 /*
183 * CAVEAT EMPTOR! all message fields not set here should have been
184 * initialised previously.
185 */
d7e09d03
PT
186 msg->ibm_magic = IBLND_MSG_MAGIC;
187 msg->ibm_version = version;
188 /* ibm_type */
189 msg->ibm_credits = credits;
190 /* ibm_nob */
191 msg->ibm_cksum = 0;
192 msg->ibm_srcnid = ni->ni_nid;
193 msg->ibm_srcstamp = net->ibn_incarnation;
194 msg->ibm_dstnid = dstnid;
195 msg->ibm_dststamp = dststamp;
196
197 if (*kiblnd_tunables.kib_cksum) {
198 /* NB ibm_cksum zero while computing cksum */
199 msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
200 }
201}
202
8d9de3f4 203int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
d7e09d03 204{
8d9de3f4 205 const int hdr_size = offsetof(struct kib_msg, ibm_u);
ec3d17c0
MS
206 __u32 msg_cksum;
207 __u16 version;
208 int msg_nob;
209 int flip;
d7e09d03
PT
210
211 /* 6 bytes are enough to have received magic + version */
212 if (nob < 6) {
213 CERROR("Short message: %d\n", nob);
214 return -EPROTO;
215 }
216
217 if (msg->ibm_magic == IBLND_MSG_MAGIC) {
218 flip = 0;
219 } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
220 flip = 1;
221 } else {
222 CERROR("Bad magic: %08x\n", msg->ibm_magic);
223 return -EPROTO;
224 }
225
226 version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
227 if (version != IBLND_MSG_VERSION &&
228 version != IBLND_MSG_VERSION_1) {
229 CERROR("Bad version: %x\n", version);
230 return -EPROTO;
231 }
232
233 if (nob < hdr_size) {
234 CERROR("Short message: %d\n", nob);
235 return -EPROTO;
236 }
237
238 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
239 if (msg_nob > nob) {
240 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
241 return -EPROTO;
242 }
243
4420cfd3
JS
244 /*
245 * checksum must be computed with ibm_cksum zero and BEFORE anything
246 * gets flipped
247 */
d7e09d03
PT
248 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
249 msg->ibm_cksum = 0;
5fd88337 250 if (msg_cksum &&
d7e09d03
PT
251 msg_cksum != kiblnd_cksum(msg, msg_nob)) {
252 CERROR("Bad checksum\n");
253 return -EPROTO;
254 }
255
256 msg->ibm_cksum = msg_cksum;
257
258 if (flip) {
259 /* leave magic unflipped as a clue to peer endianness */
260 msg->ibm_version = version;
febe73bd
GM
261 CLASSERT(sizeof(msg->ibm_type) == 1);
262 CLASSERT(sizeof(msg->ibm_credits) == 1);
d7e09d03
PT
263 msg->ibm_nob = msg_nob;
264 __swab64s(&msg->ibm_srcnid);
265 __swab64s(&msg->ibm_srcstamp);
266 __swab64s(&msg->ibm_dstnid);
267 __swab64s(&msg->ibm_dststamp);
268 }
269
270 if (msg->ibm_srcnid == LNET_NID_ANY) {
271 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
272 return -EPROTO;
273 }
274
275 if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
276 CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
277 msg_nob, kiblnd_msgtype2size(msg->ibm_type));
278 return -EPROTO;
279 }
280
281 switch (msg->ibm_type) {
282 default:
283 CERROR("Unknown message type %x\n", msg->ibm_type);
284 return -EPROTO;
285
286 case IBLND_MSG_NOOP:
287 case IBLND_MSG_IMMEDIATE:
288 case IBLND_MSG_PUT_REQ:
289 break;
290
291 case IBLND_MSG_PUT_ACK:
292 case IBLND_MSG_GET_REQ:
293 if (kiblnd_unpack_rd(msg, flip))
294 return -EPROTO;
295 break;
296
297 case IBLND_MSG_PUT_NAK:
298 case IBLND_MSG_PUT_DONE:
299 case IBLND_MSG_GET_DONE:
300 if (flip)
301 __swab32s(&msg->ibm_u.completion.ibcm_status);
302 break;
303
304 case IBLND_MSG_CONNREQ:
305 case IBLND_MSG_CONNACK:
306 if (flip) {
307 __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
308 __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
309 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
310 }
311 break;
312 }
313 return 0;
314}
315
8d9de3f4 316int kiblnd_create_peer(lnet_ni_t *ni, struct kib_peer **peerp, lnet_nid_t nid)
d7e09d03 317{
8d9de3f4
JS
318 struct kib_peer *peer;
319 struct kib_net *net = ni->ni_data;
ec3d17c0
MS
320 int cpt = lnet_cpt_of_nid(nid);
321 unsigned long flags;
d7e09d03 322
06ace26e 323 LASSERT(net);
d7e09d03
PT
324 LASSERT(nid != LNET_NID_ANY);
325
326 LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
06ace26e 327 if (!peer) {
d7e09d03
PT
328 CERROR("Cannot allocate peer\n");
329 return -ENOMEM;
330 }
331
d7e09d03
PT
332 peer->ibp_ni = ni;
333 peer->ibp_nid = nid;
334 peer->ibp_error = 0;
335 peer->ibp_last_alive = 0;
9e7d5bf3 336 peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni);
f6e50066 337 peer->ibp_queue_depth = ni->ni_peertxcredits;
d7e09d03
PT
338 atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
339
340 INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
341 INIT_LIST_HEAD(&peer->ibp_conns);
342 INIT_LIST_HEAD(&peer->ibp_tx_queue);
343
344 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
345
346 /* always called with a ref on ni, which prevents ni being shutdown */
5fd88337 347 LASSERT(!net->ibn_shutdown);
d7e09d03
PT
348
349 /* npeers only grows with the global lock held */
350 atomic_inc(&net->ibn_npeers);
351
352 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
353
354 *peerp = peer;
355 return 0;
356}
357
8d9de3f4 358void kiblnd_destroy_peer(struct kib_peer *peer)
d7e09d03 359{
8d9de3f4 360 struct kib_net *net = peer->ibp_ni->ni_data;
d7e09d03 361
06ace26e 362 LASSERT(net);
5fd88337 363 LASSERT(!atomic_read(&peer->ibp_refcount));
febe73bd 364 LASSERT(!kiblnd_peer_active(peer));
4d99b258 365 LASSERT(kiblnd_peer_idle(peer));
febe73bd 366 LASSERT(list_empty(&peer->ibp_tx_queue));
d7e09d03
PT
367
368 LIBCFS_FREE(peer, sizeof(*peer));
369
4420cfd3
JS
370 /*
371 * NB a peer's connections keep a reference on their peer until
d7e09d03
PT
372 * they are destroyed, so we can be assured that _all_ state to do
373 * with this peer has been cleaned up when its refcount drops to
4420cfd3
JS
374 * zero.
375 */
d7e09d03
PT
376 atomic_dec(&net->ibn_npeers);
377}
378
8d9de3f4 379struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid)
d7e09d03 380{
4420cfd3
JS
381 /*
382 * the caller is responsible for accounting the additional reference
383 * that this creates
384 */
ec3d17c0
MS
385 struct list_head *peer_list = kiblnd_nid2peerlist(nid);
386 struct list_head *tmp;
8d9de3f4 387 struct kib_peer *peer;
d7e09d03 388
febe73bd 389 list_for_each(tmp, peer_list) {
8d9de3f4 390 peer = list_entry(tmp, struct kib_peer, ibp_list);
4d99b258 391 LASSERT(!kiblnd_peer_idle(peer));
d7e09d03
PT
392
393 if (peer->ibp_nid != nid)
394 continue;
395
396 CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
397 peer, libcfs_nid2str(nid),
398 atomic_read(&peer->ibp_refcount),
399 peer->ibp_version);
400 return peer;
401 }
402 return NULL;
403}
404
8d9de3f4 405void kiblnd_unlink_peer_locked(struct kib_peer *peer)
d7e09d03 406{
febe73bd 407 LASSERT(list_empty(&peer->ibp_conns));
d7e09d03 408
febe73bd 409 LASSERT(kiblnd_peer_active(peer));
d7e09d03
PT
410 list_del_init(&peer->ibp_list);
411 /* lose peerlist's ref */
412 kiblnd_peer_decref(peer);
413}
414
febe73bd
GM
415static int kiblnd_get_peer_info(lnet_ni_t *ni, int index,
416 lnet_nid_t *nidp, int *count)
d7e09d03 417{
8d9de3f4 418 struct kib_peer *peer;
ec3d17c0
MS
419 struct list_head *ptmp;
420 int i;
421 unsigned long flags;
d7e09d03
PT
422
423 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
424
425 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
febe73bd 426 list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
8d9de3f4 427 peer = list_entry(ptmp, struct kib_peer, ibp_list);
4d99b258 428 LASSERT(!kiblnd_peer_idle(peer));
d7e09d03
PT
429
430 if (peer->ibp_ni != ni)
431 continue;
432
433 if (index-- > 0)
434 continue;
435
436 *nidp = peer->ibp_nid;
437 *count = atomic_read(&peer->ibp_refcount);
438
439 read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
440 flags);
441 return 0;
442 }
443 }
444
445 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
446 return -ENOENT;
447}
448
8d9de3f4 449static void kiblnd_del_peer_locked(struct kib_peer *peer)
d7e09d03 450{
ec3d17c0
MS
451 struct list_head *ctmp;
452 struct list_head *cnxt;
8d9de3f4 453 struct kib_conn *conn;
d7e09d03
PT
454
455 if (list_empty(&peer->ibp_conns)) {
456 kiblnd_unlink_peer_locked(peer);
457 } else {
febe73bd 458 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
8d9de3f4 459 conn = list_entry(ctmp, struct kib_conn, ibc_list);
d7e09d03
PT
460
461 kiblnd_close_conn_locked(conn, 0);
462 }
463 /* NB closing peer's last conn unlinked it. */
464 }
4420cfd3
JS
465 /*
466 * NB peer now unlinked; might even be freed if the peer table had the
467 * last ref on it.
468 */
d7e09d03
PT
469}
470
febe73bd 471static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid)
d7e09d03 472{
febe73bd 473 LIST_HEAD(zombies);
ec3d17c0
MS
474 struct list_head *ptmp;
475 struct list_head *pnxt;
8d9de3f4 476 struct kib_peer *peer;
ec3d17c0
MS
477 int lo;
478 int hi;
479 int i;
480 unsigned long flags;
481 int rc = -ENOENT;
d7e09d03
PT
482
483 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
484
485 if (nid != LNET_NID_ANY) {
d3d3d37a
JS
486 lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
487 hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
d7e09d03
PT
488 } else {
489 lo = 0;
490 hi = kiblnd_data.kib_peer_hash_size - 1;
491 }
492
493 for (i = lo; i <= hi; i++) {
febe73bd 494 list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
8d9de3f4 495 peer = list_entry(ptmp, struct kib_peer, ibp_list);
4d99b258 496 LASSERT(!kiblnd_peer_idle(peer));
d7e09d03
PT
497
498 if (peer->ibp_ni != ni)
499 continue;
500
501 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
502 continue;
503
504 if (!list_empty(&peer->ibp_tx_queue)) {
febe73bd 505 LASSERT(list_empty(&peer->ibp_conns));
d7e09d03
PT
506
507 list_splice_init(&peer->ibp_tx_queue,
c314c319 508 &zombies);
d7e09d03
PT
509 }
510
511 kiblnd_del_peer_locked(peer);
512 rc = 0; /* matched something */
513 }
514 }
515
516 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
517
518 kiblnd_txlist_done(ni, &zombies, -EIO);
519
520 return rc;
521}
522
8d9de3f4 523static struct kib_conn *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index)
d7e09d03 524{
8d9de3f4 525 struct kib_peer *peer;
ec3d17c0 526 struct list_head *ptmp;
8d9de3f4 527 struct kib_conn *conn;
ec3d17c0
MS
528 struct list_head *ctmp;
529 int i;
530 unsigned long flags;
d7e09d03
PT
531
532 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
533
534 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
febe73bd 535 list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
8d9de3f4 536 peer = list_entry(ptmp, struct kib_peer, ibp_list);
4d99b258 537 LASSERT(!kiblnd_peer_idle(peer));
d7e09d03
PT
538
539 if (peer->ibp_ni != ni)
540 continue;
541
febe73bd 542 list_for_each(ctmp, &peer->ibp_conns) {
d7e09d03
PT
543 if (index-- > 0)
544 continue;
545
8d9de3f4 546 conn = list_entry(ctmp, struct kib_conn,
c314c319 547 ibc_list);
d7e09d03 548 kiblnd_conn_addref(conn);
7a3888a3
GM
549 read_unlock_irqrestore(
550 &kiblnd_data.kib_global_lock,
551 flags);
d7e09d03
PT
552 return conn;
553 }
554 }
555 }
556
557 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
558 return NULL;
559}
560
febe73bd 561int kiblnd_translate_mtu(int value)
d7e09d03
PT
562{
563 switch (value) {
564 default:
565 return -1;
566 case 0:
567 return 0;
568 case 256:
569 return IB_MTU_256;
570 case 512:
571 return IB_MTU_512;
572 case 1024:
573 return IB_MTU_1024;
574 case 2048:
575 return IB_MTU_2048;
576 case 4096:
577 return IB_MTU_4096;
578 }
579}
580
febe73bd 581static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
d7e09d03 582{
ec3d17c0 583 int mtu;
d7e09d03
PT
584
585 /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
06ace26e 586 if (!cmid->route.path_rec)
d7e09d03
PT
587 return;
588
589 mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
febe73bd 590 LASSERT(mtu >= 0);
5fd88337 591 if (mtu)
d7e09d03
PT
592 cmid->route.path_rec->mtu = mtu;
593}
594
8d9de3f4 595static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
d7e09d03 596{
ec3d17c0
MS
597 cpumask_t *mask;
598 int vectors;
599 int off;
600 int i;
601 lnet_nid_t nid = conn->ibc_peer->ibp_nid;
d7e09d03
PT
602
603 vectors = conn->ibc_cmid->device->num_comp_vectors;
604 if (vectors <= 1)
605 return 0;
606
607 mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
06ace26e 608 if (!mask)
3867ea5a 609 return 0;
d7e09d03
PT
610
611 /* hash NID to CPU id in this partition... */
4a316f79
OD
612 off = do_div(nid, cpumask_weight(mask));
613 for_each_cpu(i, mask) {
5fd88337 614 if (!off--)
d7e09d03
PT
615 return i % vectors;
616 }
617
618 LBUG();
619 return 1;
620}
621
8d9de3f4 622struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid,
24c198e9 623 int state, int version)
d7e09d03 624{
4420cfd3
JS
625 /*
626 * CAVEAT EMPTOR:
d7e09d03
PT
627 * If the new conn is created successfully it takes over the caller's
628 * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
629 * is destroyed. On failure, the caller's ref on 'peer' remains and
630 * she must dispose of 'cmid'. (Actually I'd block forever if I tried
631 * to destroy 'cmid' here since I'm called from the CM which still has
4420cfd3
JS
632 * its ref on 'cmid').
633 */
ec3d17c0 634 rwlock_t *glock = &kiblnd_data.kib_global_lock;
8d9de3f4
JS
635 struct kib_net *net = peer->ibp_ni->ni_data;
636 struct kib_dev *dev;
d7e09d03 637 struct ib_qp_init_attr *init_qp_attr;
ec3d17c0 638 struct kib_sched_info *sched;
23908db4 639 struct ib_cq_init_attr cq_attr = {};
8d9de3f4 640 struct kib_conn *conn;
ec3d17c0
MS
641 struct ib_cq *cq;
642 unsigned long flags;
643 int cpt;
644 int rc;
645 int i;
d7e09d03 646
06ace26e 647 LASSERT(net);
d7e09d03
PT
648 LASSERT(!in_interrupt());
649
650 dev = net->ibn_dev;
651
652 cpt = lnet_cpt_of_nid(peer->ibp_nid);
653 sched = kiblnd_data.kib_scheds[cpt];
654
655 LASSERT(sched->ibs_nthreads > 0);
656
657 LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
658 sizeof(*init_qp_attr));
06ace26e 659 if (!init_qp_attr) {
d7e09d03
PT
660 CERROR("Can't allocate qp_attr for %s\n",
661 libcfs_nid2str(peer->ibp_nid));
662 goto failed_0;
663 }
664
665 LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
06ace26e 666 if (!conn) {
d7e09d03
PT
667 CERROR("Can't allocate connection for %s\n",
668 libcfs_nid2str(peer->ibp_nid));
669 goto failed_1;
670 }
671
672 conn->ibc_state = IBLND_CONN_INIT;
673 conn->ibc_version = version;
674 conn->ibc_peer = peer; /* I take the caller's ref */
675 cmid->context = conn; /* for future CM callbacks */
676 conn->ibc_cmid = cmid;
a01fa108
AS
677 conn->ibc_max_frags = peer->ibp_max_frags;
678 conn->ibc_queue_depth = peer->ibp_queue_depth;
2fb44f2b 679
d7e09d03
PT
680 INIT_LIST_HEAD(&conn->ibc_early_rxs);
681 INIT_LIST_HEAD(&conn->ibc_tx_noops);
682 INIT_LIST_HEAD(&conn->ibc_tx_queue);
683 INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
684 INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
685 INIT_LIST_HEAD(&conn->ibc_active_txs);
686 spin_lock_init(&conn->ibc_lock);
687
688 LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
689 sizeof(*conn->ibc_connvars));
06ace26e 690 if (!conn->ibc_connvars) {
d7e09d03
PT
691 CERROR("Can't allocate in-progress connection state\n");
692 goto failed_2;
693 }
694
695 write_lock_irqsave(glock, flags);
696 if (dev->ibd_failover) {
697 write_unlock_irqrestore(glock, flags);
698 CERROR("%s: failover in progress\n", dev->ibd_ifname);
699 goto failed_2;
700 }
701
702 if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
703 /* wakeup failover thread and teardown connection */
704 if (kiblnd_dev_can_failover(dev)) {
705 list_add_tail(&dev->ibd_fail_list,
706 &kiblnd_data.kib_failed_devs);
707 wake_up(&kiblnd_data.kib_failover_waitq);
708 }
709
710 write_unlock_irqrestore(glock, flags);
711 CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
712 cmid->device->name, dev->ibd_ifname);
713 goto failed_2;
714 }
715
716 kiblnd_hdev_addref_locked(dev->ibd_hdev);
717 conn->ibc_hdev = dev->ibd_hdev;
718
719 kiblnd_setup_mtu_locked(cmid);
720
721 write_unlock_irqrestore(glock, flags);
722
723 LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
8d9de3f4 724 IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
06ace26e 725 if (!conn->ibc_rxs) {
d7e09d03
PT
726 CERROR("Cannot allocate RX buffers\n");
727 goto failed_2;
728 }
729
730 rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
2fb44f2b 731 IBLND_RX_MSG_PAGES(conn));
5fd88337 732 if (rc)
d7e09d03
PT
733 goto failed_2;
734
735 kiblnd_map_rx_descs(conn);
736
2fb44f2b 737 cq_attr.cqe = IBLND_CQ_ENTRIES(conn);
8e37210b 738 cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
d7e09d03
PT
739 cq = ib_create_cq(cmid->device,
740 kiblnd_cq_completion, kiblnd_cq_event, conn,
8e37210b 741 &cq_attr);
d7e09d03 742 if (IS_ERR(cq)) {
2fb44f2b
JF
743 CERROR("Failed to create CQ with %d CQEs: %ld\n",
744 IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
d7e09d03
PT
745 goto failed_2;
746 }
747
748 conn->ibc_cq = cq;
749
750 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
5fd88337 751 if (rc) {
9c379663 752 CERROR("Can't request completion notification: %d\n", rc);
d7e09d03
PT
753 goto failed_2;
754 }
755
756 init_qp_attr->event_handler = kiblnd_qp_event;
757 init_qp_attr->qp_context = conn;
2fb44f2b
JF
758 init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
759 init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
d7e09d03
PT
760 init_qp_attr->cap.max_send_sge = 1;
761 init_qp_attr->cap.max_recv_sge = 1;
762 init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
763 init_qp_attr->qp_type = IB_QPT_RC;
764 init_qp_attr->send_cq = cq;
765 init_qp_attr->recv_cq = cq;
766
767 conn->ibc_sched = sched;
768
769 rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
5fd88337 770 if (rc) {
d7e09d03
PT
771 CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
772 rc, init_qp_attr->cap.max_send_wr,
773 init_qp_attr->cap.max_recv_wr);
774 goto failed_2;
775 }
776
777 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
778
779 /* 1 ref for caller and each rxmsg */
2fb44f2b
JF
780 atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
781 conn->ibc_nrx = IBLND_RX_MSGS(conn);
d7e09d03
PT
782
783 /* post receives */
2fb44f2b 784 for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
d7e09d03
PT
785 rc = kiblnd_post_rx(&conn->ibc_rxs[i],
786 IBLND_POSTRX_NO_CREDIT);
5fd88337 787 if (rc) {
d7e09d03
PT
788 CERROR("Can't post rxmsg: %d\n", rc);
789
790 /* Make posted receives complete */
791 kiblnd_abort_receives(conn);
792
4420cfd3
JS
793 /*
794 * correct # of posted buffers
795 * NB locking needed now I'm racing with completion
796 */
d7e09d03 797 spin_lock_irqsave(&sched->ibs_lock, flags);
2fb44f2b 798 conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i;
d7e09d03
PT
799 spin_unlock_irqrestore(&sched->ibs_lock, flags);
800
4420cfd3
JS
801 /*
802 * cmid will be destroyed by CM(ofed) after cm_callback
d7e09d03 803 * returned, so we can't refer it anymore
4420cfd3
JS
804 * (by kiblnd_connd()->kiblnd_destroy_conn)
805 */
d7e09d03
PT
806 rdma_destroy_qp(conn->ibc_cmid);
807 conn->ibc_cmid = NULL;
808
809 /* Drop my own and unused rxbuffer refcounts */
2fb44f2b 810 while (i++ <= IBLND_RX_MSGS(conn))
d7e09d03
PT
811 kiblnd_conn_decref(conn);
812
813 return NULL;
814 }
815 }
816
817 /* Init successful! */
febe73bd 818 LASSERT(state == IBLND_CONN_ACTIVE_CONNECT ||
c314c319 819 state == IBLND_CONN_PASSIVE_WAIT);
d7e09d03
PT
820 conn->ibc_state = state;
821
822 /* 1 more conn */
823 atomic_inc(&net->ibn_nconns);
824 return conn;
825
826 failed_2:
4d99b258 827 kiblnd_destroy_conn(conn, true);
d7e09d03
PT
828 failed_1:
829 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
830 failed_0:
831 return NULL;
832}
833
8d9de3f4 834void kiblnd_destroy_conn(struct kib_conn *conn, bool free_conn)
d7e09d03
PT
835{
836 struct rdma_cm_id *cmid = conn->ibc_cmid;
8d9de3f4 837 struct kib_peer *peer = conn->ibc_peer;
ec3d17c0 838 int rc;
d7e09d03 839
febe73bd 840 LASSERT(!in_interrupt());
5fd88337 841 LASSERT(!atomic_read(&conn->ibc_refcount));
febe73bd
GM
842 LASSERT(list_empty(&conn->ibc_early_rxs));
843 LASSERT(list_empty(&conn->ibc_tx_noops));
844 LASSERT(list_empty(&conn->ibc_tx_queue));
845 LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
846 LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
847 LASSERT(list_empty(&conn->ibc_active_txs));
5fd88337
JS
848 LASSERT(!conn->ibc_noops_posted);
849 LASSERT(!conn->ibc_nsends_posted);
d7e09d03
PT
850
851 switch (conn->ibc_state) {
852 default:
853 /* conn must be completely disengaged from the network */
854 LBUG();
855
856 case IBLND_CONN_DISCONNECTED:
857 /* connvars should have been freed already */
06ace26e 858 LASSERT(!conn->ibc_connvars);
d7e09d03
PT
859 break;
860
861 case IBLND_CONN_INIT:
862 break;
863 }
864
865 /* conn->ibc_cmid might be destroyed by CM already */
06ace26e 866 if (cmid && cmid->qp)
d7e09d03
PT
867 rdma_destroy_qp(cmid);
868
06ace26e 869 if (conn->ibc_cq) {
d7e09d03 870 rc = ib_destroy_cq(conn->ibc_cq);
5fd88337 871 if (rc)
d7e09d03
PT
872 CWARN("Error destroying CQ: %d\n", rc);
873 }
874
06ace26e 875 if (conn->ibc_rx_pages)
d7e09d03
PT
876 kiblnd_unmap_rx_descs(conn);
877
06ace26e 878 if (conn->ibc_rxs) {
d7e09d03 879 LIBCFS_FREE(conn->ibc_rxs,
8d9de3f4 880 IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
d7e09d03
PT
881 }
882
06ace26e 883 if (conn->ibc_connvars)
d7e09d03
PT
884 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
885
06ace26e 886 if (conn->ibc_hdev)
d7e09d03
PT
887 kiblnd_hdev_decref(conn->ibc_hdev);
888
889 /* See CAVEAT EMPTOR above in kiblnd_create_conn */
890 if (conn->ibc_state != IBLND_CONN_INIT) {
8d9de3f4 891 struct kib_net *net = peer->ibp_ni->ni_data;
d7e09d03
PT
892
893 kiblnd_peer_decref(peer);
894 rdma_destroy_id(cmid);
895 atomic_dec(&net->ibn_nconns);
896 }
897
898 LIBCFS_FREE(conn, sizeof(*conn));
899}
900
8d9de3f4 901int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why)
d7e09d03 902{
8d9de3f4 903 struct kib_conn *conn;
ec3d17c0
MS
904 struct list_head *ctmp;
905 struct list_head *cnxt;
906 int count = 0;
d7e09d03 907
febe73bd 908 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
8d9de3f4 909 conn = list_entry(ctmp, struct kib_conn, ibc_list);
d7e09d03 910
2d00bd17 911 CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n",
d7e09d03
PT
912 libcfs_nid2str(peer->ibp_nid),
913 conn->ibc_version, why);
914
915 kiblnd_close_conn_locked(conn, why);
916 count++;
917 }
918
919 return count;
920}
921
8d9de3f4 922int kiblnd_close_stale_conns_locked(struct kib_peer *peer,
c314c319 923 int version, __u64 incarnation)
d7e09d03 924{
8d9de3f4 925 struct kib_conn *conn;
ec3d17c0
MS
926 struct list_head *ctmp;
927 struct list_head *cnxt;
928 int count = 0;
d7e09d03 929
febe73bd 930 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
8d9de3f4 931 conn = list_entry(ctmp, struct kib_conn, ibc_list);
d7e09d03
PT
932
933 if (conn->ibc_version == version &&
934 conn->ibc_incarnation == incarnation)
935 continue;
936
7a3888a3
GM
937 CDEBUG(D_NET,
938 "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n",
d7e09d03
PT
939 libcfs_nid2str(peer->ibp_nid),
940 conn->ibc_version, conn->ibc_incarnation,
941 version, incarnation);
942
943 kiblnd_close_conn_locked(conn, -ESTALE);
944 count++;
945 }
946
947 return count;
948}
949
febe73bd 950static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid)
d7e09d03 951{
8d9de3f4 952 struct kib_peer *peer;
ec3d17c0
MS
953 struct list_head *ptmp;
954 struct list_head *pnxt;
955 int lo;
956 int hi;
957 int i;
958 unsigned long flags;
959 int count = 0;
d7e09d03
PT
960
961 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
962
d3d3d37a
JS
963 if (nid != LNET_NID_ANY) {
964 lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
965 hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
966 } else {
d7e09d03
PT
967 lo = 0;
968 hi = kiblnd_data.kib_peer_hash_size - 1;
969 }
970
971 for (i = lo; i <= hi; i++) {
febe73bd 972 list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
8d9de3f4 973 peer = list_entry(ptmp, struct kib_peer, ibp_list);
4d99b258 974 LASSERT(!kiblnd_peer_idle(peer));
d7e09d03
PT
975
976 if (peer->ibp_ni != ni)
977 continue;
978
979 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
980 continue;
981
982 count += kiblnd_close_peer_conns_locked(peer, 0);
983 }
984 }
985
986 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
987
988 /* wildcards always succeed */
989 if (nid == LNET_NID_ANY)
990 return 0;
991
5fd88337 992 return !count ? -ENOENT : 0;
d7e09d03
PT
993}
994
439b4d45 995static int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
d7e09d03
PT
996{
997 struct libcfs_ioctl_data *data = arg;
ec3d17c0 998 int rc = -EINVAL;
d7e09d03 999
a58a38ac 1000 switch (cmd) {
d7e09d03 1001 case IOC_LIBCFS_GET_PEER: {
ec3d17c0
MS
1002 lnet_nid_t nid = 0;
1003 int count = 0;
d7e09d03
PT
1004
1005 rc = kiblnd_get_peer_info(ni, data->ioc_count,
1006 &nid, &count);
ec3d17c0
MS
1007 data->ioc_nid = nid;
1008 data->ioc_count = count;
d7e09d03
PT
1009 break;
1010 }
1011
1012 case IOC_LIBCFS_DEL_PEER: {
1013 rc = kiblnd_del_peer(ni, data->ioc_nid);
1014 break;
1015 }
1016 case IOC_LIBCFS_GET_CONN: {
8d9de3f4 1017 struct kib_conn *conn;
d7e09d03
PT
1018
1019 rc = 0;
1020 conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
06ace26e 1021 if (!conn) {
d7e09d03
PT
1022 rc = -ENOENT;
1023 break;
1024 }
1025
06ace26e 1026 LASSERT(conn->ibc_cmid);
d7e09d03 1027 data->ioc_nid = conn->ibc_peer->ibp_nid;
06ace26e 1028 if (!conn->ibc_cmid->route.path_rec)
d7e09d03
PT
1029 data->ioc_u32[0] = 0; /* iWarp has no path MTU */
1030 else
1031 data->ioc_u32[0] =
1032 ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
1033 kiblnd_conn_decref(conn);
1034 break;
1035 }
1036 case IOC_LIBCFS_CLOSE_CONNECTION: {
1037 rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
1038 break;
1039 }
1040
1041 default:
1042 break;
1043 }
1044
1045 return rc;
1046}
1047
439b4d45 1048static void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when)
d7e09d03 1049{
ec3d17c0
MS
1050 unsigned long last_alive = 0;
1051 unsigned long now = cfs_time_current();
1052 rwlock_t *glock = &kiblnd_data.kib_global_lock;
8d9de3f4 1053 struct kib_peer *peer;
ec3d17c0 1054 unsigned long flags;
d7e09d03
PT
1055
1056 read_lock_irqsave(glock, flags);
1057
1058 peer = kiblnd_find_peer_locked(nid);
4d99b258 1059 if (peer)
d7e09d03 1060 last_alive = peer->ibp_last_alive;
d7e09d03
PT
1061
1062 read_unlock_irqrestore(glock, flags);
1063
5fd88337 1064 if (last_alive)
d7e09d03
PT
1065 *when = last_alive;
1066
4420cfd3
JS
1067 /*
1068 * peer is not persistent in hash, trigger peer creation
1069 * and connection establishment with a NULL tx
1070 */
06ace26e 1071 if (!peer)
d7e09d03
PT
1072 kiblnd_launch_tx(ni, NULL, nid);
1073
1074 CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
1075 libcfs_nid2str(nid), peer,
1076 last_alive ? cfs_duration_sec(now - last_alive) : -1);
d7e09d03
PT
1077}
1078
8d9de3f4 1079static void kiblnd_free_pages(struct kib_pages *p)
d7e09d03 1080{
ec3d17c0
MS
1081 int npages = p->ibp_npages;
1082 int i;
d7e09d03
PT
1083
1084 for (i = 0; i < npages; i++) {
06ace26e 1085 if (p->ibp_pages[i])
d7e09d03
PT
1086 __free_page(p->ibp_pages[i]);
1087 }
1088
8d9de3f4 1089 LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages]));
d7e09d03
PT
1090}
1091
8d9de3f4 1092int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
d7e09d03 1093{
8d9de3f4 1094 struct kib_pages *p;
ec3d17c0 1095 int i;
d7e09d03
PT
1096
1097 LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
8d9de3f4 1098 offsetof(struct kib_pages, ibp_pages[npages]));
06ace26e 1099 if (!p) {
d7e09d03
PT
1100 CERROR("Can't allocate descriptor for %d pages\n", npages);
1101 return -ENOMEM;
1102 }
1103
8d9de3f4 1104 memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages]));
d7e09d03
PT
1105 p->ibp_npages = npages;
1106
1107 for (i = 0; i < npages; i++) {
49c02a75
PT
1108 p->ibp_pages[i] = alloc_pages_node(
1109 cfs_cpt_spread_node(lnet_cpt_table(), cpt),
0be19afa 1110 GFP_NOFS, 0);
06ace26e 1111 if (!p->ibp_pages[i]) {
d7e09d03
PT
1112 CERROR("Can't allocate page %d of %d\n", i, npages);
1113 kiblnd_free_pages(p);
1114 return -ENOMEM;
1115 }
1116 }
1117
1118 *pp = p;
1119 return 0;
1120}
1121
8d9de3f4 1122void kiblnd_unmap_rx_descs(struct kib_conn *conn)
d7e09d03 1123{
8d9de3f4 1124 struct kib_rx *rx;
ec3d17c0 1125 int i;
d7e09d03 1126
06ace26e
JS
1127 LASSERT(conn->ibc_rxs);
1128 LASSERT(conn->ibc_hdev);
d7e09d03 1129
2fb44f2b 1130 for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
d7e09d03
PT
1131 rx = &conn->ibc_rxs[i];
1132
febe73bd 1133 LASSERT(rx->rx_nob >= 0); /* not posted */
d7e09d03
PT
1134
1135 kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
1136 KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
1137 rx->rx_msgaddr),
1138 IBLND_MSG_SIZE, DMA_FROM_DEVICE);
1139 }
1140
1141 kiblnd_free_pages(conn->ibc_rx_pages);
1142
1143 conn->ibc_rx_pages = NULL;
1144}
1145
8d9de3f4 1146void kiblnd_map_rx_descs(struct kib_conn *conn)
d7e09d03 1147{
8d9de3f4 1148 struct kib_rx *rx;
ec3d17c0
MS
1149 struct page *pg;
1150 int pg_off;
1151 int ipg;
1152 int i;
d7e09d03 1153
2fb44f2b 1154 for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) {
d7e09d03
PT
1155 pg = conn->ibc_rx_pages->ibp_pages[ipg];
1156 rx = &conn->ibc_rxs[i];
1157
1158 rx->rx_conn = conn;
8d9de3f4 1159 rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off);
d7e09d03
PT
1160
1161 rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
7a3888a3
GM
1162 rx->rx_msg,
1163 IBLND_MSG_SIZE,
d7e09d03 1164 DMA_FROM_DEVICE);
febe73bd 1165 LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
c314c319 1166 rx->rx_msgaddr));
d7e09d03
PT
1167 KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
1168
1d8cb70c 1169 CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
d7e09d03 1170 i, rx->rx_msg, rx->rx_msgaddr,
d664d1fd 1171 (__u64)(page_to_phys(pg) + pg_off));
d7e09d03
PT
1172
1173 pg_off += IBLND_MSG_SIZE;
febe73bd 1174 LASSERT(pg_off <= PAGE_SIZE);
d7e09d03
PT
1175
1176 if (pg_off == PAGE_SIZE) {
1177 pg_off = 0;
1178 ipg++;
2fb44f2b 1179 LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn));
d7e09d03
PT
1180 }
1181 }
1182}
1183
8d9de3f4 1184static void kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
d7e09d03 1185{
8d9de3f4
JS
1186 struct kib_hca_dev *hdev = tpo->tpo_hdev;
1187 struct kib_tx *tx;
ec3d17c0 1188 int i;
d7e09d03 1189
5fd88337 1190 LASSERT(!tpo->tpo_pool.po_allocated);
d7e09d03 1191
06ace26e 1192 if (!hdev)
d7e09d03
PT
1193 return;
1194
1195 for (i = 0; i < tpo->tpo_pool.po_size; i++) {
1196 tx = &tpo->tpo_tx_descs[i];
1197 kiblnd_dma_unmap_single(hdev->ibh_ibdev,
1198 KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
1199 tx->tx_msgaddr),
1200 IBLND_MSG_SIZE, DMA_TO_DEVICE);
1201 }
1202
1203 kiblnd_hdev_decref(hdev);
1204 tpo->tpo_hdev = NULL;
1205}
1206
8d9de3f4 1207static struct kib_hca_dev *kiblnd_current_hdev(struct kib_dev *dev)
d7e09d03 1208{
8d9de3f4 1209 struct kib_hca_dev *hdev;
ec3d17c0
MS
1210 unsigned long flags;
1211 int i = 0;
d7e09d03
PT
1212
1213 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1214 while (dev->ibd_failover) {
1215 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
5fd88337 1216 if (!(i++ % 50))
d7e09d03
PT
1217 CDEBUG(D_NET, "%s: Wait for failover\n",
1218 dev->ibd_ifname);
ea363b41 1219 set_current_state(TASK_INTERRUPTIBLE);
d7e09d03
PT
1220 schedule_timeout(cfs_time_seconds(1) / 100);
1221
1222 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1223 }
1224
1225 kiblnd_hdev_addref_locked(dev->ibd_hdev);
1226 hdev = dev->ibd_hdev;
1227
1228 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1229
1230 return hdev;
1231}
1232
8d9de3f4 1233static void kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
d7e09d03 1234{
8d9de3f4
JS
1235 struct kib_pages *txpgs = tpo->tpo_tx_pages;
1236 struct kib_pool *pool = &tpo->tpo_pool;
1237 struct kib_net *net = pool->po_owner->ps_net;
1238 struct kib_dev *dev;
ec3d17c0 1239 struct page *page;
8d9de3f4 1240 struct kib_tx *tx;
ec3d17c0
MS
1241 int page_offset;
1242 int ipage;
1243 int i;
d7e09d03 1244
06ace26e 1245 LASSERT(net);
d7e09d03
PT
1246
1247 dev = net->ibn_dev;
1248
1249 /* pre-mapped messages are not bigger than 1 page */
febe73bd 1250 CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE);
d7e09d03
PT
1251
1252 /* No fancy arithmetic when we do the buffer calculations */
5fd88337 1253 CLASSERT(!(PAGE_SIZE % IBLND_MSG_SIZE));
d7e09d03
PT
1254
1255 tpo->tpo_hdev = kiblnd_current_hdev(dev);
1256
1257 for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
1258 page = txpgs->ibp_pages[ipage];
1259 tx = &tpo->tpo_tx_descs[i];
1260
8d9de3f4 1261 tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) +
d7e09d03
PT
1262 page_offset);
1263
1264 tx->tx_msgaddr = kiblnd_dma_map_single(
1265 tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
1266 IBLND_MSG_SIZE, DMA_TO_DEVICE);
febe73bd 1267 LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
c314c319 1268 tx->tx_msgaddr));
d7e09d03
PT
1269 KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
1270
1271 list_add(&tx->tx_list, &pool->po_free_list);
1272
1273 page_offset += IBLND_MSG_SIZE;
febe73bd 1274 LASSERT(page_offset <= PAGE_SIZE);
d7e09d03
PT
1275
1276 if (page_offset == PAGE_SIZE) {
1277 page_offset = 0;
1278 ipage++;
febe73bd 1279 LASSERT(ipage <= txpgs->ibp_npages);
d7e09d03
PT
1280 }
1281 }
1282}
1283
8d9de3f4 1284struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd,
2fb44f2b 1285 int negotiated_nfrags)
d7e09d03 1286{
8d9de3f4
JS
1287 struct kib_net *net = ni->ni_data;
1288 struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
32c8deb8
AS
1289 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
1290 __u16 nfrags;
1291 int mod;
1292
1293 tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
1294 mod = tunables->lnd_map_on_demand;
1295 nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
2fb44f2b 1296
7cadcc7c 1297 LASSERT(hdev->ibh_mrs);
d7e09d03 1298
32c8deb8 1299 if (mod > 0 && nfrags <= rd->rd_nfrags)
d7e09d03
PT
1300 return NULL;
1301
7cadcc7c 1302 return hdev->ibh_mrs;
d7e09d03
PT
1303}
1304
8d9de3f4 1305static void kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
d7e09d03 1306{
8daab0a4 1307 LASSERT(!fpo->fpo_map_count);
d7e09d03 1308
80e05b34
DE
1309 if (fpo->fpo_is_fmr) {
1310 if (fpo->fmr.fpo_fmr_pool)
1311 ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
1312 } else {
1313 struct kib_fast_reg_descriptor *frd, *tmp;
1314 int i = 0;
1315
1316 list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
1317 frd_list) {
1318 list_del(&frd->frd_list);
1319 ib_dereg_mr(frd->frd_mr);
1320 LIBCFS_FREE(frd, sizeof(*frd));
1321 i++;
1322 }
1323 if (i < fpo->fast_reg.fpo_pool_size)
1324 CERROR("FastReg pool still has %d regions registered\n",
1325 fpo->fast_reg.fpo_pool_size - i);
1326 }
d7e09d03 1327
8daab0a4
DE
1328 if (fpo->fpo_hdev)
1329 kiblnd_hdev_decref(fpo->fpo_hdev);
d7e09d03 1330
8daab0a4 1331 LIBCFS_FREE(fpo, sizeof(*fpo));
d7e09d03
PT
1332}
1333
febe73bd 1334static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
d7e09d03 1335{
8d9de3f4 1336 struct kib_fmr_pool *fpo, *tmp;
d7e09d03 1337
0d33ec5f 1338 list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
8daab0a4
DE
1339 list_del(&fpo->fpo_list);
1340 kiblnd_destroy_fmr_pool(fpo);
d7e09d03
PT
1341 }
1342}
1343
32c8deb8
AS
1344static int
1345kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
1346 int ncpts)
d7e09d03 1347{
32c8deb8 1348 int size = tunables->lnd_fmr_pool_size / ncpts;
d7e09d03
PT
1349
1350 return max(IBLND_FMR_POOL, size);
1351}
1352
32c8deb8
AS
1353static int
1354kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
1355 int ncpts)
d7e09d03 1356{
32c8deb8 1357 int size = tunables->lnd_fmr_flush_trigger / ncpts;
d7e09d03
PT
1358
1359 return max(IBLND_FMR_POOL_FLUSH, size);
1360}
1361
8d9de3f4 1362static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo)
d7e09d03 1363{
d7e09d03 1364 struct ib_fmr_pool_param param = {
51078e25 1365 .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE,
ec3d17c0
MS
1366 .page_shift = PAGE_SHIFT,
1367 .access = (IB_ACCESS_LOCAL_WRITE |
e39f6efa 1368 IB_ACCESS_REMOTE_WRITE),
ec3d17c0 1369 .pool_size = fps->fps_pool_size,
d7e09d03
PT
1370 .dirty_watermark = fps->fps_flush_trigger,
1371 .flush_function = NULL,
ec3d17c0 1372 .flush_arg = NULL,
32c8deb8 1373 .cache = !!fps->fps_cache };
f66fb159
DE
1374 int rc = 0;
1375
1376 fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
1377 &param);
1378 if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
1379 rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
1380 if (rc != -ENOSYS)
1381 CERROR("Failed to create FMR pool: %d\n", rc);
1382 else
1383 CERROR("FMRs are not supported\n");
1384 }
1385
1386 return rc;
1387}
1388
8d9de3f4 1389static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo)
80e05b34
DE
1390{
1391 struct kib_fast_reg_descriptor *frd, *tmp;
1392 int i, rc;
1393
1394 INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
1395 fpo->fast_reg.fpo_pool_size = 0;
1396 for (i = 0; i < fps->fps_pool_size; i++) {
1397 LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt,
1398 sizeof(*frd));
1399 if (!frd) {
1400 CERROR("Failed to allocate a new fast_reg descriptor\n");
1401 rc = -ENOMEM;
1402 goto out;
1403 }
1404
1405 frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
1406 IB_MR_TYPE_MEM_REG,
1407 LNET_MAX_PAYLOAD / PAGE_SIZE);
1408 if (IS_ERR(frd->frd_mr)) {
1409 rc = PTR_ERR(frd->frd_mr);
1410 CERROR("Failed to allocate ib_alloc_mr: %d\n", rc);
1411 frd->frd_mr = NULL;
1412 goto out_middle;
1413 }
1414
1415 frd->frd_valid = true;
1416
1417 list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
1418 fpo->fast_reg.fpo_pool_size++;
1419 }
1420
1421 return 0;
1422
1423out_middle:
1424 if (frd->frd_mr)
1425 ib_dereg_mr(frd->frd_mr);
1426 LIBCFS_FREE(frd, sizeof(*frd));
1427
1428out:
1429 list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
1430 frd_list) {
1431 list_del(&frd->frd_list);
1432 ib_dereg_mr(frd->frd_mr);
1433 LIBCFS_FREE(frd, sizeof(*frd));
1434 }
1435
1436 return rc;
1437}
1438
8d9de3f4
JS
1439static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
1440 struct kib_fmr_pool **pp_fpo)
f66fb159 1441{
8d9de3f4 1442 struct kib_dev *dev = fps->fps_net->ibn_dev;
80e05b34 1443 struct ib_device_attr *dev_attr;
8d9de3f4 1444 struct kib_fmr_pool *fpo;
d7e09d03
PT
1445 int rc;
1446
1447 LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
06ace26e 1448 if (!fpo)
d7e09d03
PT
1449 return -ENOMEM;
1450
1451 fpo->fpo_hdev = kiblnd_current_hdev(dev);
80e05b34 1452 dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
d7e09d03 1453
80e05b34
DE
1454 /* Check for FMR or FastReg support */
1455 fpo->fpo_is_fmr = 0;
f66fb159
DE
1456 if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
1457 fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
1458 fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
1459 fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
1460 LCONSOLE_INFO("Using FMR for registration\n");
80e05b34
DE
1461 fpo->fpo_is_fmr = 1;
1462 } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
1463 LCONSOLE_INFO("Using FastReg for registration\n");
f66fb159
DE
1464 } else {
1465 rc = -ENOSYS;
80e05b34 1466 LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
f66fb159 1467 goto out_fpo;
d7e09d03
PT
1468 }
1469
80e05b34
DE
1470 if (fpo->fpo_is_fmr)
1471 rc = kiblnd_alloc_fmr_pool(fps, fpo);
1472 else
1473 rc = kiblnd_alloc_freg_pool(fps, fpo);
f66fb159
DE
1474 if (rc)
1475 goto out_fpo;
1476
d7e09d03 1477 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
f66fb159 1478 fpo->fpo_owner = fps;
d7e09d03
PT
1479 *pp_fpo = fpo;
1480
1481 return 0;
f66fb159
DE
1482
1483out_fpo:
1484 kiblnd_hdev_decref(fpo->fpo_hdev);
1485 LIBCFS_FREE(fpo, sizeof(*fpo));
1486 return rc;
d7e09d03
PT
1487}
1488
8d9de3f4 1489static void kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps,
febe73bd 1490 struct list_head *zombies)
d7e09d03 1491{
06ace26e 1492 if (!fps->fps_net) /* intialized? */
d7e09d03
PT
1493 return;
1494
1495 spin_lock(&fps->fps_lock);
1496
1497 while (!list_empty(&fps->fps_pool_list)) {
8d9de3f4
JS
1498 struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next,
1499 struct kib_fmr_pool, fpo_list);
d7e09d03
PT
1500 fpo->fpo_failed = 1;
1501 list_del(&fpo->fpo_list);
5fd88337 1502 if (!fpo->fpo_map_count)
d7e09d03
PT
1503 list_add(&fpo->fpo_list, zombies);
1504 else
1505 list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
1506 }
1507
1508 spin_unlock(&fps->fps_lock);
1509}
1510
8d9de3f4 1511static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
d7e09d03 1512{
06ace26e 1513 if (fps->fps_net) { /* initialized? */
d7e09d03
PT
1514 kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
1515 kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
1516 }
1517}
1518
32c8deb8 1519static int
8d9de3f4
JS
1520kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
1521 struct kib_net *net,
32c8deb8 1522 struct lnet_ioctl_config_o2iblnd_tunables *tunables)
d7e09d03 1523{
8d9de3f4 1524 struct kib_fmr_pool *fpo;
ec3d17c0 1525 int rc;
d7e09d03 1526
a4e872f7 1527 memset(fps, 0, sizeof(*fps));
d7e09d03
PT
1528
1529 fps->fps_net = net;
1530 fps->fps_cpt = cpt;
32c8deb8
AS
1531
1532 fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts);
1533 fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts);
1534 fps->fps_cache = tunables->lnd_fmr_cache;
1535
d7e09d03
PT
1536 spin_lock_init(&fps->fps_lock);
1537 INIT_LIST_HEAD(&fps->fps_pool_list);
1538 INIT_LIST_HEAD(&fps->fps_failed_pool_list);
1539
1540 rc = kiblnd_create_fmr_pool(fps, &fpo);
5fd88337 1541 if (!rc)
d7e09d03
PT
1542 list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1543
1544 return rc;
1545}
1546
8d9de3f4 1547static int kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, unsigned long now)
d7e09d03 1548{
5fd88337 1549 if (fpo->fpo_map_count) /* still in use */
d7e09d03
PT
1550 return 0;
1551 if (fpo->fpo_failed)
1552 return 1;
1553 return cfs_time_aftereq(now, fpo->fpo_deadline);
1554}
1555
80e05b34 1556static int
8d9de3f4 1557kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
80e05b34
DE
1558{
1559 __u64 *pages = tx->tx_pages;
8d9de3f4 1560 struct kib_hca_dev *hdev;
80e05b34
DE
1561 int npages;
1562 int size;
1563 int i;
1564
1565 hdev = tx->tx_pool->tpo_hdev;
1566
1567 for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
1568 for (size = 0; size < rd->rd_frags[i].rf_nob;
1569 size += hdev->ibh_page_size) {
1570 pages[npages++] = (rd->rd_frags[i].rf_addr &
1571 hdev->ibh_page_mask) + size;
1572 }
1573 }
1574
1575 return npages;
1576}
1577
8d9de3f4 1578void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
d7e09d03 1579{
febe73bd 1580 LIST_HEAD(zombies);
8d9de3f4
JS
1581 struct kib_fmr_pool *fpo = fmr->fmr_pool;
1582 struct kib_fmr_poolset *fps;
ec3d17c0 1583 unsigned long now = cfs_time_current();
8d9de3f4 1584 struct kib_fmr_pool *tmp;
ec3d17c0 1585 int rc;
d7e09d03 1586
1f199a0c
DE
1587 if (!fpo)
1588 return;
d7e09d03 1589
1f199a0c 1590 fps = fpo->fpo_owner;
80e05b34
DE
1591 if (fpo->fpo_is_fmr) {
1592 if (fmr->fmr_pfmr) {
1593 rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
1594 LASSERT(!rc);
1595 fmr->fmr_pfmr = NULL;
1596 }
d7e09d03 1597
80e05b34
DE
1598 if (status) {
1599 rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
1600 LASSERT(!rc);
1601 }
1602 } else {
1603 struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
d7e09d03 1604
80e05b34
DE
1605 if (frd) {
1606 frd->frd_valid = false;
1607 spin_lock(&fps->fps_lock);
1608 list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
1609 spin_unlock(&fps->fps_lock);
1610 fmr->fmr_frd = NULL;
1611 }
1612 }
d7e09d03 1613 fmr->fmr_pool = NULL;
d7e09d03
PT
1614
1615 spin_lock(&fps->fps_lock);
74732797 1616 fpo->fpo_map_count--; /* decref the pool */
d7e09d03
PT
1617
1618 list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
1619 /* the first pool is persistent */
1620 if (fps->fps_pool_list.next == &fpo->fpo_list)
1621 continue;
1622
1623 if (kiblnd_fmr_pool_is_idle(fpo, now)) {
1624 list_move(&fpo->fpo_list, &zombies);
74732797 1625 fps->fps_version++;
d7e09d03
PT
1626 }
1627 }
1628 spin_unlock(&fps->fps_lock);
1629
1630 if (!list_empty(&zombies))
1631 kiblnd_destroy_fmr_pool_list(&zombies);
1632}
1633
8d9de3f4
JS
1634int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
1635 struct kib_rdma_desc *rd, __u32 nob, __u64 iov,
1636 struct kib_fmr *fmr)
d7e09d03 1637{
80e05b34
DE
1638 __u64 *pages = tx->tx_pages;
1639 bool is_rx = (rd != tx->tx_rd);
1640 bool tx_pages_mapped = 0;
8d9de3f4 1641 struct kib_fmr_pool *fpo;
80e05b34 1642 int npages = 0;
ec3d17c0
MS
1643 __u64 version;
1644 int rc;
d7e09d03
PT
1645
1646 again:
1647 spin_lock(&fps->fps_lock);
1648 version = fps->fps_version;
1649 list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
1650 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1651 fpo->fpo_map_count++;
d7e09d03 1652
80e05b34
DE
1653 if (fpo->fpo_is_fmr) {
1654 struct ib_pool_fmr *pfmr;
1655
1656 spin_unlock(&fps->fps_lock);
1657
1658 if (!tx_pages_mapped) {
1659 npages = kiblnd_map_tx_pages(tx, rd);
1660 tx_pages_mapped = 1;
1661 }
1662
1663 pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
1664 pages, npages, iov);
1665 if (likely(!IS_ERR(pfmr))) {
1666 fmr->fmr_key = is_rx ? pfmr->fmr->rkey :
1667 pfmr->fmr->lkey;
1668 fmr->fmr_frd = NULL;
1669 fmr->fmr_pfmr = pfmr;
1670 fmr->fmr_pool = fpo;
1671 return 0;
1672 }
1673 rc = PTR_ERR(pfmr);
1674 } else {
1675 if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
1676 struct kib_fast_reg_descriptor *frd;
1677 struct ib_reg_wr *wr;
1678 struct ib_mr *mr;
1679 int n;
1680
1681 frd = list_first_entry(&fpo->fast_reg.fpo_pool_list,
1682 struct kib_fast_reg_descriptor,
1683 frd_list);
1684 list_del(&frd->frd_list);
1685 spin_unlock(&fps->fps_lock);
1686
1687 mr = frd->frd_mr;
1688
1689 if (!frd->frd_valid) {
1690 __u32 key = is_rx ? mr->rkey : mr->lkey;
1691 struct ib_send_wr *inv_wr;
1692
1693 inv_wr = &frd->frd_inv_wr;
1694 memset(inv_wr, 0, sizeof(*inv_wr));
1695 inv_wr->opcode = IB_WR_LOCAL_INV;
1696 inv_wr->wr_id = IBLND_WID_MR;
1697 inv_wr->ex.invalidate_rkey = key;
1698
1699 /* Bump the key */
1700 key = ib_inc_rkey(key);
1701 ib_update_fast_reg_key(mr, key);
1702 }
1703
1704 n = ib_map_mr_sg(mr, tx->tx_frags,
2f37dd13 1705 tx->tx_nfrags, NULL, PAGE_SIZE);
80e05b34
DE
1706 if (unlikely(n != tx->tx_nfrags)) {
1707 CERROR("Failed to map mr %d/%d elements\n",
1708 n, tx->tx_nfrags);
1709 return n < 0 ? n : -EINVAL;
1710 }
1711
1712 mr->iova = iov;
1713
1714 /* Prepare FastReg WR */
1715 wr = &frd->frd_fastreg_wr;
1716 memset(wr, 0, sizeof(*wr));
1717 wr->wr.opcode = IB_WR_REG_MR;
1718 wr->wr.wr_id = IBLND_WID_MR;
1719 wr->wr.num_sge = 0;
1720 wr->wr.send_flags = 0;
1721 wr->mr = mr;
1722 wr->key = is_rx ? mr->rkey : mr->lkey;
1723 wr->access = (IB_ACCESS_LOCAL_WRITE |
1724 IB_ACCESS_REMOTE_WRITE);
1725
1726 fmr->fmr_key = is_rx ? mr->rkey : mr->lkey;
1727 fmr->fmr_frd = frd;
1728 fmr->fmr_pfmr = NULL;
1729 fmr->fmr_pool = fpo;
1730 return 0;
1731 }
1732 spin_unlock(&fps->fps_lock);
1733 rc = -EBUSY;
d7e09d03
PT
1734 }
1735
1736 spin_lock(&fps->fps_lock);
1737 fpo->fpo_map_count--;
c1b2e0b5 1738 if (rc != -EAGAIN) {
d7e09d03 1739 spin_unlock(&fps->fps_lock);
c1b2e0b5 1740 return rc;
d7e09d03
PT
1741 }
1742
1743 /* EAGAIN and ... */
1744 if (version != fps->fps_version) {
1745 spin_unlock(&fps->fps_lock);
1746 goto again;
1747 }
1748 }
1749
1750 if (fps->fps_increasing) {
1751 spin_unlock(&fps->fps_lock);
c314c319 1752 CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n");
d7e09d03
PT
1753 schedule();
1754 goto again;
d7e09d03
PT
1755 }
1756
699503bc 1757 if (time_before(cfs_time_current(), fps->fps_next_retry)) {
d7e09d03
PT
1758 /* someone failed recently */
1759 spin_unlock(&fps->fps_lock);
1760 return -EAGAIN;
1761 }
1762
1763 fps->fps_increasing = 1;
1764 spin_unlock(&fps->fps_lock);
1765
1766 CDEBUG(D_NET, "Allocate new FMR pool\n");
1767 rc = kiblnd_create_fmr_pool(fps, &fpo);
1768 spin_lock(&fps->fps_lock);
1769 fps->fps_increasing = 0;
5fd88337 1770 if (!rc) {
d7e09d03
PT
1771 fps->fps_version++;
1772 list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1773 } else {
1774 fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1775 }
1776 spin_unlock(&fps->fps_lock);
1777
1778 goto again;
1779}
1780
8d9de3f4 1781static void kiblnd_fini_pool(struct kib_pool *pool)
d7e09d03 1782{
febe73bd 1783 LASSERT(list_empty(&pool->po_free_list));
5fd88337 1784 LASSERT(!pool->po_allocated);
d7e09d03
PT
1785
1786 CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
1787}
1788
8d9de3f4 1789static void kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size)
d7e09d03
PT
1790{
1791 CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
1792
a4e872f7 1793 memset(pool, 0, sizeof(*pool));
d7e09d03
PT
1794 INIT_LIST_HEAD(&pool->po_free_list);
1795 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1796 pool->po_owner = ps;
1797 pool->po_size = size;
1798}
1799
febe73bd 1800static void kiblnd_destroy_pool_list(struct list_head *head)
d7e09d03 1801{
8d9de3f4 1802 struct kib_pool *pool;
d7e09d03
PT
1803
1804 while (!list_empty(head)) {
8d9de3f4 1805 pool = list_entry(head->next, struct kib_pool, po_list);
d7e09d03
PT
1806 list_del(&pool->po_list);
1807
06ace26e 1808 LASSERT(pool->po_owner);
d7e09d03
PT
1809 pool->po_owner->ps_pool_destroy(pool);
1810 }
1811}
1812
8d9de3f4 1813static void kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
d7e09d03 1814{
06ace26e 1815 if (!ps->ps_net) /* intialized? */
d7e09d03
PT
1816 return;
1817
1818 spin_lock(&ps->ps_lock);
1819 while (!list_empty(&ps->ps_pool_list)) {
8d9de3f4
JS
1820 struct kib_pool *po = list_entry(ps->ps_pool_list.next,
1821 struct kib_pool, po_list);
d7e09d03
PT
1822 po->po_failed = 1;
1823 list_del(&po->po_list);
5fd88337 1824 if (!po->po_allocated)
d7e09d03
PT
1825 list_add(&po->po_list, zombies);
1826 else
1827 list_add(&po->po_list, &ps->ps_failed_pool_list);
1828 }
1829 spin_unlock(&ps->ps_lock);
1830}
1831
8d9de3f4 1832static void kiblnd_fini_poolset(struct kib_poolset *ps)
d7e09d03 1833{
06ace26e 1834 if (ps->ps_net) { /* initialized? */
d7e09d03
PT
1835 kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
1836 kiblnd_destroy_pool_list(&ps->ps_pool_list);
1837 }
1838}
1839
8d9de3f4
JS
1840static int kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
1841 struct kib_net *net, char *name, int size,
febe73bd
GM
1842 kib_ps_pool_create_t po_create,
1843 kib_ps_pool_destroy_t po_destroy,
1844 kib_ps_node_init_t nd_init,
1845 kib_ps_node_fini_t nd_fini)
d7e09d03 1846{
8d9de3f4 1847 struct kib_pool *pool;
ec3d17c0 1848 int rc;
d7e09d03 1849
a4e872f7 1850 memset(ps, 0, sizeof(*ps));
d7e09d03 1851
ec3d17c0
MS
1852 ps->ps_cpt = cpt;
1853 ps->ps_net = net;
d7e09d03
PT
1854 ps->ps_pool_create = po_create;
1855 ps->ps_pool_destroy = po_destroy;
1856 ps->ps_node_init = nd_init;
1857 ps->ps_node_fini = nd_fini;
1858 ps->ps_pool_size = size;
1859 if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
1860 >= sizeof(ps->ps_name))
1861 return -E2BIG;
1862 spin_lock_init(&ps->ps_lock);
1863 INIT_LIST_HEAD(&ps->ps_pool_list);
1864 INIT_LIST_HEAD(&ps->ps_failed_pool_list);
1865
1866 rc = ps->ps_pool_create(ps, size, &pool);
5fd88337 1867 if (!rc)
d7e09d03
PT
1868 list_add(&pool->po_list, &ps->ps_pool_list);
1869 else
1870 CERROR("Failed to create the first pool for %s\n", ps->ps_name);
1871
1872 return rc;
1873}
1874
8d9de3f4 1875static int kiblnd_pool_is_idle(struct kib_pool *pool, unsigned long now)
d7e09d03 1876{
5fd88337 1877 if (pool->po_allocated) /* still in use */
d7e09d03
PT
1878 return 0;
1879 if (pool->po_failed)
1880 return 1;
1881 return cfs_time_aftereq(now, pool->po_deadline);
1882}
1883
8d9de3f4 1884void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
d7e09d03 1885{
febe73bd 1886 LIST_HEAD(zombies);
8d9de3f4
JS
1887 struct kib_poolset *ps = pool->po_owner;
1888 struct kib_pool *tmp;
ec3d17c0 1889 unsigned long now = cfs_time_current();
d7e09d03
PT
1890
1891 spin_lock(&ps->ps_lock);
1892
06ace26e 1893 if (ps->ps_node_fini)
d7e09d03
PT
1894 ps->ps_node_fini(pool, node);
1895
febe73bd 1896 LASSERT(pool->po_allocated > 0);
d7e09d03 1897 list_add(node, &pool->po_free_list);
74732797 1898 pool->po_allocated--;
d7e09d03
PT
1899
1900 list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
1901 /* the first pool is persistent */
1902 if (ps->ps_pool_list.next == &pool->po_list)
1903 continue;
1904
1905 if (kiblnd_pool_is_idle(pool, now))
1906 list_move(&pool->po_list, &zombies);
1907 }
1908 spin_unlock(&ps->ps_lock);
1909
1910 if (!list_empty(&zombies))
1911 kiblnd_destroy_pool_list(&zombies);
1912}
1913
8d9de3f4 1914struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps)
d7e09d03 1915{
ec3d17c0 1916 struct list_head *node;
8d9de3f4 1917 struct kib_pool *pool;
ea363b41
LZ
1918 unsigned int interval = 1;
1919 unsigned long time_before;
1920 unsigned int trips = 0;
ec3d17c0 1921 int rc;
d7e09d03
PT
1922
1923 again:
1924 spin_lock(&ps->ps_lock);
1925 list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
1926 if (list_empty(&pool->po_free_list))
1927 continue;
1928
74732797 1929 pool->po_allocated++;
d7e09d03
PT
1930 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1931 node = pool->po_free_list.next;
1932 list_del(node);
1933
06ace26e 1934 if (ps->ps_node_init) {
d7e09d03
PT
1935 /* still hold the lock */
1936 ps->ps_node_init(pool, node);
1937 }
1938 spin_unlock(&ps->ps_lock);
1939 return node;
1940 }
1941
1942 /* no available tx pool and ... */
1943 if (ps->ps_increasing) {
1944 /* another thread is allocating a new pool */
1945 spin_unlock(&ps->ps_lock);
ea363b41
LZ
1946 trips++;
1947 CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting %d HZs for her to complete. trips = %d\n",
1948 ps->ps_name, interval, trips);
1949
1950 set_current_state(TASK_INTERRUPTIBLE);
1951 schedule_timeout(interval);
1952 if (interval < cfs_time_seconds(1))
1953 interval *= 2;
1954
d7e09d03
PT
1955 goto again;
1956 }
1957
699503bc 1958 if (time_before(cfs_time_current(), ps->ps_next_retry)) {
d7e09d03
PT
1959 /* someone failed recently */
1960 spin_unlock(&ps->ps_lock);
1961 return NULL;
1962 }
1963
1964 ps->ps_increasing = 1;
1965 spin_unlock(&ps->ps_lock);
1966
1967 CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
ea363b41 1968 time_before = cfs_time_current();
d7e09d03 1969 rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
ea363b41
LZ
1970 CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
1971 cfs_time_current() - time_before);
d7e09d03
PT
1972
1973 spin_lock(&ps->ps_lock);
1974 ps->ps_increasing = 0;
5fd88337 1975 if (!rc) {
d7e09d03
PT
1976 list_add_tail(&pool->po_list, &ps->ps_pool_list);
1977 } else {
1978 ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1979 CERROR("Can't allocate new %s pool because out of memory\n",
1980 ps->ps_name);
1981 }
1982 spin_unlock(&ps->ps_lock);
1983
1984 goto again;
1985}
1986
8d9de3f4 1987static void kiblnd_destroy_tx_pool(struct kib_pool *pool)
d7e09d03 1988{
8d9de3f4 1989 struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, tpo_pool);
ec3d17c0 1990 int i;
d7e09d03 1991
5fd88337 1992 LASSERT(!pool->po_allocated);
d7e09d03 1993
06ace26e 1994 if (tpo->tpo_tx_pages) {
d7e09d03
PT
1995 kiblnd_unmap_tx_pool(tpo);
1996 kiblnd_free_pages(tpo->tpo_tx_pages);
1997 }
1998
06ace26e 1999 if (!tpo->tpo_tx_descs)
d7e09d03
PT
2000 goto out;
2001
2002 for (i = 0; i < pool->po_size; i++) {
8d9de3f4 2003 struct kib_tx *tx = &tpo->tpo_tx_descs[i];
d7e09d03
PT
2004
2005 list_del(&tx->tx_list);
06ace26e 2006 if (tx->tx_pages)
d7e09d03
PT
2007 LIBCFS_FREE(tx->tx_pages,
2008 LNET_MAX_IOV *
2009 sizeof(*tx->tx_pages));
06ace26e 2010 if (tx->tx_frags)
d7e09d03 2011 LIBCFS_FREE(tx->tx_frags,
147280d8
JS
2012 (1 + IBLND_MAX_RDMA_FRAGS) *
2013 sizeof(*tx->tx_frags));
06ace26e 2014 if (tx->tx_wrq)
d7e09d03
PT
2015 LIBCFS_FREE(tx->tx_wrq,
2016 (1 + IBLND_MAX_RDMA_FRAGS) *
2017 sizeof(*tx->tx_wrq));
06ace26e 2018 if (tx->tx_sge)
d7e09d03
PT
2019 LIBCFS_FREE(tx->tx_sge,
2020 (1 + IBLND_MAX_RDMA_FRAGS) *
2021 sizeof(*tx->tx_sge));
06ace26e 2022 if (tx->tx_rd)
d7e09d03 2023 LIBCFS_FREE(tx->tx_rd,
8d9de3f4 2024 offsetof(struct kib_rdma_desc,
d7e09d03
PT
2025 rd_frags[IBLND_MAX_RDMA_FRAGS]));
2026 }
2027
2028 LIBCFS_FREE(tpo->tpo_tx_descs,
8d9de3f4 2029 pool->po_size * sizeof(struct kib_tx));
d7e09d03
PT
2030out:
2031 kiblnd_fini_pool(pool);
a4e872f7 2032 LIBCFS_FREE(tpo, sizeof(*tpo));
d7e09d03
PT
2033}
2034
2035static int kiblnd_tx_pool_size(int ncpts)
2036{
2037 int ntx = *kiblnd_tunables.kib_ntx / ncpts;
2038
2039 return max(IBLND_TX_POOL, ntx);
2040}
2041
8d9de3f4
JS
2042static int kiblnd_create_tx_pool(struct kib_poolset *ps, int size,
2043 struct kib_pool **pp_po)
d7e09d03 2044{
ec3d17c0
MS
2045 int i;
2046 int npg;
8d9de3f4
JS
2047 struct kib_pool *pool;
2048 struct kib_tx_pool *tpo;
d7e09d03
PT
2049
2050 LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
06ace26e 2051 if (!tpo) {
d7e09d03
PT
2052 CERROR("Failed to allocate TX pool\n");
2053 return -ENOMEM;
2054 }
2055
2056 pool = &tpo->tpo_pool;
2057 kiblnd_init_pool(ps, pool, size);
2058 tpo->tpo_tx_descs = NULL;
2059 tpo->tpo_tx_pages = NULL;
2060
2061 npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
5fd88337 2062 if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) {
d7e09d03 2063 CERROR("Can't allocate tx pages: %d\n", npg);
a4e872f7 2064 LIBCFS_FREE(tpo, sizeof(*tpo));
d7e09d03
PT
2065 return -ENOMEM;
2066 }
2067
2068 LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
8d9de3f4 2069 size * sizeof(struct kib_tx));
06ace26e 2070 if (!tpo->tpo_tx_descs) {
d7e09d03
PT
2071 CERROR("Can't allocate %d tx descriptors\n", size);
2072 ps->ps_pool_destroy(pool);
2073 return -ENOMEM;
2074 }
2075
8d9de3f4 2076 memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx));
d7e09d03
PT
2077
2078 for (i = 0; i < size; i++) {
8d9de3f4 2079 struct kib_tx *tx = &tpo->tpo_tx_descs[i];
d7e09d03
PT
2080
2081 tx->tx_pool = tpo;
06ace26e 2082 if (ps->ps_net->ibn_fmr_ps) {
d7e09d03
PT
2083 LIBCFS_CPT_ALLOC(tx->tx_pages,
2084 lnet_cpt_table(), ps->ps_cpt,
2085 LNET_MAX_IOV * sizeof(*tx->tx_pages));
06ace26e 2086 if (!tx->tx_pages)
d7e09d03
PT
2087 break;
2088 }
2089
2090 LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
147280d8
JS
2091 (1 + IBLND_MAX_RDMA_FRAGS) *
2092 sizeof(*tx->tx_frags));
06ace26e 2093 if (!tx->tx_frags)
d7e09d03
PT
2094 break;
2095
147280d8 2096 sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
d7e09d03
PT
2097
2098 LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
2099 (1 + IBLND_MAX_RDMA_FRAGS) *
2100 sizeof(*tx->tx_wrq));
06ace26e 2101 if (!tx->tx_wrq)
d7e09d03
PT
2102 break;
2103
2104 LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
2105 (1 + IBLND_MAX_RDMA_FRAGS) *
2106 sizeof(*tx->tx_sge));
06ace26e 2107 if (!tx->tx_sge)
d7e09d03
PT
2108 break;
2109
2110 LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
8d9de3f4 2111 offsetof(struct kib_rdma_desc,
d7e09d03 2112 rd_frags[IBLND_MAX_RDMA_FRAGS]));
06ace26e 2113 if (!tx->tx_rd)
d7e09d03
PT
2114 break;
2115 }
2116
2117 if (i == size) {
2118 kiblnd_map_tx_pool(tpo);
2119 *pp_po = pool;
2120 return 0;
2121 }
2122
2123 ps->ps_pool_destroy(pool);
2124 return -ENOMEM;
2125}
2126
8d9de3f4 2127static void kiblnd_tx_init(struct kib_pool *pool, struct list_head *node)
d7e09d03 2128{
8d9de3f4
JS
2129 struct kib_tx_poolset *tps = container_of(pool->po_owner,
2130 struct kib_tx_poolset,
2131 tps_poolset);
2132 struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list);
d7e09d03 2133
74732797 2134 tx->tx_cookie = tps->tps_next_tx_cookie++;
d7e09d03
PT
2135}
2136
8d9de3f4 2137static void kiblnd_net_fini_pools(struct kib_net *net)
d7e09d03 2138{
ec3d17c0 2139 int i;
d7e09d03
PT
2140
2141 cfs_cpt_for_each(i, lnet_cpt_table()) {
8d9de3f4
JS
2142 struct kib_tx_poolset *tps;
2143 struct kib_fmr_poolset *fps;
d7e09d03 2144
06ace26e 2145 if (net->ibn_tx_ps) {
d7e09d03
PT
2146 tps = net->ibn_tx_ps[i];
2147 kiblnd_fini_poolset(&tps->tps_poolset);
2148 }
2149
06ace26e 2150 if (net->ibn_fmr_ps) {
d7e09d03
PT
2151 fps = net->ibn_fmr_ps[i];
2152 kiblnd_fini_fmr_poolset(fps);
2153 }
d7e09d03
PT
2154 }
2155
06ace26e 2156 if (net->ibn_tx_ps) {
d7e09d03
PT
2157 cfs_percpt_free(net->ibn_tx_ps);
2158 net->ibn_tx_ps = NULL;
2159 }
2160
06ace26e 2161 if (net->ibn_fmr_ps) {
d7e09d03
PT
2162 cfs_percpt_free(net->ibn_fmr_ps);
2163 net->ibn_fmr_ps = NULL;
2164 }
d7e09d03
PT
2165}
2166
8d9de3f4 2167static int kiblnd_net_init_pools(struct kib_net *net, lnet_ni_t *ni, __u32 *cpts,
32c8deb8 2168 int ncpts)
d7e09d03 2169{
32c8deb8 2170 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
ec3d17c0
MS
2171 unsigned long flags;
2172 int cpt;
32c8deb8 2173 int rc;
ec3d17c0 2174 int i;
d7e09d03 2175
32c8deb8
AS
2176 tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
2177
d7e09d03 2178 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
32c8deb8 2179 if (!tunables->lnd_map_on_demand) {
ec3d17c0 2180 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
d7e09d03
PT
2181 goto create_tx_pool;
2182 }
2183
2184 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2185
32c8deb8 2186 if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) {
d7e09d03 2187 CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
32c8deb8 2188 tunables->lnd_fmr_pool_size,
d7e09d03
PT
2189 *kiblnd_tunables.kib_ntx / 4);
2190 rc = -EINVAL;
2191 goto failed;
2192 }
2193
415bcb5c
OD
2194 /*
2195 * TX pool must be created later than FMR, see LU-2268
2196 * for details
2197 */
06ace26e 2198 LASSERT(!net->ibn_tx_ps);
d7e09d03 2199
415bcb5c
OD
2200 /*
2201 * premapping can fail if ibd_nmr > 1, so we always create
2202 * FMR pool and map-on-demand if premapping failed
7e221b60
JS
2203 *
2204 * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset
2205 * The number of struct kib_fmr_poolsets create is equal to the
2206 * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt].
415bcb5c 2207 */
d7e09d03 2208 net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
8d9de3f4 2209 sizeof(struct kib_fmr_poolset));
06ace26e 2210 if (!net->ibn_fmr_ps) {
d7e09d03
PT
2211 CERROR("Failed to allocate FMR pool array\n");
2212 rc = -ENOMEM;
2213 goto failed;
2214 }
2215
2216 for (i = 0; i < ncpts; i++) {
06ace26e 2217 cpt = !cpts ? i : cpts[i];
32c8deb8
AS
2218 rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts,
2219 net, tunables);
7cadcc7c 2220 if (rc) {
d7e09d03
PT
2221 CERROR("Can't initialize FMR pool for CPT %d: %d\n",
2222 cpt, rc);
2223 goto failed;
2224 }
2225 }
2226
7cadcc7c 2227 if (i > 0)
d7e09d03 2228 LASSERT(i == ncpts);
d7e09d03
PT
2229
2230 create_tx_pool:
7e221b60
JS
2231 /*
2232 * cfs_precpt_alloc is creating an array of struct kib_tx_poolset
2233 * The number of struct kib_tx_poolsets create is equal to the
2234 * number of CPTs that exist, i.e net->ibn_tx_ps[cpt].
2235 */
d7e09d03 2236 net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
8d9de3f4 2237 sizeof(struct kib_tx_poolset));
06ace26e 2238 if (!net->ibn_tx_ps) {
d7e09d03
PT
2239 CERROR("Failed to allocate tx pool array\n");
2240 rc = -ENOMEM;
2241 goto failed;
2242 }
2243
2244 for (i = 0; i < ncpts; i++) {
06ace26e 2245 cpt = !cpts ? i : cpts[i];
d7e09d03
PT
2246 rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
2247 cpt, net, "TX",
2248 kiblnd_tx_pool_size(ncpts),
2249 kiblnd_create_tx_pool,
2250 kiblnd_destroy_tx_pool,
2251 kiblnd_tx_init, NULL);
5fd88337 2252 if (rc) {
d7e09d03
PT
2253 CERROR("Can't initialize TX pool for CPT %d: %d\n",
2254 cpt, rc);
2255 goto failed;
2256 }
2257 }
2258
2259 return 0;
2260 failed:
2261 kiblnd_net_fini_pools(net);
5fd88337 2262 LASSERT(rc);
d7e09d03
PT
2263 return rc;
2264}
2265
8d9de3f4 2266static int kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
d7e09d03 2267{
4420cfd3
JS
2268 /*
2269 * It's safe to assume a HCA can handle a page size
2270 * matching that of the native system
2271 */
d7e09d03
PT
2272 hdev->ibh_page_shift = PAGE_SHIFT;
2273 hdev->ibh_page_size = 1 << PAGE_SHIFT;
2274 hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
2275
cebfe5ca 2276 hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
d7e09d03
PT
2277 if (hdev->ibh_mr_size == ~0ULL) {
2278 hdev->ibh_mr_shift = 64;
2279 return 0;
2280 }
2281
55f5a824 2282 CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
d7e09d03
PT
2283 return -EINVAL;
2284}
2285
8d9de3f4 2286static void kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
d7e09d03 2287{
7cadcc7c 2288 if (!hdev->ibh_mrs)
d7e09d03
PT
2289 return;
2290
7cadcc7c 2291 ib_dereg_mr(hdev->ibh_mrs);
d7e09d03 2292
7cadcc7c 2293 hdev->ibh_mrs = NULL;
d7e09d03
PT
2294}
2295
8d9de3f4 2296void kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
d7e09d03
PT
2297{
2298 kiblnd_hdev_cleanup_mrs(hdev);
2299
06ace26e 2300 if (hdev->ibh_pd)
d7e09d03
PT
2301 ib_dealloc_pd(hdev->ibh_pd);
2302
06ace26e 2303 if (hdev->ibh_cmid)
d7e09d03
PT
2304 rdma_destroy_id(hdev->ibh_cmid);
2305
2306 LIBCFS_FREE(hdev, sizeof(*hdev));
2307}
2308
8d9de3f4 2309static int kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev)
d7e09d03
PT
2310{
2311 struct ib_mr *mr;
ec3d17c0 2312 int rc;
ec3d17c0 2313 int acflags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
d7e09d03
PT
2314
2315 rc = kiblnd_hdev_get_attr(hdev);
5fd88337 2316 if (rc)
d7e09d03
PT
2317 return rc;
2318
01738448
LB
2319 mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
2320 if (IS_ERR(mr)) {
2321 CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
2322 kiblnd_hdev_cleanup_mrs(hdev);
2323 return PTR_ERR(mr);
2324 }
d7e09d03 2325
7cadcc7c 2326 hdev->ibh_mrs = mr;
d7e09d03 2327
d7e09d03
PT
2328 return 0;
2329}
2330
febe73bd 2331/* DUMMY */
7a3888a3
GM
2332static int kiblnd_dummy_callback(struct rdma_cm_id *cmid,
2333 struct rdma_cm_event *event)
febe73bd 2334{
d7e09d03
PT
2335 return 0;
2336}
2337
8d9de3f4 2338static int kiblnd_dev_need_failover(struct kib_dev *dev)
d7e09d03 2339{
ec3d17c0
MS
2340 struct rdma_cm_id *cmid;
2341 struct sockaddr_in srcaddr;
2342 struct sockaddr_in dstaddr;
2343 int rc;
d7e09d03 2344
06ace26e
JS
2345 if (!dev->ibd_hdev || /* initializing */
2346 !dev->ibd_hdev->ibh_cmid || /* listener is dead */
d7e09d03
PT
2347 *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
2348 return 1;
2349
4420cfd3
JS
2350 /*
2351 * XXX: it's UGLY, but I don't have better way to find
d7e09d03
PT
2352 * ib-bonding HCA failover because:
2353 *
2354 * a. no reliable CM event for HCA failover...
2355 * b. no OFED API to get ib_device for current net_device...
2356 *
2357 * We have only two choices at this point:
2358 *
2359 * a. rdma_bind_addr(), it will conflict with listener cmid
4420cfd3
JS
2360 * b. rdma_resolve_addr() to zero addr
2361 */
d7e09d03
PT
2362 cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
2363 IB_QPT_RC);
2364 if (IS_ERR(cmid)) {
2365 rc = PTR_ERR(cmid);
2366 CERROR("Failed to create cmid for failover: %d\n", rc);
2367 return rc;
2368 }
2369
2370 memset(&srcaddr, 0, sizeof(srcaddr));
ec3d17c0 2371 srcaddr.sin_family = AF_INET;
d7e09d03
PT
2372 srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2373
2374 memset(&dstaddr, 0, sizeof(dstaddr));
2375 dstaddr.sin_family = AF_INET;
2376 rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
2377 (struct sockaddr *)&dstaddr, 1);
5fd88337 2378 if (rc || !cmid->device) {
5e8f6920
PT
2379 CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2380 dev->ibd_ifname, &dev->ibd_ifip,
d7e09d03
PT
2381 cmid->device, rc);
2382 rdma_destroy_id(cmid);
2383 return rc;
2384 }
2385
199a0cc0
LZ
2386 rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */
2387 rdma_destroy_id(cmid);
d7e09d03 2388
199a0cc0 2389 return rc;
d7e09d03
PT
2390}
2391
8d9de3f4 2392int kiblnd_dev_failover(struct kib_dev *dev)
d7e09d03 2393{
febe73bd
GM
2394 LIST_HEAD(zombie_tpo);
2395 LIST_HEAD(zombie_ppo);
2396 LIST_HEAD(zombie_fpo);
ec3d17c0 2397 struct rdma_cm_id *cmid = NULL;
8d9de3f4 2398 struct kib_hca_dev *hdev = NULL;
ec3d17c0 2399 struct ib_pd *pd;
8d9de3f4 2400 struct kib_net *net;
ec3d17c0
MS
2401 struct sockaddr_in addr;
2402 unsigned long flags;
2403 int rc = 0;
2404 int i;
d7e09d03 2405
febe73bd 2406 LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
06ace26e 2407 dev->ibd_can_failover || !dev->ibd_hdev);
d7e09d03
PT
2408
2409 rc = kiblnd_dev_need_failover(dev);
2410 if (rc <= 0)
2411 goto out;
2412
06ace26e
JS
2413 if (dev->ibd_hdev &&
2414 dev->ibd_hdev->ibh_cmid) {
4420cfd3
JS
2415 /*
2416 * XXX it's not good to close old listener at here,
d7e09d03
PT
2417 * because we can fail to create new listener.
2418 * But we have to close it now, otherwise rdma_bind_addr
4420cfd3
JS
2419 * will return EADDRINUSE... How crap!
2420 */
d7e09d03
PT
2421 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2422
2423 cmid = dev->ibd_hdev->ibh_cmid;
4420cfd3
JS
2424 /*
2425 * make next schedule of kiblnd_dev_need_failover()
2426 * return 1 for me
2427 */
d7e09d03
PT
2428 dev->ibd_hdev->ibh_cmid = NULL;
2429 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2430
2431 rdma_destroy_id(cmid);
2432 }
2433
2434 cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
2435 IB_QPT_RC);
2436 if (IS_ERR(cmid)) {
2437 rc = PTR_ERR(cmid);
2438 CERROR("Failed to create cmid for failover: %d\n", rc);
2439 goto out;
2440 }
2441
2442 memset(&addr, 0, sizeof(addr));
2443 addr.sin_family = AF_INET;
2444 addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2445 addr.sin_port = htons(*kiblnd_tunables.kib_service);
2446
2447 /* Bind to failover device or port */
2448 rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
5fd88337 2449 if (rc || !cmid->device) {
5e8f6920
PT
2450 CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2451 dev->ibd_ifname, &dev->ibd_ifip,
d7e09d03
PT
2452 cmid->device, rc);
2453 rdma_destroy_id(cmid);
2454 goto out;
2455 }
2456
2457 LIBCFS_ALLOC(hdev, sizeof(*hdev));
06ace26e 2458 if (!hdev) {
d7e09d03
PT
2459 CERROR("Failed to allocate kib_hca_dev\n");
2460 rdma_destroy_id(cmid);
2461 rc = -ENOMEM;
2462 goto out;
2463 }
2464
2465 atomic_set(&hdev->ibh_ref, 1);
2466 hdev->ibh_dev = dev;
2467 hdev->ibh_cmid = cmid;
2468 hdev->ibh_ibdev = cmid->device;
2469
2470 pd = ib_alloc_pd(cmid->device);
2471 if (IS_ERR(pd)) {
2472 rc = PTR_ERR(pd);
2473 CERROR("Can't allocate PD: %d\n", rc);
2474 goto out;
2475 }
2476
2477 hdev->ibh_pd = pd;
2478
2479 rc = rdma_listen(cmid, 0);
5fd88337 2480 if (rc) {
d7e09d03
PT
2481 CERROR("Can't start new listener: %d\n", rc);
2482 goto out;
2483 }
2484
2485 rc = kiblnd_hdev_setup_mrs(hdev);
5fd88337 2486 if (rc) {
d7e09d03
PT
2487 CERROR("Can't setup device: %d\n", rc);
2488 goto out;
2489 }
2490
2491 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2492
6d37b171 2493 swap(dev->ibd_hdev, hdev); /* take over the refcount */
d7e09d03
PT
2494
2495 list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
2496 cfs_cpt_for_each(i, lnet_cpt_table()) {
2497 kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
2498 &zombie_tpo);
2499
415bcb5c 2500 if (net->ibn_fmr_ps)
d7e09d03
PT
2501 kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
2502 &zombie_fpo);
d7e09d03
PT
2503 }
2504 }
2505
2506 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2507 out:
2508 if (!list_empty(&zombie_tpo))
2509 kiblnd_destroy_pool_list(&zombie_tpo);
2510 if (!list_empty(&zombie_ppo))
2511 kiblnd_destroy_pool_list(&zombie_ppo);
2512 if (!list_empty(&zombie_fpo))
2513 kiblnd_destroy_fmr_pool_list(&zombie_fpo);
06ace26e 2514 if (hdev)
d7e09d03
PT
2515 kiblnd_hdev_decref(hdev);
2516
5fd88337 2517 if (rc)
d7e09d03
PT
2518 dev->ibd_failed_failover++;
2519 else
2520 dev->ibd_failed_failover = 0;
2521
2522 return rc;
2523}
2524
8d9de3f4 2525void kiblnd_destroy_dev(struct kib_dev *dev)
d7e09d03 2526{
5fd88337 2527 LASSERT(!dev->ibd_nnets);
febe73bd 2528 LASSERT(list_empty(&dev->ibd_nets));
d7e09d03
PT
2529
2530 list_del(&dev->ibd_fail_list);
2531 list_del(&dev->ibd_list);
2532
06ace26e 2533 if (dev->ibd_hdev)
d7e09d03
PT
2534 kiblnd_hdev_decref(dev->ibd_hdev);
2535
2536 LIBCFS_FREE(dev, sizeof(*dev));
2537}
2538
8d9de3f4 2539static struct kib_dev *kiblnd_create_dev(char *ifname)
d7e09d03
PT
2540{
2541 struct net_device *netdev;
8d9de3f4 2542 struct kib_dev *dev;
ec3d17c0
MS
2543 __u32 netmask;
2544 __u32 ip;
2545 int up;
2546 int rc;
d7e09d03 2547
1ad6a73e 2548 rc = lnet_ipif_query(ifname, &up, &ip, &netmask);
5fd88337 2549 if (rc) {
d7e09d03
PT
2550 CERROR("Can't query IPoIB interface %s: %d\n",
2551 ifname, rc);
2552 return NULL;
2553 }
2554
2555 if (!up) {
2556 CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
2557 return NULL;
2558 }
2559
2560 LIBCFS_ALLOC(dev, sizeof(*dev));
06ace26e 2561 if (!dev)
d7e09d03
PT
2562 return NULL;
2563
d7e09d03 2564 netdev = dev_get_by_name(&init_net, ifname);
06ace26e 2565 if (!netdev) {
d7e09d03
PT
2566 dev->ibd_can_failover = 0;
2567 } else {
2568 dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
2569 dev_put(netdev);
2570 }
2571
2572 INIT_LIST_HEAD(&dev->ibd_nets);
2573 INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
2574 INIT_LIST_HEAD(&dev->ibd_fail_list);
2575 dev->ibd_ifip = ip;
2576 strcpy(&dev->ibd_ifname[0], ifname);
2577
2578 /* initialize the device */
2579 rc = kiblnd_dev_failover(dev);
5fd88337 2580 if (rc) {
d7e09d03
PT
2581 CERROR("Can't initialize device: %d\n", rc);
2582 LIBCFS_FREE(dev, sizeof(*dev));
2583 return NULL;
2584 }
2585
c314c319 2586 list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
d7e09d03
PT
2587 return dev;
2588}
2589
febe73bd 2590static void kiblnd_base_shutdown(void)
d7e09d03 2591{
ec3d17c0
MS
2592 struct kib_sched_info *sched;
2593 int i;
d7e09d03 2594
febe73bd 2595 LASSERT(list_empty(&kiblnd_data.kib_devs));
d7e09d03 2596
d7e09d03
PT
2597 switch (kiblnd_data.kib_init) {
2598 default:
2599 LBUG();
2600
2601 case IBLND_INIT_ALL:
2602 case IBLND_INIT_DATA:
06ace26e 2603 LASSERT(kiblnd_data.kib_peers);
7a3888a3 2604 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
febe73bd 2605 LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
febe73bd
GM
2606 LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
2607 LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
4d99b258
LZ
2608 LASSERT(list_empty(&kiblnd_data.kib_reconn_list));
2609 LASSERT(list_empty(&kiblnd_data.kib_reconn_wait));
d7e09d03
PT
2610
2611 /* flag threads to terminate; wake and wait for them to die */
2612 kiblnd_data.kib_shutdown = 1;
2613
4420cfd3
JS
2614 /*
2615 * NB: we really want to stop scheduler threads net by net
d7e09d03 2616 * instead of the whole module, this should be improved
4420cfd3
JS
2617 * with dynamic configuration LNet
2618 */
d7e09d03
PT
2619 cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
2620 wake_up_all(&sched->ibs_waitq);
2621
2622 wake_up_all(&kiblnd_data.kib_connd_waitq);
2623 wake_up_all(&kiblnd_data.kib_failover_waitq);
2624
2625 i = 2;
5fd88337 2626 while (atomic_read(&kiblnd_data.kib_nthreads)) {
d7e09d03 2627 i++;
7a3888a3
GM
2628 /* power of 2 ? */
2629 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
d7e09d03
PT
2630 "Waiting for %d threads to terminate\n",
2631 atomic_read(&kiblnd_data.kib_nthreads));
d3caf4d5
PT
2632 set_current_state(TASK_UNINTERRUPTIBLE);
2633 schedule_timeout(cfs_time_seconds(1));
d7e09d03
PT
2634 }
2635
2636 /* fall through */
2637
2638 case IBLND_INIT_NOTHING:
2639 break;
2640 }
2641
06ace26e 2642 if (kiblnd_data.kib_peers) {
d7e09d03
PT
2643 LIBCFS_FREE(kiblnd_data.kib_peers,
2644 sizeof(struct list_head) *
2645 kiblnd_data.kib_peer_hash_size);
2646 }
2647
06ace26e 2648 if (kiblnd_data.kib_scheds)
d7e09d03
PT
2649 cfs_percpt_free(kiblnd_data.kib_scheds);
2650
d7e09d03
PT
2651 kiblnd_data.kib_init = IBLND_INIT_NOTHING;
2652 module_put(THIS_MODULE);
2653}
2654
439b4d45 2655static void kiblnd_shutdown(lnet_ni_t *ni)
d7e09d03 2656{
8d9de3f4 2657 struct kib_net *net = ni->ni_data;
ec3d17c0
MS
2658 rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
2659 int i;
2660 unsigned long flags;
d7e09d03
PT
2661
2662 LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
2663
06ace26e 2664 if (!net)
d7e09d03
PT
2665 goto out;
2666
d7e09d03
PT
2667 write_lock_irqsave(g_lock, flags);
2668 net->ibn_shutdown = 1;
2669 write_unlock_irqrestore(g_lock, flags);
2670
2671 switch (net->ibn_init) {
2672 default:
2673 LBUG();
2674
2675 case IBLND_INIT_ALL:
2676 /* nuke all existing peers within this net */
2677 kiblnd_del_peer(ni, LNET_NID_ANY);
2678
2679 /* Wait for all peer state to clean up */
2680 i = 2;
5fd88337 2681 while (atomic_read(&net->ibn_npeers)) {
d7e09d03
PT
2682 i++;
2683 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
2684 "%s: waiting for %d peers to disconnect\n",
2685 libcfs_nid2str(ni->ni_nid),
2686 atomic_read(&net->ibn_npeers));
d3caf4d5
PT
2687 set_current_state(TASK_UNINTERRUPTIBLE);
2688 schedule_timeout(cfs_time_seconds(1));
d7e09d03
PT
2689 }
2690
2691 kiblnd_net_fini_pools(net);
2692
2693 write_lock_irqsave(g_lock, flags);
2694 LASSERT(net->ibn_dev->ibd_nnets > 0);
2695 net->ibn_dev->ibd_nnets--;
2696 list_del(&net->ibn_list);
2697 write_unlock_irqrestore(g_lock, flags);
2698
2699 /* fall through */
2700
2701 case IBLND_INIT_NOTHING:
5fd88337 2702 LASSERT(!atomic_read(&net->ibn_nconns));
d7e09d03 2703
5fd88337 2704 if (net->ibn_dev && !net->ibn_dev->ibd_nnets)
d7e09d03
PT
2705 kiblnd_destroy_dev(net->ibn_dev);
2706
2707 break;
2708 }
2709
d7e09d03
PT
2710 net->ibn_init = IBLND_INIT_NOTHING;
2711 ni->ni_data = NULL;
2712
2713 LIBCFS_FREE(net, sizeof(*net));
2714
2715out:
2716 if (list_empty(&kiblnd_data.kib_devs))
2717 kiblnd_base_shutdown();
d7e09d03
PT
2718}
2719
febe73bd 2720static int kiblnd_base_startup(void)
d7e09d03 2721{
ec3d17c0
MS
2722 struct kib_sched_info *sched;
2723 int rc;
2724 int i;
d7e09d03 2725
febe73bd 2726 LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
d7e09d03
PT
2727
2728 try_module_get(THIS_MODULE);
7a3888a3
GM
2729 /* zero pointers, flags etc */
2730 memset(&kiblnd_data, 0, sizeof(kiblnd_data));
d7e09d03
PT
2731
2732 rwlock_init(&kiblnd_data.kib_global_lock);
2733
2734 INIT_LIST_HEAD(&kiblnd_data.kib_devs);
2735 INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
2736
2737 kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
2738 LIBCFS_ALLOC(kiblnd_data.kib_peers,
ec3d17c0 2739 sizeof(struct list_head) * kiblnd_data.kib_peer_hash_size);
06ace26e 2740 if (!kiblnd_data.kib_peers)
d7e09d03 2741 goto failed;
d7e09d03
PT
2742 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
2743 INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
2744
2745 spin_lock_init(&kiblnd_data.kib_connd_lock);
2746 INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
2747 INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
4d99b258
LZ
2748 INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list);
2749 INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait);
2750
d7e09d03
PT
2751 init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
2752 init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
2753
2754 kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
2755 sizeof(*sched));
06ace26e 2756 if (!kiblnd_data.kib_scheds)
d7e09d03
PT
2757 goto failed;
2758
2759 cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
ec3d17c0 2760 int nthrs;
d7e09d03
PT
2761
2762 spin_lock_init(&sched->ibs_lock);
2763 INIT_LIST_HEAD(&sched->ibs_conns);
2764 init_waitqueue_head(&sched->ibs_waitq);
2765
2766 nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
2767 if (*kiblnd_tunables.kib_nscheds > 0) {
2768 nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
2769 } else {
4420cfd3
JS
2770 /*
2771 * max to half of CPUs, another half is reserved for
2772 * upper layer modules
2773 */
d7e09d03
PT
2774 nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2775 }
2776
2777 sched->ibs_nthreads_max = nthrs;
2778 sched->ibs_cpt = i;
2779 }
2780
2781 kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
2782
2783 /* lists/ptrs/locks initialised */
2784 kiblnd_data.kib_init = IBLND_INIT_DATA;
2785 /*****************************************************/
2786
2787 rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
5fd88337 2788 if (rc) {
d7e09d03
PT
2789 CERROR("Can't spawn o2iblnd connd: %d\n", rc);
2790 goto failed;
2791 }
2792
5fd88337 2793 if (*kiblnd_tunables.kib_dev_failover)
d7e09d03
PT
2794 rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
2795 "kiblnd_failover");
2796
5fd88337 2797 if (rc) {
d7e09d03
PT
2798 CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
2799 goto failed;
2800 }
2801
2802 /* flag everything initialised */
2803 kiblnd_data.kib_init = IBLND_INIT_ALL;
2804 /*****************************************************/
2805
2806 return 0;
2807
2808 failed:
2809 kiblnd_base_shutdown();
2810 return -ENETDOWN;
2811}
2812
febe73bd 2813static int kiblnd_start_schedulers(struct kib_sched_info *sched)
d7e09d03 2814{
ec3d17c0
MS
2815 int rc = 0;
2816 int nthrs;
2817 int i;
d7e09d03 2818
5fd88337 2819 if (!sched->ibs_nthreads) {
d7e09d03
PT
2820 if (*kiblnd_tunables.kib_nscheds > 0) {
2821 nthrs = sched->ibs_nthreads_max;
2822 } else {
2823 nthrs = cfs_cpt_weight(lnet_cpt_table(),
2824 sched->ibs_cpt);
2825 nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2826 nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
2827 }
2828 } else {
2829 LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
2830 /* increase one thread if there is new interface */
b6ee3824 2831 nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max;
d7e09d03
PT
2832 }
2833
2834 for (i = 0; i < nthrs; i++) {
ec3d17c0
MS
2835 long id;
2836 char name[20];
7a3888a3 2837
d7e09d03
PT
2838 id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
2839 snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
2840 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
2841 rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
5fd88337 2842 if (!rc)
d7e09d03
PT
2843 continue;
2844
2845 CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
2846 sched->ibs_cpt, sched->ibs_nthreads + i, rc);
2847 break;
2848 }
2849
2850 sched->ibs_nthreads += i;
2851 return rc;
2852}
2853
8d9de3f4 2854static int kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, __u32 *cpts,
7a3888a3 2855 int ncpts)
d7e09d03 2856{
ec3d17c0
MS
2857 int cpt;
2858 int rc;
2859 int i;
d7e09d03
PT
2860
2861 for (i = 0; i < ncpts; i++) {
2862 struct kib_sched_info *sched;
2863
06ace26e 2864 cpt = !cpts ? i : cpts[i];
d7e09d03
PT
2865 sched = kiblnd_data.kib_scheds[cpt];
2866
2867 if (!newdev && sched->ibs_nthreads > 0)
2868 continue;
2869
2870 rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
5fd88337 2871 if (rc) {
d7e09d03
PT
2872 CERROR("Failed to start scheduler threads for %s\n",
2873 dev->ibd_ifname);
2874 return rc;
2875 }
2876 }
2877 return 0;
2878}
2879
8d9de3f4 2880static struct kib_dev *kiblnd_dev_search(char *ifname)
d7e09d03 2881{
8d9de3f4
JS
2882 struct kib_dev *alias = NULL;
2883 struct kib_dev *dev;
ec3d17c0
MS
2884 char *colon;
2885 char *colon2;
d7e09d03
PT
2886
2887 colon = strchr(ifname, ':');
2888 list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
5fd88337 2889 if (!strcmp(&dev->ibd_ifname[0], ifname))
d7e09d03
PT
2890 return dev;
2891
06ace26e 2892 if (alias)
d7e09d03
PT
2893 continue;
2894
2895 colon2 = strchr(dev->ibd_ifname, ':');
06ace26e 2896 if (colon)
d7e09d03 2897 *colon = 0;
06ace26e 2898 if (colon2)
d7e09d03
PT
2899 *colon2 = 0;
2900
5fd88337 2901 if (!strcmp(&dev->ibd_ifname[0], ifname))
d7e09d03
PT
2902 alias = dev;
2903
06ace26e 2904 if (colon)
d7e09d03 2905 *colon = ':';
06ace26e 2906 if (colon2)
d7e09d03
PT
2907 *colon2 = ':';
2908 }
2909 return alias;
2910}
2911
439b4d45 2912static int kiblnd_startup(lnet_ni_t *ni)
d7e09d03 2913{
ec3d17c0 2914 char *ifname;
8d9de3f4
JS
2915 struct kib_dev *ibdev = NULL;
2916 struct kib_net *net;
473c4e01 2917 struct timespec64 tv;
ec3d17c0
MS
2918 unsigned long flags;
2919 int rc;
2920 int newdev;
d7e09d03 2921
febe73bd 2922 LASSERT(ni->ni_lnd == &the_o2iblnd);
d7e09d03
PT
2923
2924 if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
2925 rc = kiblnd_base_startup();
5fd88337 2926 if (rc)
d7e09d03
PT
2927 return rc;
2928 }
2929
2930 LIBCFS_ALLOC(net, sizeof(*net));
2931 ni->ni_data = net;
06ace26e 2932 if (!net)
3247c4e5 2933 goto net_failed;
d7e09d03 2934
473c4e01
AB
2935 ktime_get_real_ts64(&tv);
2936 net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC +
2937 tv.tv_nsec / NSEC_PER_USEC;
d7e09d03 2938
f6e50066 2939 rc = kiblnd_tunables_setup(ni);
025ba826
AS
2940 if (rc)
2941 goto net_failed;
d7e09d03 2942
06ace26e 2943 if (ni->ni_interfaces[0]) {
d7e09d03
PT
2944 /* Use the IPoIB interface specified in 'networks=' */
2945
febe73bd 2946 CLASSERT(LNET_MAX_INTERFACES > 1);
06ace26e 2947 if (ni->ni_interfaces[1]) {
d7e09d03
PT
2948 CERROR("Multiple interfaces not supported\n");
2949 goto failed;
2950 }
2951
2952 ifname = ni->ni_interfaces[0];
2953 } else {
2954 ifname = *kiblnd_tunables.kib_default_ipif;
2955 }
2956
2957 if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
2958 CERROR("IPoIB interface name too long: %s\n", ifname);
2959 goto failed;
2960 }
2961
2962 ibdev = kiblnd_dev_search(ifname);
2963
06ace26e 2964 newdev = !ibdev;
d7e09d03 2965 /* hmm...create kib_dev even for alias */
5fd88337 2966 if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname))
d7e09d03
PT
2967 ibdev = kiblnd_create_dev(ifname);
2968
06ace26e 2969 if (!ibdev)
d7e09d03
PT
2970 goto failed;
2971
2972 net->ibn_dev = ibdev;
2973 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
2974
2975 rc = kiblnd_dev_start_threads(ibdev, newdev,
2976 ni->ni_cpts, ni->ni_ncpts);
5fd88337 2977 if (rc)
d7e09d03
PT
2978 goto failed;
2979
32c8deb8 2980 rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
5fd88337 2981 if (rc) {
d7e09d03
PT
2982 CERROR("Failed to initialize NI pools: %d\n", rc);
2983 goto failed;
2984 }
2985
2986 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2987 ibdev->ibd_nnets++;
2988 list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
2989 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2990
2991 net->ibn_init = IBLND_INIT_ALL;
2992
2993 return 0;
2994
2995failed:
06ace26e 2996 if (!net->ibn_dev && ibdev)
d7e09d03
PT
2997 kiblnd_destroy_dev(ibdev);
2998
3247c4e5 2999net_failed:
d7e09d03
PT
3000 kiblnd_shutdown(ni);
3001
3002 CDEBUG(D_NET, "kiblnd_startup failed\n");
3003 return -ENETDOWN;
3004}
3005
439b4d45
FZ
3006static lnd_t the_o2iblnd = {
3007 .lnd_type = O2IBLND,
3008 .lnd_startup = kiblnd_startup,
3009 .lnd_shutdown = kiblnd_shutdown,
3010 .lnd_ctl = kiblnd_ctl,
3011 .lnd_query = kiblnd_query,
3012 .lnd_send = kiblnd_send,
3013 .lnd_recv = kiblnd_recv,
3014};
3015
e0f94113 3016static void __exit ko2iblnd_exit(void)
d7e09d03
PT
3017{
3018 lnet_unregister_lnd(&the_o2iblnd);
d7e09d03
PT
3019}
3020
e0f94113 3021static int __init ko2iblnd_init(void)
d7e09d03 3022{
8d9de3f4
JS
3023 CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE);
3024 CLASSERT(offsetof(struct kib_msg,
c314c319
JS
3025 ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
3026 <= IBLND_MSG_SIZE);
8d9de3f4 3027 CLASSERT(offsetof(struct kib_msg,
c314c319
JS
3028 ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
3029 <= IBLND_MSG_SIZE);
d7e09d03 3030
025ba826 3031 kiblnd_tunables_init();
d7e09d03
PT
3032
3033 lnet_register_lnd(&the_o2iblnd);
3034
3035 return 0;
3036}
3037
a0455471 3038MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
57878e17 3039MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver");
5b0e50b9 3040MODULE_VERSION("2.7.0");
d7e09d03
PT
3041MODULE_LICENSE("GPL");
3042
e0f94113
AD
3043module_init(ko2iblnd_init);
3044module_exit(ko2iblnd_exit);
This page took 0.893897 seconds and 5 git commands to generate.