Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
6a5b99a4 | 18 | * http://www.gnu.org/licenses/gpl-2.0.html |
d7e09d03 | 19 | * |
d7e09d03 PT |
20 | * GPL HEADER END |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Use is subject to license terms. | |
25 | * | |
1dc563a6 | 26 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
27 | */ |
28 | /* | |
29 | * This file is part of Lustre, http://www.lustre.org/ | |
30 | * Lustre is a trademark of Sun Microsystems, Inc. | |
31 | * | |
32 | * lnet/klnds/o2iblnd/o2iblnd.c | |
33 | * | |
34 | * Author: Eric Barton <eric@bartonsoftware.com> | |
35 | */ | |
36 | ||
5f43264c | 37 | #include <asm/div64.h> |
d664d1fd JH |
38 | #include <asm/page.h> |
39 | #include "o2iblnd.h" | |
d7e09d03 | 40 | |
439b4d45 | 41 | static lnd_t the_o2iblnd; |
d7e09d03 | 42 | |
8d9de3f4 | 43 | struct kib_data kiblnd_data; |
d7e09d03 | 44 | |
febe73bd | 45 | static __u32 kiblnd_cksum(void *ptr, int nob) |
d7e09d03 | 46 | { |
ec3d17c0 MS |
47 | char *c = ptr; |
48 | __u32 sum = 0; | |
d7e09d03 PT |
49 | |
50 | while (nob-- > 0) | |
51 | sum = ((sum << 1) | (sum >> 31)) + *c++; | |
52 | ||
53 | /* ensure I don't return 0 (== no checksum) */ | |
5fd88337 | 54 | return !sum ? 1 : sum; |
d7e09d03 PT |
55 | } |
56 | ||
febe73bd | 57 | static char *kiblnd_msgtype2str(int type) |
d7e09d03 PT |
58 | { |
59 | switch (type) { | |
60 | case IBLND_MSG_CONNREQ: | |
61 | return "CONNREQ"; | |
62 | ||
63 | case IBLND_MSG_CONNACK: | |
64 | return "CONNACK"; | |
65 | ||
66 | case IBLND_MSG_NOOP: | |
67 | return "NOOP"; | |
68 | ||
69 | case IBLND_MSG_IMMEDIATE: | |
70 | return "IMMEDIATE"; | |
71 | ||
72 | case IBLND_MSG_PUT_REQ: | |
73 | return "PUT_REQ"; | |
74 | ||
75 | case IBLND_MSG_PUT_NAK: | |
76 | return "PUT_NAK"; | |
77 | ||
78 | case IBLND_MSG_PUT_ACK: | |
79 | return "PUT_ACK"; | |
80 | ||
81 | case IBLND_MSG_PUT_DONE: | |
82 | return "PUT_DONE"; | |
83 | ||
84 | case IBLND_MSG_GET_REQ: | |
85 | return "GET_REQ"; | |
86 | ||
87 | case IBLND_MSG_GET_DONE: | |
88 | return "GET_DONE"; | |
89 | ||
90 | default: | |
91 | return "???"; | |
92 | } | |
93 | } | |
94 | ||
febe73bd | 95 | static int kiblnd_msgtype2size(int type) |
d7e09d03 | 96 | { |
8d9de3f4 | 97 | const int hdr_size = offsetof(struct kib_msg, ibm_u); |
d7e09d03 PT |
98 | |
99 | switch (type) { | |
100 | case IBLND_MSG_CONNREQ: | |
101 | case IBLND_MSG_CONNACK: | |
8d9de3f4 | 102 | return hdr_size + sizeof(struct kib_connparams); |
d7e09d03 PT |
103 | |
104 | case IBLND_MSG_NOOP: | |
105 | return hdr_size; | |
106 | ||
107 | case IBLND_MSG_IMMEDIATE: | |
8d9de3f4 | 108 | return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]); |
d7e09d03 PT |
109 | |
110 | case IBLND_MSG_PUT_REQ: | |
8d9de3f4 | 111 | return hdr_size + sizeof(struct kib_putreq_msg); |
d7e09d03 PT |
112 | |
113 | case IBLND_MSG_PUT_ACK: | |
8d9de3f4 | 114 | return hdr_size + sizeof(struct kib_putack_msg); |
d7e09d03 PT |
115 | |
116 | case IBLND_MSG_GET_REQ: | |
8d9de3f4 | 117 | return hdr_size + sizeof(struct kib_get_msg); |
d7e09d03 PT |
118 | |
119 | case IBLND_MSG_PUT_NAK: | |
120 | case IBLND_MSG_PUT_DONE: | |
121 | case IBLND_MSG_GET_DONE: | |
8d9de3f4 | 122 | return hdr_size + sizeof(struct kib_completion_msg); |
d7e09d03 PT |
123 | default: |
124 | return -1; | |
125 | } | |
126 | } | |
127 | ||
8d9de3f4 | 128 | static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) |
d7e09d03 | 129 | { |
8d9de3f4 | 130 | struct kib_rdma_desc *rd; |
bbc2d82f | 131 | int msg_size; |
ec3d17c0 MS |
132 | int nob; |
133 | int n; | |
134 | int i; | |
d7e09d03 | 135 | |
febe73bd | 136 | LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ || |
c314c319 | 137 | msg->ibm_type == IBLND_MSG_PUT_ACK); |
d7e09d03 PT |
138 | |
139 | rd = msg->ibm_type == IBLND_MSG_GET_REQ ? | |
140 | &msg->ibm_u.get.ibgm_rd : | |
141 | &msg->ibm_u.putack.ibpam_rd; | |
142 | ||
143 | if (flip) { | |
144 | __swab32s(&rd->rd_key); | |
145 | __swab32s(&rd->rd_nfrags); | |
146 | } | |
147 | ||
148 | n = rd->rd_nfrags; | |
149 | ||
8d9de3f4 | 150 | nob = offsetof(struct kib_msg, ibm_u) + |
d7e09d03 PT |
151 | kiblnd_rd_msg_size(rd, msg->ibm_type, n); |
152 | ||
153 | if (msg->ibm_nob < nob) { | |
154 | CERROR("Short %s: %d(%d)\n", | |
155 | kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob); | |
156 | return 1; | |
157 | } | |
158 | ||
bbc2d82f JS |
159 | msg_size = kiblnd_rd_size(rd); |
160 | if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) { | |
161 | CERROR("Bad msg_size: %d, should be 0 < n <= %d\n", | |
162 | msg_size, LNET_MAX_PAYLOAD); | |
163 | return 1; | |
164 | } | |
165 | ||
d7e09d03 PT |
166 | if (!flip) |
167 | return 0; | |
168 | ||
169 | for (i = 0; i < n; i++) { | |
170 | __swab32s(&rd->rd_frags[i].rf_nob); | |
171 | __swab64s(&rd->rd_frags[i].rf_addr); | |
172 | } | |
173 | ||
174 | return 0; | |
175 | } | |
176 | ||
8d9de3f4 | 177 | void kiblnd_pack_msg(lnet_ni_t *ni, struct kib_msg *msg, int version, |
febe73bd | 178 | int credits, lnet_nid_t dstnid, __u64 dststamp) |
d7e09d03 | 179 | { |
8d9de3f4 | 180 | struct kib_net *net = ni->ni_data; |
d7e09d03 | 181 | |
4420cfd3 JS |
182 | /* |
183 | * CAVEAT EMPTOR! all message fields not set here should have been | |
184 | * initialised previously. | |
185 | */ | |
d7e09d03 PT |
186 | msg->ibm_magic = IBLND_MSG_MAGIC; |
187 | msg->ibm_version = version; | |
188 | /* ibm_type */ | |
189 | msg->ibm_credits = credits; | |
190 | /* ibm_nob */ | |
191 | msg->ibm_cksum = 0; | |
192 | msg->ibm_srcnid = ni->ni_nid; | |
193 | msg->ibm_srcstamp = net->ibn_incarnation; | |
194 | msg->ibm_dstnid = dstnid; | |
195 | msg->ibm_dststamp = dststamp; | |
196 | ||
197 | if (*kiblnd_tunables.kib_cksum) { | |
198 | /* NB ibm_cksum zero while computing cksum */ | |
199 | msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); | |
200 | } | |
201 | } | |
202 | ||
8d9de3f4 | 203 | int kiblnd_unpack_msg(struct kib_msg *msg, int nob) |
d7e09d03 | 204 | { |
8d9de3f4 | 205 | const int hdr_size = offsetof(struct kib_msg, ibm_u); |
ec3d17c0 MS |
206 | __u32 msg_cksum; |
207 | __u16 version; | |
208 | int msg_nob; | |
209 | int flip; | |
d7e09d03 PT |
210 | |
211 | /* 6 bytes are enough to have received magic + version */ | |
212 | if (nob < 6) { | |
213 | CERROR("Short message: %d\n", nob); | |
214 | return -EPROTO; | |
215 | } | |
216 | ||
217 | if (msg->ibm_magic == IBLND_MSG_MAGIC) { | |
218 | flip = 0; | |
219 | } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { | |
220 | flip = 1; | |
221 | } else { | |
222 | CERROR("Bad magic: %08x\n", msg->ibm_magic); | |
223 | return -EPROTO; | |
224 | } | |
225 | ||
226 | version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; | |
227 | if (version != IBLND_MSG_VERSION && | |
228 | version != IBLND_MSG_VERSION_1) { | |
229 | CERROR("Bad version: %x\n", version); | |
230 | return -EPROTO; | |
231 | } | |
232 | ||
233 | if (nob < hdr_size) { | |
234 | CERROR("Short message: %d\n", nob); | |
235 | return -EPROTO; | |
236 | } | |
237 | ||
238 | msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; | |
239 | if (msg_nob > nob) { | |
240 | CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); | |
241 | return -EPROTO; | |
242 | } | |
243 | ||
4420cfd3 JS |
244 | /* |
245 | * checksum must be computed with ibm_cksum zero and BEFORE anything | |
246 | * gets flipped | |
247 | */ | |
d7e09d03 PT |
248 | msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; |
249 | msg->ibm_cksum = 0; | |
5fd88337 | 250 | if (msg_cksum && |
d7e09d03 PT |
251 | msg_cksum != kiblnd_cksum(msg, msg_nob)) { |
252 | CERROR("Bad checksum\n"); | |
253 | return -EPROTO; | |
254 | } | |
255 | ||
256 | msg->ibm_cksum = msg_cksum; | |
257 | ||
258 | if (flip) { | |
259 | /* leave magic unflipped as a clue to peer endianness */ | |
260 | msg->ibm_version = version; | |
febe73bd GM |
261 | CLASSERT(sizeof(msg->ibm_type) == 1); |
262 | CLASSERT(sizeof(msg->ibm_credits) == 1); | |
d7e09d03 PT |
263 | msg->ibm_nob = msg_nob; |
264 | __swab64s(&msg->ibm_srcnid); | |
265 | __swab64s(&msg->ibm_srcstamp); | |
266 | __swab64s(&msg->ibm_dstnid); | |
267 | __swab64s(&msg->ibm_dststamp); | |
268 | } | |
269 | ||
270 | if (msg->ibm_srcnid == LNET_NID_ANY) { | |
271 | CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); | |
272 | return -EPROTO; | |
273 | } | |
274 | ||
275 | if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) { | |
276 | CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type), | |
277 | msg_nob, kiblnd_msgtype2size(msg->ibm_type)); | |
278 | return -EPROTO; | |
279 | } | |
280 | ||
281 | switch (msg->ibm_type) { | |
282 | default: | |
283 | CERROR("Unknown message type %x\n", msg->ibm_type); | |
284 | return -EPROTO; | |
285 | ||
286 | case IBLND_MSG_NOOP: | |
287 | case IBLND_MSG_IMMEDIATE: | |
288 | case IBLND_MSG_PUT_REQ: | |
289 | break; | |
290 | ||
291 | case IBLND_MSG_PUT_ACK: | |
292 | case IBLND_MSG_GET_REQ: | |
293 | if (kiblnd_unpack_rd(msg, flip)) | |
294 | return -EPROTO; | |
295 | break; | |
296 | ||
297 | case IBLND_MSG_PUT_NAK: | |
298 | case IBLND_MSG_PUT_DONE: | |
299 | case IBLND_MSG_GET_DONE: | |
300 | if (flip) | |
301 | __swab32s(&msg->ibm_u.completion.ibcm_status); | |
302 | break; | |
303 | ||
304 | case IBLND_MSG_CONNREQ: | |
305 | case IBLND_MSG_CONNACK: | |
306 | if (flip) { | |
307 | __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); | |
308 | __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); | |
309 | __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); | |
310 | } | |
311 | break; | |
312 | } | |
313 | return 0; | |
314 | } | |
315 | ||
8d9de3f4 | 316 | int kiblnd_create_peer(lnet_ni_t *ni, struct kib_peer **peerp, lnet_nid_t nid) |
d7e09d03 | 317 | { |
8d9de3f4 JS |
318 | struct kib_peer *peer; |
319 | struct kib_net *net = ni->ni_data; | |
ec3d17c0 MS |
320 | int cpt = lnet_cpt_of_nid(nid); |
321 | unsigned long flags; | |
d7e09d03 | 322 | |
06ace26e | 323 | LASSERT(net); |
d7e09d03 PT |
324 | LASSERT(nid != LNET_NID_ANY); |
325 | ||
326 | LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer)); | |
06ace26e | 327 | if (!peer) { |
d7e09d03 PT |
328 | CERROR("Cannot allocate peer\n"); |
329 | return -ENOMEM; | |
330 | } | |
331 | ||
d7e09d03 PT |
332 | peer->ibp_ni = ni; |
333 | peer->ibp_nid = nid; | |
334 | peer->ibp_error = 0; | |
335 | peer->ibp_last_alive = 0; | |
9e7d5bf3 | 336 | peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni); |
f6e50066 | 337 | peer->ibp_queue_depth = ni->ni_peertxcredits; |
d7e09d03 PT |
338 | atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ |
339 | ||
340 | INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ | |
341 | INIT_LIST_HEAD(&peer->ibp_conns); | |
342 | INIT_LIST_HEAD(&peer->ibp_tx_queue); | |
343 | ||
344 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
345 | ||
346 | /* always called with a ref on ni, which prevents ni being shutdown */ | |
5fd88337 | 347 | LASSERT(!net->ibn_shutdown); |
d7e09d03 PT |
348 | |
349 | /* npeers only grows with the global lock held */ | |
350 | atomic_inc(&net->ibn_npeers); | |
351 | ||
352 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
353 | ||
354 | *peerp = peer; | |
355 | return 0; | |
356 | } | |
357 | ||
8d9de3f4 | 358 | void kiblnd_destroy_peer(struct kib_peer *peer) |
d7e09d03 | 359 | { |
8d9de3f4 | 360 | struct kib_net *net = peer->ibp_ni->ni_data; |
d7e09d03 | 361 | |
06ace26e | 362 | LASSERT(net); |
5fd88337 | 363 | LASSERT(!atomic_read(&peer->ibp_refcount)); |
febe73bd | 364 | LASSERT(!kiblnd_peer_active(peer)); |
4d99b258 | 365 | LASSERT(kiblnd_peer_idle(peer)); |
febe73bd | 366 | LASSERT(list_empty(&peer->ibp_tx_queue)); |
d7e09d03 PT |
367 | |
368 | LIBCFS_FREE(peer, sizeof(*peer)); | |
369 | ||
4420cfd3 JS |
370 | /* |
371 | * NB a peer's connections keep a reference on their peer until | |
d7e09d03 PT |
372 | * they are destroyed, so we can be assured that _all_ state to do |
373 | * with this peer has been cleaned up when its refcount drops to | |
4420cfd3 JS |
374 | * zero. |
375 | */ | |
d7e09d03 PT |
376 | atomic_dec(&net->ibn_npeers); |
377 | } | |
378 | ||
8d9de3f4 | 379 | struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid) |
d7e09d03 | 380 | { |
4420cfd3 JS |
381 | /* |
382 | * the caller is responsible for accounting the additional reference | |
383 | * that this creates | |
384 | */ | |
ec3d17c0 MS |
385 | struct list_head *peer_list = kiblnd_nid2peerlist(nid); |
386 | struct list_head *tmp; | |
8d9de3f4 | 387 | struct kib_peer *peer; |
d7e09d03 | 388 | |
febe73bd | 389 | list_for_each(tmp, peer_list) { |
8d9de3f4 | 390 | peer = list_entry(tmp, struct kib_peer, ibp_list); |
4d99b258 | 391 | LASSERT(!kiblnd_peer_idle(peer)); |
d7e09d03 PT |
392 | |
393 | if (peer->ibp_nid != nid) | |
394 | continue; | |
395 | ||
396 | CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n", | |
397 | peer, libcfs_nid2str(nid), | |
398 | atomic_read(&peer->ibp_refcount), | |
399 | peer->ibp_version); | |
400 | return peer; | |
401 | } | |
402 | return NULL; | |
403 | } | |
404 | ||
8d9de3f4 | 405 | void kiblnd_unlink_peer_locked(struct kib_peer *peer) |
d7e09d03 | 406 | { |
febe73bd | 407 | LASSERT(list_empty(&peer->ibp_conns)); |
d7e09d03 | 408 | |
febe73bd | 409 | LASSERT(kiblnd_peer_active(peer)); |
d7e09d03 PT |
410 | list_del_init(&peer->ibp_list); |
411 | /* lose peerlist's ref */ | |
412 | kiblnd_peer_decref(peer); | |
413 | } | |
414 | ||
febe73bd GM |
415 | static int kiblnd_get_peer_info(lnet_ni_t *ni, int index, |
416 | lnet_nid_t *nidp, int *count) | |
d7e09d03 | 417 | { |
8d9de3f4 | 418 | struct kib_peer *peer; |
ec3d17c0 MS |
419 | struct list_head *ptmp; |
420 | int i; | |
421 | unsigned long flags; | |
d7e09d03 PT |
422 | |
423 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
424 | ||
425 | for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { | |
febe73bd | 426 | list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { |
8d9de3f4 | 427 | peer = list_entry(ptmp, struct kib_peer, ibp_list); |
4d99b258 | 428 | LASSERT(!kiblnd_peer_idle(peer)); |
d7e09d03 PT |
429 | |
430 | if (peer->ibp_ni != ni) | |
431 | continue; | |
432 | ||
433 | if (index-- > 0) | |
434 | continue; | |
435 | ||
436 | *nidp = peer->ibp_nid; | |
437 | *count = atomic_read(&peer->ibp_refcount); | |
438 | ||
439 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, | |
440 | flags); | |
441 | return 0; | |
442 | } | |
443 | } | |
444 | ||
445 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
446 | return -ENOENT; | |
447 | } | |
448 | ||
8d9de3f4 | 449 | static void kiblnd_del_peer_locked(struct kib_peer *peer) |
d7e09d03 | 450 | { |
ec3d17c0 MS |
451 | struct list_head *ctmp; |
452 | struct list_head *cnxt; | |
8d9de3f4 | 453 | struct kib_conn *conn; |
d7e09d03 PT |
454 | |
455 | if (list_empty(&peer->ibp_conns)) { | |
456 | kiblnd_unlink_peer_locked(peer); | |
457 | } else { | |
febe73bd | 458 | list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { |
8d9de3f4 | 459 | conn = list_entry(ctmp, struct kib_conn, ibc_list); |
d7e09d03 PT |
460 | |
461 | kiblnd_close_conn_locked(conn, 0); | |
462 | } | |
463 | /* NB closing peer's last conn unlinked it. */ | |
464 | } | |
4420cfd3 JS |
465 | /* |
466 | * NB peer now unlinked; might even be freed if the peer table had the | |
467 | * last ref on it. | |
468 | */ | |
d7e09d03 PT |
469 | } |
470 | ||
febe73bd | 471 | static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid) |
d7e09d03 | 472 | { |
febe73bd | 473 | LIST_HEAD(zombies); |
ec3d17c0 MS |
474 | struct list_head *ptmp; |
475 | struct list_head *pnxt; | |
8d9de3f4 | 476 | struct kib_peer *peer; |
ec3d17c0 MS |
477 | int lo; |
478 | int hi; | |
479 | int i; | |
480 | unsigned long flags; | |
481 | int rc = -ENOENT; | |
d7e09d03 PT |
482 | |
483 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
484 | ||
485 | if (nid != LNET_NID_ANY) { | |
d3d3d37a JS |
486 | lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; |
487 | hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; | |
d7e09d03 PT |
488 | } else { |
489 | lo = 0; | |
490 | hi = kiblnd_data.kib_peer_hash_size - 1; | |
491 | } | |
492 | ||
493 | for (i = lo; i <= hi; i++) { | |
febe73bd | 494 | list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { |
8d9de3f4 | 495 | peer = list_entry(ptmp, struct kib_peer, ibp_list); |
4d99b258 | 496 | LASSERT(!kiblnd_peer_idle(peer)); |
d7e09d03 PT |
497 | |
498 | if (peer->ibp_ni != ni) | |
499 | continue; | |
500 | ||
501 | if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) | |
502 | continue; | |
503 | ||
504 | if (!list_empty(&peer->ibp_tx_queue)) { | |
febe73bd | 505 | LASSERT(list_empty(&peer->ibp_conns)); |
d7e09d03 PT |
506 | |
507 | list_splice_init(&peer->ibp_tx_queue, | |
c314c319 | 508 | &zombies); |
d7e09d03 PT |
509 | } |
510 | ||
511 | kiblnd_del_peer_locked(peer); | |
512 | rc = 0; /* matched something */ | |
513 | } | |
514 | } | |
515 | ||
516 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
517 | ||
518 | kiblnd_txlist_done(ni, &zombies, -EIO); | |
519 | ||
520 | return rc; | |
521 | } | |
522 | ||
8d9de3f4 | 523 | static struct kib_conn *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index) |
d7e09d03 | 524 | { |
8d9de3f4 | 525 | struct kib_peer *peer; |
ec3d17c0 | 526 | struct list_head *ptmp; |
8d9de3f4 | 527 | struct kib_conn *conn; |
ec3d17c0 MS |
528 | struct list_head *ctmp; |
529 | int i; | |
530 | unsigned long flags; | |
d7e09d03 PT |
531 | |
532 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
533 | ||
534 | for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { | |
febe73bd | 535 | list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { |
8d9de3f4 | 536 | peer = list_entry(ptmp, struct kib_peer, ibp_list); |
4d99b258 | 537 | LASSERT(!kiblnd_peer_idle(peer)); |
d7e09d03 PT |
538 | |
539 | if (peer->ibp_ni != ni) | |
540 | continue; | |
541 | ||
febe73bd | 542 | list_for_each(ctmp, &peer->ibp_conns) { |
d7e09d03 PT |
543 | if (index-- > 0) |
544 | continue; | |
545 | ||
8d9de3f4 | 546 | conn = list_entry(ctmp, struct kib_conn, |
c314c319 | 547 | ibc_list); |
d7e09d03 | 548 | kiblnd_conn_addref(conn); |
7a3888a3 GM |
549 | read_unlock_irqrestore( |
550 | &kiblnd_data.kib_global_lock, | |
551 | flags); | |
d7e09d03 PT |
552 | return conn; |
553 | } | |
554 | } | |
555 | } | |
556 | ||
557 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
558 | return NULL; | |
559 | } | |
560 | ||
febe73bd | 561 | int kiblnd_translate_mtu(int value) |
d7e09d03 PT |
562 | { |
563 | switch (value) { | |
564 | default: | |
565 | return -1; | |
566 | case 0: | |
567 | return 0; | |
568 | case 256: | |
569 | return IB_MTU_256; | |
570 | case 512: | |
571 | return IB_MTU_512; | |
572 | case 1024: | |
573 | return IB_MTU_1024; | |
574 | case 2048: | |
575 | return IB_MTU_2048; | |
576 | case 4096: | |
577 | return IB_MTU_4096; | |
578 | } | |
579 | } | |
580 | ||
febe73bd | 581 | static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) |
d7e09d03 | 582 | { |
ec3d17c0 | 583 | int mtu; |
d7e09d03 PT |
584 | |
585 | /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ | |
06ace26e | 586 | if (!cmid->route.path_rec) |
d7e09d03 PT |
587 | return; |
588 | ||
589 | mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu); | |
febe73bd | 590 | LASSERT(mtu >= 0); |
5fd88337 | 591 | if (mtu) |
d7e09d03 PT |
592 | cmid->route.path_rec->mtu = mtu; |
593 | } | |
594 | ||
8d9de3f4 | 595 | static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt) |
d7e09d03 | 596 | { |
ec3d17c0 MS |
597 | cpumask_t *mask; |
598 | int vectors; | |
599 | int off; | |
600 | int i; | |
601 | lnet_nid_t nid = conn->ibc_peer->ibp_nid; | |
d7e09d03 PT |
602 | |
603 | vectors = conn->ibc_cmid->device->num_comp_vectors; | |
604 | if (vectors <= 1) | |
605 | return 0; | |
606 | ||
607 | mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); | |
06ace26e | 608 | if (!mask) |
3867ea5a | 609 | return 0; |
d7e09d03 PT |
610 | |
611 | /* hash NID to CPU id in this partition... */ | |
4a316f79 OD |
612 | off = do_div(nid, cpumask_weight(mask)); |
613 | for_each_cpu(i, mask) { | |
5fd88337 | 614 | if (!off--) |
d7e09d03 PT |
615 | return i % vectors; |
616 | } | |
617 | ||
618 | LBUG(); | |
619 | return 1; | |
620 | } | |
621 | ||
8d9de3f4 | 622 | struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid, |
24c198e9 | 623 | int state, int version) |
d7e09d03 | 624 | { |
4420cfd3 JS |
625 | /* |
626 | * CAVEAT EMPTOR: | |
d7e09d03 PT |
627 | * If the new conn is created successfully it takes over the caller's |
628 | * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself | |
629 | * is destroyed. On failure, the caller's ref on 'peer' remains and | |
630 | * she must dispose of 'cmid'. (Actually I'd block forever if I tried | |
631 | * to destroy 'cmid' here since I'm called from the CM which still has | |
4420cfd3 JS |
632 | * its ref on 'cmid'). |
633 | */ | |
ec3d17c0 | 634 | rwlock_t *glock = &kiblnd_data.kib_global_lock; |
8d9de3f4 JS |
635 | struct kib_net *net = peer->ibp_ni->ni_data; |
636 | struct kib_dev *dev; | |
d7e09d03 | 637 | struct ib_qp_init_attr *init_qp_attr; |
ec3d17c0 | 638 | struct kib_sched_info *sched; |
23908db4 | 639 | struct ib_cq_init_attr cq_attr = {}; |
8d9de3f4 | 640 | struct kib_conn *conn; |
ec3d17c0 MS |
641 | struct ib_cq *cq; |
642 | unsigned long flags; | |
643 | int cpt; | |
644 | int rc; | |
645 | int i; | |
d7e09d03 | 646 | |
06ace26e | 647 | LASSERT(net); |
d7e09d03 PT |
648 | LASSERT(!in_interrupt()); |
649 | ||
650 | dev = net->ibn_dev; | |
651 | ||
652 | cpt = lnet_cpt_of_nid(peer->ibp_nid); | |
653 | sched = kiblnd_data.kib_scheds[cpt]; | |
654 | ||
655 | LASSERT(sched->ibs_nthreads > 0); | |
656 | ||
657 | LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt, | |
658 | sizeof(*init_qp_attr)); | |
06ace26e | 659 | if (!init_qp_attr) { |
d7e09d03 PT |
660 | CERROR("Can't allocate qp_attr for %s\n", |
661 | libcfs_nid2str(peer->ibp_nid)); | |
662 | goto failed_0; | |
663 | } | |
664 | ||
665 | LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); | |
06ace26e | 666 | if (!conn) { |
d7e09d03 PT |
667 | CERROR("Can't allocate connection for %s\n", |
668 | libcfs_nid2str(peer->ibp_nid)); | |
669 | goto failed_1; | |
670 | } | |
671 | ||
672 | conn->ibc_state = IBLND_CONN_INIT; | |
673 | conn->ibc_version = version; | |
674 | conn->ibc_peer = peer; /* I take the caller's ref */ | |
675 | cmid->context = conn; /* for future CM callbacks */ | |
676 | conn->ibc_cmid = cmid; | |
a01fa108 AS |
677 | conn->ibc_max_frags = peer->ibp_max_frags; |
678 | conn->ibc_queue_depth = peer->ibp_queue_depth; | |
2fb44f2b | 679 | |
d7e09d03 PT |
680 | INIT_LIST_HEAD(&conn->ibc_early_rxs); |
681 | INIT_LIST_HEAD(&conn->ibc_tx_noops); | |
682 | INIT_LIST_HEAD(&conn->ibc_tx_queue); | |
683 | INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); | |
684 | INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); | |
685 | INIT_LIST_HEAD(&conn->ibc_active_txs); | |
686 | spin_lock_init(&conn->ibc_lock); | |
687 | ||
688 | LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt, | |
689 | sizeof(*conn->ibc_connvars)); | |
06ace26e | 690 | if (!conn->ibc_connvars) { |
d7e09d03 PT |
691 | CERROR("Can't allocate in-progress connection state\n"); |
692 | goto failed_2; | |
693 | } | |
694 | ||
695 | write_lock_irqsave(glock, flags); | |
696 | if (dev->ibd_failover) { | |
697 | write_unlock_irqrestore(glock, flags); | |
698 | CERROR("%s: failover in progress\n", dev->ibd_ifname); | |
699 | goto failed_2; | |
700 | } | |
701 | ||
702 | if (dev->ibd_hdev->ibh_ibdev != cmid->device) { | |
703 | /* wakeup failover thread and teardown connection */ | |
704 | if (kiblnd_dev_can_failover(dev)) { | |
705 | list_add_tail(&dev->ibd_fail_list, | |
706 | &kiblnd_data.kib_failed_devs); | |
707 | wake_up(&kiblnd_data.kib_failover_waitq); | |
708 | } | |
709 | ||
710 | write_unlock_irqrestore(glock, flags); | |
711 | CERROR("cmid HCA(%s), kib_dev(%s) need failover\n", | |
712 | cmid->device->name, dev->ibd_ifname); | |
713 | goto failed_2; | |
714 | } | |
715 | ||
716 | kiblnd_hdev_addref_locked(dev->ibd_hdev); | |
717 | conn->ibc_hdev = dev->ibd_hdev; | |
718 | ||
719 | kiblnd_setup_mtu_locked(cmid); | |
720 | ||
721 | write_unlock_irqrestore(glock, flags); | |
722 | ||
723 | LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, | |
8d9de3f4 | 724 | IBLND_RX_MSGS(conn) * sizeof(struct kib_rx)); |
06ace26e | 725 | if (!conn->ibc_rxs) { |
d7e09d03 PT |
726 | CERROR("Cannot allocate RX buffers\n"); |
727 | goto failed_2; | |
728 | } | |
729 | ||
730 | rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, | |
2fb44f2b | 731 | IBLND_RX_MSG_PAGES(conn)); |
5fd88337 | 732 | if (rc) |
d7e09d03 PT |
733 | goto failed_2; |
734 | ||
735 | kiblnd_map_rx_descs(conn); | |
736 | ||
2fb44f2b | 737 | cq_attr.cqe = IBLND_CQ_ENTRIES(conn); |
8e37210b | 738 | cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt); |
d7e09d03 PT |
739 | cq = ib_create_cq(cmid->device, |
740 | kiblnd_cq_completion, kiblnd_cq_event, conn, | |
8e37210b | 741 | &cq_attr); |
d7e09d03 | 742 | if (IS_ERR(cq)) { |
2fb44f2b JF |
743 | CERROR("Failed to create CQ with %d CQEs: %ld\n", |
744 | IBLND_CQ_ENTRIES(conn), PTR_ERR(cq)); | |
d7e09d03 PT |
745 | goto failed_2; |
746 | } | |
747 | ||
748 | conn->ibc_cq = cq; | |
749 | ||
750 | rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); | |
5fd88337 | 751 | if (rc) { |
9c379663 | 752 | CERROR("Can't request completion notification: %d\n", rc); |
d7e09d03 PT |
753 | goto failed_2; |
754 | } | |
755 | ||
756 | init_qp_attr->event_handler = kiblnd_qp_event; | |
757 | init_qp_attr->qp_context = conn; | |
2fb44f2b JF |
758 | init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn); |
759 | init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn); | |
d7e09d03 PT |
760 | init_qp_attr->cap.max_send_sge = 1; |
761 | init_qp_attr->cap.max_recv_sge = 1; | |
762 | init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; | |
763 | init_qp_attr->qp_type = IB_QPT_RC; | |
764 | init_qp_attr->send_cq = cq; | |
765 | init_qp_attr->recv_cq = cq; | |
766 | ||
767 | conn->ibc_sched = sched; | |
768 | ||
769 | rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); | |
5fd88337 | 770 | if (rc) { |
d7e09d03 PT |
771 | CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", |
772 | rc, init_qp_attr->cap.max_send_wr, | |
773 | init_qp_attr->cap.max_recv_wr); | |
774 | goto failed_2; | |
775 | } | |
776 | ||
777 | LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); | |
778 | ||
779 | /* 1 ref for caller and each rxmsg */ | |
2fb44f2b JF |
780 | atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn)); |
781 | conn->ibc_nrx = IBLND_RX_MSGS(conn); | |
d7e09d03 PT |
782 | |
783 | /* post receives */ | |
2fb44f2b | 784 | for (i = 0; i < IBLND_RX_MSGS(conn); i++) { |
d7e09d03 PT |
785 | rc = kiblnd_post_rx(&conn->ibc_rxs[i], |
786 | IBLND_POSTRX_NO_CREDIT); | |
5fd88337 | 787 | if (rc) { |
d7e09d03 PT |
788 | CERROR("Can't post rxmsg: %d\n", rc); |
789 | ||
790 | /* Make posted receives complete */ | |
791 | kiblnd_abort_receives(conn); | |
792 | ||
4420cfd3 JS |
793 | /* |
794 | * correct # of posted buffers | |
795 | * NB locking needed now I'm racing with completion | |
796 | */ | |
d7e09d03 | 797 | spin_lock_irqsave(&sched->ibs_lock, flags); |
2fb44f2b | 798 | conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i; |
d7e09d03 PT |
799 | spin_unlock_irqrestore(&sched->ibs_lock, flags); |
800 | ||
4420cfd3 JS |
801 | /* |
802 | * cmid will be destroyed by CM(ofed) after cm_callback | |
d7e09d03 | 803 | * returned, so we can't refer it anymore |
4420cfd3 JS |
804 | * (by kiblnd_connd()->kiblnd_destroy_conn) |
805 | */ | |
d7e09d03 PT |
806 | rdma_destroy_qp(conn->ibc_cmid); |
807 | conn->ibc_cmid = NULL; | |
808 | ||
809 | /* Drop my own and unused rxbuffer refcounts */ | |
2fb44f2b | 810 | while (i++ <= IBLND_RX_MSGS(conn)) |
d7e09d03 PT |
811 | kiblnd_conn_decref(conn); |
812 | ||
813 | return NULL; | |
814 | } | |
815 | } | |
816 | ||
817 | /* Init successful! */ | |
febe73bd | 818 | LASSERT(state == IBLND_CONN_ACTIVE_CONNECT || |
c314c319 | 819 | state == IBLND_CONN_PASSIVE_WAIT); |
d7e09d03 PT |
820 | conn->ibc_state = state; |
821 | ||
822 | /* 1 more conn */ | |
823 | atomic_inc(&net->ibn_nconns); | |
824 | return conn; | |
825 | ||
826 | failed_2: | |
4d99b258 | 827 | kiblnd_destroy_conn(conn, true); |
d7e09d03 PT |
828 | failed_1: |
829 | LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); | |
830 | failed_0: | |
831 | return NULL; | |
832 | } | |
833 | ||
8d9de3f4 | 834 | void kiblnd_destroy_conn(struct kib_conn *conn, bool free_conn) |
d7e09d03 PT |
835 | { |
836 | struct rdma_cm_id *cmid = conn->ibc_cmid; | |
8d9de3f4 | 837 | struct kib_peer *peer = conn->ibc_peer; |
ec3d17c0 | 838 | int rc; |
d7e09d03 | 839 | |
febe73bd | 840 | LASSERT(!in_interrupt()); |
5fd88337 | 841 | LASSERT(!atomic_read(&conn->ibc_refcount)); |
febe73bd GM |
842 | LASSERT(list_empty(&conn->ibc_early_rxs)); |
843 | LASSERT(list_empty(&conn->ibc_tx_noops)); | |
844 | LASSERT(list_empty(&conn->ibc_tx_queue)); | |
845 | LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd)); | |
846 | LASSERT(list_empty(&conn->ibc_tx_queue_nocred)); | |
847 | LASSERT(list_empty(&conn->ibc_active_txs)); | |
5fd88337 JS |
848 | LASSERT(!conn->ibc_noops_posted); |
849 | LASSERT(!conn->ibc_nsends_posted); | |
d7e09d03 PT |
850 | |
851 | switch (conn->ibc_state) { | |
852 | default: | |
853 | /* conn must be completely disengaged from the network */ | |
854 | LBUG(); | |
855 | ||
856 | case IBLND_CONN_DISCONNECTED: | |
857 | /* connvars should have been freed already */ | |
06ace26e | 858 | LASSERT(!conn->ibc_connvars); |
d7e09d03 PT |
859 | break; |
860 | ||
861 | case IBLND_CONN_INIT: | |
862 | break; | |
863 | } | |
864 | ||
865 | /* conn->ibc_cmid might be destroyed by CM already */ | |
06ace26e | 866 | if (cmid && cmid->qp) |
d7e09d03 PT |
867 | rdma_destroy_qp(cmid); |
868 | ||
06ace26e | 869 | if (conn->ibc_cq) { |
d7e09d03 | 870 | rc = ib_destroy_cq(conn->ibc_cq); |
5fd88337 | 871 | if (rc) |
d7e09d03 PT |
872 | CWARN("Error destroying CQ: %d\n", rc); |
873 | } | |
874 | ||
06ace26e | 875 | if (conn->ibc_rx_pages) |
d7e09d03 PT |
876 | kiblnd_unmap_rx_descs(conn); |
877 | ||
06ace26e | 878 | if (conn->ibc_rxs) { |
d7e09d03 | 879 | LIBCFS_FREE(conn->ibc_rxs, |
8d9de3f4 | 880 | IBLND_RX_MSGS(conn) * sizeof(struct kib_rx)); |
d7e09d03 PT |
881 | } |
882 | ||
06ace26e | 883 | if (conn->ibc_connvars) |
d7e09d03 PT |
884 | LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); |
885 | ||
06ace26e | 886 | if (conn->ibc_hdev) |
d7e09d03 PT |
887 | kiblnd_hdev_decref(conn->ibc_hdev); |
888 | ||
889 | /* See CAVEAT EMPTOR above in kiblnd_create_conn */ | |
890 | if (conn->ibc_state != IBLND_CONN_INIT) { | |
8d9de3f4 | 891 | struct kib_net *net = peer->ibp_ni->ni_data; |
d7e09d03 PT |
892 | |
893 | kiblnd_peer_decref(peer); | |
894 | rdma_destroy_id(cmid); | |
895 | atomic_dec(&net->ibn_nconns); | |
896 | } | |
897 | ||
898 | LIBCFS_FREE(conn, sizeof(*conn)); | |
899 | } | |
900 | ||
8d9de3f4 | 901 | int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why) |
d7e09d03 | 902 | { |
8d9de3f4 | 903 | struct kib_conn *conn; |
ec3d17c0 MS |
904 | struct list_head *ctmp; |
905 | struct list_head *cnxt; | |
906 | int count = 0; | |
d7e09d03 | 907 | |
febe73bd | 908 | list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { |
8d9de3f4 | 909 | conn = list_entry(ctmp, struct kib_conn, ibc_list); |
d7e09d03 | 910 | |
2d00bd17 | 911 | CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n", |
d7e09d03 PT |
912 | libcfs_nid2str(peer->ibp_nid), |
913 | conn->ibc_version, why); | |
914 | ||
915 | kiblnd_close_conn_locked(conn, why); | |
916 | count++; | |
917 | } | |
918 | ||
919 | return count; | |
920 | } | |
921 | ||
8d9de3f4 | 922 | int kiblnd_close_stale_conns_locked(struct kib_peer *peer, |
c314c319 | 923 | int version, __u64 incarnation) |
d7e09d03 | 924 | { |
8d9de3f4 | 925 | struct kib_conn *conn; |
ec3d17c0 MS |
926 | struct list_head *ctmp; |
927 | struct list_head *cnxt; | |
928 | int count = 0; | |
d7e09d03 | 929 | |
febe73bd | 930 | list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { |
8d9de3f4 | 931 | conn = list_entry(ctmp, struct kib_conn, ibc_list); |
d7e09d03 PT |
932 | |
933 | if (conn->ibc_version == version && | |
934 | conn->ibc_incarnation == incarnation) | |
935 | continue; | |
936 | ||
7a3888a3 GM |
937 | CDEBUG(D_NET, |
938 | "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n", | |
d7e09d03 PT |
939 | libcfs_nid2str(peer->ibp_nid), |
940 | conn->ibc_version, conn->ibc_incarnation, | |
941 | version, incarnation); | |
942 | ||
943 | kiblnd_close_conn_locked(conn, -ESTALE); | |
944 | count++; | |
945 | } | |
946 | ||
947 | return count; | |
948 | } | |
949 | ||
febe73bd | 950 | static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid) |
d7e09d03 | 951 | { |
8d9de3f4 | 952 | struct kib_peer *peer; |
ec3d17c0 MS |
953 | struct list_head *ptmp; |
954 | struct list_head *pnxt; | |
955 | int lo; | |
956 | int hi; | |
957 | int i; | |
958 | unsigned long flags; | |
959 | int count = 0; | |
d7e09d03 PT |
960 | |
961 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
962 | ||
d3d3d37a JS |
963 | if (nid != LNET_NID_ANY) { |
964 | lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; | |
965 | hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; | |
966 | } else { | |
d7e09d03 PT |
967 | lo = 0; |
968 | hi = kiblnd_data.kib_peer_hash_size - 1; | |
969 | } | |
970 | ||
971 | for (i = lo; i <= hi; i++) { | |
febe73bd | 972 | list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { |
8d9de3f4 | 973 | peer = list_entry(ptmp, struct kib_peer, ibp_list); |
4d99b258 | 974 | LASSERT(!kiblnd_peer_idle(peer)); |
d7e09d03 PT |
975 | |
976 | if (peer->ibp_ni != ni) | |
977 | continue; | |
978 | ||
979 | if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) | |
980 | continue; | |
981 | ||
982 | count += kiblnd_close_peer_conns_locked(peer, 0); | |
983 | } | |
984 | } | |
985 | ||
986 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
987 | ||
988 | /* wildcards always succeed */ | |
989 | if (nid == LNET_NID_ANY) | |
990 | return 0; | |
991 | ||
5fd88337 | 992 | return !count ? -ENOENT : 0; |
d7e09d03 PT |
993 | } |
994 | ||
439b4d45 | 995 | static int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) |
d7e09d03 PT |
996 | { |
997 | struct libcfs_ioctl_data *data = arg; | |
ec3d17c0 | 998 | int rc = -EINVAL; |
d7e09d03 | 999 | |
a58a38ac | 1000 | switch (cmd) { |
d7e09d03 | 1001 | case IOC_LIBCFS_GET_PEER: { |
ec3d17c0 MS |
1002 | lnet_nid_t nid = 0; |
1003 | int count = 0; | |
d7e09d03 PT |
1004 | |
1005 | rc = kiblnd_get_peer_info(ni, data->ioc_count, | |
1006 | &nid, &count); | |
ec3d17c0 MS |
1007 | data->ioc_nid = nid; |
1008 | data->ioc_count = count; | |
d7e09d03 PT |
1009 | break; |
1010 | } | |
1011 | ||
1012 | case IOC_LIBCFS_DEL_PEER: { | |
1013 | rc = kiblnd_del_peer(ni, data->ioc_nid); | |
1014 | break; | |
1015 | } | |
1016 | case IOC_LIBCFS_GET_CONN: { | |
8d9de3f4 | 1017 | struct kib_conn *conn; |
d7e09d03 PT |
1018 | |
1019 | rc = 0; | |
1020 | conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); | |
06ace26e | 1021 | if (!conn) { |
d7e09d03 PT |
1022 | rc = -ENOENT; |
1023 | break; | |
1024 | } | |
1025 | ||
06ace26e | 1026 | LASSERT(conn->ibc_cmid); |
d7e09d03 | 1027 | data->ioc_nid = conn->ibc_peer->ibp_nid; |
06ace26e | 1028 | if (!conn->ibc_cmid->route.path_rec) |
d7e09d03 PT |
1029 | data->ioc_u32[0] = 0; /* iWarp has no path MTU */ |
1030 | else | |
1031 | data->ioc_u32[0] = | |
1032 | ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); | |
1033 | kiblnd_conn_decref(conn); | |
1034 | break; | |
1035 | } | |
1036 | case IOC_LIBCFS_CLOSE_CONNECTION: { | |
1037 | rc = kiblnd_close_matching_conns(ni, data->ioc_nid); | |
1038 | break; | |
1039 | } | |
1040 | ||
1041 | default: | |
1042 | break; | |
1043 | } | |
1044 | ||
1045 | return rc; | |
1046 | } | |
1047 | ||
439b4d45 | 1048 | static void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when) |
d7e09d03 | 1049 | { |
ec3d17c0 MS |
1050 | unsigned long last_alive = 0; |
1051 | unsigned long now = cfs_time_current(); | |
1052 | rwlock_t *glock = &kiblnd_data.kib_global_lock; | |
8d9de3f4 | 1053 | struct kib_peer *peer; |
ec3d17c0 | 1054 | unsigned long flags; |
d7e09d03 PT |
1055 | |
1056 | read_lock_irqsave(glock, flags); | |
1057 | ||
1058 | peer = kiblnd_find_peer_locked(nid); | |
4d99b258 | 1059 | if (peer) |
d7e09d03 | 1060 | last_alive = peer->ibp_last_alive; |
d7e09d03 PT |
1061 | |
1062 | read_unlock_irqrestore(glock, flags); | |
1063 | ||
5fd88337 | 1064 | if (last_alive) |
d7e09d03 PT |
1065 | *when = last_alive; |
1066 | ||
4420cfd3 JS |
1067 | /* |
1068 | * peer is not persistent in hash, trigger peer creation | |
1069 | * and connection establishment with a NULL tx | |
1070 | */ | |
06ace26e | 1071 | if (!peer) |
d7e09d03 PT |
1072 | kiblnd_launch_tx(ni, NULL, nid); |
1073 | ||
1074 | CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", | |
1075 | libcfs_nid2str(nid), peer, | |
1076 | last_alive ? cfs_duration_sec(now - last_alive) : -1); | |
d7e09d03 PT |
1077 | } |
1078 | ||
8d9de3f4 | 1079 | static void kiblnd_free_pages(struct kib_pages *p) |
d7e09d03 | 1080 | { |
ec3d17c0 MS |
1081 | int npages = p->ibp_npages; |
1082 | int i; | |
d7e09d03 PT |
1083 | |
1084 | for (i = 0; i < npages; i++) { | |
06ace26e | 1085 | if (p->ibp_pages[i]) |
d7e09d03 PT |
1086 | __free_page(p->ibp_pages[i]); |
1087 | } | |
1088 | ||
8d9de3f4 | 1089 | LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages])); |
d7e09d03 PT |
1090 | } |
1091 | ||
8d9de3f4 | 1092 | int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages) |
d7e09d03 | 1093 | { |
8d9de3f4 | 1094 | struct kib_pages *p; |
ec3d17c0 | 1095 | int i; |
d7e09d03 PT |
1096 | |
1097 | LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt, | |
8d9de3f4 | 1098 | offsetof(struct kib_pages, ibp_pages[npages])); |
06ace26e | 1099 | if (!p) { |
d7e09d03 PT |
1100 | CERROR("Can't allocate descriptor for %d pages\n", npages); |
1101 | return -ENOMEM; | |
1102 | } | |
1103 | ||
8d9de3f4 | 1104 | memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages])); |
d7e09d03 PT |
1105 | p->ibp_npages = npages; |
1106 | ||
1107 | for (i = 0; i < npages; i++) { | |
49c02a75 PT |
1108 | p->ibp_pages[i] = alloc_pages_node( |
1109 | cfs_cpt_spread_node(lnet_cpt_table(), cpt), | |
0be19afa | 1110 | GFP_NOFS, 0); |
06ace26e | 1111 | if (!p->ibp_pages[i]) { |
d7e09d03 PT |
1112 | CERROR("Can't allocate page %d of %d\n", i, npages); |
1113 | kiblnd_free_pages(p); | |
1114 | return -ENOMEM; | |
1115 | } | |
1116 | } | |
1117 | ||
1118 | *pp = p; | |
1119 | return 0; | |
1120 | } | |
1121 | ||
8d9de3f4 | 1122 | void kiblnd_unmap_rx_descs(struct kib_conn *conn) |
d7e09d03 | 1123 | { |
8d9de3f4 | 1124 | struct kib_rx *rx; |
ec3d17c0 | 1125 | int i; |
d7e09d03 | 1126 | |
06ace26e JS |
1127 | LASSERT(conn->ibc_rxs); |
1128 | LASSERT(conn->ibc_hdev); | |
d7e09d03 | 1129 | |
2fb44f2b | 1130 | for (i = 0; i < IBLND_RX_MSGS(conn); i++) { |
d7e09d03 PT |
1131 | rx = &conn->ibc_rxs[i]; |
1132 | ||
febe73bd | 1133 | LASSERT(rx->rx_nob >= 0); /* not posted */ |
d7e09d03 PT |
1134 | |
1135 | kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, | |
1136 | KIBLND_UNMAP_ADDR(rx, rx_msgunmap, | |
1137 | rx->rx_msgaddr), | |
1138 | IBLND_MSG_SIZE, DMA_FROM_DEVICE); | |
1139 | } | |
1140 | ||
1141 | kiblnd_free_pages(conn->ibc_rx_pages); | |
1142 | ||
1143 | conn->ibc_rx_pages = NULL; | |
1144 | } | |
1145 | ||
8d9de3f4 | 1146 | void kiblnd_map_rx_descs(struct kib_conn *conn) |
d7e09d03 | 1147 | { |
8d9de3f4 | 1148 | struct kib_rx *rx; |
ec3d17c0 MS |
1149 | struct page *pg; |
1150 | int pg_off; | |
1151 | int ipg; | |
1152 | int i; | |
d7e09d03 | 1153 | |
2fb44f2b | 1154 | for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) { |
d7e09d03 PT |
1155 | pg = conn->ibc_rx_pages->ibp_pages[ipg]; |
1156 | rx = &conn->ibc_rxs[i]; | |
1157 | ||
1158 | rx->rx_conn = conn; | |
8d9de3f4 | 1159 | rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off); |
d7e09d03 PT |
1160 | |
1161 | rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, | |
7a3888a3 GM |
1162 | rx->rx_msg, |
1163 | IBLND_MSG_SIZE, | |
d7e09d03 | 1164 | DMA_FROM_DEVICE); |
febe73bd | 1165 | LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, |
c314c319 | 1166 | rx->rx_msgaddr)); |
d7e09d03 PT |
1167 | KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); |
1168 | ||
1d8cb70c | 1169 | CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n", |
d7e09d03 | 1170 | i, rx->rx_msg, rx->rx_msgaddr, |
d664d1fd | 1171 | (__u64)(page_to_phys(pg) + pg_off)); |
d7e09d03 PT |
1172 | |
1173 | pg_off += IBLND_MSG_SIZE; | |
febe73bd | 1174 | LASSERT(pg_off <= PAGE_SIZE); |
d7e09d03 PT |
1175 | |
1176 | if (pg_off == PAGE_SIZE) { | |
1177 | pg_off = 0; | |
1178 | ipg++; | |
2fb44f2b | 1179 | LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn)); |
d7e09d03 PT |
1180 | } |
1181 | } | |
1182 | } | |
1183 | ||
8d9de3f4 | 1184 | static void kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo) |
d7e09d03 | 1185 | { |
8d9de3f4 JS |
1186 | struct kib_hca_dev *hdev = tpo->tpo_hdev; |
1187 | struct kib_tx *tx; | |
ec3d17c0 | 1188 | int i; |
d7e09d03 | 1189 | |
5fd88337 | 1190 | LASSERT(!tpo->tpo_pool.po_allocated); |
d7e09d03 | 1191 | |
06ace26e | 1192 | if (!hdev) |
d7e09d03 PT |
1193 | return; |
1194 | ||
1195 | for (i = 0; i < tpo->tpo_pool.po_size; i++) { | |
1196 | tx = &tpo->tpo_tx_descs[i]; | |
1197 | kiblnd_dma_unmap_single(hdev->ibh_ibdev, | |
1198 | KIBLND_UNMAP_ADDR(tx, tx_msgunmap, | |
1199 | tx->tx_msgaddr), | |
1200 | IBLND_MSG_SIZE, DMA_TO_DEVICE); | |
1201 | } | |
1202 | ||
1203 | kiblnd_hdev_decref(hdev); | |
1204 | tpo->tpo_hdev = NULL; | |
1205 | } | |
1206 | ||
8d9de3f4 | 1207 | static struct kib_hca_dev *kiblnd_current_hdev(struct kib_dev *dev) |
d7e09d03 | 1208 | { |
8d9de3f4 | 1209 | struct kib_hca_dev *hdev; |
ec3d17c0 MS |
1210 | unsigned long flags; |
1211 | int i = 0; | |
d7e09d03 PT |
1212 | |
1213 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
1214 | while (dev->ibd_failover) { | |
1215 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
5fd88337 | 1216 | if (!(i++ % 50)) |
d7e09d03 PT |
1217 | CDEBUG(D_NET, "%s: Wait for failover\n", |
1218 | dev->ibd_ifname); | |
ea363b41 | 1219 | set_current_state(TASK_INTERRUPTIBLE); |
d7e09d03 PT |
1220 | schedule_timeout(cfs_time_seconds(1) / 100); |
1221 | ||
1222 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
1223 | } | |
1224 | ||
1225 | kiblnd_hdev_addref_locked(dev->ibd_hdev); | |
1226 | hdev = dev->ibd_hdev; | |
1227 | ||
1228 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
1229 | ||
1230 | return hdev; | |
1231 | } | |
1232 | ||
8d9de3f4 | 1233 | static void kiblnd_map_tx_pool(struct kib_tx_pool *tpo) |
d7e09d03 | 1234 | { |
8d9de3f4 JS |
1235 | struct kib_pages *txpgs = tpo->tpo_tx_pages; |
1236 | struct kib_pool *pool = &tpo->tpo_pool; | |
1237 | struct kib_net *net = pool->po_owner->ps_net; | |
1238 | struct kib_dev *dev; | |
ec3d17c0 | 1239 | struct page *page; |
8d9de3f4 | 1240 | struct kib_tx *tx; |
ec3d17c0 MS |
1241 | int page_offset; |
1242 | int ipage; | |
1243 | int i; | |
d7e09d03 | 1244 | |
06ace26e | 1245 | LASSERT(net); |
d7e09d03 PT |
1246 | |
1247 | dev = net->ibn_dev; | |
1248 | ||
1249 | /* pre-mapped messages are not bigger than 1 page */ | |
febe73bd | 1250 | CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE); |
d7e09d03 PT |
1251 | |
1252 | /* No fancy arithmetic when we do the buffer calculations */ | |
5fd88337 | 1253 | CLASSERT(!(PAGE_SIZE % IBLND_MSG_SIZE)); |
d7e09d03 PT |
1254 | |
1255 | tpo->tpo_hdev = kiblnd_current_hdev(dev); | |
1256 | ||
1257 | for (ipage = page_offset = i = 0; i < pool->po_size; i++) { | |
1258 | page = txpgs->ibp_pages[ipage]; | |
1259 | tx = &tpo->tpo_tx_descs[i]; | |
1260 | ||
8d9de3f4 | 1261 | tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) + |
d7e09d03 PT |
1262 | page_offset); |
1263 | ||
1264 | tx->tx_msgaddr = kiblnd_dma_map_single( | |
1265 | tpo->tpo_hdev->ibh_ibdev, tx->tx_msg, | |
1266 | IBLND_MSG_SIZE, DMA_TO_DEVICE); | |
febe73bd | 1267 | LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev, |
c314c319 | 1268 | tx->tx_msgaddr)); |
d7e09d03 PT |
1269 | KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); |
1270 | ||
1271 | list_add(&tx->tx_list, &pool->po_free_list); | |
1272 | ||
1273 | page_offset += IBLND_MSG_SIZE; | |
febe73bd | 1274 | LASSERT(page_offset <= PAGE_SIZE); |
d7e09d03 PT |
1275 | |
1276 | if (page_offset == PAGE_SIZE) { | |
1277 | page_offset = 0; | |
1278 | ipage++; | |
febe73bd | 1279 | LASSERT(ipage <= txpgs->ibp_npages); |
d7e09d03 PT |
1280 | } |
1281 | } | |
1282 | } | |
1283 | ||
8d9de3f4 | 1284 | struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd, |
2fb44f2b | 1285 | int negotiated_nfrags) |
d7e09d03 | 1286 | { |
8d9de3f4 JS |
1287 | struct kib_net *net = ni->ni_data; |
1288 | struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev; | |
32c8deb8 AS |
1289 | struct lnet_ioctl_config_o2iblnd_tunables *tunables; |
1290 | __u16 nfrags; | |
1291 | int mod; | |
1292 | ||
1293 | tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; | |
1294 | mod = tunables->lnd_map_on_demand; | |
1295 | nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod; | |
2fb44f2b | 1296 | |
7cadcc7c | 1297 | LASSERT(hdev->ibh_mrs); |
d7e09d03 | 1298 | |
32c8deb8 | 1299 | if (mod > 0 && nfrags <= rd->rd_nfrags) |
d7e09d03 PT |
1300 | return NULL; |
1301 | ||
7cadcc7c | 1302 | return hdev->ibh_mrs; |
d7e09d03 PT |
1303 | } |
1304 | ||
8d9de3f4 | 1305 | static void kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo) |
d7e09d03 | 1306 | { |
8daab0a4 | 1307 | LASSERT(!fpo->fpo_map_count); |
d7e09d03 | 1308 | |
80e05b34 DE |
1309 | if (fpo->fpo_is_fmr) { |
1310 | if (fpo->fmr.fpo_fmr_pool) | |
1311 | ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool); | |
1312 | } else { | |
1313 | struct kib_fast_reg_descriptor *frd, *tmp; | |
1314 | int i = 0; | |
1315 | ||
1316 | list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, | |
1317 | frd_list) { | |
1318 | list_del(&frd->frd_list); | |
1319 | ib_dereg_mr(frd->frd_mr); | |
1320 | LIBCFS_FREE(frd, sizeof(*frd)); | |
1321 | i++; | |
1322 | } | |
1323 | if (i < fpo->fast_reg.fpo_pool_size) | |
1324 | CERROR("FastReg pool still has %d regions registered\n", | |
1325 | fpo->fast_reg.fpo_pool_size - i); | |
1326 | } | |
d7e09d03 | 1327 | |
8daab0a4 DE |
1328 | if (fpo->fpo_hdev) |
1329 | kiblnd_hdev_decref(fpo->fpo_hdev); | |
d7e09d03 | 1330 | |
8daab0a4 | 1331 | LIBCFS_FREE(fpo, sizeof(*fpo)); |
d7e09d03 PT |
1332 | } |
1333 | ||
febe73bd | 1334 | static void kiblnd_destroy_fmr_pool_list(struct list_head *head) |
d7e09d03 | 1335 | { |
8d9de3f4 | 1336 | struct kib_fmr_pool *fpo, *tmp; |
d7e09d03 | 1337 | |
0d33ec5f | 1338 | list_for_each_entry_safe(fpo, tmp, head, fpo_list) { |
8daab0a4 DE |
1339 | list_del(&fpo->fpo_list); |
1340 | kiblnd_destroy_fmr_pool(fpo); | |
d7e09d03 PT |
1341 | } |
1342 | } | |
1343 | ||
32c8deb8 AS |
1344 | static int |
1345 | kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables, | |
1346 | int ncpts) | |
d7e09d03 | 1347 | { |
32c8deb8 | 1348 | int size = tunables->lnd_fmr_pool_size / ncpts; |
d7e09d03 PT |
1349 | |
1350 | return max(IBLND_FMR_POOL, size); | |
1351 | } | |
1352 | ||
32c8deb8 AS |
1353 | static int |
1354 | kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables, | |
1355 | int ncpts) | |
d7e09d03 | 1356 | { |
32c8deb8 | 1357 | int size = tunables->lnd_fmr_flush_trigger / ncpts; |
d7e09d03 PT |
1358 | |
1359 | return max(IBLND_FMR_POOL_FLUSH, size); | |
1360 | } | |
1361 | ||
8d9de3f4 | 1362 | static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo) |
d7e09d03 | 1363 | { |
d7e09d03 | 1364 | struct ib_fmr_pool_param param = { |
51078e25 | 1365 | .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE, |
ec3d17c0 MS |
1366 | .page_shift = PAGE_SHIFT, |
1367 | .access = (IB_ACCESS_LOCAL_WRITE | | |
e39f6efa | 1368 | IB_ACCESS_REMOTE_WRITE), |
ec3d17c0 | 1369 | .pool_size = fps->fps_pool_size, |
d7e09d03 PT |
1370 | .dirty_watermark = fps->fps_flush_trigger, |
1371 | .flush_function = NULL, | |
ec3d17c0 | 1372 | .flush_arg = NULL, |
32c8deb8 | 1373 | .cache = !!fps->fps_cache }; |
f66fb159 DE |
1374 | int rc = 0; |
1375 | ||
1376 | fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, | |
1377 | ¶m); | |
1378 | if (IS_ERR(fpo->fmr.fpo_fmr_pool)) { | |
1379 | rc = PTR_ERR(fpo->fmr.fpo_fmr_pool); | |
1380 | if (rc != -ENOSYS) | |
1381 | CERROR("Failed to create FMR pool: %d\n", rc); | |
1382 | else | |
1383 | CERROR("FMRs are not supported\n"); | |
1384 | } | |
1385 | ||
1386 | return rc; | |
1387 | } | |
1388 | ||
8d9de3f4 | 1389 | static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo) |
80e05b34 DE |
1390 | { |
1391 | struct kib_fast_reg_descriptor *frd, *tmp; | |
1392 | int i, rc; | |
1393 | ||
1394 | INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list); | |
1395 | fpo->fast_reg.fpo_pool_size = 0; | |
1396 | for (i = 0; i < fps->fps_pool_size; i++) { | |
1397 | LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt, | |
1398 | sizeof(*frd)); | |
1399 | if (!frd) { | |
1400 | CERROR("Failed to allocate a new fast_reg descriptor\n"); | |
1401 | rc = -ENOMEM; | |
1402 | goto out; | |
1403 | } | |
1404 | ||
1405 | frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd, | |
1406 | IB_MR_TYPE_MEM_REG, | |
1407 | LNET_MAX_PAYLOAD / PAGE_SIZE); | |
1408 | if (IS_ERR(frd->frd_mr)) { | |
1409 | rc = PTR_ERR(frd->frd_mr); | |
1410 | CERROR("Failed to allocate ib_alloc_mr: %d\n", rc); | |
1411 | frd->frd_mr = NULL; | |
1412 | goto out_middle; | |
1413 | } | |
1414 | ||
1415 | frd->frd_valid = true; | |
1416 | ||
1417 | list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); | |
1418 | fpo->fast_reg.fpo_pool_size++; | |
1419 | } | |
1420 | ||
1421 | return 0; | |
1422 | ||
1423 | out_middle: | |
1424 | if (frd->frd_mr) | |
1425 | ib_dereg_mr(frd->frd_mr); | |
1426 | LIBCFS_FREE(frd, sizeof(*frd)); | |
1427 | ||
1428 | out: | |
1429 | list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, | |
1430 | frd_list) { | |
1431 | list_del(&frd->frd_list); | |
1432 | ib_dereg_mr(frd->frd_mr); | |
1433 | LIBCFS_FREE(frd, sizeof(*frd)); | |
1434 | } | |
1435 | ||
1436 | return rc; | |
1437 | } | |
1438 | ||
8d9de3f4 JS |
1439 | static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps, |
1440 | struct kib_fmr_pool **pp_fpo) | |
f66fb159 | 1441 | { |
8d9de3f4 | 1442 | struct kib_dev *dev = fps->fps_net->ibn_dev; |
80e05b34 | 1443 | struct ib_device_attr *dev_attr; |
8d9de3f4 | 1444 | struct kib_fmr_pool *fpo; |
d7e09d03 PT |
1445 | int rc; |
1446 | ||
1447 | LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo)); | |
06ace26e | 1448 | if (!fpo) |
d7e09d03 PT |
1449 | return -ENOMEM; |
1450 | ||
1451 | fpo->fpo_hdev = kiblnd_current_hdev(dev); | |
80e05b34 | 1452 | dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs; |
d7e09d03 | 1453 | |
80e05b34 DE |
1454 | /* Check for FMR or FastReg support */ |
1455 | fpo->fpo_is_fmr = 0; | |
f66fb159 DE |
1456 | if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr && |
1457 | fpo->fpo_hdev->ibh_ibdev->dealloc_fmr && | |
1458 | fpo->fpo_hdev->ibh_ibdev->map_phys_fmr && | |
1459 | fpo->fpo_hdev->ibh_ibdev->unmap_fmr) { | |
1460 | LCONSOLE_INFO("Using FMR for registration\n"); | |
80e05b34 DE |
1461 | fpo->fpo_is_fmr = 1; |
1462 | } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { | |
1463 | LCONSOLE_INFO("Using FastReg for registration\n"); | |
f66fb159 DE |
1464 | } else { |
1465 | rc = -ENOSYS; | |
80e05b34 | 1466 | LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n"); |
f66fb159 | 1467 | goto out_fpo; |
d7e09d03 PT |
1468 | } |
1469 | ||
80e05b34 DE |
1470 | if (fpo->fpo_is_fmr) |
1471 | rc = kiblnd_alloc_fmr_pool(fps, fpo); | |
1472 | else | |
1473 | rc = kiblnd_alloc_freg_pool(fps, fpo); | |
f66fb159 DE |
1474 | if (rc) |
1475 | goto out_fpo; | |
1476 | ||
d7e09d03 | 1477 | fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); |
f66fb159 | 1478 | fpo->fpo_owner = fps; |
d7e09d03 PT |
1479 | *pp_fpo = fpo; |
1480 | ||
1481 | return 0; | |
f66fb159 DE |
1482 | |
1483 | out_fpo: | |
1484 | kiblnd_hdev_decref(fpo->fpo_hdev); | |
1485 | LIBCFS_FREE(fpo, sizeof(*fpo)); | |
1486 | return rc; | |
d7e09d03 PT |
1487 | } |
1488 | ||
8d9de3f4 | 1489 | static void kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, |
febe73bd | 1490 | struct list_head *zombies) |
d7e09d03 | 1491 | { |
06ace26e | 1492 | if (!fps->fps_net) /* intialized? */ |
d7e09d03 PT |
1493 | return; |
1494 | ||
1495 | spin_lock(&fps->fps_lock); | |
1496 | ||
1497 | while (!list_empty(&fps->fps_pool_list)) { | |
8d9de3f4 JS |
1498 | struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next, |
1499 | struct kib_fmr_pool, fpo_list); | |
d7e09d03 PT |
1500 | fpo->fpo_failed = 1; |
1501 | list_del(&fpo->fpo_list); | |
5fd88337 | 1502 | if (!fpo->fpo_map_count) |
d7e09d03 PT |
1503 | list_add(&fpo->fpo_list, zombies); |
1504 | else | |
1505 | list_add(&fpo->fpo_list, &fps->fps_failed_pool_list); | |
1506 | } | |
1507 | ||
1508 | spin_unlock(&fps->fps_lock); | |
1509 | } | |
1510 | ||
8d9de3f4 | 1511 | static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps) |
d7e09d03 | 1512 | { |
06ace26e | 1513 | if (fps->fps_net) { /* initialized? */ |
d7e09d03 PT |
1514 | kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); |
1515 | kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); | |
1516 | } | |
1517 | } | |
1518 | ||
32c8deb8 | 1519 | static int |
8d9de3f4 JS |
1520 | kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts, |
1521 | struct kib_net *net, | |
32c8deb8 | 1522 | struct lnet_ioctl_config_o2iblnd_tunables *tunables) |
d7e09d03 | 1523 | { |
8d9de3f4 | 1524 | struct kib_fmr_pool *fpo; |
ec3d17c0 | 1525 | int rc; |
d7e09d03 | 1526 | |
a4e872f7 | 1527 | memset(fps, 0, sizeof(*fps)); |
d7e09d03 PT |
1528 | |
1529 | fps->fps_net = net; | |
1530 | fps->fps_cpt = cpt; | |
32c8deb8 AS |
1531 | |
1532 | fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts); | |
1533 | fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts); | |
1534 | fps->fps_cache = tunables->lnd_fmr_cache; | |
1535 | ||
d7e09d03 PT |
1536 | spin_lock_init(&fps->fps_lock); |
1537 | INIT_LIST_HEAD(&fps->fps_pool_list); | |
1538 | INIT_LIST_HEAD(&fps->fps_failed_pool_list); | |
1539 | ||
1540 | rc = kiblnd_create_fmr_pool(fps, &fpo); | |
5fd88337 | 1541 | if (!rc) |
d7e09d03 PT |
1542 | list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); |
1543 | ||
1544 | return rc; | |
1545 | } | |
1546 | ||
8d9de3f4 | 1547 | static int kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, unsigned long now) |
d7e09d03 | 1548 | { |
5fd88337 | 1549 | if (fpo->fpo_map_count) /* still in use */ |
d7e09d03 PT |
1550 | return 0; |
1551 | if (fpo->fpo_failed) | |
1552 | return 1; | |
1553 | return cfs_time_aftereq(now, fpo->fpo_deadline); | |
1554 | } | |
1555 | ||
80e05b34 | 1556 | static int |
8d9de3f4 | 1557 | kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd) |
80e05b34 DE |
1558 | { |
1559 | __u64 *pages = tx->tx_pages; | |
8d9de3f4 | 1560 | struct kib_hca_dev *hdev; |
80e05b34 DE |
1561 | int npages; |
1562 | int size; | |
1563 | int i; | |
1564 | ||
1565 | hdev = tx->tx_pool->tpo_hdev; | |
1566 | ||
1567 | for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { | |
1568 | for (size = 0; size < rd->rd_frags[i].rf_nob; | |
1569 | size += hdev->ibh_page_size) { | |
1570 | pages[npages++] = (rd->rd_frags[i].rf_addr & | |
1571 | hdev->ibh_page_mask) + size; | |
1572 | } | |
1573 | } | |
1574 | ||
1575 | return npages; | |
1576 | } | |
1577 | ||
8d9de3f4 | 1578 | void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status) |
d7e09d03 | 1579 | { |
febe73bd | 1580 | LIST_HEAD(zombies); |
8d9de3f4 JS |
1581 | struct kib_fmr_pool *fpo = fmr->fmr_pool; |
1582 | struct kib_fmr_poolset *fps; | |
ec3d17c0 | 1583 | unsigned long now = cfs_time_current(); |
8d9de3f4 | 1584 | struct kib_fmr_pool *tmp; |
ec3d17c0 | 1585 | int rc; |
d7e09d03 | 1586 | |
1f199a0c DE |
1587 | if (!fpo) |
1588 | return; | |
d7e09d03 | 1589 | |
1f199a0c | 1590 | fps = fpo->fpo_owner; |
80e05b34 DE |
1591 | if (fpo->fpo_is_fmr) { |
1592 | if (fmr->fmr_pfmr) { | |
1593 | rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); | |
1594 | LASSERT(!rc); | |
1595 | fmr->fmr_pfmr = NULL; | |
1596 | } | |
d7e09d03 | 1597 | |
80e05b34 DE |
1598 | if (status) { |
1599 | rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool); | |
1600 | LASSERT(!rc); | |
1601 | } | |
1602 | } else { | |
1603 | struct kib_fast_reg_descriptor *frd = fmr->fmr_frd; | |
d7e09d03 | 1604 | |
80e05b34 DE |
1605 | if (frd) { |
1606 | frd->frd_valid = false; | |
1607 | spin_lock(&fps->fps_lock); | |
1608 | list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); | |
1609 | spin_unlock(&fps->fps_lock); | |
1610 | fmr->fmr_frd = NULL; | |
1611 | } | |
1612 | } | |
d7e09d03 | 1613 | fmr->fmr_pool = NULL; |
d7e09d03 PT |
1614 | |
1615 | spin_lock(&fps->fps_lock); | |
74732797 | 1616 | fpo->fpo_map_count--; /* decref the pool */ |
d7e09d03 PT |
1617 | |
1618 | list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { | |
1619 | /* the first pool is persistent */ | |
1620 | if (fps->fps_pool_list.next == &fpo->fpo_list) | |
1621 | continue; | |
1622 | ||
1623 | if (kiblnd_fmr_pool_is_idle(fpo, now)) { | |
1624 | list_move(&fpo->fpo_list, &zombies); | |
74732797 | 1625 | fps->fps_version++; |
d7e09d03 PT |
1626 | } |
1627 | } | |
1628 | spin_unlock(&fps->fps_lock); | |
1629 | ||
1630 | if (!list_empty(&zombies)) | |
1631 | kiblnd_destroy_fmr_pool_list(&zombies); | |
1632 | } | |
1633 | ||
8d9de3f4 JS |
1634 | int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, |
1635 | struct kib_rdma_desc *rd, __u32 nob, __u64 iov, | |
1636 | struct kib_fmr *fmr) | |
d7e09d03 | 1637 | { |
80e05b34 DE |
1638 | __u64 *pages = tx->tx_pages; |
1639 | bool is_rx = (rd != tx->tx_rd); | |
1640 | bool tx_pages_mapped = 0; | |
8d9de3f4 | 1641 | struct kib_fmr_pool *fpo; |
80e05b34 | 1642 | int npages = 0; |
ec3d17c0 MS |
1643 | __u64 version; |
1644 | int rc; | |
d7e09d03 PT |
1645 | |
1646 | again: | |
1647 | spin_lock(&fps->fps_lock); | |
1648 | version = fps->fps_version; | |
1649 | list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { | |
1650 | fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); | |
1651 | fpo->fpo_map_count++; | |
d7e09d03 | 1652 | |
80e05b34 DE |
1653 | if (fpo->fpo_is_fmr) { |
1654 | struct ib_pool_fmr *pfmr; | |
1655 | ||
1656 | spin_unlock(&fps->fps_lock); | |
1657 | ||
1658 | if (!tx_pages_mapped) { | |
1659 | npages = kiblnd_map_tx_pages(tx, rd); | |
1660 | tx_pages_mapped = 1; | |
1661 | } | |
1662 | ||
1663 | pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool, | |
1664 | pages, npages, iov); | |
1665 | if (likely(!IS_ERR(pfmr))) { | |
1666 | fmr->fmr_key = is_rx ? pfmr->fmr->rkey : | |
1667 | pfmr->fmr->lkey; | |
1668 | fmr->fmr_frd = NULL; | |
1669 | fmr->fmr_pfmr = pfmr; | |
1670 | fmr->fmr_pool = fpo; | |
1671 | return 0; | |
1672 | } | |
1673 | rc = PTR_ERR(pfmr); | |
1674 | } else { | |
1675 | if (!list_empty(&fpo->fast_reg.fpo_pool_list)) { | |
1676 | struct kib_fast_reg_descriptor *frd; | |
1677 | struct ib_reg_wr *wr; | |
1678 | struct ib_mr *mr; | |
1679 | int n; | |
1680 | ||
1681 | frd = list_first_entry(&fpo->fast_reg.fpo_pool_list, | |
1682 | struct kib_fast_reg_descriptor, | |
1683 | frd_list); | |
1684 | list_del(&frd->frd_list); | |
1685 | spin_unlock(&fps->fps_lock); | |
1686 | ||
1687 | mr = frd->frd_mr; | |
1688 | ||
1689 | if (!frd->frd_valid) { | |
1690 | __u32 key = is_rx ? mr->rkey : mr->lkey; | |
1691 | struct ib_send_wr *inv_wr; | |
1692 | ||
1693 | inv_wr = &frd->frd_inv_wr; | |
1694 | memset(inv_wr, 0, sizeof(*inv_wr)); | |
1695 | inv_wr->opcode = IB_WR_LOCAL_INV; | |
1696 | inv_wr->wr_id = IBLND_WID_MR; | |
1697 | inv_wr->ex.invalidate_rkey = key; | |
1698 | ||
1699 | /* Bump the key */ | |
1700 | key = ib_inc_rkey(key); | |
1701 | ib_update_fast_reg_key(mr, key); | |
1702 | } | |
1703 | ||
1704 | n = ib_map_mr_sg(mr, tx->tx_frags, | |
2f37dd13 | 1705 | tx->tx_nfrags, NULL, PAGE_SIZE); |
80e05b34 DE |
1706 | if (unlikely(n != tx->tx_nfrags)) { |
1707 | CERROR("Failed to map mr %d/%d elements\n", | |
1708 | n, tx->tx_nfrags); | |
1709 | return n < 0 ? n : -EINVAL; | |
1710 | } | |
1711 | ||
1712 | mr->iova = iov; | |
1713 | ||
1714 | /* Prepare FastReg WR */ | |
1715 | wr = &frd->frd_fastreg_wr; | |
1716 | memset(wr, 0, sizeof(*wr)); | |
1717 | wr->wr.opcode = IB_WR_REG_MR; | |
1718 | wr->wr.wr_id = IBLND_WID_MR; | |
1719 | wr->wr.num_sge = 0; | |
1720 | wr->wr.send_flags = 0; | |
1721 | wr->mr = mr; | |
1722 | wr->key = is_rx ? mr->rkey : mr->lkey; | |
1723 | wr->access = (IB_ACCESS_LOCAL_WRITE | | |
1724 | IB_ACCESS_REMOTE_WRITE); | |
1725 | ||
1726 | fmr->fmr_key = is_rx ? mr->rkey : mr->lkey; | |
1727 | fmr->fmr_frd = frd; | |
1728 | fmr->fmr_pfmr = NULL; | |
1729 | fmr->fmr_pool = fpo; | |
1730 | return 0; | |
1731 | } | |
1732 | spin_unlock(&fps->fps_lock); | |
1733 | rc = -EBUSY; | |
d7e09d03 PT |
1734 | } |
1735 | ||
1736 | spin_lock(&fps->fps_lock); | |
1737 | fpo->fpo_map_count--; | |
c1b2e0b5 | 1738 | if (rc != -EAGAIN) { |
d7e09d03 | 1739 | spin_unlock(&fps->fps_lock); |
c1b2e0b5 | 1740 | return rc; |
d7e09d03 PT |
1741 | } |
1742 | ||
1743 | /* EAGAIN and ... */ | |
1744 | if (version != fps->fps_version) { | |
1745 | spin_unlock(&fps->fps_lock); | |
1746 | goto again; | |
1747 | } | |
1748 | } | |
1749 | ||
1750 | if (fps->fps_increasing) { | |
1751 | spin_unlock(&fps->fps_lock); | |
c314c319 | 1752 | CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n"); |
d7e09d03 PT |
1753 | schedule(); |
1754 | goto again; | |
d7e09d03 PT |
1755 | } |
1756 | ||
699503bc | 1757 | if (time_before(cfs_time_current(), fps->fps_next_retry)) { |
d7e09d03 PT |
1758 | /* someone failed recently */ |
1759 | spin_unlock(&fps->fps_lock); | |
1760 | return -EAGAIN; | |
1761 | } | |
1762 | ||
1763 | fps->fps_increasing = 1; | |
1764 | spin_unlock(&fps->fps_lock); | |
1765 | ||
1766 | CDEBUG(D_NET, "Allocate new FMR pool\n"); | |
1767 | rc = kiblnd_create_fmr_pool(fps, &fpo); | |
1768 | spin_lock(&fps->fps_lock); | |
1769 | fps->fps_increasing = 0; | |
5fd88337 | 1770 | if (!rc) { |
d7e09d03 PT |
1771 | fps->fps_version++; |
1772 | list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); | |
1773 | } else { | |
1774 | fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY); | |
1775 | } | |
1776 | spin_unlock(&fps->fps_lock); | |
1777 | ||
1778 | goto again; | |
1779 | } | |
1780 | ||
8d9de3f4 | 1781 | static void kiblnd_fini_pool(struct kib_pool *pool) |
d7e09d03 | 1782 | { |
febe73bd | 1783 | LASSERT(list_empty(&pool->po_free_list)); |
5fd88337 | 1784 | LASSERT(!pool->po_allocated); |
d7e09d03 PT |
1785 | |
1786 | CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); | |
1787 | } | |
1788 | ||
8d9de3f4 | 1789 | static void kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size) |
d7e09d03 PT |
1790 | { |
1791 | CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); | |
1792 | ||
a4e872f7 | 1793 | memset(pool, 0, sizeof(*pool)); |
d7e09d03 PT |
1794 | INIT_LIST_HEAD(&pool->po_free_list); |
1795 | pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); | |
1796 | pool->po_owner = ps; | |
1797 | pool->po_size = size; | |
1798 | } | |
1799 | ||
febe73bd | 1800 | static void kiblnd_destroy_pool_list(struct list_head *head) |
d7e09d03 | 1801 | { |
8d9de3f4 | 1802 | struct kib_pool *pool; |
d7e09d03 PT |
1803 | |
1804 | while (!list_empty(head)) { | |
8d9de3f4 | 1805 | pool = list_entry(head->next, struct kib_pool, po_list); |
d7e09d03 PT |
1806 | list_del(&pool->po_list); |
1807 | ||
06ace26e | 1808 | LASSERT(pool->po_owner); |
d7e09d03 PT |
1809 | pool->po_owner->ps_pool_destroy(pool); |
1810 | } | |
1811 | } | |
1812 | ||
8d9de3f4 | 1813 | static void kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies) |
d7e09d03 | 1814 | { |
06ace26e | 1815 | if (!ps->ps_net) /* intialized? */ |
d7e09d03 PT |
1816 | return; |
1817 | ||
1818 | spin_lock(&ps->ps_lock); | |
1819 | while (!list_empty(&ps->ps_pool_list)) { | |
8d9de3f4 JS |
1820 | struct kib_pool *po = list_entry(ps->ps_pool_list.next, |
1821 | struct kib_pool, po_list); | |
d7e09d03 PT |
1822 | po->po_failed = 1; |
1823 | list_del(&po->po_list); | |
5fd88337 | 1824 | if (!po->po_allocated) |
d7e09d03 PT |
1825 | list_add(&po->po_list, zombies); |
1826 | else | |
1827 | list_add(&po->po_list, &ps->ps_failed_pool_list); | |
1828 | } | |
1829 | spin_unlock(&ps->ps_lock); | |
1830 | } | |
1831 | ||
8d9de3f4 | 1832 | static void kiblnd_fini_poolset(struct kib_poolset *ps) |
d7e09d03 | 1833 | { |
06ace26e | 1834 | if (ps->ps_net) { /* initialized? */ |
d7e09d03 PT |
1835 | kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); |
1836 | kiblnd_destroy_pool_list(&ps->ps_pool_list); | |
1837 | } | |
1838 | } | |
1839 | ||
8d9de3f4 JS |
1840 | static int kiblnd_init_poolset(struct kib_poolset *ps, int cpt, |
1841 | struct kib_net *net, char *name, int size, | |
febe73bd GM |
1842 | kib_ps_pool_create_t po_create, |
1843 | kib_ps_pool_destroy_t po_destroy, | |
1844 | kib_ps_node_init_t nd_init, | |
1845 | kib_ps_node_fini_t nd_fini) | |
d7e09d03 | 1846 | { |
8d9de3f4 | 1847 | struct kib_pool *pool; |
ec3d17c0 | 1848 | int rc; |
d7e09d03 | 1849 | |
a4e872f7 | 1850 | memset(ps, 0, sizeof(*ps)); |
d7e09d03 | 1851 | |
ec3d17c0 MS |
1852 | ps->ps_cpt = cpt; |
1853 | ps->ps_net = net; | |
d7e09d03 PT |
1854 | ps->ps_pool_create = po_create; |
1855 | ps->ps_pool_destroy = po_destroy; | |
1856 | ps->ps_node_init = nd_init; | |
1857 | ps->ps_node_fini = nd_fini; | |
1858 | ps->ps_pool_size = size; | |
1859 | if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name)) | |
1860 | >= sizeof(ps->ps_name)) | |
1861 | return -E2BIG; | |
1862 | spin_lock_init(&ps->ps_lock); | |
1863 | INIT_LIST_HEAD(&ps->ps_pool_list); | |
1864 | INIT_LIST_HEAD(&ps->ps_failed_pool_list); | |
1865 | ||
1866 | rc = ps->ps_pool_create(ps, size, &pool); | |
5fd88337 | 1867 | if (!rc) |
d7e09d03 PT |
1868 | list_add(&pool->po_list, &ps->ps_pool_list); |
1869 | else | |
1870 | CERROR("Failed to create the first pool for %s\n", ps->ps_name); | |
1871 | ||
1872 | return rc; | |
1873 | } | |
1874 | ||
8d9de3f4 | 1875 | static int kiblnd_pool_is_idle(struct kib_pool *pool, unsigned long now) |
d7e09d03 | 1876 | { |
5fd88337 | 1877 | if (pool->po_allocated) /* still in use */ |
d7e09d03 PT |
1878 | return 0; |
1879 | if (pool->po_failed) | |
1880 | return 1; | |
1881 | return cfs_time_aftereq(now, pool->po_deadline); | |
1882 | } | |
1883 | ||
8d9de3f4 | 1884 | void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node) |
d7e09d03 | 1885 | { |
febe73bd | 1886 | LIST_HEAD(zombies); |
8d9de3f4 JS |
1887 | struct kib_poolset *ps = pool->po_owner; |
1888 | struct kib_pool *tmp; | |
ec3d17c0 | 1889 | unsigned long now = cfs_time_current(); |
d7e09d03 PT |
1890 | |
1891 | spin_lock(&ps->ps_lock); | |
1892 | ||
06ace26e | 1893 | if (ps->ps_node_fini) |
d7e09d03 PT |
1894 | ps->ps_node_fini(pool, node); |
1895 | ||
febe73bd | 1896 | LASSERT(pool->po_allocated > 0); |
d7e09d03 | 1897 | list_add(node, &pool->po_free_list); |
74732797 | 1898 | pool->po_allocated--; |
d7e09d03 PT |
1899 | |
1900 | list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { | |
1901 | /* the first pool is persistent */ | |
1902 | if (ps->ps_pool_list.next == &pool->po_list) | |
1903 | continue; | |
1904 | ||
1905 | if (kiblnd_pool_is_idle(pool, now)) | |
1906 | list_move(&pool->po_list, &zombies); | |
1907 | } | |
1908 | spin_unlock(&ps->ps_lock); | |
1909 | ||
1910 | if (!list_empty(&zombies)) | |
1911 | kiblnd_destroy_pool_list(&zombies); | |
1912 | } | |
1913 | ||
8d9de3f4 | 1914 | struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps) |
d7e09d03 | 1915 | { |
ec3d17c0 | 1916 | struct list_head *node; |
8d9de3f4 | 1917 | struct kib_pool *pool; |
ea363b41 LZ |
1918 | unsigned int interval = 1; |
1919 | unsigned long time_before; | |
1920 | unsigned int trips = 0; | |
ec3d17c0 | 1921 | int rc; |
d7e09d03 PT |
1922 | |
1923 | again: | |
1924 | spin_lock(&ps->ps_lock); | |
1925 | list_for_each_entry(pool, &ps->ps_pool_list, po_list) { | |
1926 | if (list_empty(&pool->po_free_list)) | |
1927 | continue; | |
1928 | ||
74732797 | 1929 | pool->po_allocated++; |
d7e09d03 PT |
1930 | pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); |
1931 | node = pool->po_free_list.next; | |
1932 | list_del(node); | |
1933 | ||
06ace26e | 1934 | if (ps->ps_node_init) { |
d7e09d03 PT |
1935 | /* still hold the lock */ |
1936 | ps->ps_node_init(pool, node); | |
1937 | } | |
1938 | spin_unlock(&ps->ps_lock); | |
1939 | return node; | |
1940 | } | |
1941 | ||
1942 | /* no available tx pool and ... */ | |
1943 | if (ps->ps_increasing) { | |
1944 | /* another thread is allocating a new pool */ | |
1945 | spin_unlock(&ps->ps_lock); | |
ea363b41 LZ |
1946 | trips++; |
1947 | CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting %d HZs for her to complete. trips = %d\n", | |
1948 | ps->ps_name, interval, trips); | |
1949 | ||
1950 | set_current_state(TASK_INTERRUPTIBLE); | |
1951 | schedule_timeout(interval); | |
1952 | if (interval < cfs_time_seconds(1)) | |
1953 | interval *= 2; | |
1954 | ||
d7e09d03 PT |
1955 | goto again; |
1956 | } | |
1957 | ||
699503bc | 1958 | if (time_before(cfs_time_current(), ps->ps_next_retry)) { |
d7e09d03 PT |
1959 | /* someone failed recently */ |
1960 | spin_unlock(&ps->ps_lock); | |
1961 | return NULL; | |
1962 | } | |
1963 | ||
1964 | ps->ps_increasing = 1; | |
1965 | spin_unlock(&ps->ps_lock); | |
1966 | ||
1967 | CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); | |
ea363b41 | 1968 | time_before = cfs_time_current(); |
d7e09d03 | 1969 | rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); |
ea363b41 LZ |
1970 | CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete", |
1971 | cfs_time_current() - time_before); | |
d7e09d03 PT |
1972 | |
1973 | spin_lock(&ps->ps_lock); | |
1974 | ps->ps_increasing = 0; | |
5fd88337 | 1975 | if (!rc) { |
d7e09d03 PT |
1976 | list_add_tail(&pool->po_list, &ps->ps_pool_list); |
1977 | } else { | |
1978 | ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY); | |
1979 | CERROR("Can't allocate new %s pool because out of memory\n", | |
1980 | ps->ps_name); | |
1981 | } | |
1982 | spin_unlock(&ps->ps_lock); | |
1983 | ||
1984 | goto again; | |
1985 | } | |
1986 | ||
8d9de3f4 | 1987 | static void kiblnd_destroy_tx_pool(struct kib_pool *pool) |
d7e09d03 | 1988 | { |
8d9de3f4 | 1989 | struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, tpo_pool); |
ec3d17c0 | 1990 | int i; |
d7e09d03 | 1991 | |
5fd88337 | 1992 | LASSERT(!pool->po_allocated); |
d7e09d03 | 1993 | |
06ace26e | 1994 | if (tpo->tpo_tx_pages) { |
d7e09d03 PT |
1995 | kiblnd_unmap_tx_pool(tpo); |
1996 | kiblnd_free_pages(tpo->tpo_tx_pages); | |
1997 | } | |
1998 | ||
06ace26e | 1999 | if (!tpo->tpo_tx_descs) |
d7e09d03 PT |
2000 | goto out; |
2001 | ||
2002 | for (i = 0; i < pool->po_size; i++) { | |
8d9de3f4 | 2003 | struct kib_tx *tx = &tpo->tpo_tx_descs[i]; |
d7e09d03 PT |
2004 | |
2005 | list_del(&tx->tx_list); | |
06ace26e | 2006 | if (tx->tx_pages) |
d7e09d03 PT |
2007 | LIBCFS_FREE(tx->tx_pages, |
2008 | LNET_MAX_IOV * | |
2009 | sizeof(*tx->tx_pages)); | |
06ace26e | 2010 | if (tx->tx_frags) |
d7e09d03 | 2011 | LIBCFS_FREE(tx->tx_frags, |
147280d8 JS |
2012 | (1 + IBLND_MAX_RDMA_FRAGS) * |
2013 | sizeof(*tx->tx_frags)); | |
06ace26e | 2014 | if (tx->tx_wrq) |
d7e09d03 PT |
2015 | LIBCFS_FREE(tx->tx_wrq, |
2016 | (1 + IBLND_MAX_RDMA_FRAGS) * | |
2017 | sizeof(*tx->tx_wrq)); | |
06ace26e | 2018 | if (tx->tx_sge) |
d7e09d03 PT |
2019 | LIBCFS_FREE(tx->tx_sge, |
2020 | (1 + IBLND_MAX_RDMA_FRAGS) * | |
2021 | sizeof(*tx->tx_sge)); | |
06ace26e | 2022 | if (tx->tx_rd) |
d7e09d03 | 2023 | LIBCFS_FREE(tx->tx_rd, |
8d9de3f4 | 2024 | offsetof(struct kib_rdma_desc, |
d7e09d03 PT |
2025 | rd_frags[IBLND_MAX_RDMA_FRAGS])); |
2026 | } | |
2027 | ||
2028 | LIBCFS_FREE(tpo->tpo_tx_descs, | |
8d9de3f4 | 2029 | pool->po_size * sizeof(struct kib_tx)); |
d7e09d03 PT |
2030 | out: |
2031 | kiblnd_fini_pool(pool); | |
a4e872f7 | 2032 | LIBCFS_FREE(tpo, sizeof(*tpo)); |
d7e09d03 PT |
2033 | } |
2034 | ||
2035 | static int kiblnd_tx_pool_size(int ncpts) | |
2036 | { | |
2037 | int ntx = *kiblnd_tunables.kib_ntx / ncpts; | |
2038 | ||
2039 | return max(IBLND_TX_POOL, ntx); | |
2040 | } | |
2041 | ||
8d9de3f4 JS |
2042 | static int kiblnd_create_tx_pool(struct kib_poolset *ps, int size, |
2043 | struct kib_pool **pp_po) | |
d7e09d03 | 2044 | { |
ec3d17c0 MS |
2045 | int i; |
2046 | int npg; | |
8d9de3f4 JS |
2047 | struct kib_pool *pool; |
2048 | struct kib_tx_pool *tpo; | |
d7e09d03 PT |
2049 | |
2050 | LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo)); | |
06ace26e | 2051 | if (!tpo) { |
d7e09d03 PT |
2052 | CERROR("Failed to allocate TX pool\n"); |
2053 | return -ENOMEM; | |
2054 | } | |
2055 | ||
2056 | pool = &tpo->tpo_pool; | |
2057 | kiblnd_init_pool(ps, pool, size); | |
2058 | tpo->tpo_tx_descs = NULL; | |
2059 | tpo->tpo_tx_pages = NULL; | |
2060 | ||
2061 | npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE; | |
5fd88337 | 2062 | if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) { |
d7e09d03 | 2063 | CERROR("Can't allocate tx pages: %d\n", npg); |
a4e872f7 | 2064 | LIBCFS_FREE(tpo, sizeof(*tpo)); |
d7e09d03 PT |
2065 | return -ENOMEM; |
2066 | } | |
2067 | ||
2068 | LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt, | |
8d9de3f4 | 2069 | size * sizeof(struct kib_tx)); |
06ace26e | 2070 | if (!tpo->tpo_tx_descs) { |
d7e09d03 PT |
2071 | CERROR("Can't allocate %d tx descriptors\n", size); |
2072 | ps->ps_pool_destroy(pool); | |
2073 | return -ENOMEM; | |
2074 | } | |
2075 | ||
8d9de3f4 | 2076 | memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx)); |
d7e09d03 PT |
2077 | |
2078 | for (i = 0; i < size; i++) { | |
8d9de3f4 | 2079 | struct kib_tx *tx = &tpo->tpo_tx_descs[i]; |
d7e09d03 PT |
2080 | |
2081 | tx->tx_pool = tpo; | |
06ace26e | 2082 | if (ps->ps_net->ibn_fmr_ps) { |
d7e09d03 PT |
2083 | LIBCFS_CPT_ALLOC(tx->tx_pages, |
2084 | lnet_cpt_table(), ps->ps_cpt, | |
2085 | LNET_MAX_IOV * sizeof(*tx->tx_pages)); | |
06ace26e | 2086 | if (!tx->tx_pages) |
d7e09d03 PT |
2087 | break; |
2088 | } | |
2089 | ||
2090 | LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt, | |
147280d8 JS |
2091 | (1 + IBLND_MAX_RDMA_FRAGS) * |
2092 | sizeof(*tx->tx_frags)); | |
06ace26e | 2093 | if (!tx->tx_frags) |
d7e09d03 PT |
2094 | break; |
2095 | ||
147280d8 | 2096 | sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1); |
d7e09d03 PT |
2097 | |
2098 | LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt, | |
2099 | (1 + IBLND_MAX_RDMA_FRAGS) * | |
2100 | sizeof(*tx->tx_wrq)); | |
06ace26e | 2101 | if (!tx->tx_wrq) |
d7e09d03 PT |
2102 | break; |
2103 | ||
2104 | LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, | |
2105 | (1 + IBLND_MAX_RDMA_FRAGS) * | |
2106 | sizeof(*tx->tx_sge)); | |
06ace26e | 2107 | if (!tx->tx_sge) |
d7e09d03 PT |
2108 | break; |
2109 | ||
2110 | LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt, | |
8d9de3f4 | 2111 | offsetof(struct kib_rdma_desc, |
d7e09d03 | 2112 | rd_frags[IBLND_MAX_RDMA_FRAGS])); |
06ace26e | 2113 | if (!tx->tx_rd) |
d7e09d03 PT |
2114 | break; |
2115 | } | |
2116 | ||
2117 | if (i == size) { | |
2118 | kiblnd_map_tx_pool(tpo); | |
2119 | *pp_po = pool; | |
2120 | return 0; | |
2121 | } | |
2122 | ||
2123 | ps->ps_pool_destroy(pool); | |
2124 | return -ENOMEM; | |
2125 | } | |
2126 | ||
8d9de3f4 | 2127 | static void kiblnd_tx_init(struct kib_pool *pool, struct list_head *node) |
d7e09d03 | 2128 | { |
8d9de3f4 JS |
2129 | struct kib_tx_poolset *tps = container_of(pool->po_owner, |
2130 | struct kib_tx_poolset, | |
2131 | tps_poolset); | |
2132 | struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list); | |
d7e09d03 | 2133 | |
74732797 | 2134 | tx->tx_cookie = tps->tps_next_tx_cookie++; |
d7e09d03 PT |
2135 | } |
2136 | ||
8d9de3f4 | 2137 | static void kiblnd_net_fini_pools(struct kib_net *net) |
d7e09d03 | 2138 | { |
ec3d17c0 | 2139 | int i; |
d7e09d03 PT |
2140 | |
2141 | cfs_cpt_for_each(i, lnet_cpt_table()) { | |
8d9de3f4 JS |
2142 | struct kib_tx_poolset *tps; |
2143 | struct kib_fmr_poolset *fps; | |
d7e09d03 | 2144 | |
06ace26e | 2145 | if (net->ibn_tx_ps) { |
d7e09d03 PT |
2146 | tps = net->ibn_tx_ps[i]; |
2147 | kiblnd_fini_poolset(&tps->tps_poolset); | |
2148 | } | |
2149 | ||
06ace26e | 2150 | if (net->ibn_fmr_ps) { |
d7e09d03 PT |
2151 | fps = net->ibn_fmr_ps[i]; |
2152 | kiblnd_fini_fmr_poolset(fps); | |
2153 | } | |
d7e09d03 PT |
2154 | } |
2155 | ||
06ace26e | 2156 | if (net->ibn_tx_ps) { |
d7e09d03 PT |
2157 | cfs_percpt_free(net->ibn_tx_ps); |
2158 | net->ibn_tx_ps = NULL; | |
2159 | } | |
2160 | ||
06ace26e | 2161 | if (net->ibn_fmr_ps) { |
d7e09d03 PT |
2162 | cfs_percpt_free(net->ibn_fmr_ps); |
2163 | net->ibn_fmr_ps = NULL; | |
2164 | } | |
d7e09d03 PT |
2165 | } |
2166 | ||
8d9de3f4 | 2167 | static int kiblnd_net_init_pools(struct kib_net *net, lnet_ni_t *ni, __u32 *cpts, |
32c8deb8 | 2168 | int ncpts) |
d7e09d03 | 2169 | { |
32c8deb8 | 2170 | struct lnet_ioctl_config_o2iblnd_tunables *tunables; |
ec3d17c0 MS |
2171 | unsigned long flags; |
2172 | int cpt; | |
32c8deb8 | 2173 | int rc; |
ec3d17c0 | 2174 | int i; |
d7e09d03 | 2175 | |
32c8deb8 AS |
2176 | tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; |
2177 | ||
d7e09d03 | 2178 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); |
32c8deb8 | 2179 | if (!tunables->lnd_map_on_demand) { |
ec3d17c0 | 2180 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); |
d7e09d03 PT |
2181 | goto create_tx_pool; |
2182 | } | |
2183 | ||
2184 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
2185 | ||
32c8deb8 | 2186 | if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) { |
d7e09d03 | 2187 | CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", |
32c8deb8 | 2188 | tunables->lnd_fmr_pool_size, |
d7e09d03 PT |
2189 | *kiblnd_tunables.kib_ntx / 4); |
2190 | rc = -EINVAL; | |
2191 | goto failed; | |
2192 | } | |
2193 | ||
415bcb5c OD |
2194 | /* |
2195 | * TX pool must be created later than FMR, see LU-2268 | |
2196 | * for details | |
2197 | */ | |
06ace26e | 2198 | LASSERT(!net->ibn_tx_ps); |
d7e09d03 | 2199 | |
415bcb5c OD |
2200 | /* |
2201 | * premapping can fail if ibd_nmr > 1, so we always create | |
2202 | * FMR pool and map-on-demand if premapping failed | |
7e221b60 JS |
2203 | * |
2204 | * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset | |
2205 | * The number of struct kib_fmr_poolsets create is equal to the | |
2206 | * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt]. | |
415bcb5c | 2207 | */ |
d7e09d03 | 2208 | net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), |
8d9de3f4 | 2209 | sizeof(struct kib_fmr_poolset)); |
06ace26e | 2210 | if (!net->ibn_fmr_ps) { |
d7e09d03 PT |
2211 | CERROR("Failed to allocate FMR pool array\n"); |
2212 | rc = -ENOMEM; | |
2213 | goto failed; | |
2214 | } | |
2215 | ||
2216 | for (i = 0; i < ncpts; i++) { | |
06ace26e | 2217 | cpt = !cpts ? i : cpts[i]; |
32c8deb8 AS |
2218 | rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts, |
2219 | net, tunables); | |
7cadcc7c | 2220 | if (rc) { |
d7e09d03 PT |
2221 | CERROR("Can't initialize FMR pool for CPT %d: %d\n", |
2222 | cpt, rc); | |
2223 | goto failed; | |
2224 | } | |
2225 | } | |
2226 | ||
7cadcc7c | 2227 | if (i > 0) |
d7e09d03 | 2228 | LASSERT(i == ncpts); |
d7e09d03 PT |
2229 | |
2230 | create_tx_pool: | |
7e221b60 JS |
2231 | /* |
2232 | * cfs_precpt_alloc is creating an array of struct kib_tx_poolset | |
2233 | * The number of struct kib_tx_poolsets create is equal to the | |
2234 | * number of CPTs that exist, i.e net->ibn_tx_ps[cpt]. | |
2235 | */ | |
d7e09d03 | 2236 | net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), |
8d9de3f4 | 2237 | sizeof(struct kib_tx_poolset)); |
06ace26e | 2238 | if (!net->ibn_tx_ps) { |
d7e09d03 PT |
2239 | CERROR("Failed to allocate tx pool array\n"); |
2240 | rc = -ENOMEM; | |
2241 | goto failed; | |
2242 | } | |
2243 | ||
2244 | for (i = 0; i < ncpts; i++) { | |
06ace26e | 2245 | cpt = !cpts ? i : cpts[i]; |
d7e09d03 PT |
2246 | rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, |
2247 | cpt, net, "TX", | |
2248 | kiblnd_tx_pool_size(ncpts), | |
2249 | kiblnd_create_tx_pool, | |
2250 | kiblnd_destroy_tx_pool, | |
2251 | kiblnd_tx_init, NULL); | |
5fd88337 | 2252 | if (rc) { |
d7e09d03 PT |
2253 | CERROR("Can't initialize TX pool for CPT %d: %d\n", |
2254 | cpt, rc); | |
2255 | goto failed; | |
2256 | } | |
2257 | } | |
2258 | ||
2259 | return 0; | |
2260 | failed: | |
2261 | kiblnd_net_fini_pools(net); | |
5fd88337 | 2262 | LASSERT(rc); |
d7e09d03 PT |
2263 | return rc; |
2264 | } | |
2265 | ||
8d9de3f4 | 2266 | static int kiblnd_hdev_get_attr(struct kib_hca_dev *hdev) |
d7e09d03 | 2267 | { |
4420cfd3 JS |
2268 | /* |
2269 | * It's safe to assume a HCA can handle a page size | |
2270 | * matching that of the native system | |
2271 | */ | |
d7e09d03 PT |
2272 | hdev->ibh_page_shift = PAGE_SHIFT; |
2273 | hdev->ibh_page_size = 1 << PAGE_SHIFT; | |
2274 | hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); | |
2275 | ||
cebfe5ca | 2276 | hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size; |
d7e09d03 PT |
2277 | if (hdev->ibh_mr_size == ~0ULL) { |
2278 | hdev->ibh_mr_shift = 64; | |
2279 | return 0; | |
2280 | } | |
2281 | ||
55f5a824 | 2282 | CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size); |
d7e09d03 PT |
2283 | return -EINVAL; |
2284 | } | |
2285 | ||
8d9de3f4 | 2286 | static void kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev) |
d7e09d03 | 2287 | { |
7cadcc7c | 2288 | if (!hdev->ibh_mrs) |
d7e09d03 PT |
2289 | return; |
2290 | ||
7cadcc7c | 2291 | ib_dereg_mr(hdev->ibh_mrs); |
d7e09d03 | 2292 | |
7cadcc7c | 2293 | hdev->ibh_mrs = NULL; |
d7e09d03 PT |
2294 | } |
2295 | ||
8d9de3f4 | 2296 | void kiblnd_hdev_destroy(struct kib_hca_dev *hdev) |
d7e09d03 PT |
2297 | { |
2298 | kiblnd_hdev_cleanup_mrs(hdev); | |
2299 | ||
06ace26e | 2300 | if (hdev->ibh_pd) |
d7e09d03 PT |
2301 | ib_dealloc_pd(hdev->ibh_pd); |
2302 | ||
06ace26e | 2303 | if (hdev->ibh_cmid) |
d7e09d03 PT |
2304 | rdma_destroy_id(hdev->ibh_cmid); |
2305 | ||
2306 | LIBCFS_FREE(hdev, sizeof(*hdev)); | |
2307 | } | |
2308 | ||
8d9de3f4 | 2309 | static int kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev) |
d7e09d03 PT |
2310 | { |
2311 | struct ib_mr *mr; | |
ec3d17c0 | 2312 | int rc; |
ec3d17c0 | 2313 | int acflags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; |
d7e09d03 PT |
2314 | |
2315 | rc = kiblnd_hdev_get_attr(hdev); | |
5fd88337 | 2316 | if (rc) |
d7e09d03 PT |
2317 | return rc; |
2318 | ||
01738448 LB |
2319 | mr = ib_get_dma_mr(hdev->ibh_pd, acflags); |
2320 | if (IS_ERR(mr)) { | |
2321 | CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr)); | |
2322 | kiblnd_hdev_cleanup_mrs(hdev); | |
2323 | return PTR_ERR(mr); | |
2324 | } | |
d7e09d03 | 2325 | |
7cadcc7c | 2326 | hdev->ibh_mrs = mr; |
d7e09d03 | 2327 | |
d7e09d03 PT |
2328 | return 0; |
2329 | } | |
2330 | ||
febe73bd | 2331 | /* DUMMY */ |
7a3888a3 GM |
2332 | static int kiblnd_dummy_callback(struct rdma_cm_id *cmid, |
2333 | struct rdma_cm_event *event) | |
febe73bd | 2334 | { |
d7e09d03 PT |
2335 | return 0; |
2336 | } | |
2337 | ||
8d9de3f4 | 2338 | static int kiblnd_dev_need_failover(struct kib_dev *dev) |
d7e09d03 | 2339 | { |
ec3d17c0 MS |
2340 | struct rdma_cm_id *cmid; |
2341 | struct sockaddr_in srcaddr; | |
2342 | struct sockaddr_in dstaddr; | |
2343 | int rc; | |
d7e09d03 | 2344 | |
06ace26e JS |
2345 | if (!dev->ibd_hdev || /* initializing */ |
2346 | !dev->ibd_hdev->ibh_cmid || /* listener is dead */ | |
d7e09d03 PT |
2347 | *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ |
2348 | return 1; | |
2349 | ||
4420cfd3 JS |
2350 | /* |
2351 | * XXX: it's UGLY, but I don't have better way to find | |
d7e09d03 PT |
2352 | * ib-bonding HCA failover because: |
2353 | * | |
2354 | * a. no reliable CM event for HCA failover... | |
2355 | * b. no OFED API to get ib_device for current net_device... | |
2356 | * | |
2357 | * We have only two choices at this point: | |
2358 | * | |
2359 | * a. rdma_bind_addr(), it will conflict with listener cmid | |
4420cfd3 JS |
2360 | * b. rdma_resolve_addr() to zero addr |
2361 | */ | |
d7e09d03 PT |
2362 | cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP, |
2363 | IB_QPT_RC); | |
2364 | if (IS_ERR(cmid)) { | |
2365 | rc = PTR_ERR(cmid); | |
2366 | CERROR("Failed to create cmid for failover: %d\n", rc); | |
2367 | return rc; | |
2368 | } | |
2369 | ||
2370 | memset(&srcaddr, 0, sizeof(srcaddr)); | |
ec3d17c0 | 2371 | srcaddr.sin_family = AF_INET; |
d7e09d03 PT |
2372 | srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); |
2373 | ||
2374 | memset(&dstaddr, 0, sizeof(dstaddr)); | |
2375 | dstaddr.sin_family = AF_INET; | |
2376 | rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, | |
2377 | (struct sockaddr *)&dstaddr, 1); | |
5fd88337 | 2378 | if (rc || !cmid->device) { |
5e8f6920 PT |
2379 | CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", |
2380 | dev->ibd_ifname, &dev->ibd_ifip, | |
d7e09d03 PT |
2381 | cmid->device, rc); |
2382 | rdma_destroy_id(cmid); | |
2383 | return rc; | |
2384 | } | |
2385 | ||
199a0cc0 LZ |
2386 | rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */ |
2387 | rdma_destroy_id(cmid); | |
d7e09d03 | 2388 | |
199a0cc0 | 2389 | return rc; |
d7e09d03 PT |
2390 | } |
2391 | ||
8d9de3f4 | 2392 | int kiblnd_dev_failover(struct kib_dev *dev) |
d7e09d03 | 2393 | { |
febe73bd GM |
2394 | LIST_HEAD(zombie_tpo); |
2395 | LIST_HEAD(zombie_ppo); | |
2396 | LIST_HEAD(zombie_fpo); | |
ec3d17c0 | 2397 | struct rdma_cm_id *cmid = NULL; |
8d9de3f4 | 2398 | struct kib_hca_dev *hdev = NULL; |
ec3d17c0 | 2399 | struct ib_pd *pd; |
8d9de3f4 | 2400 | struct kib_net *net; |
ec3d17c0 MS |
2401 | struct sockaddr_in addr; |
2402 | unsigned long flags; | |
2403 | int rc = 0; | |
2404 | int i; | |
d7e09d03 | 2405 | |
febe73bd | 2406 | LASSERT(*kiblnd_tunables.kib_dev_failover > 1 || |
06ace26e | 2407 | dev->ibd_can_failover || !dev->ibd_hdev); |
d7e09d03 PT |
2408 | |
2409 | rc = kiblnd_dev_need_failover(dev); | |
2410 | if (rc <= 0) | |
2411 | goto out; | |
2412 | ||
06ace26e JS |
2413 | if (dev->ibd_hdev && |
2414 | dev->ibd_hdev->ibh_cmid) { | |
4420cfd3 JS |
2415 | /* |
2416 | * XXX it's not good to close old listener at here, | |
d7e09d03 PT |
2417 | * because we can fail to create new listener. |
2418 | * But we have to close it now, otherwise rdma_bind_addr | |
4420cfd3 JS |
2419 | * will return EADDRINUSE... How crap! |
2420 | */ | |
d7e09d03 PT |
2421 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); |
2422 | ||
2423 | cmid = dev->ibd_hdev->ibh_cmid; | |
4420cfd3 JS |
2424 | /* |
2425 | * make next schedule of kiblnd_dev_need_failover() | |
2426 | * return 1 for me | |
2427 | */ | |
d7e09d03 PT |
2428 | dev->ibd_hdev->ibh_cmid = NULL; |
2429 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
2430 | ||
2431 | rdma_destroy_id(cmid); | |
2432 | } | |
2433 | ||
2434 | cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP, | |
2435 | IB_QPT_RC); | |
2436 | if (IS_ERR(cmid)) { | |
2437 | rc = PTR_ERR(cmid); | |
2438 | CERROR("Failed to create cmid for failover: %d\n", rc); | |
2439 | goto out; | |
2440 | } | |
2441 | ||
2442 | memset(&addr, 0, sizeof(addr)); | |
2443 | addr.sin_family = AF_INET; | |
2444 | addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); | |
2445 | addr.sin_port = htons(*kiblnd_tunables.kib_service); | |
2446 | ||
2447 | /* Bind to failover device or port */ | |
2448 | rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr); | |
5fd88337 | 2449 | if (rc || !cmid->device) { |
5e8f6920 PT |
2450 | CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", |
2451 | dev->ibd_ifname, &dev->ibd_ifip, | |
d7e09d03 PT |
2452 | cmid->device, rc); |
2453 | rdma_destroy_id(cmid); | |
2454 | goto out; | |
2455 | } | |
2456 | ||
2457 | LIBCFS_ALLOC(hdev, sizeof(*hdev)); | |
06ace26e | 2458 | if (!hdev) { |
d7e09d03 PT |
2459 | CERROR("Failed to allocate kib_hca_dev\n"); |
2460 | rdma_destroy_id(cmid); | |
2461 | rc = -ENOMEM; | |
2462 | goto out; | |
2463 | } | |
2464 | ||
2465 | atomic_set(&hdev->ibh_ref, 1); | |
2466 | hdev->ibh_dev = dev; | |
2467 | hdev->ibh_cmid = cmid; | |
2468 | hdev->ibh_ibdev = cmid->device; | |
2469 | ||
2470 | pd = ib_alloc_pd(cmid->device); | |
2471 | if (IS_ERR(pd)) { | |
2472 | rc = PTR_ERR(pd); | |
2473 | CERROR("Can't allocate PD: %d\n", rc); | |
2474 | goto out; | |
2475 | } | |
2476 | ||
2477 | hdev->ibh_pd = pd; | |
2478 | ||
2479 | rc = rdma_listen(cmid, 0); | |
5fd88337 | 2480 | if (rc) { |
d7e09d03 PT |
2481 | CERROR("Can't start new listener: %d\n", rc); |
2482 | goto out; | |
2483 | } | |
2484 | ||
2485 | rc = kiblnd_hdev_setup_mrs(hdev); | |
5fd88337 | 2486 | if (rc) { |
d7e09d03 PT |
2487 | CERROR("Can't setup device: %d\n", rc); |
2488 | goto out; | |
2489 | } | |
2490 | ||
2491 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
2492 | ||
6d37b171 | 2493 | swap(dev->ibd_hdev, hdev); /* take over the refcount */ |
d7e09d03 PT |
2494 | |
2495 | list_for_each_entry(net, &dev->ibd_nets, ibn_list) { | |
2496 | cfs_cpt_for_each(i, lnet_cpt_table()) { | |
2497 | kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, | |
2498 | &zombie_tpo); | |
2499 | ||
415bcb5c | 2500 | if (net->ibn_fmr_ps) |
d7e09d03 PT |
2501 | kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], |
2502 | &zombie_fpo); | |
d7e09d03 PT |
2503 | } |
2504 | } | |
2505 | ||
2506 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
2507 | out: | |
2508 | if (!list_empty(&zombie_tpo)) | |
2509 | kiblnd_destroy_pool_list(&zombie_tpo); | |
2510 | if (!list_empty(&zombie_ppo)) | |
2511 | kiblnd_destroy_pool_list(&zombie_ppo); | |
2512 | if (!list_empty(&zombie_fpo)) | |
2513 | kiblnd_destroy_fmr_pool_list(&zombie_fpo); | |
06ace26e | 2514 | if (hdev) |
d7e09d03 PT |
2515 | kiblnd_hdev_decref(hdev); |
2516 | ||
5fd88337 | 2517 | if (rc) |
d7e09d03 PT |
2518 | dev->ibd_failed_failover++; |
2519 | else | |
2520 | dev->ibd_failed_failover = 0; | |
2521 | ||
2522 | return rc; | |
2523 | } | |
2524 | ||
8d9de3f4 | 2525 | void kiblnd_destroy_dev(struct kib_dev *dev) |
d7e09d03 | 2526 | { |
5fd88337 | 2527 | LASSERT(!dev->ibd_nnets); |
febe73bd | 2528 | LASSERT(list_empty(&dev->ibd_nets)); |
d7e09d03 PT |
2529 | |
2530 | list_del(&dev->ibd_fail_list); | |
2531 | list_del(&dev->ibd_list); | |
2532 | ||
06ace26e | 2533 | if (dev->ibd_hdev) |
d7e09d03 PT |
2534 | kiblnd_hdev_decref(dev->ibd_hdev); |
2535 | ||
2536 | LIBCFS_FREE(dev, sizeof(*dev)); | |
2537 | } | |
2538 | ||
8d9de3f4 | 2539 | static struct kib_dev *kiblnd_create_dev(char *ifname) |
d7e09d03 PT |
2540 | { |
2541 | struct net_device *netdev; | |
8d9de3f4 | 2542 | struct kib_dev *dev; |
ec3d17c0 MS |
2543 | __u32 netmask; |
2544 | __u32 ip; | |
2545 | int up; | |
2546 | int rc; | |
d7e09d03 | 2547 | |
1ad6a73e | 2548 | rc = lnet_ipif_query(ifname, &up, &ip, &netmask); |
5fd88337 | 2549 | if (rc) { |
d7e09d03 PT |
2550 | CERROR("Can't query IPoIB interface %s: %d\n", |
2551 | ifname, rc); | |
2552 | return NULL; | |
2553 | } | |
2554 | ||
2555 | if (!up) { | |
2556 | CERROR("Can't query IPoIB interface %s: it's down\n", ifname); | |
2557 | return NULL; | |
2558 | } | |
2559 | ||
2560 | LIBCFS_ALLOC(dev, sizeof(*dev)); | |
06ace26e | 2561 | if (!dev) |
d7e09d03 PT |
2562 | return NULL; |
2563 | ||
d7e09d03 | 2564 | netdev = dev_get_by_name(&init_net, ifname); |
06ace26e | 2565 | if (!netdev) { |
d7e09d03 PT |
2566 | dev->ibd_can_failover = 0; |
2567 | } else { | |
2568 | dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER); | |
2569 | dev_put(netdev); | |
2570 | } | |
2571 | ||
2572 | INIT_LIST_HEAD(&dev->ibd_nets); | |
2573 | INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */ | |
2574 | INIT_LIST_HEAD(&dev->ibd_fail_list); | |
2575 | dev->ibd_ifip = ip; | |
2576 | strcpy(&dev->ibd_ifname[0], ifname); | |
2577 | ||
2578 | /* initialize the device */ | |
2579 | rc = kiblnd_dev_failover(dev); | |
5fd88337 | 2580 | if (rc) { |
d7e09d03 PT |
2581 | CERROR("Can't initialize device: %d\n", rc); |
2582 | LIBCFS_FREE(dev, sizeof(*dev)); | |
2583 | return NULL; | |
2584 | } | |
2585 | ||
c314c319 | 2586 | list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs); |
d7e09d03 PT |
2587 | return dev; |
2588 | } | |
2589 | ||
febe73bd | 2590 | static void kiblnd_base_shutdown(void) |
d7e09d03 | 2591 | { |
ec3d17c0 MS |
2592 | struct kib_sched_info *sched; |
2593 | int i; | |
d7e09d03 | 2594 | |
febe73bd | 2595 | LASSERT(list_empty(&kiblnd_data.kib_devs)); |
d7e09d03 | 2596 | |
d7e09d03 PT |
2597 | switch (kiblnd_data.kib_init) { |
2598 | default: | |
2599 | LBUG(); | |
2600 | ||
2601 | case IBLND_INIT_ALL: | |
2602 | case IBLND_INIT_DATA: | |
06ace26e | 2603 | LASSERT(kiblnd_data.kib_peers); |
7a3888a3 | 2604 | for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) |
febe73bd | 2605 | LASSERT(list_empty(&kiblnd_data.kib_peers[i])); |
febe73bd GM |
2606 | LASSERT(list_empty(&kiblnd_data.kib_connd_zombies)); |
2607 | LASSERT(list_empty(&kiblnd_data.kib_connd_conns)); | |
4d99b258 LZ |
2608 | LASSERT(list_empty(&kiblnd_data.kib_reconn_list)); |
2609 | LASSERT(list_empty(&kiblnd_data.kib_reconn_wait)); | |
d7e09d03 PT |
2610 | |
2611 | /* flag threads to terminate; wake and wait for them to die */ | |
2612 | kiblnd_data.kib_shutdown = 1; | |
2613 | ||
4420cfd3 JS |
2614 | /* |
2615 | * NB: we really want to stop scheduler threads net by net | |
d7e09d03 | 2616 | * instead of the whole module, this should be improved |
4420cfd3 JS |
2617 | * with dynamic configuration LNet |
2618 | */ | |
d7e09d03 PT |
2619 | cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) |
2620 | wake_up_all(&sched->ibs_waitq); | |
2621 | ||
2622 | wake_up_all(&kiblnd_data.kib_connd_waitq); | |
2623 | wake_up_all(&kiblnd_data.kib_failover_waitq); | |
2624 | ||
2625 | i = 2; | |
5fd88337 | 2626 | while (atomic_read(&kiblnd_data.kib_nthreads)) { |
d7e09d03 | 2627 | i++; |
7a3888a3 GM |
2628 | /* power of 2 ? */ |
2629 | CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, | |
d7e09d03 PT |
2630 | "Waiting for %d threads to terminate\n", |
2631 | atomic_read(&kiblnd_data.kib_nthreads)); | |
d3caf4d5 PT |
2632 | set_current_state(TASK_UNINTERRUPTIBLE); |
2633 | schedule_timeout(cfs_time_seconds(1)); | |
d7e09d03 PT |
2634 | } |
2635 | ||
2636 | /* fall through */ | |
2637 | ||
2638 | case IBLND_INIT_NOTHING: | |
2639 | break; | |
2640 | } | |
2641 | ||
06ace26e | 2642 | if (kiblnd_data.kib_peers) { |
d7e09d03 PT |
2643 | LIBCFS_FREE(kiblnd_data.kib_peers, |
2644 | sizeof(struct list_head) * | |
2645 | kiblnd_data.kib_peer_hash_size); | |
2646 | } | |
2647 | ||
06ace26e | 2648 | if (kiblnd_data.kib_scheds) |
d7e09d03 PT |
2649 | cfs_percpt_free(kiblnd_data.kib_scheds); |
2650 | ||
d7e09d03 PT |
2651 | kiblnd_data.kib_init = IBLND_INIT_NOTHING; |
2652 | module_put(THIS_MODULE); | |
2653 | } | |
2654 | ||
439b4d45 | 2655 | static void kiblnd_shutdown(lnet_ni_t *ni) |
d7e09d03 | 2656 | { |
8d9de3f4 | 2657 | struct kib_net *net = ni->ni_data; |
ec3d17c0 MS |
2658 | rwlock_t *g_lock = &kiblnd_data.kib_global_lock; |
2659 | int i; | |
2660 | unsigned long flags; | |
d7e09d03 PT |
2661 | |
2662 | LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); | |
2663 | ||
06ace26e | 2664 | if (!net) |
d7e09d03 PT |
2665 | goto out; |
2666 | ||
d7e09d03 PT |
2667 | write_lock_irqsave(g_lock, flags); |
2668 | net->ibn_shutdown = 1; | |
2669 | write_unlock_irqrestore(g_lock, flags); | |
2670 | ||
2671 | switch (net->ibn_init) { | |
2672 | default: | |
2673 | LBUG(); | |
2674 | ||
2675 | case IBLND_INIT_ALL: | |
2676 | /* nuke all existing peers within this net */ | |
2677 | kiblnd_del_peer(ni, LNET_NID_ANY); | |
2678 | ||
2679 | /* Wait for all peer state to clean up */ | |
2680 | i = 2; | |
5fd88337 | 2681 | while (atomic_read(&net->ibn_npeers)) { |
d7e09d03 PT |
2682 | i++; |
2683 | CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ | |
2684 | "%s: waiting for %d peers to disconnect\n", | |
2685 | libcfs_nid2str(ni->ni_nid), | |
2686 | atomic_read(&net->ibn_npeers)); | |
d3caf4d5 PT |
2687 | set_current_state(TASK_UNINTERRUPTIBLE); |
2688 | schedule_timeout(cfs_time_seconds(1)); | |
d7e09d03 PT |
2689 | } |
2690 | ||
2691 | kiblnd_net_fini_pools(net); | |
2692 | ||
2693 | write_lock_irqsave(g_lock, flags); | |
2694 | LASSERT(net->ibn_dev->ibd_nnets > 0); | |
2695 | net->ibn_dev->ibd_nnets--; | |
2696 | list_del(&net->ibn_list); | |
2697 | write_unlock_irqrestore(g_lock, flags); | |
2698 | ||
2699 | /* fall through */ | |
2700 | ||
2701 | case IBLND_INIT_NOTHING: | |
5fd88337 | 2702 | LASSERT(!atomic_read(&net->ibn_nconns)); |
d7e09d03 | 2703 | |
5fd88337 | 2704 | if (net->ibn_dev && !net->ibn_dev->ibd_nnets) |
d7e09d03 PT |
2705 | kiblnd_destroy_dev(net->ibn_dev); |
2706 | ||
2707 | break; | |
2708 | } | |
2709 | ||
d7e09d03 PT |
2710 | net->ibn_init = IBLND_INIT_NOTHING; |
2711 | ni->ni_data = NULL; | |
2712 | ||
2713 | LIBCFS_FREE(net, sizeof(*net)); | |
2714 | ||
2715 | out: | |
2716 | if (list_empty(&kiblnd_data.kib_devs)) | |
2717 | kiblnd_base_shutdown(); | |
d7e09d03 PT |
2718 | } |
2719 | ||
febe73bd | 2720 | static int kiblnd_base_startup(void) |
d7e09d03 | 2721 | { |
ec3d17c0 MS |
2722 | struct kib_sched_info *sched; |
2723 | int rc; | |
2724 | int i; | |
d7e09d03 | 2725 | |
febe73bd | 2726 | LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING); |
d7e09d03 PT |
2727 | |
2728 | try_module_get(THIS_MODULE); | |
7a3888a3 GM |
2729 | /* zero pointers, flags etc */ |
2730 | memset(&kiblnd_data, 0, sizeof(kiblnd_data)); | |
d7e09d03 PT |
2731 | |
2732 | rwlock_init(&kiblnd_data.kib_global_lock); | |
2733 | ||
2734 | INIT_LIST_HEAD(&kiblnd_data.kib_devs); | |
2735 | INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); | |
2736 | ||
2737 | kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; | |
2738 | LIBCFS_ALLOC(kiblnd_data.kib_peers, | |
ec3d17c0 | 2739 | sizeof(struct list_head) * kiblnd_data.kib_peer_hash_size); |
06ace26e | 2740 | if (!kiblnd_data.kib_peers) |
d7e09d03 | 2741 | goto failed; |
d7e09d03 PT |
2742 | for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) |
2743 | INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]); | |
2744 | ||
2745 | spin_lock_init(&kiblnd_data.kib_connd_lock); | |
2746 | INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); | |
2747 | INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); | |
4d99b258 LZ |
2748 | INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list); |
2749 | INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait); | |
2750 | ||
d7e09d03 PT |
2751 | init_waitqueue_head(&kiblnd_data.kib_connd_waitq); |
2752 | init_waitqueue_head(&kiblnd_data.kib_failover_waitq); | |
2753 | ||
2754 | kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), | |
2755 | sizeof(*sched)); | |
06ace26e | 2756 | if (!kiblnd_data.kib_scheds) |
d7e09d03 PT |
2757 | goto failed; |
2758 | ||
2759 | cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { | |
ec3d17c0 | 2760 | int nthrs; |
d7e09d03 PT |
2761 | |
2762 | spin_lock_init(&sched->ibs_lock); | |
2763 | INIT_LIST_HEAD(&sched->ibs_conns); | |
2764 | init_waitqueue_head(&sched->ibs_waitq); | |
2765 | ||
2766 | nthrs = cfs_cpt_weight(lnet_cpt_table(), i); | |
2767 | if (*kiblnd_tunables.kib_nscheds > 0) { | |
2768 | nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); | |
2769 | } else { | |
4420cfd3 JS |
2770 | /* |
2771 | * max to half of CPUs, another half is reserved for | |
2772 | * upper layer modules | |
2773 | */ | |
d7e09d03 PT |
2774 | nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); |
2775 | } | |
2776 | ||
2777 | sched->ibs_nthreads_max = nthrs; | |
2778 | sched->ibs_cpt = i; | |
2779 | } | |
2780 | ||
2781 | kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; | |
2782 | ||
2783 | /* lists/ptrs/locks initialised */ | |
2784 | kiblnd_data.kib_init = IBLND_INIT_DATA; | |
2785 | /*****************************************************/ | |
2786 | ||
2787 | rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd"); | |
5fd88337 | 2788 | if (rc) { |
d7e09d03 PT |
2789 | CERROR("Can't spawn o2iblnd connd: %d\n", rc); |
2790 | goto failed; | |
2791 | } | |
2792 | ||
5fd88337 | 2793 | if (*kiblnd_tunables.kib_dev_failover) |
d7e09d03 PT |
2794 | rc = kiblnd_thread_start(kiblnd_failover_thread, NULL, |
2795 | "kiblnd_failover"); | |
2796 | ||
5fd88337 | 2797 | if (rc) { |
d7e09d03 PT |
2798 | CERROR("Can't spawn o2iblnd failover thread: %d\n", rc); |
2799 | goto failed; | |
2800 | } | |
2801 | ||
2802 | /* flag everything initialised */ | |
2803 | kiblnd_data.kib_init = IBLND_INIT_ALL; | |
2804 | /*****************************************************/ | |
2805 | ||
2806 | return 0; | |
2807 | ||
2808 | failed: | |
2809 | kiblnd_base_shutdown(); | |
2810 | return -ENETDOWN; | |
2811 | } | |
2812 | ||
febe73bd | 2813 | static int kiblnd_start_schedulers(struct kib_sched_info *sched) |
d7e09d03 | 2814 | { |
ec3d17c0 MS |
2815 | int rc = 0; |
2816 | int nthrs; | |
2817 | int i; | |
d7e09d03 | 2818 | |
5fd88337 | 2819 | if (!sched->ibs_nthreads) { |
d7e09d03 PT |
2820 | if (*kiblnd_tunables.kib_nscheds > 0) { |
2821 | nthrs = sched->ibs_nthreads_max; | |
2822 | } else { | |
2823 | nthrs = cfs_cpt_weight(lnet_cpt_table(), | |
2824 | sched->ibs_cpt); | |
2825 | nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); | |
2826 | nthrs = min(IBLND_N_SCHED_HIGH, nthrs); | |
2827 | } | |
2828 | } else { | |
2829 | LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); | |
2830 | /* increase one thread if there is new interface */ | |
b6ee3824 | 2831 | nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max; |
d7e09d03 PT |
2832 | } |
2833 | ||
2834 | for (i = 0; i < nthrs; i++) { | |
ec3d17c0 MS |
2835 | long id; |
2836 | char name[20]; | |
7a3888a3 | 2837 | |
d7e09d03 PT |
2838 | id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); |
2839 | snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld", | |
2840 | KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); | |
2841 | rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name); | |
5fd88337 | 2842 | if (!rc) |
d7e09d03 PT |
2843 | continue; |
2844 | ||
2845 | CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", | |
2846 | sched->ibs_cpt, sched->ibs_nthreads + i, rc); | |
2847 | break; | |
2848 | } | |
2849 | ||
2850 | sched->ibs_nthreads += i; | |
2851 | return rc; | |
2852 | } | |
2853 | ||
8d9de3f4 | 2854 | static int kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, __u32 *cpts, |
7a3888a3 | 2855 | int ncpts) |
d7e09d03 | 2856 | { |
ec3d17c0 MS |
2857 | int cpt; |
2858 | int rc; | |
2859 | int i; | |
d7e09d03 PT |
2860 | |
2861 | for (i = 0; i < ncpts; i++) { | |
2862 | struct kib_sched_info *sched; | |
2863 | ||
06ace26e | 2864 | cpt = !cpts ? i : cpts[i]; |
d7e09d03 PT |
2865 | sched = kiblnd_data.kib_scheds[cpt]; |
2866 | ||
2867 | if (!newdev && sched->ibs_nthreads > 0) | |
2868 | continue; | |
2869 | ||
2870 | rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); | |
5fd88337 | 2871 | if (rc) { |
d7e09d03 PT |
2872 | CERROR("Failed to start scheduler threads for %s\n", |
2873 | dev->ibd_ifname); | |
2874 | return rc; | |
2875 | } | |
2876 | } | |
2877 | return 0; | |
2878 | } | |
2879 | ||
8d9de3f4 | 2880 | static struct kib_dev *kiblnd_dev_search(char *ifname) |
d7e09d03 | 2881 | { |
8d9de3f4 JS |
2882 | struct kib_dev *alias = NULL; |
2883 | struct kib_dev *dev; | |
ec3d17c0 MS |
2884 | char *colon; |
2885 | char *colon2; | |
d7e09d03 PT |
2886 | |
2887 | colon = strchr(ifname, ':'); | |
2888 | list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { | |
5fd88337 | 2889 | if (!strcmp(&dev->ibd_ifname[0], ifname)) |
d7e09d03 PT |
2890 | return dev; |
2891 | ||
06ace26e | 2892 | if (alias) |
d7e09d03 PT |
2893 | continue; |
2894 | ||
2895 | colon2 = strchr(dev->ibd_ifname, ':'); | |
06ace26e | 2896 | if (colon) |
d7e09d03 | 2897 | *colon = 0; |
06ace26e | 2898 | if (colon2) |
d7e09d03 PT |
2899 | *colon2 = 0; |
2900 | ||
5fd88337 | 2901 | if (!strcmp(&dev->ibd_ifname[0], ifname)) |
d7e09d03 PT |
2902 | alias = dev; |
2903 | ||
06ace26e | 2904 | if (colon) |
d7e09d03 | 2905 | *colon = ':'; |
06ace26e | 2906 | if (colon2) |
d7e09d03 PT |
2907 | *colon2 = ':'; |
2908 | } | |
2909 | return alias; | |
2910 | } | |
2911 | ||
439b4d45 | 2912 | static int kiblnd_startup(lnet_ni_t *ni) |
d7e09d03 | 2913 | { |
ec3d17c0 | 2914 | char *ifname; |
8d9de3f4 JS |
2915 | struct kib_dev *ibdev = NULL; |
2916 | struct kib_net *net; | |
473c4e01 | 2917 | struct timespec64 tv; |
ec3d17c0 MS |
2918 | unsigned long flags; |
2919 | int rc; | |
2920 | int newdev; | |
d7e09d03 | 2921 | |
febe73bd | 2922 | LASSERT(ni->ni_lnd == &the_o2iblnd); |
d7e09d03 PT |
2923 | |
2924 | if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { | |
2925 | rc = kiblnd_base_startup(); | |
5fd88337 | 2926 | if (rc) |
d7e09d03 PT |
2927 | return rc; |
2928 | } | |
2929 | ||
2930 | LIBCFS_ALLOC(net, sizeof(*net)); | |
2931 | ni->ni_data = net; | |
06ace26e | 2932 | if (!net) |
3247c4e5 | 2933 | goto net_failed; |
d7e09d03 | 2934 | |
473c4e01 AB |
2935 | ktime_get_real_ts64(&tv); |
2936 | net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC + | |
2937 | tv.tv_nsec / NSEC_PER_USEC; | |
d7e09d03 | 2938 | |
f6e50066 | 2939 | rc = kiblnd_tunables_setup(ni); |
025ba826 AS |
2940 | if (rc) |
2941 | goto net_failed; | |
d7e09d03 | 2942 | |
06ace26e | 2943 | if (ni->ni_interfaces[0]) { |
d7e09d03 PT |
2944 | /* Use the IPoIB interface specified in 'networks=' */ |
2945 | ||
febe73bd | 2946 | CLASSERT(LNET_MAX_INTERFACES > 1); |
06ace26e | 2947 | if (ni->ni_interfaces[1]) { |
d7e09d03 PT |
2948 | CERROR("Multiple interfaces not supported\n"); |
2949 | goto failed; | |
2950 | } | |
2951 | ||
2952 | ifname = ni->ni_interfaces[0]; | |
2953 | } else { | |
2954 | ifname = *kiblnd_tunables.kib_default_ipif; | |
2955 | } | |
2956 | ||
2957 | if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { | |
2958 | CERROR("IPoIB interface name too long: %s\n", ifname); | |
2959 | goto failed; | |
2960 | } | |
2961 | ||
2962 | ibdev = kiblnd_dev_search(ifname); | |
2963 | ||
06ace26e | 2964 | newdev = !ibdev; |
d7e09d03 | 2965 | /* hmm...create kib_dev even for alias */ |
5fd88337 | 2966 | if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname)) |
d7e09d03 PT |
2967 | ibdev = kiblnd_create_dev(ifname); |
2968 | ||
06ace26e | 2969 | if (!ibdev) |
d7e09d03 PT |
2970 | goto failed; |
2971 | ||
2972 | net->ibn_dev = ibdev; | |
2973 | ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); | |
2974 | ||
2975 | rc = kiblnd_dev_start_threads(ibdev, newdev, | |
2976 | ni->ni_cpts, ni->ni_ncpts); | |
5fd88337 | 2977 | if (rc) |
d7e09d03 PT |
2978 | goto failed; |
2979 | ||
32c8deb8 | 2980 | rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts); |
5fd88337 | 2981 | if (rc) { |
d7e09d03 PT |
2982 | CERROR("Failed to initialize NI pools: %d\n", rc); |
2983 | goto failed; | |
2984 | } | |
2985 | ||
2986 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
2987 | ibdev->ibd_nnets++; | |
2988 | list_add_tail(&net->ibn_list, &ibdev->ibd_nets); | |
2989 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
2990 | ||
2991 | net->ibn_init = IBLND_INIT_ALL; | |
2992 | ||
2993 | return 0; | |
2994 | ||
2995 | failed: | |
06ace26e | 2996 | if (!net->ibn_dev && ibdev) |
d7e09d03 PT |
2997 | kiblnd_destroy_dev(ibdev); |
2998 | ||
3247c4e5 | 2999 | net_failed: |
d7e09d03 PT |
3000 | kiblnd_shutdown(ni); |
3001 | ||
3002 | CDEBUG(D_NET, "kiblnd_startup failed\n"); | |
3003 | return -ENETDOWN; | |
3004 | } | |
3005 | ||
439b4d45 FZ |
3006 | static lnd_t the_o2iblnd = { |
3007 | .lnd_type = O2IBLND, | |
3008 | .lnd_startup = kiblnd_startup, | |
3009 | .lnd_shutdown = kiblnd_shutdown, | |
3010 | .lnd_ctl = kiblnd_ctl, | |
3011 | .lnd_query = kiblnd_query, | |
3012 | .lnd_send = kiblnd_send, | |
3013 | .lnd_recv = kiblnd_recv, | |
3014 | }; | |
3015 | ||
e0f94113 | 3016 | static void __exit ko2iblnd_exit(void) |
d7e09d03 PT |
3017 | { |
3018 | lnet_unregister_lnd(&the_o2iblnd); | |
d7e09d03 PT |
3019 | } |
3020 | ||
e0f94113 | 3021 | static int __init ko2iblnd_init(void) |
d7e09d03 | 3022 | { |
8d9de3f4 JS |
3023 | CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE); |
3024 | CLASSERT(offsetof(struct kib_msg, | |
c314c319 JS |
3025 | ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) |
3026 | <= IBLND_MSG_SIZE); | |
8d9de3f4 | 3027 | CLASSERT(offsetof(struct kib_msg, |
c314c319 JS |
3028 | ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) |
3029 | <= IBLND_MSG_SIZE); | |
d7e09d03 | 3030 | |
025ba826 | 3031 | kiblnd_tunables_init(); |
d7e09d03 PT |
3032 | |
3033 | lnet_register_lnd(&the_o2iblnd); | |
3034 | ||
3035 | return 0; | |
3036 | } | |
3037 | ||
a0455471 | 3038 | MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>"); |
57878e17 | 3039 | MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver"); |
5b0e50b9 | 3040 | MODULE_VERSION("2.7.0"); |
d7e09d03 PT |
3041 | MODULE_LICENSE("GPL"); |
3042 | ||
e0f94113 AD |
3043 | module_init(ko2iblnd_init); |
3044 | module_exit(ko2iblnd_exit); |