Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. | |
3 | * | |
4 | * Copyright (c) 2011, 2012, Intel Corporation. | |
5 | * | |
6 | * Author: Zach Brown <zab@zabbo.net> | |
7 | * Author: Peter J. Braam <braam@clusterfs.com> | |
8 | * Author: Phil Schwan <phil@clusterfs.com> | |
9 | * Author: Eric Barton <eric@bartonsoftware.com> | |
10 | * | |
11 | * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ | |
12 | * | |
13 | * Portals is free software; you can redistribute it and/or | |
14 | * modify it under the terms of version 2 of the GNU General Public | |
15 | * License as published by the Free Software Foundation. | |
16 | * | |
17 | * Portals is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | * GNU General Public License for more details. | |
21 | * | |
d7e09d03 PT |
22 | */ |
23 | ||
24 | #include "socklnd.h" | |
25 | ||
ff13fd40 | 26 | struct ksock_tx * |
d7e09d03 PT |
27 | ksocknal_alloc_tx(int type, int size) |
28 | { | |
ff13fd40 | 29 | struct ksock_tx *tx = NULL; |
d7e09d03 PT |
30 | |
31 | if (type == KSOCK_MSG_NOOP) { | |
32 | LASSERT(size == KSOCK_NOOP_TX_SIZE); | |
33 | ||
34 | /* searching for a noop tx in free list */ | |
35 | spin_lock(&ksocknal_data.ksnd_tx_lock); | |
36 | ||
37 | if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { | |
38 | tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \ | |
ff13fd40 | 39 | next, struct ksock_tx, tx_list); |
d7e09d03 PT |
40 | LASSERT(tx->tx_desc_size == size); |
41 | list_del(&tx->tx_list); | |
42 | } | |
43 | ||
44 | spin_unlock(&ksocknal_data.ksnd_tx_lock); | |
45 | } | |
46 | ||
06ace26e | 47 | if (!tx) |
d7e09d03 PT |
48 | LIBCFS_ALLOC(tx, size); |
49 | ||
06ace26e | 50 | if (!tx) |
d7e09d03 PT |
51 | return NULL; |
52 | ||
53 | atomic_set(&tx->tx_refcount, 1); | |
54 | tx->tx_zc_aborted = 0; | |
55 | tx->tx_zc_capable = 0; | |
56 | tx->tx_zc_checked = 0; | |
57 | tx->tx_desc_size = size; | |
58 | ||
59 | atomic_inc(&ksocknal_data.ksnd_nactive_txs); | |
60 | ||
61 | return tx; | |
62 | } | |
63 | ||
ff13fd40 | 64 | struct ksock_tx * |
d7e09d03 PT |
65 | ksocknal_alloc_tx_noop(__u64 cookie, int nonblk) |
66 | { | |
ff13fd40 | 67 | struct ksock_tx *tx; |
d7e09d03 PT |
68 | |
69 | tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE); | |
06ace26e | 70 | if (!tx) { |
d7e09d03 PT |
71 | CERROR("Can't allocate noop tx desc\n"); |
72 | return NULL; | |
73 | } | |
74 | ||
97d10d0a MS |
75 | tx->tx_conn = NULL; |
76 | tx->tx_lnetmsg = NULL; | |
77 | tx->tx_kiov = NULL; | |
78 | tx->tx_nkiov = 0; | |
79 | tx->tx_iov = tx->tx_frags.virt.iov; | |
80 | tx->tx_niov = 1; | |
81 | tx->tx_nonblk = nonblk; | |
d7e09d03 PT |
82 | |
83 | socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP); | |
84 | tx->tx_msg.ksm_zc_cookies[1] = cookie; | |
85 | ||
86 | return tx; | |
87 | } | |
88 | ||
d7e09d03 | 89 | void |
ff13fd40 | 90 | ksocknal_free_tx(struct ksock_tx *tx) |
d7e09d03 PT |
91 | { |
92 | atomic_dec(&ksocknal_data.ksnd_nactive_txs); | |
93 | ||
06ace26e | 94 | if (!tx->tx_lnetmsg && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { |
d7e09d03 PT |
95 | /* it's a noop tx */ |
96 | spin_lock(&ksocknal_data.ksnd_tx_lock); | |
97 | ||
98 | list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); | |
99 | ||
100 | spin_unlock(&ksocknal_data.ksnd_tx_lock); | |
101 | } else { | |
102 | LIBCFS_FREE(tx, tx->tx_desc_size); | |
103 | } | |
104 | } | |
105 | ||
f9cd474f | 106 | static int |
ff13fd40 | 107 | ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) |
d7e09d03 | 108 | { |
97d10d0a MS |
109 | struct kvec *iov = tx->tx_iov; |
110 | int nob; | |
111 | int rc; | |
d7e09d03 | 112 | |
97d10d0a | 113 | LASSERT(tx->tx_niov > 0); |
d7e09d03 PT |
114 | |
115 | /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */ | |
116 | rc = ksocknal_lib_send_iov(conn, tx); | |
117 | ||
118 | if (rc <= 0) /* sent nothing? */ | |
71397095 | 119 | return rc; |
d7e09d03 PT |
120 | |
121 | nob = rc; | |
b31e64c4 | 122 | LASSERT(nob <= tx->tx_resid); |
d7e09d03 PT |
123 | tx->tx_resid -= nob; |
124 | ||
125 | /* "consume" iov */ | |
126 | do { | |
97d10d0a | 127 | LASSERT(tx->tx_niov > 0); |
d7e09d03 | 128 | |
9797fb0e | 129 | if (nob < (int)iov->iov_len) { |
d7e09d03 PT |
130 | iov->iov_base = (void *)((char *)iov->iov_base + nob); |
131 | iov->iov_len -= nob; | |
71397095 | 132 | return rc; |
d7e09d03 PT |
133 | } |
134 | ||
135 | nob -= iov->iov_len; | |
136 | tx->tx_iov = ++iov; | |
137 | tx->tx_niov--; | |
5fd88337 | 138 | } while (nob); |
d7e09d03 | 139 | |
71397095 | 140 | return rc; |
d7e09d03 PT |
141 | } |
142 | ||
f9cd474f | 143 | static int |
ff13fd40 | 144 | ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) |
d7e09d03 | 145 | { |
97d10d0a MS |
146 | lnet_kiov_t *kiov = tx->tx_kiov; |
147 | int nob; | |
148 | int rc; | |
d7e09d03 | 149 | |
5fd88337 | 150 | LASSERT(!tx->tx_niov); |
97d10d0a | 151 | LASSERT(tx->tx_nkiov > 0); |
d7e09d03 PT |
152 | |
153 | /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */ | |
154 | rc = ksocknal_lib_send_kiov(conn, tx); | |
155 | ||
156 | if (rc <= 0) /* sent nothing? */ | |
71397095 | 157 | return rc; |
d7e09d03 PT |
158 | |
159 | nob = rc; | |
b31e64c4 | 160 | LASSERT(nob <= tx->tx_resid); |
d7e09d03 PT |
161 | tx->tx_resid -= nob; |
162 | ||
163 | /* "consume" kiov */ | |
164 | do { | |
165 | LASSERT(tx->tx_nkiov > 0); | |
166 | ||
65ffc679 AV |
167 | if (nob < (int)kiov->bv_len) { |
168 | kiov->bv_offset += nob; | |
169 | kiov->bv_len -= nob; | |
d7e09d03 PT |
170 | return rc; |
171 | } | |
172 | ||
65ffc679 | 173 | nob -= (int)kiov->bv_len; |
d7e09d03 PT |
174 | tx->tx_kiov = ++kiov; |
175 | tx->tx_nkiov--; | |
5fd88337 | 176 | } while (nob); |
d7e09d03 | 177 | |
71397095 | 178 | return rc; |
d7e09d03 PT |
179 | } |
180 | ||
f9cd474f | 181 | static int |
ff13fd40 | 182 | ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx) |
d7e09d03 | 183 | { |
97d10d0a MS |
184 | int rc; |
185 | int bufnob; | |
d7e09d03 | 186 | |
5fd88337 | 187 | if (ksocknal_data.ksnd_stall_tx) { |
d3caf4d5 PT |
188 | set_current_state(TASK_UNINTERRUPTIBLE); |
189 | schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); | |
d7e09d03 PT |
190 | } |
191 | ||
5fd88337 | 192 | LASSERT(tx->tx_resid); |
d7e09d03 PT |
193 | |
194 | rc = ksocknal_connsock_addref(conn); | |
5fd88337 | 195 | if (rc) { |
b31e64c4 | 196 | LASSERT(conn->ksnc_closing); |
71397095 | 197 | return -ESHUTDOWN; |
d7e09d03 PT |
198 | } |
199 | ||
200 | do { | |
201 | if (ksocknal_data.ksnd_enomem_tx > 0) { | |
202 | /* testing... */ | |
203 | ksocknal_data.ksnd_enomem_tx--; | |
204 | rc = -EAGAIN; | |
5fd88337 | 205 | } else if (tx->tx_niov) { |
b31e64c4 | 206 | rc = ksocknal_send_iov(conn, tx); |
d7e09d03 | 207 | } else { |
b31e64c4 | 208 | rc = ksocknal_send_kiov(conn, tx); |
d7e09d03 PT |
209 | } |
210 | ||
fb4a1539 | 211 | bufnob = conn->ksnc_sock->sk->sk_wmem_queued; |
d7e09d03 PT |
212 | if (rc > 0) /* sent something? */ |
213 | conn->ksnc_tx_bufnob += rc; /* account it */ | |
214 | ||
215 | if (bufnob < conn->ksnc_tx_bufnob) { | |
4420cfd3 JS |
216 | /* |
217 | * allocated send buffer bytes < computed; infer | |
218 | * something got ACKed | |
219 | */ | |
d7e09d03 PT |
220 | conn->ksnc_tx_deadline = |
221 | cfs_time_shift(*ksocknal_tunables.ksnd_timeout); | |
222 | conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); | |
223 | conn->ksnc_tx_bufnob = bufnob; | |
224 | mb(); | |
225 | } | |
226 | ||
227 | if (rc <= 0) { /* Didn't write anything? */ | |
228 | ||
5fd88337 | 229 | if (!rc) /* some stacks return 0 instead of -EAGAIN */ |
d7e09d03 PT |
230 | rc = -EAGAIN; |
231 | ||
232 | /* Check if EAGAIN is due to memory pressure */ | |
a58a38ac | 233 | if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn)) |
d7e09d03 PT |
234 | rc = -ENOMEM; |
235 | ||
236 | break; | |
237 | } | |
238 | ||
239 | /* socket's wmem_queued now includes 'rc' bytes */ | |
b31e64c4 | 240 | atomic_sub(rc, &conn->ksnc_tx_nob); |
d7e09d03 PT |
241 | rc = 0; |
242 | ||
5fd88337 | 243 | } while (tx->tx_resid); |
d7e09d03 PT |
244 | |
245 | ksocknal_connsock_decref(conn); | |
71397095 | 246 | return rc; |
d7e09d03 PT |
247 | } |
248 | ||
f9cd474f | 249 | static int |
ff13fd40 | 250 | ksocknal_recv_iov(struct ksock_conn *conn) |
d7e09d03 | 251 | { |
f351bad2 | 252 | struct kvec *iov = conn->ksnc_rx_iov; |
97d10d0a MS |
253 | int nob; |
254 | int rc; | |
d7e09d03 | 255 | |
97d10d0a | 256 | LASSERT(conn->ksnc_rx_niov > 0); |
d7e09d03 | 257 | |
4420cfd3 JS |
258 | /* |
259 | * Never touch conn->ksnc_rx_iov or change connection | |
260 | * status inside ksocknal_lib_recv_iov | |
261 | */ | |
d7e09d03 PT |
262 | rc = ksocknal_lib_recv_iov(conn); |
263 | ||
264 | if (rc <= 0) | |
71397095 | 265 | return rc; |
d7e09d03 PT |
266 | |
267 | /* received something... */ | |
268 | nob = rc; | |
269 | ||
270 | conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); | |
271 | conn->ksnc_rx_deadline = | |
272 | cfs_time_shift(*ksocknal_tunables.ksnd_timeout); | |
273 | mb(); /* order with setting rx_started */ | |
274 | conn->ksnc_rx_started = 1; | |
275 | ||
276 | conn->ksnc_rx_nob_wanted -= nob; | |
277 | conn->ksnc_rx_nob_left -= nob; | |
278 | ||
279 | do { | |
97d10d0a | 280 | LASSERT(conn->ksnc_rx_niov > 0); |
d7e09d03 PT |
281 | |
282 | if (nob < (int)iov->iov_len) { | |
283 | iov->iov_len -= nob; | |
2101f98c | 284 | iov->iov_base += nob; |
71397095 | 285 | return -EAGAIN; |
d7e09d03 PT |
286 | } |
287 | ||
288 | nob -= iov->iov_len; | |
289 | conn->ksnc_rx_iov = ++iov; | |
290 | conn->ksnc_rx_niov--; | |
5fd88337 | 291 | } while (nob); |
d7e09d03 | 292 | |
71397095 | 293 | return rc; |
d7e09d03 PT |
294 | } |
295 | ||
f9cd474f | 296 | static int |
ff13fd40 | 297 | ksocknal_recv_kiov(struct ksock_conn *conn) |
d7e09d03 | 298 | { |
97d10d0a MS |
299 | lnet_kiov_t *kiov = conn->ksnc_rx_kiov; |
300 | int nob; | |
301 | int rc; | |
50ffcb7e | 302 | |
97d10d0a | 303 | LASSERT(conn->ksnc_rx_nkiov > 0); |
d7e09d03 | 304 | |
4420cfd3 JS |
305 | /* |
306 | * Never touch conn->ksnc_rx_kiov or change connection | |
307 | * status inside ksocknal_lib_recv_iov | |
308 | */ | |
d7e09d03 PT |
309 | rc = ksocknal_lib_recv_kiov(conn); |
310 | ||
311 | if (rc <= 0) | |
71397095 | 312 | return rc; |
d7e09d03 PT |
313 | |
314 | /* received something... */ | |
315 | nob = rc; | |
316 | ||
317 | conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); | |
318 | conn->ksnc_rx_deadline = | |
319 | cfs_time_shift(*ksocknal_tunables.ksnd_timeout); | |
320 | mb(); /* order with setting rx_started */ | |
321 | conn->ksnc_rx_started = 1; | |
322 | ||
323 | conn->ksnc_rx_nob_wanted -= nob; | |
324 | conn->ksnc_rx_nob_left -= nob; | |
325 | ||
326 | do { | |
97d10d0a | 327 | LASSERT(conn->ksnc_rx_nkiov > 0); |
d7e09d03 | 328 | |
65ffc679 AV |
329 | if (nob < (int)kiov->bv_len) { |
330 | kiov->bv_offset += nob; | |
331 | kiov->bv_len -= nob; | |
d7e09d03 PT |
332 | return -EAGAIN; |
333 | } | |
334 | ||
65ffc679 | 335 | nob -= kiov->bv_len; |
d7e09d03 PT |
336 | conn->ksnc_rx_kiov = ++kiov; |
337 | conn->ksnc_rx_nkiov--; | |
5fd88337 | 338 | } while (nob); |
d7e09d03 PT |
339 | |
340 | return 1; | |
341 | } | |
342 | ||
f9cd474f | 343 | static int |
ff13fd40 | 344 | ksocknal_receive(struct ksock_conn *conn) |
d7e09d03 | 345 | { |
4420cfd3 JS |
346 | /* |
347 | * Return 1 on success, 0 on EOF, < 0 on error. | |
d7e09d03 | 348 | * Caller checks ksnc_rx_nob_wanted to determine |
4420cfd3 JS |
349 | * progress/completion. |
350 | */ | |
97d10d0a | 351 | int rc; |
d7e09d03 | 352 | |
5fd88337 | 353 | if (ksocknal_data.ksnd_stall_rx) { |
d3caf4d5 PT |
354 | set_current_state(TASK_UNINTERRUPTIBLE); |
355 | schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx)); | |
d7e09d03 PT |
356 | } |
357 | ||
358 | rc = ksocknal_connsock_addref(conn); | |
5fd88337 | 359 | if (rc) { |
b31e64c4 | 360 | LASSERT(conn->ksnc_closing); |
71397095 | 361 | return -ESHUTDOWN; |
d7e09d03 PT |
362 | } |
363 | ||
364 | for (;;) { | |
5fd88337 | 365 | if (conn->ksnc_rx_niov) |
b31e64c4 | 366 | rc = ksocknal_recv_iov(conn); |
d7e09d03 | 367 | else |
b31e64c4 | 368 | rc = ksocknal_recv_kiov(conn); |
d7e09d03 PT |
369 | |
370 | if (rc <= 0) { | |
371 | /* error/EOF or partial receive */ | |
372 | if (rc == -EAGAIN) { | |
373 | rc = 1; | |
5fd88337 | 374 | } else if (!rc && conn->ksnc_rx_started) { |
d7e09d03 PT |
375 | /* EOF in the middle of a message */ |
376 | rc = -EPROTO; | |
377 | } | |
378 | break; | |
379 | } | |
380 | ||
381 | /* Completed a fragment */ | |
382 | ||
5fd88337 | 383 | if (!conn->ksnc_rx_nob_wanted) { |
d7e09d03 PT |
384 | rc = 1; |
385 | break; | |
386 | } | |
387 | } | |
388 | ||
389 | ksocknal_connsock_decref(conn); | |
0a3bdb00 | 390 | return rc; |
d7e09d03 PT |
391 | } |
392 | ||
393 | void | |
ff13fd40 | 394 | ksocknal_tx_done(lnet_ni_t *ni, struct ksock_tx *tx) |
d7e09d03 | 395 | { |
97d10d0a | 396 | lnet_msg_t *lnetmsg = tx->tx_lnetmsg; |
5fd88337 | 397 | int rc = (!tx->tx_resid && !tx->tx_zc_aborted) ? 0 : -EIO; |
d7e09d03 | 398 | |
06ace26e | 399 | LASSERT(ni || tx->tx_conn); |
d7e09d03 | 400 | |
06ace26e | 401 | if (tx->tx_conn) |
d7e09d03 PT |
402 | ksocknal_conn_decref(tx->tx_conn); |
403 | ||
06ace26e | 404 | if (!ni && tx->tx_conn) |
d7e09d03 PT |
405 | ni = tx->tx_conn->ksnc_peer->ksnp_ni; |
406 | ||
b31e64c4 | 407 | ksocknal_free_tx(tx); |
06ace26e | 408 | if (lnetmsg) /* KSOCK_MSG_NOOP go without lnetmsg */ |
b31e64c4 | 409 | lnet_finalize(ni, lnetmsg, rc); |
d7e09d03 PT |
410 | } |
411 | ||
412 | void | |
b31e64c4 | 413 | ksocknal_txlist_done(lnet_ni_t *ni, struct list_head *txlist, int error) |
d7e09d03 | 414 | { |
ff13fd40 | 415 | struct ksock_tx *tx; |
d7e09d03 | 416 | |
b31e64c4 | 417 | while (!list_empty(txlist)) { |
ff13fd40 | 418 | tx = list_entry(txlist->next, struct ksock_tx, tx_list); |
d7e09d03 | 419 | |
06ace26e | 420 | if (error && tx->tx_lnetmsg) { |
d7e09d03 | 421 | CNETERR("Deleting packet type %d len %d %s->%s\n", |
b31e64c4 JS |
422 | le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type), |
423 | le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length), | |
d7e09d03 PT |
424 | libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), |
425 | libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); | |
426 | } else if (error) { | |
427 | CNETERR("Deleting noop packet\n"); | |
428 | } | |
429 | ||
97d10d0a | 430 | list_del(&tx->tx_list); |
d7e09d03 | 431 | |
97d10d0a MS |
432 | LASSERT(atomic_read(&tx->tx_refcount) == 1); |
433 | ksocknal_tx_done(ni, tx); | |
d7e09d03 PT |
434 | } |
435 | } | |
436 | ||
437 | static void | |
ff13fd40 | 438 | ksocknal_check_zc_req(struct ksock_tx *tx) |
d7e09d03 | 439 | { |
ff13fd40 JS |
440 | struct ksock_conn *conn = tx->tx_conn; |
441 | struct ksock_peer *peer = conn->ksnc_peer; | |
d7e09d03 | 442 | |
4420cfd3 JS |
443 | /* |
444 | * Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx | |
d7e09d03 PT |
445 | * to ksnp_zc_req_list if some fragment of this message should be sent |
446 | * zero-copy. Our peer will send an ACK containing this cookie when | |
447 | * she has received this message to tell us we can signal completion. | |
448 | * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on | |
4420cfd3 JS |
449 | * ksnp_zc_req_list. |
450 | */ | |
97d10d0a MS |
451 | LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); |
452 | LASSERT(tx->tx_zc_capable); | |
d7e09d03 PT |
453 | |
454 | tx->tx_zc_checked = 1; | |
455 | ||
456 | if (conn->ksnc_proto == &ksocknal_protocol_v1x || | |
457 | !conn->ksnc_zc_capable) | |
458 | return; | |
459 | ||
4420cfd3 JS |
460 | /* |
461 | * assign cookie and queue tx to pending list, it will be released when | |
462 | * a matching ack is received. See ksocknal_handle_zcack() | |
463 | */ | |
d7e09d03 PT |
464 | ksocknal_tx_addref(tx); |
465 | ||
466 | spin_lock(&peer->ksnp_lock); | |
467 | ||
468 | /* ZC_REQ is going to be pinned to the peer */ | |
469 | tx->tx_deadline = | |
470 | cfs_time_shift(*ksocknal_tunables.ksnd_timeout); | |
471 | ||
5fd88337 | 472 | LASSERT(!tx->tx_msg.ksm_zc_cookies[0]); |
d7e09d03 PT |
473 | |
474 | tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++; | |
475 | ||
5fd88337 | 476 | if (!peer->ksnp_zc_next_cookie) |
d7e09d03 PT |
477 | peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; |
478 | ||
479 | list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list); | |
480 | ||
481 | spin_unlock(&peer->ksnp_lock); | |
482 | } | |
483 | ||
484 | static void | |
ff13fd40 | 485 | ksocknal_uncheck_zc_req(struct ksock_tx *tx) |
d7e09d03 | 486 | { |
ff13fd40 | 487 | struct ksock_peer *peer = tx->tx_conn->ksnc_peer; |
d7e09d03 PT |
488 | |
489 | LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); | |
490 | LASSERT(tx->tx_zc_capable); | |
491 | ||
492 | tx->tx_zc_checked = 0; | |
493 | ||
494 | spin_lock(&peer->ksnp_lock); | |
495 | ||
5fd88337 | 496 | if (!tx->tx_msg.ksm_zc_cookies[0]) { |
d7e09d03 PT |
497 | /* Not waiting for an ACK */ |
498 | spin_unlock(&peer->ksnp_lock); | |
499 | return; | |
500 | } | |
501 | ||
502 | tx->tx_msg.ksm_zc_cookies[0] = 0; | |
503 | list_del(&tx->tx_zc_list); | |
504 | ||
505 | spin_unlock(&peer->ksnp_lock); | |
506 | ||
507 | ksocknal_tx_decref(tx); | |
508 | } | |
509 | ||
f9cd474f | 510 | static int |
ff13fd40 | 511 | ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx) |
d7e09d03 | 512 | { |
97d10d0a | 513 | int rc; |
d7e09d03 PT |
514 | |
515 | if (tx->tx_zc_capable && !tx->tx_zc_checked) | |
516 | ksocknal_check_zc_req(tx); | |
517 | ||
b31e64c4 | 518 | rc = ksocknal_transmit(conn, tx); |
d7e09d03 | 519 | |
97d10d0a | 520 | CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc); |
d7e09d03 | 521 | |
5fd88337 | 522 | if (!tx->tx_resid) { |
d7e09d03 | 523 | /* Sent everything OK */ |
5fd88337 | 524 | LASSERT(!rc); |
d7e09d03 | 525 | |
71397095 | 526 | return 0; |
d7e09d03 PT |
527 | } |
528 | ||
529 | if (rc == -EAGAIN) | |
71397095 | 530 | return rc; |
d7e09d03 PT |
531 | |
532 | if (rc == -ENOMEM) { | |
533 | static int counter; | |
534 | ||
535 | counter++; /* exponential backoff warnings */ | |
536 | if ((counter & (-counter)) == counter) | |
323b0b2c | 537 | CWARN("%u ENOMEM tx %p\n", counter, conn); |
d7e09d03 PT |
538 | |
539 | /* Queue on ksnd_enomem_conns for retry after a timeout */ | |
540 | spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); | |
541 | ||
542 | /* enomem list takes over scheduler's ref... */ | |
b31e64c4 | 543 | LASSERT(conn->ksnc_tx_scheduled); |
d7e09d03 | 544 | list_add_tail(&conn->ksnc_tx_list, |
c314c319 | 545 | &ksocknal_data.ksnd_enomem_conns); |
d7e09d03 PT |
546 | if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(), |
547 | SOCKNAL_ENOMEM_RETRY), | |
548 | ksocknal_data.ksnd_reaper_waketime)) | |
b31e64c4 | 549 | wake_up(&ksocknal_data.ksnd_reaper_waitq); |
d7e09d03 PT |
550 | |
551 | spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); | |
71397095 | 552 | return rc; |
d7e09d03 PT |
553 | } |
554 | ||
555 | /* Actual error */ | |
97d10d0a | 556 | LASSERT(rc < 0); |
d7e09d03 PT |
557 | |
558 | if (!conn->ksnc_closing) { | |
559 | switch (rc) { | |
560 | case -ECONNRESET: | |
2d00bd17 | 561 | LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n", |
5e8f6920 | 562 | &conn->ksnc_ipaddr); |
d7e09d03 PT |
563 | break; |
564 | default: | |
2d00bd17 | 565 | LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n", |
5e8f6920 | 566 | &conn->ksnc_ipaddr, rc); |
d7e09d03 PT |
567 | break; |
568 | } | |
2d00bd17 JP |
569 | CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n", |
570 | conn, rc, | |
d7e09d03 | 571 | libcfs_id2str(conn->ksnc_peer->ksnp_id), |
5e8f6920 | 572 | &conn->ksnc_ipaddr, |
d7e09d03 PT |
573 | conn->ksnc_port); |
574 | } | |
575 | ||
576 | if (tx->tx_zc_checked) | |
577 | ksocknal_uncheck_zc_req(tx); | |
578 | ||
579 | /* it's not an error if conn is being closed */ | |
b31e64c4 | 580 | ksocknal_close_conn_and_siblings(conn, (conn->ksnc_closing) ? 0 : rc); |
d7e09d03 | 581 | |
71397095 | 582 | return rc; |
d7e09d03 PT |
583 | } |
584 | ||
f9cd474f | 585 | static void |
ff13fd40 | 586 | ksocknal_launch_connection_locked(struct ksock_route *route) |
d7e09d03 | 587 | { |
d7e09d03 PT |
588 | /* called holding write lock on ksnd_global_lock */ |
589 | ||
97d10d0a MS |
590 | LASSERT(!route->ksnr_scheduled); |
591 | LASSERT(!route->ksnr_connecting); | |
5fd88337 | 592 | LASSERT(ksocknal_route_mask() & ~route->ksnr_connected); |
d7e09d03 PT |
593 | |
594 | route->ksnr_scheduled = 1; /* scheduling conn for connd */ | |
595 | ksocknal_route_addref(route); /* extra ref for connd */ | |
596 | ||
597 | spin_lock_bh(&ksocknal_data.ksnd_connd_lock); | |
598 | ||
599 | list_add_tail(&route->ksnr_connd_list, | |
c314c319 | 600 | &ksocknal_data.ksnd_connd_routes); |
d7e09d03 PT |
601 | wake_up(&ksocknal_data.ksnd_connd_waitq); |
602 | ||
603 | spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); | |
604 | } | |
605 | ||
606 | void | |
ff13fd40 | 607 | ksocknal_launch_all_connections_locked(struct ksock_peer *peer) |
d7e09d03 | 608 | { |
ff13fd40 | 609 | struct ksock_route *route; |
d7e09d03 PT |
610 | |
611 | /* called holding write lock on ksnd_global_lock */ | |
612 | for (;;) { | |
613 | /* launch any/all connections that need it */ | |
614 | route = ksocknal_find_connectable_route_locked(peer); | |
06ace26e | 615 | if (!route) |
d7e09d03 PT |
616 | return; |
617 | ||
618 | ksocknal_launch_connection_locked(route); | |
619 | } | |
620 | } | |
621 | ||
ff13fd40 JS |
622 | struct ksock_conn * |
623 | ksocknal_find_conn_locked(struct ksock_peer *peer, struct ksock_tx *tx, int nonblk) | |
d7e09d03 | 624 | { |
97d10d0a | 625 | struct list_head *tmp; |
ff13fd40 JS |
626 | struct ksock_conn *conn; |
627 | struct ksock_conn *typed = NULL; | |
628 | struct ksock_conn *fallback = NULL; | |
97d10d0a MS |
629 | int tnob = 0; |
630 | int fnob = 0; | |
d7e09d03 | 631 | |
b31e64c4 | 632 | list_for_each(tmp, &peer->ksnp_conns) { |
ff13fd40 | 633 | struct ksock_conn *c = list_entry(tmp, struct ksock_conn, ksnc_list); |
97d10d0a | 634 | int nob = atomic_read(&c->ksnc_tx_nob) + |
b1ff8901 | 635 | c->ksnc_sock->sk->sk_wmem_queued; |
97d10d0a | 636 | int rc; |
d7e09d03 | 637 | |
97d10d0a | 638 | LASSERT(!c->ksnc_closing); |
06ace26e JS |
639 | LASSERT(c->ksnc_proto && |
640 | c->ksnc_proto->pro_match_tx); | |
d7e09d03 PT |
641 | |
642 | rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk); | |
643 | ||
644 | switch (rc) { | |
645 | default: | |
646 | LBUG(); | |
647 | case SOCKNAL_MATCH_NO: /* protocol rejected the tx */ | |
648 | continue; | |
649 | ||
650 | case SOCKNAL_MATCH_YES: /* typed connection */ | |
06ace26e | 651 | if (!typed || tnob > nob || |
d7e09d03 PT |
652 | (tnob == nob && *ksocknal_tunables.ksnd_round_robin && |
653 | cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) { | |
654 | typed = c; | |
655 | tnob = nob; | |
656 | } | |
657 | break; | |
658 | ||
659 | case SOCKNAL_MATCH_MAY: /* fallback connection */ | |
06ace26e | 660 | if (!fallback || fnob > nob || |
d7e09d03 PT |
661 | (fnob == nob && *ksocknal_tunables.ksnd_round_robin && |
662 | cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) { | |
663 | fallback = c; | |
97d10d0a | 664 | fnob = nob; |
d7e09d03 PT |
665 | } |
666 | break; | |
667 | } | |
668 | } | |
669 | ||
670 | /* prefer the typed selection */ | |
06ace26e | 671 | conn = (typed) ? typed : fallback; |
d7e09d03 | 672 | |
06ace26e | 673 | if (conn) |
d7e09d03 PT |
674 | conn->ksnc_tx_last_post = cfs_time_current(); |
675 | ||
676 | return conn; | |
677 | } | |
678 | ||
679 | void | |
ff13fd40 | 680 | ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx) |
d7e09d03 PT |
681 | { |
682 | conn->ksnc_proto->pro_pack(tx); | |
683 | ||
b31e64c4 | 684 | atomic_add(tx->tx_nob, &conn->ksnc_tx_nob); |
d7e09d03 PT |
685 | ksocknal_conn_addref(conn); /* +1 ref for tx */ |
686 | tx->tx_conn = conn; | |
687 | } | |
688 | ||
689 | void | |
ff13fd40 | 690 | ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn) |
d7e09d03 | 691 | { |
ff13fd40 | 692 | struct ksock_sched *sched = conn->ksnc_scheduler; |
97d10d0a | 693 | ksock_msg_t *msg = &tx->tx_msg; |
ff13fd40 | 694 | struct ksock_tx *ztx = NULL; |
97d10d0a | 695 | int bufnob = 0; |
d7e09d03 | 696 | |
4420cfd3 JS |
697 | /* |
698 | * called holding global lock (read or irq-write) and caller may | |
d7e09d03 PT |
699 | * not have dropped this lock between finding conn and calling me, |
700 | * so we don't need the {get,put}connsock dance to deref | |
4420cfd3 JS |
701 | * ksnc_sock... |
702 | */ | |
d7e09d03 PT |
703 | LASSERT(!conn->ksnc_closing); |
704 | ||
5e8f6920 | 705 | CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n", |
c314c319 JS |
706 | libcfs_id2str(conn->ksnc_peer->ksnp_id), |
707 | &conn->ksnc_ipaddr, conn->ksnc_port); | |
d7e09d03 PT |
708 | |
709 | ksocknal_tx_prep(conn, tx); | |
710 | ||
4420cfd3 JS |
711 | /* |
712 | * Ensure the frags we've been given EXACTLY match the number of | |
d7e09d03 PT |
713 | * bytes we want to send. Many TCP/IP stacks disregard any total |
714 | * size parameters passed to them and just look at the frags. | |
715 | * | |
716 | * We always expect at least 1 mapped fragment containing the | |
4420cfd3 JS |
717 | * complete ksocknal message header. |
718 | */ | |
b31e64c4 | 719 | LASSERT(lnet_iov_nob(tx->tx_niov, tx->tx_iov) + |
97d10d0a MS |
720 | lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) == |
721 | (unsigned int)tx->tx_nob); | |
722 | LASSERT(tx->tx_niov >= 1); | |
723 | LASSERT(tx->tx_resid == tx->tx_nob); | |
d7e09d03 | 724 | |
b31e64c4 | 725 | CDEBUG(D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", |
06ace26e | 726 | tx, (tx->tx_lnetmsg) ? tx->tx_lnetmsg->msg_hdr.type : |
b31e64c4 JS |
727 | KSOCK_MSG_NOOP, |
728 | tx->tx_nob, tx->tx_niov, tx->tx_nkiov); | |
d7e09d03 PT |
729 | |
730 | /* | |
731 | * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__ | |
732 | * but they're used inside spinlocks a lot. | |
733 | */ | |
fb4a1539 | 734 | bufnob = conn->ksnc_sock->sk->sk_wmem_queued; |
d7e09d03 PT |
735 | spin_lock_bh(&sched->kss_lock); |
736 | ||
5fd88337 | 737 | if (list_empty(&conn->ksnc_tx_queue) && !bufnob) { |
d7e09d03 PT |
738 | /* First packet starts the timeout */ |
739 | conn->ksnc_tx_deadline = | |
740 | cfs_time_shift(*ksocknal_tunables.ksnd_timeout); | |
741 | if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ | |
742 | conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); | |
743 | conn->ksnc_tx_bufnob = 0; | |
744 | mb(); /* order with adding to tx_queue */ | |
745 | } | |
746 | ||
747 | if (msg->ksm_type == KSOCK_MSG_NOOP) { | |
4420cfd3 JS |
748 | /* |
749 | * The packet is noop ZC ACK, try to piggyback the ack_cookie | |
750 | * on a normal packet so I don't need to send it | |
751 | */ | |
5fd88337 | 752 | LASSERT(msg->ksm_zc_cookies[1]); |
06ace26e | 753 | LASSERT(conn->ksnc_proto->pro_queue_tx_zcack); |
d7e09d03 PT |
754 | |
755 | if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0)) | |
756 | ztx = tx; /* ZC ACK piggybacked on ztx release tx later */ | |
757 | ||
758 | } else { | |
4420cfd3 JS |
759 | /* |
760 | * It's a normal packet - can it piggback a noop zc-ack that | |
761 | * has been queued already? | |
762 | */ | |
5fd88337 | 763 | LASSERT(!msg->ksm_zc_cookies[1]); |
06ace26e | 764 | LASSERT(conn->ksnc_proto->pro_queue_tx_msg); |
d7e09d03 PT |
765 | |
766 | ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx); | |
767 | /* ztx will be released later */ | |
768 | } | |
769 | ||
06ace26e | 770 | if (ztx) { |
b31e64c4 | 771 | atomic_sub(ztx->tx_nob, &conn->ksnc_tx_nob); |
d7e09d03 PT |
772 | list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); |
773 | } | |
774 | ||
775 | if (conn->ksnc_tx_ready && /* able to send */ | |
776 | !conn->ksnc_tx_scheduled) { /* not scheduled to send */ | |
777 | /* +1 ref for scheduler */ | |
778 | ksocknal_conn_addref(conn); | |
c314c319 | 779 | list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); |
d7e09d03 | 780 | conn->ksnc_tx_scheduled = 1; |
b31e64c4 | 781 | wake_up(&sched->kss_waitq); |
d7e09d03 PT |
782 | } |
783 | ||
784 | spin_unlock_bh(&sched->kss_lock); | |
785 | } | |
786 | ||
ff13fd40 JS |
787 | struct ksock_route * |
788 | ksocknal_find_connectable_route_locked(struct ksock_peer *peer) | |
d7e09d03 | 789 | { |
97d10d0a MS |
790 | unsigned long now = cfs_time_current(); |
791 | struct list_head *tmp; | |
ff13fd40 | 792 | struct ksock_route *route; |
d7e09d03 | 793 | |
b31e64c4 | 794 | list_for_each(tmp, &peer->ksnp_routes) { |
ff13fd40 | 795 | route = list_entry(tmp, struct ksock_route, ksnr_list); |
d7e09d03 | 796 | |
97d10d0a | 797 | LASSERT(!route->ksnr_connecting || route->ksnr_scheduled); |
d7e09d03 PT |
798 | |
799 | if (route->ksnr_scheduled) /* connections being established */ | |
800 | continue; | |
801 | ||
802 | /* all route types connected ? */ | |
5fd88337 | 803 | if (!(ksocknal_route_mask() & ~route->ksnr_connected)) |
d7e09d03 PT |
804 | continue; |
805 | ||
5fd88337 | 806 | if (!(!route->ksnr_retry_interval || /* first attempt */ |
d7e09d03 PT |
807 | cfs_time_aftereq(now, route->ksnr_timeout))) { |
808 | CDEBUG(D_NET, | |
2d00bd17 | 809 | "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n", |
5e8f6920 | 810 | &route->ksnr_ipaddr, |
d7e09d03 PT |
811 | route->ksnr_connected, |
812 | route->ksnr_retry_interval, | |
813 | cfs_duration_sec(route->ksnr_timeout - now)); | |
814 | continue; | |
815 | } | |
816 | ||
71397095 | 817 | return route; |
d7e09d03 PT |
818 | } |
819 | ||
71397095 | 820 | return NULL; |
d7e09d03 PT |
821 | } |
822 | ||
ff13fd40 JS |
823 | struct ksock_route * |
824 | ksocknal_find_connecting_route_locked(struct ksock_peer *peer) | |
d7e09d03 | 825 | { |
97d10d0a | 826 | struct list_head *tmp; |
ff13fd40 | 827 | struct ksock_route *route; |
d7e09d03 | 828 | |
b31e64c4 | 829 | list_for_each(tmp, &peer->ksnp_routes) { |
ff13fd40 | 830 | route = list_entry(tmp, struct ksock_route, ksnr_list); |
d7e09d03 | 831 | |
97d10d0a | 832 | LASSERT(!route->ksnr_connecting || route->ksnr_scheduled); |
d7e09d03 PT |
833 | |
834 | if (route->ksnr_scheduled) | |
71397095 | 835 | return route; |
d7e09d03 PT |
836 | } |
837 | ||
71397095 | 838 | return NULL; |
d7e09d03 PT |
839 | } |
840 | ||
841 | int | |
ff13fd40 | 842 | ksocknal_launch_packet(lnet_ni_t *ni, struct ksock_tx *tx, lnet_process_id_t id) |
d7e09d03 | 843 | { |
ff13fd40 JS |
844 | struct ksock_peer *peer; |
845 | struct ksock_conn *conn; | |
97d10d0a MS |
846 | rwlock_t *g_lock; |
847 | int retry; | |
848 | int rc; | |
d7e09d03 | 849 | |
06ace26e | 850 | LASSERT(!tx->tx_conn); |
d7e09d03 PT |
851 | |
852 | g_lock = &ksocknal_data.ksnd_global_lock; | |
853 | ||
854 | for (retry = 0;; retry = 1) { | |
855 | read_lock(g_lock); | |
856 | peer = ksocknal_find_peer_locked(ni, id); | |
06ace26e JS |
857 | if (peer) { |
858 | if (!ksocknal_find_connectable_route_locked(peer)) { | |
d7e09d03 | 859 | conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); |
06ace26e | 860 | if (conn) { |
4420cfd3 JS |
861 | /* |
862 | * I've got no routes that need to be | |
d7e09d03 | 863 | * connecting and I do have an actual |
4420cfd3 JS |
864 | * connection... |
865 | */ | |
b31e64c4 | 866 | ksocknal_queue_tx_locked(tx, conn); |
d7e09d03 | 867 | read_unlock(g_lock); |
71397095 | 868 | return 0; |
d7e09d03 PT |
869 | } |
870 | } | |
871 | } | |
872 | ||
873 | /* I'll need a write lock... */ | |
874 | read_unlock(g_lock); | |
875 | ||
876 | write_lock_bh(g_lock); | |
877 | ||
878 | peer = ksocknal_find_peer_locked(ni, id); | |
06ace26e | 879 | if (peer) |
d7e09d03 PT |
880 | break; |
881 | ||
882 | write_unlock_bh(g_lock); | |
883 | ||
5fd88337 | 884 | if (id.pid & LNET_PID_USERFLAG) { |
2d00bd17 JP |
885 | CERROR("Refusing to create a connection to userspace process %s\n", |
886 | libcfs_id2str(id)); | |
d7e09d03 PT |
887 | return -EHOSTUNREACH; |
888 | } | |
889 | ||
890 | if (retry) { | |
891 | CERROR("Can't find peer %s\n", libcfs_id2str(id)); | |
892 | return -EHOSTUNREACH; | |
893 | } | |
894 | ||
895 | rc = ksocknal_add_peer(ni, id, | |
896 | LNET_NIDADDR(id.nid), | |
897 | lnet_acceptor_port()); | |
5fd88337 | 898 | if (rc) { |
d7e09d03 PT |
899 | CERROR("Can't add peer %s: %d\n", |
900 | libcfs_id2str(id), rc); | |
901 | return rc; | |
902 | } | |
903 | } | |
904 | ||
905 | ksocknal_launch_all_connections_locked(peer); | |
906 | ||
907 | conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); | |
06ace26e | 908 | if (conn) { |
d7e09d03 | 909 | /* Connection exists; queue message on it */ |
b31e64c4 | 910 | ksocknal_queue_tx_locked(tx, conn); |
d7e09d03 | 911 | write_unlock_bh(g_lock); |
71397095 | 912 | return 0; |
d7e09d03 PT |
913 | } |
914 | ||
915 | if (peer->ksnp_accepting > 0 || | |
06ace26e | 916 | ksocknal_find_connecting_route_locked(peer)) { |
d7e09d03 PT |
917 | /* the message is going to be pinned to the peer */ |
918 | tx->tx_deadline = | |
919 | cfs_time_shift(*ksocknal_tunables.ksnd_timeout); | |
920 | ||
921 | /* Queue the message until a connection is established */ | |
b31e64c4 | 922 | list_add_tail(&tx->tx_list, &peer->ksnp_tx_queue); |
d7e09d03 PT |
923 | write_unlock_bh(g_lock); |
924 | return 0; | |
925 | } | |
926 | ||
927 | write_unlock_bh(g_lock); | |
928 | ||
929 | /* NB Routes may be ignored if connections to them failed recently */ | |
930 | CNETERR("No usable routes to %s\n", libcfs_id2str(id)); | |
71397095 | 931 | return -EHOSTUNREACH; |
d7e09d03 PT |
932 | } |
933 | ||
934 | int | |
935 | ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) | |
936 | { | |
97d10d0a MS |
937 | int mpflag = 1; |
938 | int type = lntmsg->msg_type; | |
d7e09d03 | 939 | lnet_process_id_t target = lntmsg->msg_target; |
97d10d0a MS |
940 | unsigned int payload_niov = lntmsg->msg_niov; |
941 | struct kvec *payload_iov = lntmsg->msg_iov; | |
942 | lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; | |
943 | unsigned int payload_offset = lntmsg->msg_offset; | |
944 | unsigned int payload_nob = lntmsg->msg_len; | |
ff13fd40 | 945 | struct ksock_tx *tx; |
97d10d0a MS |
946 | int desc_size; |
947 | int rc; | |
d7e09d03 | 948 | |
4420cfd3 JS |
949 | /* |
950 | * NB 'private' is different depending on what we're sending. | |
951 | * Just ignore it... | |
952 | */ | |
d7e09d03 PT |
953 | CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", |
954 | payload_nob, payload_niov, libcfs_id2str(target)); | |
955 | ||
5fd88337 | 956 | LASSERT(!payload_nob || payload_niov > 0); |
97d10d0a | 957 | LASSERT(payload_niov <= LNET_MAX_IOV); |
d7e09d03 | 958 | /* payload is either all vaddrs or all pages */ |
06ace26e | 959 | LASSERT(!(payload_kiov && payload_iov)); |
b31e64c4 | 960 | LASSERT(!in_interrupt()); |
d7e09d03 | 961 | |
06ace26e | 962 | if (payload_iov) |
ff13fd40 | 963 | desc_size = offsetof(struct ksock_tx, |
d7e09d03 PT |
964 | tx_frags.virt.iov[1 + payload_niov]); |
965 | else | |
ff13fd40 | 966 | desc_size = offsetof(struct ksock_tx, |
d7e09d03 PT |
967 | tx_frags.paged.kiov[payload_niov]); |
968 | ||
969 | if (lntmsg->msg_vmflush) | |
970 | mpflag = cfs_memory_pressure_get_and_set(); | |
971 | tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); | |
06ace26e | 972 | if (!tx) { |
d7e09d03 PT |
973 | CERROR("Can't allocate tx desc type %d size %d\n", |
974 | type, desc_size); | |
975 | if (lntmsg->msg_vmflush) | |
976 | cfs_memory_pressure_restore(mpflag); | |
71397095 | 977 | return -ENOMEM; |
d7e09d03 PT |
978 | } |
979 | ||
980 | tx->tx_conn = NULL; /* set when assigned a conn */ | |
981 | tx->tx_lnetmsg = lntmsg; | |
982 | ||
06ace26e | 983 | if (payload_iov) { |
d7e09d03 PT |
984 | tx->tx_kiov = NULL; |
985 | tx->tx_nkiov = 0; | |
986 | tx->tx_iov = tx->tx_frags.virt.iov; | |
987 | tx->tx_niov = 1 + | |
988 | lnet_extract_iov(payload_niov, &tx->tx_iov[1], | |
989 | payload_niov, payload_iov, | |
990 | payload_offset, payload_nob); | |
991 | } else { | |
992 | tx->tx_niov = 1; | |
993 | tx->tx_iov = &tx->tx_frags.paged.iov; | |
994 | tx->tx_kiov = tx->tx_frags.paged.kiov; | |
995 | tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, | |
996 | payload_niov, payload_kiov, | |
997 | payload_offset, payload_nob); | |
998 | ||
999 | if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload) | |
1000 | tx->tx_zc_capable = 1; | |
1001 | } | |
1002 | ||
1003 | socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET); | |
1004 | ||
1005 | /* The first fragment will be set later in pro_pack */ | |
1006 | rc = ksocknal_launch_packet(ni, tx, target); | |
aadbacc7 | 1007 | if (!mpflag) |
d7e09d03 | 1008 | cfs_memory_pressure_restore(mpflag); |
aadbacc7 | 1009 | |
5fd88337 | 1010 | if (!rc) |
71397095 | 1011 | return 0; |
d7e09d03 PT |
1012 | |
1013 | ksocknal_free_tx(tx); | |
71397095 | 1014 | return -EIO; |
d7e09d03 PT |
1015 | } |
1016 | ||
1017 | int | |
1018 | ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name) | |
1019 | { | |
9edf0f67 | 1020 | struct task_struct *task = kthread_run(fn, arg, "%s", name); |
d7e09d03 PT |
1021 | |
1022 | if (IS_ERR(task)) | |
1023 | return PTR_ERR(task); | |
1024 | ||
1025 | write_lock_bh(&ksocknal_data.ksnd_global_lock); | |
1026 | ksocknal_data.ksnd_nthreads++; | |
1027 | write_unlock_bh(&ksocknal_data.ksnd_global_lock); | |
1028 | return 0; | |
1029 | } | |
1030 | ||
1031 | void | |
b31e64c4 | 1032 | ksocknal_thread_fini(void) |
d7e09d03 PT |
1033 | { |
1034 | write_lock_bh(&ksocknal_data.ksnd_global_lock); | |
1035 | ksocknal_data.ksnd_nthreads--; | |
1036 | write_unlock_bh(&ksocknal_data.ksnd_global_lock); | |
1037 | } | |
1038 | ||
1039 | int | |
ff13fd40 | 1040 | ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip) |
d7e09d03 PT |
1041 | { |
1042 | static char ksocknal_slop_buffer[4096]; | |
1043 | ||
97d10d0a MS |
1044 | int nob; |
1045 | unsigned int niov; | |
1046 | int skipped; | |
d7e09d03 | 1047 | |
06ace26e | 1048 | LASSERT(conn->ksnc_proto); |
d7e09d03 | 1049 | |
5fd88337 | 1050 | if (*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) { |
d7e09d03 PT |
1051 | /* Remind the socket to ack eagerly... */ |
1052 | ksocknal_lib_eager_ack(conn); | |
1053 | } | |
1054 | ||
5fd88337 | 1055 | if (!nob_to_skip) { /* right at next packet boundary now */ |
d7e09d03 PT |
1056 | conn->ksnc_rx_started = 0; |
1057 | mb(); /* racing with timeout thread */ | |
1058 | ||
1059 | switch (conn->ksnc_proto->pro_version) { | |
1060 | case KSOCK_PROTO_V2: | |
1061 | case KSOCK_PROTO_V3: | |
1062 | conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; | |
f351bad2 AV |
1063 | conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; |
1064 | conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg; | |
d7e09d03 PT |
1065 | |
1066 | conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u); | |
1067 | conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u); | |
1068 | conn->ksnc_rx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u); | |
1069 | break; | |
1070 | ||
1071 | case KSOCK_PROTO_V1: | |
1072 | /* Receiving bare lnet_hdr_t */ | |
1073 | conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; | |
1074 | conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t); | |
1075 | conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t); | |
1076 | ||
f351bad2 AV |
1077 | conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; |
1078 | conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; | |
b31e64c4 | 1079 | conn->ksnc_rx_iov[0].iov_len = sizeof(lnet_hdr_t); |
d7e09d03 PT |
1080 | break; |
1081 | ||
1082 | default: | |
b31e64c4 | 1083 | LBUG(); |
d7e09d03 PT |
1084 | } |
1085 | conn->ksnc_rx_niov = 1; | |
1086 | ||
1087 | conn->ksnc_rx_kiov = NULL; | |
1088 | conn->ksnc_rx_nkiov = 0; | |
1089 | conn->ksnc_rx_csum = ~0; | |
71397095 | 1090 | return 1; |
d7e09d03 PT |
1091 | } |
1092 | ||
4420cfd3 JS |
1093 | /* |
1094 | * Set up to skip as much as possible now. If there's more left | |
1095 | * (ran out of iov entries) we'll get called again | |
1096 | */ | |
d7e09d03 PT |
1097 | conn->ksnc_rx_state = SOCKNAL_RX_SLOP; |
1098 | conn->ksnc_rx_nob_left = nob_to_skip; | |
f351bad2 | 1099 | conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; |
d7e09d03 PT |
1100 | skipped = 0; |
1101 | niov = 0; | |
1102 | ||
1103 | do { | |
462ef1e0 | 1104 | nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer)); |
d7e09d03 PT |
1105 | |
1106 | conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; | |
1107 | conn->ksnc_rx_iov[niov].iov_len = nob; | |
1108 | niov++; | |
1109 | skipped += nob; | |
b2952d62 | 1110 | nob_to_skip -= nob; |
d7e09d03 | 1111 | |
5fd88337 | 1112 | } while (nob_to_skip && /* mustn't overflow conn's rx iov */ |
b31e64c4 | 1113 | niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct iovec)); |
d7e09d03 PT |
1114 | |
1115 | conn->ksnc_rx_niov = niov; | |
1116 | conn->ksnc_rx_kiov = NULL; | |
1117 | conn->ksnc_rx_nkiov = 0; | |
1118 | conn->ksnc_rx_nob_wanted = skipped; | |
71397095 | 1119 | return 0; |
d7e09d03 PT |
1120 | } |
1121 | ||
f9cd474f | 1122 | static int |
ff13fd40 | 1123 | ksocknal_process_receive(struct ksock_conn *conn) |
d7e09d03 | 1124 | { |
97d10d0a | 1125 | lnet_hdr_t *lhdr; |
d7e09d03 | 1126 | lnet_process_id_t *id; |
97d10d0a | 1127 | int rc; |
d7e09d03 | 1128 | |
b31e64c4 | 1129 | LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); |
d7e09d03 PT |
1130 | |
1131 | /* NB: sched lock NOT held */ | |
2b284326 | 1132 | /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */ |
97d10d0a MS |
1133 | LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || |
1134 | conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || | |
1135 | conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || | |
1136 | conn->ksnc_rx_state == SOCKNAL_RX_SLOP); | |
d7e09d03 | 1137 | again: |
5fd88337 | 1138 | if (conn->ksnc_rx_nob_wanted) { |
d7e09d03 PT |
1139 | rc = ksocknal_receive(conn); |
1140 | ||
1141 | if (rc <= 0) { | |
b31e64c4 | 1142 | LASSERT(rc != -EAGAIN); |
d7e09d03 | 1143 | |
5fd88337 | 1144 | if (!rc) |
2d00bd17 JP |
1145 | CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n", |
1146 | conn, | |
1147 | libcfs_id2str(conn->ksnc_peer->ksnp_id), | |
1148 | &conn->ksnc_ipaddr, | |
1149 | conn->ksnc_port); | |
d7e09d03 | 1150 | else if (!conn->ksnc_closing) |
2d00bd17 JP |
1151 | CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n", |
1152 | conn, rc, | |
1153 | libcfs_id2str(conn->ksnc_peer->ksnp_id), | |
1154 | &conn->ksnc_ipaddr, | |
1155 | conn->ksnc_port); | |
d7e09d03 PT |
1156 | |
1157 | /* it's not an error if conn is being closed */ | |
b31e64c4 JS |
1158 | ksocknal_close_conn_and_siblings(conn, |
1159 | (conn->ksnc_closing) ? 0 : rc); | |
5fd88337 | 1160 | return (!rc ? -ESHUTDOWN : rc); |
d7e09d03 PT |
1161 | } |
1162 | ||
5fd88337 | 1163 | if (conn->ksnc_rx_nob_wanted) { |
d7e09d03 | 1164 | /* short read */ |
71397095 | 1165 | return -EAGAIN; |
d7e09d03 PT |
1166 | } |
1167 | } | |
1168 | switch (conn->ksnc_rx_state) { | |
1169 | case SOCKNAL_RX_KSM_HEADER: | |
1170 | if (conn->ksnc_flip) { | |
1171 | __swab32s(&conn->ksnc_msg.ksm_type); | |
1172 | __swab32s(&conn->ksnc_msg.ksm_csum); | |
1173 | __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]); | |
1174 | __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]); | |
1175 | } | |
1176 | ||
1177 | if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP && | |
1178 | conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) { | |
1179 | CERROR("%s: Unknown message type: %x\n", | |
1180 | libcfs_id2str(conn->ksnc_peer->ksnp_id), | |
1181 | conn->ksnc_msg.ksm_type); | |
1182 | ksocknal_new_packet(conn, 0); | |
1183 | ksocknal_close_conn_and_siblings(conn, -EPROTO); | |
71397095 | 1184 | return -EPROTO; |
d7e09d03 PT |
1185 | } |
1186 | ||
1187 | if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && | |
5fd88337 | 1188 | conn->ksnc_msg.ksm_csum && /* has checksum */ |
d7e09d03 PT |
1189 | conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { |
1190 | /* NOOP Checksum error */ | |
1191 | CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", | |
1192 | libcfs_id2str(conn->ksnc_peer->ksnp_id), | |
1193 | conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); | |
1194 | ksocknal_new_packet(conn, 0); | |
1195 | ksocknal_close_conn_and_siblings(conn, -EPROTO); | |
71397095 | 1196 | return -EIO; |
d7e09d03 PT |
1197 | } |
1198 | ||
5fd88337 | 1199 | if (conn->ksnc_msg.ksm_zc_cookies[1]) { |
d7e09d03 PT |
1200 | __u64 cookie = 0; |
1201 | ||
b31e64c4 | 1202 | LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); |
d7e09d03 PT |
1203 | |
1204 | if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) | |
1205 | cookie = conn->ksnc_msg.ksm_zc_cookies[0]; | |
1206 | ||
1207 | rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie, | |
1208 | conn->ksnc_msg.ksm_zc_cookies[1]); | |
1209 | ||
5fd88337 | 1210 | if (rc) { |
b0f5aad5 | 1211 | CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n", |
d7e09d03 PT |
1212 | libcfs_id2str(conn->ksnc_peer->ksnp_id), |
1213 | cookie, conn->ksnc_msg.ksm_zc_cookies[1]); | |
1214 | ksocknal_new_packet(conn, 0); | |
1215 | ksocknal_close_conn_and_siblings(conn, -EPROTO); | |
71397095 | 1216 | return rc; |
d7e09d03 PT |
1217 | } |
1218 | } | |
1219 | ||
1220 | if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { | |
b31e64c4 | 1221 | ksocknal_new_packet(conn, 0); |
d7e09d03 PT |
1222 | return 0; /* NOOP is done and just return */ |
1223 | } | |
1224 | ||
1225 | conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; | |
1226 | conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t); | |
1227 | conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t); | |
1228 | ||
f351bad2 AV |
1229 | conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; |
1230 | conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; | |
d7e09d03 PT |
1231 | conn->ksnc_rx_iov[0].iov_len = sizeof(ksock_lnet_msg_t); |
1232 | ||
1233 | conn->ksnc_rx_niov = 1; | |
1234 | conn->ksnc_rx_kiov = NULL; | |
1235 | conn->ksnc_rx_nkiov = 0; | |
1236 | ||
1237 | goto again; /* read lnet header now */ | |
1238 | ||
1239 | case SOCKNAL_RX_LNET_HEADER: | |
1240 | /* unpack message header */ | |
1241 | conn->ksnc_proto->pro_unpack(&conn->ksnc_msg); | |
1242 | ||
5fd88337 | 1243 | if (conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) { |
d7e09d03 PT |
1244 | /* Userspace peer */ |
1245 | lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; | |
97d10d0a | 1246 | id = &conn->ksnc_peer->ksnp_id; |
d7e09d03 PT |
1247 | |
1248 | /* Substitute process ID assigned at connection time */ | |
1249 | lhdr->src_pid = cpu_to_le32(id->pid); | |
1250 | lhdr->src_nid = cpu_to_le64(id->nid); | |
1251 | } | |
1252 | ||
1253 | conn->ksnc_rx_state = SOCKNAL_RX_PARSE; | |
1254 | ksocknal_conn_addref(conn); /* ++ref while parsing */ | |
1255 | ||
1256 | rc = lnet_parse(conn->ksnc_peer->ksnp_ni, | |
1257 | &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, | |
1258 | conn->ksnc_peer->ksnp_id.nid, conn, 0); | |
1259 | if (rc < 0) { | |
1260 | /* I just received garbage: give up on this conn */ | |
1261 | ksocknal_new_packet(conn, 0); | |
b31e64c4 | 1262 | ksocknal_close_conn_and_siblings(conn, rc); |
d7e09d03 | 1263 | ksocknal_conn_decref(conn); |
71397095 | 1264 | return -EPROTO; |
d7e09d03 PT |
1265 | } |
1266 | ||
1267 | /* I'm racing with ksocknal_recv() */ | |
b31e64c4 JS |
1268 | LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_PARSE || |
1269 | conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); | |
d7e09d03 PT |
1270 | |
1271 | if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) | |
1272 | return 0; | |
1273 | ||
1274 | /* ksocknal_recv() got called */ | |
1275 | goto again; | |
1276 | ||
1277 | case SOCKNAL_RX_LNET_PAYLOAD: | |
1278 | /* payload all received */ | |
1279 | rc = 0; | |
1280 | ||
5fd88337 JS |
1281 | if (!conn->ksnc_rx_nob_left && /* not truncating */ |
1282 | conn->ksnc_msg.ksm_csum && /* has checksum */ | |
d7e09d03 PT |
1283 | conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { |
1284 | CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", | |
1285 | libcfs_id2str(conn->ksnc_peer->ksnp_id), | |
1286 | conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); | |
1287 | rc = -EIO; | |
1288 | } | |
1289 | ||
5fd88337 | 1290 | if (!rc && conn->ksnc_msg.ksm_zc_cookies[0]) { |
d7e09d03 PT |
1291 | LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); |
1292 | ||
1293 | lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; | |
97d10d0a | 1294 | id = &conn->ksnc_peer->ksnp_id; |
d7e09d03 PT |
1295 | |
1296 | rc = conn->ksnc_proto->pro_handle_zcreq(conn, | |
1297 | conn->ksnc_msg.ksm_zc_cookies[0], | |
1298 | *ksocknal_tunables.ksnd_nonblk_zcack || | |
1299 | le64_to_cpu(lhdr->src_nid) != id->nid); | |
1300 | } | |
1301 | ||
1302 | lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc); | |
1303 | ||
5fd88337 | 1304 | if (rc) { |
d7e09d03 | 1305 | ksocknal_new_packet(conn, 0); |
b31e64c4 | 1306 | ksocknal_close_conn_and_siblings(conn, rc); |
71397095 | 1307 | return -EPROTO; |
d7e09d03 PT |
1308 | } |
1309 | /* Fall through */ | |
1310 | ||
1311 | case SOCKNAL_RX_SLOP: | |
1312 | /* starting new packet? */ | |
b31e64c4 | 1313 | if (ksocknal_new_packet(conn, conn->ksnc_rx_nob_left)) |
d7e09d03 PT |
1314 | return 0; /* come back later */ |
1315 | goto again; /* try to finish reading slop now */ | |
1316 | ||
1317 | default: | |
1318 | break; | |
1319 | } | |
1320 | ||
1321 | /* Not Reached */ | |
97d10d0a | 1322 | LBUG(); |
71397095 | 1323 | return -EINVAL; /* keep gcc happy */ |
d7e09d03 PT |
1324 | } |
1325 | ||
1326 | int | |
b31e64c4 | 1327 | ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, |
c1b7b8eb | 1328 | struct iov_iter *to, unsigned int rlen) |
d7e09d03 | 1329 | { |
ff13fd40 JS |
1330 | struct ksock_conn *conn = private; |
1331 | struct ksock_sched *sched = conn->ksnc_scheduler; | |
d7e09d03 | 1332 | |
c1b7b8eb AV |
1333 | LASSERT(iov_iter_count(to) <= rlen); |
1334 | LASSERT(to->nr_segs <= LNET_MAX_IOV); | |
d7e09d03 PT |
1335 | |
1336 | conn->ksnc_cookie = msg; | |
c1b7b8eb | 1337 | conn->ksnc_rx_nob_wanted = iov_iter_count(to); |
97d10d0a | 1338 | conn->ksnc_rx_nob_left = rlen; |
d7e09d03 | 1339 | |
c1b7b8eb | 1340 | if (to->type & ITER_KVEC) { |
d7e09d03 PT |
1341 | conn->ksnc_rx_nkiov = 0; |
1342 | conn->ksnc_rx_kiov = NULL; | |
1343 | conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; | |
1344 | conn->ksnc_rx_niov = | |
1345 | lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov, | |
c1b7b8eb AV |
1346 | to->nr_segs, to->kvec, |
1347 | to->iov_offset, iov_iter_count(to)); | |
d7e09d03 PT |
1348 | } else { |
1349 | conn->ksnc_rx_niov = 0; | |
97d10d0a | 1350 | conn->ksnc_rx_iov = NULL; |
d7e09d03 PT |
1351 | conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; |
1352 | conn->ksnc_rx_nkiov = | |
1353 | lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, | |
c1b7b8eb AV |
1354 | to->nr_segs, to->bvec, |
1355 | to->iov_offset, iov_iter_count(to)); | |
d7e09d03 PT |
1356 | } |
1357 | ||
97d10d0a | 1358 | LASSERT(conn->ksnc_rx_scheduled); |
d7e09d03 PT |
1359 | |
1360 | spin_lock_bh(&sched->kss_lock); | |
1361 | ||
1362 | switch (conn->ksnc_rx_state) { | |
1363 | case SOCKNAL_RX_PARSE_WAIT: | |
1364 | list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); | |
b31e64c4 JS |
1365 | wake_up(&sched->kss_waitq); |
1366 | LASSERT(conn->ksnc_rx_ready); | |
d7e09d03 PT |
1367 | break; |
1368 | ||
1369 | case SOCKNAL_RX_PARSE: | |
1370 | /* scheduler hasn't noticed I'm parsing yet */ | |
1371 | break; | |
1372 | } | |
1373 | ||
1374 | conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; | |
1375 | ||
1376 | spin_unlock_bh(&sched->kss_lock); | |
1377 | ksocknal_conn_decref(conn); | |
1378 | return 0; | |
1379 | } | |
1380 | ||
1381 | static inline int | |
ff13fd40 | 1382 | ksocknal_sched_cansleep(struct ksock_sched *sched) |
d7e09d03 | 1383 | { |
97d10d0a | 1384 | int rc; |
d7e09d03 PT |
1385 | |
1386 | spin_lock_bh(&sched->kss_lock); | |
1387 | ||
b6ee3824 | 1388 | rc = !ksocknal_data.ksnd_shuttingdown && |
d7e09d03 | 1389 | list_empty(&sched->kss_rx_conns) && |
b6ee3824 | 1390 | list_empty(&sched->kss_tx_conns); |
d7e09d03 PT |
1391 | |
1392 | spin_unlock_bh(&sched->kss_lock); | |
1393 | return rc; | |
1394 | } | |
1395 | ||
1396 | int ksocknal_scheduler(void *arg) | |
1397 | { | |
97d10d0a | 1398 | struct ksock_sched_info *info; |
ff13fd40 JS |
1399 | struct ksock_sched *sched; |
1400 | struct ksock_conn *conn; | |
1401 | struct ksock_tx *tx; | |
97d10d0a MS |
1402 | int rc; |
1403 | int nloops = 0; | |
1404 | long id = (long)arg; | |
d7e09d03 PT |
1405 | |
1406 | info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)]; | |
1407 | sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; | |
1408 | ||
1409 | cfs_block_allsigs(); | |
1410 | ||
1411 | rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt); | |
5fd88337 | 1412 | if (rc) { |
d7e09d03 PT |
1413 | CERROR("Can't set CPT affinity to %d: %d\n", |
1414 | info->ksi_cpt, rc); | |
1415 | } | |
1416 | ||
1417 | spin_lock_bh(&sched->kss_lock); | |
1418 | ||
1419 | while (!ksocknal_data.ksnd_shuttingdown) { | |
1420 | int did_something = 0; | |
1421 | ||
1422 | /* Ensure I progress everything semi-fairly */ | |
1423 | ||
b31e64c4 | 1424 | if (!list_empty(&sched->kss_rx_conns)) { |
d7e09d03 | 1425 | conn = list_entry(sched->kss_rx_conns.next, |
ff13fd40 | 1426 | struct ksock_conn, ksnc_rx_list); |
d7e09d03 PT |
1427 | list_del(&conn->ksnc_rx_list); |
1428 | ||
1429 | LASSERT(conn->ksnc_rx_scheduled); | |
1430 | LASSERT(conn->ksnc_rx_ready); | |
1431 | ||
4420cfd3 JS |
1432 | /* |
1433 | * clear rx_ready in case receive isn't complete. | |
d7e09d03 PT |
1434 | * Do it BEFORE we call process_recv, since |
1435 | * data_ready can set it any time after we release | |
4420cfd3 JS |
1436 | * kss_lock. |
1437 | */ | |
d7e09d03 PT |
1438 | conn->ksnc_rx_ready = 0; |
1439 | spin_unlock_bh(&sched->kss_lock); | |
1440 | ||
1441 | rc = ksocknal_process_receive(conn); | |
1442 | ||
1443 | spin_lock_bh(&sched->kss_lock); | |
1444 | ||
1445 | /* I'm the only one that can clear this flag */ | |
1446 | LASSERT(conn->ksnc_rx_scheduled); | |
1447 | ||
1448 | /* Did process_receive get everything it wanted? */ | |
5fd88337 | 1449 | if (!rc) |
d7e09d03 PT |
1450 | conn->ksnc_rx_ready = 1; |
1451 | ||
1452 | if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { | |
4420cfd3 JS |
1453 | /* |
1454 | * Conn blocked waiting for ksocknal_recv() | |
d7e09d03 | 1455 | * I change its state (under lock) to signal |
4420cfd3 JS |
1456 | * it can be rescheduled |
1457 | */ | |
d7e09d03 PT |
1458 | conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; |
1459 | } else if (conn->ksnc_rx_ready) { | |
1460 | /* reschedule for rx */ | |
c314c319 JS |
1461 | list_add_tail(&conn->ksnc_rx_list, |
1462 | &sched->kss_rx_conns); | |
d7e09d03 PT |
1463 | } else { |
1464 | conn->ksnc_rx_scheduled = 0; | |
1465 | /* drop my ref */ | |
1466 | ksocknal_conn_decref(conn); | |
1467 | } | |
1468 | ||
1469 | did_something = 1; | |
1470 | } | |
1471 | ||
b31e64c4 | 1472 | if (!list_empty(&sched->kss_tx_conns)) { |
97d10d0a | 1473 | LIST_HEAD(zlist); |
d7e09d03 PT |
1474 | |
1475 | if (!list_empty(&sched->kss_zombie_noop_txs)) { | |
c314c319 | 1476 | list_add(&zlist, &sched->kss_zombie_noop_txs); |
d7e09d03 PT |
1477 | list_del_init(&sched->kss_zombie_noop_txs); |
1478 | } | |
1479 | ||
1480 | conn = list_entry(sched->kss_tx_conns.next, | |
ff13fd40 | 1481 | struct ksock_conn, ksnc_tx_list); |
b31e64c4 | 1482 | list_del(&conn->ksnc_tx_list); |
d7e09d03 PT |
1483 | |
1484 | LASSERT(conn->ksnc_tx_scheduled); | |
1485 | LASSERT(conn->ksnc_tx_ready); | |
1486 | LASSERT(!list_empty(&conn->ksnc_tx_queue)); | |
1487 | ||
1488 | tx = list_entry(conn->ksnc_tx_queue.next, | |
ff13fd40 | 1489 | struct ksock_tx, tx_list); |
d7e09d03 PT |
1490 | |
1491 | if (conn->ksnc_tx_carrier == tx) | |
1492 | ksocknal_next_tx_carrier(conn); | |
1493 | ||
1494 | /* dequeue now so empty list => more to send */ | |
1495 | list_del(&tx->tx_list); | |
1496 | ||
4420cfd3 JS |
1497 | /* |
1498 | * Clear tx_ready in case send isn't complete. Do | |
d7e09d03 PT |
1499 | * it BEFORE we call process_transmit, since |
1500 | * write_space can set it any time after we release | |
4420cfd3 JS |
1501 | * kss_lock. |
1502 | */ | |
d7e09d03 PT |
1503 | conn->ksnc_tx_ready = 0; |
1504 | spin_unlock_bh(&sched->kss_lock); | |
1505 | ||
1506 | if (!list_empty(&zlist)) { | |
4420cfd3 JS |
1507 | /* |
1508 | * free zombie noop txs, it's fast because | |
1509 | * noop txs are just put in freelist | |
1510 | */ | |
d7e09d03 PT |
1511 | ksocknal_txlist_done(NULL, &zlist, 0); |
1512 | } | |
1513 | ||
1514 | rc = ksocknal_process_transmit(conn, tx); | |
1515 | ||
1516 | if (rc == -ENOMEM || rc == -EAGAIN) { | |
1517 | /* Incomplete send: replace tx on HEAD of tx_queue */ | |
1518 | spin_lock_bh(&sched->kss_lock); | |
c314c319 | 1519 | list_add(&tx->tx_list, &conn->ksnc_tx_queue); |
d7e09d03 PT |
1520 | } else { |
1521 | /* Complete send; tx -ref */ | |
1522 | ksocknal_tx_decref(tx); | |
1523 | ||
1524 | spin_lock_bh(&sched->kss_lock); | |
1525 | /* assume space for more */ | |
1526 | conn->ksnc_tx_ready = 1; | |
1527 | } | |
1528 | ||
1529 | if (rc == -ENOMEM) { | |
4420cfd3 JS |
1530 | /* |
1531 | * Do nothing; after a short timeout, this | |
1532 | * conn will be reposted on kss_tx_conns. | |
1533 | */ | |
d7e09d03 | 1534 | } else if (conn->ksnc_tx_ready && |
97d10d0a | 1535 | !list_empty(&conn->ksnc_tx_queue)) { |
d7e09d03 | 1536 | /* reschedule for tx */ |
97d10d0a | 1537 | list_add_tail(&conn->ksnc_tx_list, |
c314c319 | 1538 | &sched->kss_tx_conns); |
d7e09d03 PT |
1539 | } else { |
1540 | conn->ksnc_tx_scheduled = 0; | |
1541 | /* drop my ref */ | |
1542 | ksocknal_conn_decref(conn); | |
1543 | } | |
1544 | ||
1545 | did_something = 1; | |
1546 | } | |
1547 | if (!did_something || /* nothing to do */ | |
1548 | ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ | |
1549 | spin_unlock_bh(&sched->kss_lock); | |
1550 | ||
1551 | nloops = 0; | |
1552 | ||
1553 | if (!did_something) { /* wait for something to do */ | |
46ffc934 | 1554 | rc = wait_event_interruptible_exclusive( |
d7e09d03 | 1555 | sched->kss_waitq, |
46ffc934 | 1556 | !ksocknal_sched_cansleep(sched)); |
5fd88337 | 1557 | LASSERT(!rc); |
d7e09d03 PT |
1558 | } else { |
1559 | cond_resched(); | |
1560 | } | |
1561 | ||
1562 | spin_lock_bh(&sched->kss_lock); | |
1563 | } | |
1564 | } | |
1565 | ||
1566 | spin_unlock_bh(&sched->kss_lock); | |
1567 | ksocknal_thread_fini(); | |
1568 | return 0; | |
1569 | } | |
1570 | ||
1571 | /* | |
1572 | * Add connection to kss_rx_conns of scheduler | |
1573 | * and wakeup the scheduler. | |
1574 | */ | |
ff13fd40 | 1575 | void ksocknal_read_callback(struct ksock_conn *conn) |
d7e09d03 | 1576 | { |
ff13fd40 | 1577 | struct ksock_sched *sched; |
d7e09d03 PT |
1578 | |
1579 | sched = conn->ksnc_scheduler; | |
1580 | ||
1581 | spin_lock_bh(&sched->kss_lock); | |
1582 | ||
1583 | conn->ksnc_rx_ready = 1; | |
1584 | ||
1585 | if (!conn->ksnc_rx_scheduled) { /* not being progressed */ | |
c314c319 | 1586 | list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); |
d7e09d03 PT |
1587 | conn->ksnc_rx_scheduled = 1; |
1588 | /* extra ref for scheduler */ | |
1589 | ksocknal_conn_addref(conn); | |
1590 | ||
b31e64c4 | 1591 | wake_up(&sched->kss_waitq); |
d7e09d03 PT |
1592 | } |
1593 | spin_unlock_bh(&sched->kss_lock); | |
d7e09d03 PT |
1594 | } |
1595 | ||
1596 | /* | |
1597 | * Add connection to kss_tx_conns of scheduler | |
1598 | * and wakeup the scheduler. | |
1599 | */ | |
ff13fd40 | 1600 | void ksocknal_write_callback(struct ksock_conn *conn) |
d7e09d03 | 1601 | { |
ff13fd40 | 1602 | struct ksock_sched *sched; |
d7e09d03 PT |
1603 | |
1604 | sched = conn->ksnc_scheduler; | |
1605 | ||
1606 | spin_lock_bh(&sched->kss_lock); | |
1607 | ||
1608 | conn->ksnc_tx_ready = 1; | |
1609 | ||
995c8b4a GD |
1610 | if (!conn->ksnc_tx_scheduled && /* not being progressed */ |
1611 | !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */ | |
c314c319 | 1612 | list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); |
d7e09d03 PT |
1613 | conn->ksnc_tx_scheduled = 1; |
1614 | /* extra ref for scheduler */ | |
1615 | ksocknal_conn_addref(conn); | |
1616 | ||
b31e64c4 | 1617 | wake_up(&sched->kss_waitq); |
d7e09d03 PT |
1618 | } |
1619 | ||
1620 | spin_unlock_bh(&sched->kss_lock); | |
d7e09d03 PT |
1621 | } |
1622 | ||
ff13fd40 | 1623 | static struct ksock_proto * |
b31e64c4 | 1624 | ksocknal_parse_proto_version(ksock_hello_msg_t *hello) |
d7e09d03 | 1625 | { |
97d10d0a | 1626 | __u32 version = 0; |
d7e09d03 PT |
1627 | |
1628 | if (hello->kshm_magic == LNET_PROTO_MAGIC) | |
1629 | version = hello->kshm_version; | |
1630 | else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC)) | |
1631 | version = __swab32(hello->kshm_version); | |
1632 | ||
5fd88337 | 1633 | if (version) { |
d7e09d03 PT |
1634 | #if SOCKNAL_VERSION_DEBUG |
1635 | if (*ksocknal_tunables.ksnd_protocol == 1) | |
1636 | return NULL; | |
1637 | ||
1638 | if (*ksocknal_tunables.ksnd_protocol == 2 && | |
1639 | version == KSOCK_PROTO_V3) | |
1640 | return NULL; | |
1641 | #endif | |
1642 | if (version == KSOCK_PROTO_V2) | |
1643 | return &ksocknal_protocol_v2x; | |
1644 | ||
1645 | if (version == KSOCK_PROTO_V3) | |
1646 | return &ksocknal_protocol_v3x; | |
1647 | ||
1648 | return NULL; | |
1649 | } | |
1650 | ||
1651 | if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { | |
1652 | lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello; | |
1653 | ||
b31e64c4 JS |
1654 | CLASSERT(sizeof(lnet_magicversion_t) == |
1655 | offsetof(ksock_hello_msg_t, kshm_src_nid)); | |
d7e09d03 | 1656 | |
b31e64c4 JS |
1657 | if (hmv->version_major == cpu_to_le16(KSOCK_PROTO_V1_MAJOR) && |
1658 | hmv->version_minor == cpu_to_le16(KSOCK_PROTO_V1_MINOR)) | |
d7e09d03 PT |
1659 | return &ksocknal_protocol_v1x; |
1660 | } | |
1661 | ||
1662 | return NULL; | |
1663 | } | |
1664 | ||
1665 | int | |
ff13fd40 | 1666 | ksocknal_send_hello(lnet_ni_t *ni, struct ksock_conn *conn, |
b31e64c4 | 1667 | lnet_nid_t peer_nid, ksock_hello_msg_t *hello) |
d7e09d03 PT |
1668 | { |
1669 | /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ | |
ff13fd40 | 1670 | struct ksock_net *net = (struct ksock_net *)ni->ni_data; |
d7e09d03 | 1671 | |
97d10d0a | 1672 | LASSERT(hello->kshm_nips <= LNET_MAX_INTERFACES); |
d7e09d03 PT |
1673 | |
1674 | /* rely on caller to hold a ref on socket so it wouldn't disappear */ | |
06ace26e | 1675 | LASSERT(conn->ksnc_proto); |
d7e09d03 | 1676 | |
97d10d0a MS |
1677 | hello->kshm_src_nid = ni->ni_nid; |
1678 | hello->kshm_dst_nid = peer_nid; | |
1679 | hello->kshm_src_pid = the_lnet.ln_pid; | |
d7e09d03 PT |
1680 | |
1681 | hello->kshm_src_incarnation = net->ksnn_incarnation; | |
97d10d0a | 1682 | hello->kshm_ctype = conn->ksnc_type; |
d7e09d03 PT |
1683 | |
1684 | return conn->ksnc_proto->pro_send_hello(conn, hello); | |
1685 | } | |
1686 | ||
f9cd474f | 1687 | static int |
d7e09d03 PT |
1688 | ksocknal_invert_type(int type) |
1689 | { | |
9d0b2b7a | 1690 | switch (type) { |
d7e09d03 PT |
1691 | case SOCKLND_CONN_ANY: |
1692 | case SOCKLND_CONN_CONTROL: | |
71397095 | 1693 | return type; |
d7e09d03 PT |
1694 | case SOCKLND_CONN_BULK_IN: |
1695 | return SOCKLND_CONN_BULK_OUT; | |
1696 | case SOCKLND_CONN_BULK_OUT: | |
1697 | return SOCKLND_CONN_BULK_IN; | |
1698 | default: | |
71397095 | 1699 | return SOCKLND_CONN_NONE; |
d7e09d03 PT |
1700 | } |
1701 | } | |
1702 | ||
1703 | int | |
ff13fd40 | 1704 | ksocknal_recv_hello(lnet_ni_t *ni, struct ksock_conn *conn, |
b31e64c4 JS |
1705 | ksock_hello_msg_t *hello, lnet_process_id_t *peerid, |
1706 | __u64 *incarnation) | |
d7e09d03 PT |
1707 | { |
1708 | /* Return < 0 fatal error | |
1709 | * 0 success | |
1710 | * EALREADY lost connection race | |
1711 | * EPROTO protocol version mismatch | |
1712 | */ | |
97d10d0a | 1713 | struct socket *sock = conn->ksnc_sock; |
06ace26e | 1714 | int active = !!conn->ksnc_proto; |
97d10d0a MS |
1715 | int timeout; |
1716 | int proto_match; | |
1717 | int rc; | |
ff13fd40 | 1718 | struct ksock_proto *proto; |
97d10d0a | 1719 | lnet_process_id_t recv_id; |
d7e09d03 PT |
1720 | |
1721 | /* socket type set on active connections - not set on passive */ | |
97d10d0a | 1722 | LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE)); |
d7e09d03 PT |
1723 | |
1724 | timeout = active ? *ksocknal_tunables.ksnd_timeout : | |
1725 | lnet_acceptor_timeout(); | |
1726 | ||
b31e64c4 | 1727 | rc = lnet_sock_read(sock, &hello->kshm_magic, sizeof(hello->kshm_magic), timeout); |
5fd88337 | 1728 | if (rc) { |
5e8f6920 | 1729 | CERROR("Error %d reading HELLO from %pI4h\n", |
c314c319 | 1730 | rc, &conn->ksnc_ipaddr); |
b31e64c4 | 1731 | LASSERT(rc < 0); |
d7e09d03 PT |
1732 | return rc; |
1733 | } | |
1734 | ||
1735 | if (hello->kshm_magic != LNET_PROTO_MAGIC && | |
1736 | hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && | |
b31e64c4 | 1737 | hello->kshm_magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { |
d7e09d03 | 1738 | /* Unexpected magic! */ |
2d00bd17 | 1739 | CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n", |
b31e64c4 | 1740 | __cpu_to_le32(hello->kshm_magic), |
2d00bd17 JP |
1741 | LNET_PROTO_TCP_MAGIC, |
1742 | &conn->ksnc_ipaddr); | |
d7e09d03 PT |
1743 | return -EPROTO; |
1744 | } | |
1745 | ||
1ad6a73e JS |
1746 | rc = lnet_sock_read(sock, &hello->kshm_version, |
1747 | sizeof(hello->kshm_version), timeout); | |
5fd88337 | 1748 | if (rc) { |
5e8f6920 | 1749 | CERROR("Error %d reading HELLO from %pI4h\n", |
c314c319 | 1750 | rc, &conn->ksnc_ipaddr); |
97d10d0a | 1751 | LASSERT(rc < 0); |
d7e09d03 PT |
1752 | return rc; |
1753 | } | |
1754 | ||
1755 | proto = ksocknal_parse_proto_version(hello); | |
06ace26e | 1756 | if (!proto) { |
d7e09d03 PT |
1757 | if (!active) { |
1758 | /* unknown protocol from peer, tell peer my protocol */ | |
1759 | conn->ksnc_proto = &ksocknal_protocol_v3x; | |
1760 | #if SOCKNAL_VERSION_DEBUG | |
1761 | if (*ksocknal_tunables.ksnd_protocol == 2) | |
1762 | conn->ksnc_proto = &ksocknal_protocol_v2x; | |
1763 | else if (*ksocknal_tunables.ksnd_protocol == 1) | |
1764 | conn->ksnc_proto = &ksocknal_protocol_v1x; | |
1765 | #endif | |
1766 | hello->kshm_nips = 0; | |
1767 | ksocknal_send_hello(ni, conn, ni->ni_nid, hello); | |
1768 | } | |
1769 | ||
2d00bd17 JP |
1770 | CERROR("Unknown protocol version (%d.x expected) from %pI4h\n", |
1771 | conn->ksnc_proto->pro_version, | |
1772 | &conn->ksnc_ipaddr); | |
d7e09d03 PT |
1773 | |
1774 | return -EPROTO; | |
1775 | } | |
1776 | ||
1777 | proto_match = (conn->ksnc_proto == proto); | |
1778 | conn->ksnc_proto = proto; | |
1779 | ||
1780 | /* receive the rest of hello message anyway */ | |
1781 | rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); | |
5fd88337 | 1782 | if (rc) { |
5e8f6920 PT |
1783 | CERROR("Error %d reading or checking hello from from %pI4h\n", |
1784 | rc, &conn->ksnc_ipaddr); | |
97d10d0a | 1785 | LASSERT(rc < 0); |
d7e09d03 PT |
1786 | return rc; |
1787 | } | |
1788 | ||
1789 | *incarnation = hello->kshm_src_incarnation; | |
1790 | ||
1791 | if (hello->kshm_src_nid == LNET_NID_ANY) { | |
2d00bd17 JP |
1792 | CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n", |
1793 | &conn->ksnc_ipaddr); | |
d7e09d03 PT |
1794 | return -EPROTO; |
1795 | } | |
1796 | ||
1797 | if (!active && | |
1798 | conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { | |
1799 | /* Userspace NAL assigns peer process ID from socket */ | |
1800 | recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG; | |
1801 | recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr); | |
1802 | } else { | |
1803 | recv_id.nid = hello->kshm_src_nid; | |
1804 | recv_id.pid = hello->kshm_src_pid; | |
1805 | } | |
1806 | ||
1807 | if (!active) { | |
1808 | *peerid = recv_id; | |
1809 | ||
1810 | /* peer determines type */ | |
1811 | conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); | |
1812 | if (conn->ksnc_type == SOCKLND_CONN_NONE) { | |
5e8f6920 | 1813 | CERROR("Unexpected type %d from %s ip %pI4h\n", |
c314c319 JS |
1814 | hello->kshm_ctype, libcfs_id2str(*peerid), |
1815 | &conn->ksnc_ipaddr); | |
d7e09d03 PT |
1816 | return -EPROTO; |
1817 | } | |
1818 | ||
1819 | return 0; | |
1820 | } | |
1821 | ||
1822 | if (peerid->pid != recv_id.pid || | |
1823 | peerid->nid != recv_id.nid) { | |
2d00bd17 | 1824 | LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n", |
d7e09d03 | 1825 | libcfs_id2str(*peerid), |
5e8f6920 | 1826 | &conn->ksnc_ipaddr, |
d7e09d03 PT |
1827 | libcfs_id2str(recv_id)); |
1828 | return -EPROTO; | |
1829 | } | |
1830 | ||
1831 | if (hello->kshm_ctype == SOCKLND_CONN_NONE) { | |
1832 | /* Possible protocol mismatch or I lost the connection race */ | |
1833 | return proto_match ? EALREADY : EPROTO; | |
1834 | } | |
1835 | ||
1836 | if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { | |
5e8f6920 | 1837 | CERROR("Mismatched types: me %d, %s ip %pI4h %d\n", |
c314c319 JS |
1838 | conn->ksnc_type, libcfs_id2str(*peerid), |
1839 | &conn->ksnc_ipaddr, hello->kshm_ctype); | |
d7e09d03 PT |
1840 | return -EPROTO; |
1841 | } | |
1842 | ||
1843 | return 0; | |
1844 | } | |
1845 | ||
f9cd474f | 1846 | static int |
ff13fd40 | 1847 | ksocknal_connect(struct ksock_route *route) |
d7e09d03 | 1848 | { |
97d10d0a | 1849 | LIST_HEAD(zombies); |
ff13fd40 | 1850 | struct ksock_peer *peer = route->ksnr_peer; |
97d10d0a MS |
1851 | int type; |
1852 | int wanted; | |
1853 | struct socket *sock; | |
1854 | unsigned long deadline; | |
1855 | int retry_later = 0; | |
1856 | int rc = 0; | |
d7e09d03 PT |
1857 | |
1858 | deadline = cfs_time_add(cfs_time_current(), | |
1859 | cfs_time_seconds(*ksocknal_tunables.ksnd_timeout)); | |
1860 | ||
1861 | write_lock_bh(&ksocknal_data.ksnd_global_lock); | |
1862 | ||
97d10d0a MS |
1863 | LASSERT(route->ksnr_scheduled); |
1864 | LASSERT(!route->ksnr_connecting); | |
d7e09d03 PT |
1865 | |
1866 | route->ksnr_connecting = 1; | |
1867 | ||
1868 | for (;;) { | |
1869 | wanted = ksocknal_route_mask() & ~route->ksnr_connected; | |
1870 | ||
4420cfd3 JS |
1871 | /* |
1872 | * stop connecting if peer/route got closed under me, or | |
1873 | * route got connected while queued | |
1874 | */ | |
d7e09d03 | 1875 | if (peer->ksnp_closing || route->ksnr_deleted || |
5fd88337 | 1876 | !wanted) { |
d7e09d03 PT |
1877 | retry_later = 0; |
1878 | break; | |
1879 | } | |
1880 | ||
1881 | /* reschedule if peer is connecting to me */ | |
1882 | if (peer->ksnp_accepting > 0) { | |
1883 | CDEBUG(D_NET, | |
1884 | "peer %s(%d) already connecting to me, retry later.\n", | |
1885 | libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting); | |
1886 | retry_later = 1; | |
1887 | } | |
1888 | ||
1889 | if (retry_later) /* needs reschedule */ | |
1890 | break; | |
1891 | ||
5fd88337 | 1892 | if (wanted & (1 << SOCKLND_CONN_ANY)) { |
d7e09d03 | 1893 | type = SOCKLND_CONN_ANY; |
5fd88337 | 1894 | } else if (wanted & (1 << SOCKLND_CONN_CONTROL)) { |
d7e09d03 | 1895 | type = SOCKLND_CONN_CONTROL; |
5fd88337 | 1896 | } else if (wanted & (1 << SOCKLND_CONN_BULK_IN)) { |
d7e09d03 PT |
1897 | type = SOCKLND_CONN_BULK_IN; |
1898 | } else { | |
5fd88337 | 1899 | LASSERT(wanted & (1 << SOCKLND_CONN_BULK_OUT)); |
d7e09d03 PT |
1900 | type = SOCKLND_CONN_BULK_OUT; |
1901 | } | |
1902 | ||
1903 | write_unlock_bh(&ksocknal_data.ksnd_global_lock); | |
1904 | ||
1905 | if (cfs_time_aftereq(cfs_time_current(), deadline)) { | |
1906 | rc = -ETIMEDOUT; | |
1907 | lnet_connect_console_error(rc, peer->ksnp_id.nid, | |
1908 | route->ksnr_ipaddr, | |
1909 | route->ksnr_port); | |
1910 | goto failed; | |
1911 | } | |
1912 | ||
1913 | rc = lnet_connect(&sock, peer->ksnp_id.nid, | |
1914 | route->ksnr_myipaddr, | |
1915 | route->ksnr_ipaddr, route->ksnr_port); | |
5fd88337 | 1916 | if (rc) |
d7e09d03 PT |
1917 | goto failed; |
1918 | ||
1919 | rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type); | |
1920 | if (rc < 0) { | |
1921 | lnet_connect_console_error(rc, peer->ksnp_id.nid, | |
1922 | route->ksnr_ipaddr, | |
1923 | route->ksnr_port); | |
1924 | goto failed; | |
1925 | } | |
1926 | ||
4420cfd3 JS |
1927 | /* |
1928 | * A +ve RC means I have to retry because I lost the connection | |
1929 | * race or I have to renegotiate protocol version | |
1930 | */ | |
5fd88337 | 1931 | retry_later = (rc); |
d7e09d03 PT |
1932 | if (retry_later) |
1933 | CDEBUG(D_NET, "peer %s: conn race, retry later.\n", | |
1934 | libcfs_nid2str(peer->ksnp_id.nid)); | |
1935 | ||
1936 | write_lock_bh(&ksocknal_data.ksnd_global_lock); | |
1937 | } | |
1938 | ||
1939 | route->ksnr_scheduled = 0; | |
1940 | route->ksnr_connecting = 0; | |
1941 | ||
1942 | if (retry_later) { | |
4420cfd3 JS |
1943 | /* |
1944 | * re-queue for attention; this frees me up to handle | |
1945 | * the peer's incoming connection request | |
1946 | */ | |
d7e09d03 | 1947 | if (rc == EALREADY || |
5fd88337 | 1948 | (!rc && peer->ksnp_accepting > 0)) { |
4420cfd3 JS |
1949 | /* |
1950 | * We want to introduce a delay before next | |
d7e09d03 PT |
1951 | * attempt to connect if we lost conn race, |
1952 | * but the race is resolved quickly usually, | |
4420cfd3 JS |
1953 | * so min_reconnectms should be good heuristic |
1954 | */ | |
d7e09d03 | 1955 | route->ksnr_retry_interval = |
51078e25 | 1956 | cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms) / 1000; |
d7e09d03 PT |
1957 | route->ksnr_timeout = cfs_time_add(cfs_time_current(), |
1958 | route->ksnr_retry_interval); | |
1959 | } | |
1960 | ||
1961 | ksocknal_launch_connection_locked(route); | |
1962 | } | |
1963 | ||
1964 | write_unlock_bh(&ksocknal_data.ksnd_global_lock); | |
1965 | return retry_later; | |
1966 | ||
1967 | failed: | |
1968 | write_lock_bh(&ksocknal_data.ksnd_global_lock); | |
1969 | ||
1970 | route->ksnr_scheduled = 0; | |
1971 | route->ksnr_connecting = 0; | |
1972 | ||
1973 | /* This is a retry rather than a new connection */ | |
1974 | route->ksnr_retry_interval *= 2; | |
1975 | route->ksnr_retry_interval = | |
0c575417 | 1976 | max(route->ksnr_retry_interval, |
51078e25 | 1977 | cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms) / 1000); |
d7e09d03 | 1978 | route->ksnr_retry_interval = |
0c575417 | 1979 | min(route->ksnr_retry_interval, |
51078e25 | 1980 | cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms) / 1000); |
d7e09d03 | 1981 | |
5fd88337 | 1982 | LASSERT(route->ksnr_retry_interval); |
d7e09d03 PT |
1983 | route->ksnr_timeout = cfs_time_add(cfs_time_current(), |
1984 | route->ksnr_retry_interval); | |
1985 | ||
1986 | if (!list_empty(&peer->ksnp_tx_queue) && | |
5fd88337 | 1987 | !peer->ksnp_accepting && |
06ace26e | 1988 | !ksocknal_find_connecting_route_locked(peer)) { |
ff13fd40 | 1989 | struct ksock_conn *conn; |
d7e09d03 | 1990 | |
4420cfd3 JS |
1991 | /* |
1992 | * ksnp_tx_queue is queued on a conn on successful | |
1993 | * connection for V1.x and V2.x | |
1994 | */ | |
b31e64c4 | 1995 | if (!list_empty(&peer->ksnp_conns)) { |
d7e09d03 | 1996 | conn = list_entry(peer->ksnp_conns.next, |
ff13fd40 | 1997 | struct ksock_conn, ksnc_list); |
b31e64c4 | 1998 | LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); |
d7e09d03 PT |
1999 | } |
2000 | ||
4420cfd3 JS |
2001 | /* |
2002 | * take all the blocked packets while I've got the lock and | |
2003 | * complete below... | |
2004 | */ | |
d7e09d03 PT |
2005 | list_splice_init(&peer->ksnp_tx_queue, &zombies); |
2006 | } | |
2007 | ||
d7e09d03 PT |
2008 | write_unlock_bh(&ksocknal_data.ksnd_global_lock); |
2009 | ||
2010 | ksocknal_peer_failed(peer); | |
2011 | ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); | |
2012 | return 0; | |
2013 | } | |
2014 | ||
2015 | /* | |
2016 | * check whether we need to create more connds. | |
2017 | * It will try to create new thread if it's necessary, @timeout can | |
2018 | * be updated if failed to create, so caller wouldn't keep try while | |
2019 | * running out of resource. | |
2020 | */ | |
2021 | static int | |
74ad578f | 2022 | ksocknal_connd_check_start(time64_t sec, long *timeout) |
d7e09d03 PT |
2023 | { |
2024 | char name[16]; | |
2025 | int rc; | |
2026 | int total = ksocknal_data.ksnd_connd_starting + | |
2027 | ksocknal_data.ksnd_connd_running; | |
2028 | ||
2029 | if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { | |
2030 | /* still in initializing */ | |
2031 | return 0; | |
2032 | } | |
2033 | ||
2034 | if (total >= *ksocknal_tunables.ksnd_nconnds_max || | |
2035 | total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) { | |
4420cfd3 JS |
2036 | /* |
2037 | * can't create more connd, or still have enough | |
2038 | * threads to handle more connecting | |
2039 | */ | |
d7e09d03 PT |
2040 | return 0; |
2041 | } | |
2042 | ||
2043 | if (list_empty(&ksocknal_data.ksnd_connd_routes)) { | |
2044 | /* no pending connecting request */ | |
2045 | return 0; | |
2046 | } | |
2047 | ||
2048 | if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) { | |
2049 | /* may run out of resource, retry later */ | |
2050 | *timeout = cfs_time_seconds(1); | |
2051 | return 0; | |
2052 | } | |
2053 | ||
2054 | if (ksocknal_data.ksnd_connd_starting > 0) { | |
2055 | /* serialize starting to avoid flood */ | |
2056 | return 0; | |
2057 | } | |
2058 | ||
2059 | ksocknal_data.ksnd_connd_starting_stamp = sec; | |
2060 | ksocknal_data.ksnd_connd_starting++; | |
2061 | spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); | |
2062 | ||
2063 | /* NB: total is the next id */ | |
2064 | snprintf(name, sizeof(name), "socknal_cd%02d", total); | |
2065 | rc = ksocknal_thread_start(ksocknal_connd, NULL, name); | |
2066 | ||
2067 | spin_lock_bh(&ksocknal_data.ksnd_connd_lock); | |
5fd88337 | 2068 | if (!rc) |
d7e09d03 PT |
2069 | return 1; |
2070 | ||
2071 | /* we tried ... */ | |
2072 | LASSERT(ksocknal_data.ksnd_connd_starting > 0); | |
2073 | ksocknal_data.ksnd_connd_starting--; | |
74ad578f | 2074 | ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds(); |
d7e09d03 PT |
2075 | |
2076 | return 1; | |
2077 | } | |
2078 | ||
2079 | /* | |
2080 | * check whether current thread can exit, it will return 1 if there are too | |
2081 | * many threads and no creating in past 120 seconds. | |
2082 | * Also, this function may update @timeout to make caller come back | |
2083 | * again to recheck these conditions. | |
2084 | */ | |
2085 | static int | |
74ad578f | 2086 | ksocknal_connd_check_stop(time64_t sec, long *timeout) |
d7e09d03 PT |
2087 | { |
2088 | int val; | |
2089 | ||
2090 | if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { | |
2091 | /* still in initializing */ | |
2092 | return 0; | |
2093 | } | |
2094 | ||
2095 | if (ksocknal_data.ksnd_connd_starting > 0) { | |
2096 | /* in progress of starting new thread */ | |
2097 | return 0; | |
2098 | } | |
2099 | ||
2100 | if (ksocknal_data.ksnd_connd_running <= | |
2101 | *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */ | |
2102 | return 0; | |
2103 | } | |
2104 | ||
2105 | /* created thread in past 120 seconds? */ | |
2106 | val = (int)(ksocknal_data.ksnd_connd_starting_stamp + | |
2107 | SOCKNAL_CONND_TIMEOUT - sec); | |
2108 | ||
2109 | *timeout = (val > 0) ? cfs_time_seconds(val) : | |
2110 | cfs_time_seconds(SOCKNAL_CONND_TIMEOUT); | |
2111 | if (val > 0) | |
2112 | return 0; | |
2113 | ||
2114 | /* no creating in past 120 seconds */ | |
2115 | ||
2116 | return ksocknal_data.ksnd_connd_running > | |
2117 | ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV; | |
2118 | } | |
2119 | ||
4420cfd3 JS |
2120 | /* |
2121 | * Go through connd_routes queue looking for a route that we can process | |
2122 | * right now, @timeout_p can be updated if we need to come back later | |
2123 | */ | |
ff13fd40 | 2124 | static struct ksock_route * |
d7e09d03 PT |
2125 | ksocknal_connd_get_route_locked(signed long *timeout_p) |
2126 | { | |
ff13fd40 | 2127 | struct ksock_route *route; |
97d10d0a | 2128 | unsigned long now; |
d7e09d03 PT |
2129 | |
2130 | now = cfs_time_current(); | |
2131 | ||
2132 | /* connd_routes can contain both pending and ordinary routes */ | |
c314c319 JS |
2133 | list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes, |
2134 | ksnr_connd_list) { | |
5fd88337 | 2135 | if (!route->ksnr_retry_interval || |
d7e09d03 PT |
2136 | cfs_time_aftereq(now, route->ksnr_timeout)) |
2137 | return route; | |
2138 | ||
2139 | if (*timeout_p == MAX_SCHEDULE_TIMEOUT || | |
2140 | (int)*timeout_p > (int)(route->ksnr_timeout - now)) | |
2141 | *timeout_p = (int)(route->ksnr_timeout - now); | |
2142 | } | |
2143 | ||
2144 | return NULL; | |
2145 | } | |
2146 | ||
2147 | int | |
b31e64c4 | 2148 | ksocknal_connd(void *arg) |
d7e09d03 | 2149 | { |
97d10d0a | 2150 | spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; |
ff13fd40 | 2151 | struct ksock_connreq *cr; |
97d10d0a MS |
2152 | wait_queue_t wait; |
2153 | int nloops = 0; | |
2154 | int cons_retry = 0; | |
d7e09d03 | 2155 | |
97d10d0a | 2156 | cfs_block_allsigs(); |
d7e09d03 | 2157 | |
9e795d35 | 2158 | init_waitqueue_entry(&wait, current); |
d7e09d03 PT |
2159 | |
2160 | spin_lock_bh(connd_lock); | |
2161 | ||
2162 | LASSERT(ksocknal_data.ksnd_connd_starting > 0); | |
2163 | ksocknal_data.ksnd_connd_starting--; | |
2164 | ksocknal_data.ksnd_connd_running++; | |
2165 | ||
2166 | while (!ksocknal_data.ksnd_shuttingdown) { | |
ff13fd40 | 2167 | struct ksock_route *route = NULL; |
74ad578f | 2168 | time64_t sec = ktime_get_real_seconds(); |
d7e09d03 | 2169 | long timeout = MAX_SCHEDULE_TIMEOUT; |
97d10d0a | 2170 | int dropped_lock = 0; |
d7e09d03 PT |
2171 | |
2172 | if (ksocknal_connd_check_stop(sec, &timeout)) { | |
2173 | /* wakeup another one to check stop */ | |
2174 | wake_up(&ksocknal_data.ksnd_connd_waitq); | |
2175 | break; | |
2176 | } | |
2177 | ||
2178 | if (ksocknal_connd_check_start(sec, &timeout)) { | |
2179 | /* created new thread */ | |
2180 | dropped_lock = 1; | |
2181 | } | |
2182 | ||
2183 | if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { | |
2184 | /* Connection accepted by the listener */ | |
ff13fd40 JS |
2185 | cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next, |
2186 | struct ksock_connreq, ksncr_list); | |
d7e09d03 PT |
2187 | |
2188 | list_del(&cr->ksncr_list); | |
2189 | spin_unlock_bh(connd_lock); | |
2190 | dropped_lock = 1; | |
2191 | ||
2192 | ksocknal_create_conn(cr->ksncr_ni, NULL, | |
2193 | cr->ksncr_sock, SOCKLND_CONN_NONE); | |
2194 | lnet_ni_decref(cr->ksncr_ni); | |
2195 | LIBCFS_FREE(cr, sizeof(*cr)); | |
2196 | ||
2197 | spin_lock_bh(connd_lock); | |
2198 | } | |
2199 | ||
4420cfd3 JS |
2200 | /* |
2201 | * Only handle an outgoing connection request if there | |
d7e09d03 | 2202 | * is a thread left to handle incoming connections and |
4420cfd3 JS |
2203 | * create new connd |
2204 | */ | |
d7e09d03 PT |
2205 | if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV < |
2206 | ksocknal_data.ksnd_connd_running) { | |
2207 | route = ksocknal_connd_get_route_locked(&timeout); | |
2208 | } | |
06ace26e | 2209 | if (route) { |
b31e64c4 | 2210 | list_del(&route->ksnr_connd_list); |
d7e09d03 PT |
2211 | ksocknal_data.ksnd_connd_connecting++; |
2212 | spin_unlock_bh(connd_lock); | |
2213 | dropped_lock = 1; | |
2214 | ||
2215 | if (ksocknal_connect(route)) { | |
2216 | /* consecutive retry */ | |
2217 | if (cons_retry++ > SOCKNAL_INSANITY_RECONN) { | |
2d00bd17 | 2218 | CWARN("massive consecutive re-connecting to %pI4h\n", |
5e8f6920 | 2219 | &route->ksnr_ipaddr); |
d7e09d03 PT |
2220 | cons_retry = 0; |
2221 | } | |
2222 | } else { | |
2223 | cons_retry = 0; | |
2224 | } | |
2225 | ||
2226 | ksocknal_route_decref(route); | |
2227 | ||
2228 | spin_lock_bh(connd_lock); | |
2229 | ksocknal_data.ksnd_connd_connecting--; | |
2230 | } | |
2231 | ||
2232 | if (dropped_lock) { | |
2233 | if (++nloops < SOCKNAL_RESCHED) | |
2234 | continue; | |
2235 | spin_unlock_bh(connd_lock); | |
2236 | nloops = 0; | |
2237 | cond_resched(); | |
2238 | spin_lock_bh(connd_lock); | |
2239 | continue; | |
2240 | } | |
2241 | ||
2242 | /* Nothing to do for 'timeout' */ | |
2243 | set_current_state(TASK_INTERRUPTIBLE); | |
2244 | add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait); | |
2245 | spin_unlock_bh(connd_lock); | |
2246 | ||
2247 | nloops = 0; | |
b7efb98d | 2248 | schedule_timeout(timeout); |
d7e09d03 | 2249 | |
d7e09d03 PT |
2250 | remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); |
2251 | spin_lock_bh(connd_lock); | |
2252 | } | |
2253 | ksocknal_data.ksnd_connd_running--; | |
2254 | spin_unlock_bh(connd_lock); | |
2255 | ||
2256 | ksocknal_thread_fini(); | |
2257 | return 0; | |
2258 | } | |
2259 | ||
ff13fd40 JS |
2260 | static struct ksock_conn * |
2261 | ksocknal_find_timed_out_conn(struct ksock_peer *peer) | |
d7e09d03 PT |
2262 | { |
2263 | /* We're called with a shared lock on ksnd_global_lock */ | |
ff13fd40 | 2264 | struct ksock_conn *conn; |
97d10d0a | 2265 | struct list_head *ctmp; |
d7e09d03 | 2266 | |
b31e64c4 | 2267 | list_for_each(ctmp, &peer->ksnp_conns) { |
97d10d0a | 2268 | int error; |
50ffcb7e | 2269 | |
ff13fd40 | 2270 | conn = list_entry(ctmp, struct ksock_conn, ksnc_list); |
d7e09d03 PT |
2271 | |
2272 | /* Don't need the {get,put}connsock dance to deref ksnc_sock */ | |
97d10d0a | 2273 | LASSERT(!conn->ksnc_closing); |
d7e09d03 | 2274 | |
4420cfd3 JS |
2275 | /* |
2276 | * SOCK_ERROR will reset error code of socket in | |
2277 | * some platform (like Darwin8.x) | |
2278 | */ | |
fb4a1539 | 2279 | error = conn->ksnc_sock->sk->sk_err; |
5fd88337 | 2280 | if (error) { |
d7e09d03 PT |
2281 | ksocknal_conn_addref(conn); |
2282 | ||
2283 | switch (error) { | |
2284 | case ECONNRESET: | |
2d00bd17 | 2285 | CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n", |
d7e09d03 | 2286 | libcfs_id2str(peer->ksnp_id), |
5e8f6920 | 2287 | &conn->ksnc_ipaddr, |
d7e09d03 PT |
2288 | conn->ksnc_port); |
2289 | break; | |
2290 | case ETIMEDOUT: | |
2d00bd17 | 2291 | CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n", |
d7e09d03 | 2292 | libcfs_id2str(peer->ksnp_id), |
5e8f6920 | 2293 | &conn->ksnc_ipaddr, |
d7e09d03 PT |
2294 | conn->ksnc_port); |
2295 | break; | |
2296 | default: | |
2d00bd17 JP |
2297 | CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n", |
2298 | error, | |
d7e09d03 | 2299 | libcfs_id2str(peer->ksnp_id), |
5e8f6920 | 2300 | &conn->ksnc_ipaddr, |
d7e09d03 PT |
2301 | conn->ksnc_port); |
2302 | break; | |
2303 | } | |
2304 | ||
71397095 | 2305 | return conn; |
d7e09d03 PT |
2306 | } |
2307 | ||
2308 | if (conn->ksnc_rx_started && | |
2309 | cfs_time_aftereq(cfs_time_current(), | |
2310 | conn->ksnc_rx_deadline)) { | |
2311 | /* Timed out incomplete incoming message */ | |
2312 | ksocknal_conn_addref(conn); | |
2d00bd17 | 2313 | CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %d left %d\n", |
d7e09d03 | 2314 | libcfs_id2str(peer->ksnp_id), |
5e8f6920 | 2315 | &conn->ksnc_ipaddr, |
d7e09d03 PT |
2316 | conn->ksnc_port, |
2317 | conn->ksnc_rx_state, | |
2318 | conn->ksnc_rx_nob_wanted, | |
2319 | conn->ksnc_rx_nob_left); | |
71397095 | 2320 | return conn; |
d7e09d03 PT |
2321 | } |
2322 | ||
2323 | if ((!list_empty(&conn->ksnc_tx_queue) || | |
5fd88337 | 2324 | conn->ksnc_sock->sk->sk_wmem_queued) && |
d7e09d03 PT |
2325 | cfs_time_aftereq(cfs_time_current(), |
2326 | conn->ksnc_tx_deadline)) { | |
4420cfd3 JS |
2327 | /* |
2328 | * Timed out messages queued for sending or | |
2329 | * buffered in the socket's send buffer | |
2330 | */ | |
d7e09d03 | 2331 | ksocknal_conn_addref(conn); |
2d00bd17 | 2332 | CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n", |
d7e09d03 | 2333 | libcfs_id2str(peer->ksnp_id), |
5e8f6920 | 2334 | &conn->ksnc_ipaddr, |
d7e09d03 | 2335 | conn->ksnc_port); |
71397095 | 2336 | return conn; |
d7e09d03 PT |
2337 | } |
2338 | } | |
2339 | ||
71397095 | 2340 | return NULL; |
d7e09d03 PT |
2341 | } |
2342 | ||
2343 | static inline void | |
ff13fd40 | 2344 | ksocknal_flush_stale_txs(struct ksock_peer *peer) |
d7e09d03 | 2345 | { |
ff13fd40 JS |
2346 | struct ksock_tx *tx; |
2347 | struct ksock_tx *tmp; | |
97d10d0a | 2348 | LIST_HEAD(stale_txs); |
d7e09d03 PT |
2349 | |
2350 | write_lock_bh(&ksocknal_data.ksnd_global_lock); | |
2351 | ||
0daec763 | 2352 | list_for_each_entry_safe(tx, tmp, &peer->ksnp_tx_queue, tx_list) { |
d7e09d03 PT |
2353 | if (!cfs_time_aftereq(cfs_time_current(), |
2354 | tx->tx_deadline)) | |
2355 | break; | |
2356 | ||
b31e64c4 JS |
2357 | list_del(&tx->tx_list); |
2358 | list_add_tail(&tx->tx_list, &stale_txs); | |
d7e09d03 PT |
2359 | } |
2360 | ||
2361 | write_unlock_bh(&ksocknal_data.ksnd_global_lock); | |
2362 | ||
2363 | ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1); | |
2364 | } | |
2365 | ||
f9cd474f | 2366 | static int |
ff13fd40 | 2367 | ksocknal_send_keepalive_locked(struct ksock_peer *peer) |
a161de86 | 2368 | __must_hold(&ksocknal_data.ksnd_global_lock) |
d7e09d03 | 2369 | { |
ff13fd40 JS |
2370 | struct ksock_sched *sched; |
2371 | struct ksock_conn *conn; | |
2372 | struct ksock_tx *tx; | |
d7e09d03 PT |
2373 | |
2374 | if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */ | |
2375 | return 0; | |
2376 | ||
2377 | if (peer->ksnp_proto != &ksocknal_protocol_v3x) | |
2378 | return 0; | |
2379 | ||
2380 | if (*ksocknal_tunables.ksnd_keepalive <= 0 || | |
699503bc GKH |
2381 | time_before(cfs_time_current(), |
2382 | cfs_time_add(peer->ksnp_last_alive, | |
2383 | cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive)))) | |
d7e09d03 PT |
2384 | return 0; |
2385 | ||
699503bc | 2386 | if (time_before(cfs_time_current(), peer->ksnp_send_keepalive)) |
d7e09d03 PT |
2387 | return 0; |
2388 | ||
4420cfd3 JS |
2389 | /* |
2390 | * retry 10 secs later, so we wouldn't put pressure | |
2391 | * on this peer if we failed to send keepalive this time | |
2392 | */ | |
d7e09d03 PT |
2393 | peer->ksnp_send_keepalive = cfs_time_shift(10); |
2394 | ||
2395 | conn = ksocknal_find_conn_locked(peer, NULL, 1); | |
06ace26e | 2396 | if (conn) { |
d7e09d03 PT |
2397 | sched = conn->ksnc_scheduler; |
2398 | ||
2399 | spin_lock_bh(&sched->kss_lock); | |
2400 | if (!list_empty(&conn->ksnc_tx_queue)) { | |
2401 | spin_unlock_bh(&sched->kss_lock); | |
2402 | /* there is an queued ACK, don't need keepalive */ | |
2403 | return 0; | |
2404 | } | |
2405 | ||
2406 | spin_unlock_bh(&sched->kss_lock); | |
2407 | } | |
2408 | ||
2409 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
2410 | ||
2411 | /* cookie = 1 is reserved for keepalive PING */ | |
2412 | tx = ksocknal_alloc_tx_noop(1, 1); | |
06ace26e | 2413 | if (!tx) { |
d7e09d03 PT |
2414 | read_lock(&ksocknal_data.ksnd_global_lock); |
2415 | return -ENOMEM; | |
2416 | } | |
2417 | ||
5fd88337 | 2418 | if (!ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) { |
d7e09d03 PT |
2419 | read_lock(&ksocknal_data.ksnd_global_lock); |
2420 | return 1; | |
2421 | } | |
2422 | ||
2423 | ksocknal_free_tx(tx); | |
2424 | read_lock(&ksocknal_data.ksnd_global_lock); | |
2425 | ||
2426 | return -EIO; | |
2427 | } | |
2428 | ||
f9cd474f | 2429 | static void |
b31e64c4 | 2430 | ksocknal_check_peer_timeouts(int idx) |
d7e09d03 | 2431 | { |
97d10d0a | 2432 | struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; |
ff13fd40 JS |
2433 | struct ksock_peer *peer; |
2434 | struct ksock_conn *conn; | |
2435 | struct ksock_tx *tx; | |
d7e09d03 PT |
2436 | |
2437 | again: | |
4420cfd3 JS |
2438 | /* |
2439 | * NB. We expect to have a look at all the peers and not find any | |
d7e09d03 | 2440 | * connections to time out, so we just use a shared lock while we |
4420cfd3 JS |
2441 | * take a look... |
2442 | */ | |
d7e09d03 PT |
2443 | read_lock(&ksocknal_data.ksnd_global_lock); |
2444 | ||
2445 | list_for_each_entry(peer, peers, ksnp_list) { | |
97d10d0a MS |
2446 | unsigned long deadline = 0; |
2447 | int resid = 0; | |
2448 | int n = 0; | |
d7e09d03 | 2449 | |
5fd88337 | 2450 | if (ksocknal_send_keepalive_locked(peer)) { |
d7e09d03 PT |
2451 | read_unlock(&ksocknal_data.ksnd_global_lock); |
2452 | goto again; | |
2453 | } | |
2454 | ||
b31e64c4 | 2455 | conn = ksocknal_find_timed_out_conn(peer); |
d7e09d03 | 2456 | |
06ace26e | 2457 | if (conn) { |
d7e09d03 PT |
2458 | read_unlock(&ksocknal_data.ksnd_global_lock); |
2459 | ||
b31e64c4 | 2460 | ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT); |
d7e09d03 | 2461 | |
4420cfd3 JS |
2462 | /* |
2463 | * NB we won't find this one again, but we can't | |
d7e09d03 | 2464 | * just proceed with the next peer, since we dropped |
4420cfd3 JS |
2465 | * ksnd_global_lock and it might be dead already! |
2466 | */ | |
d7e09d03 PT |
2467 | ksocknal_conn_decref(conn); |
2468 | goto again; | |
2469 | } | |
2470 | ||
4420cfd3 JS |
2471 | /* |
2472 | * we can't process stale txs right here because we're | |
2473 | * holding only shared lock | |
2474 | */ | |
b31e64c4 | 2475 | if (!list_empty(&peer->ksnp_tx_queue)) { |
ff13fd40 JS |
2476 | struct ksock_tx *tx = list_entry(peer->ksnp_tx_queue.next, |
2477 | struct ksock_tx, tx_list); | |
d7e09d03 PT |
2478 | |
2479 | if (cfs_time_aftereq(cfs_time_current(), | |
2480 | tx->tx_deadline)) { | |
d7e09d03 PT |
2481 | ksocknal_peer_addref(peer); |
2482 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
2483 | ||
2484 | ksocknal_flush_stale_txs(peer); | |
2485 | ||
2486 | ksocknal_peer_decref(peer); | |
2487 | goto again; | |
2488 | } | |
2489 | } | |
2490 | ||
2491 | if (list_empty(&peer->ksnp_zc_req_list)) | |
2492 | continue; | |
2493 | ||
2494 | spin_lock(&peer->ksnp_lock); | |
2495 | list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) { | |
2496 | if (!cfs_time_aftereq(cfs_time_current(), | |
2497 | tx->tx_deadline)) | |
2498 | break; | |
2499 | /* ignore the TX if connection is being closed */ | |
2500 | if (tx->tx_conn->ksnc_closing) | |
2501 | continue; | |
2502 | n++; | |
2503 | } | |
2504 | ||
5fd88337 | 2505 | if (!n) { |
d7e09d03 PT |
2506 | spin_unlock(&peer->ksnp_lock); |
2507 | continue; | |
2508 | } | |
2509 | ||
2510 | tx = list_entry(peer->ksnp_zc_req_list.next, | |
ff13fd40 | 2511 | struct ksock_tx, tx_zc_list); |
d7e09d03 | 2512 | deadline = tx->tx_deadline; |
97d10d0a MS |
2513 | resid = tx->tx_resid; |
2514 | conn = tx->tx_conn; | |
d7e09d03 PT |
2515 | ksocknal_conn_addref(conn); |
2516 | ||
2517 | spin_unlock(&peer->ksnp_lock); | |
2518 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
2519 | ||
2d00bd17 | 2520 | CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n", |
d7e09d03 PT |
2521 | n, libcfs_nid2str(peer->ksnp_id.nid), tx, |
2522 | cfs_duration_sec(cfs_time_current() - deadline), | |
fb4a1539 | 2523 | resid, conn->ksnc_sock->sk->sk_wmem_queued); |
d7e09d03 | 2524 | |
b31e64c4 | 2525 | ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT); |
d7e09d03 PT |
2526 | ksocknal_conn_decref(conn); |
2527 | goto again; | |
2528 | } | |
2529 | ||
2530 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
2531 | } | |
2532 | ||
2533 | int | |
b31e64c4 | 2534 | ksocknal_reaper(void *arg) |
d7e09d03 | 2535 | { |
97d10d0a | 2536 | wait_queue_t wait; |
ff13fd40 JS |
2537 | struct ksock_conn *conn; |
2538 | struct ksock_sched *sched; | |
97d10d0a MS |
2539 | struct list_head enomem_conns; |
2540 | int nenomem_conns; | |
2541 | long timeout; | |
2542 | int i; | |
2543 | int peer_index = 0; | |
2544 | unsigned long deadline = cfs_time_current(); | |
2545 | ||
2546 | cfs_block_allsigs(); | |
d7e09d03 PT |
2547 | |
2548 | INIT_LIST_HEAD(&enomem_conns); | |
9e795d35 | 2549 | init_waitqueue_entry(&wait, current); |
d7e09d03 PT |
2550 | |
2551 | spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); | |
2552 | ||
2553 | while (!ksocknal_data.ksnd_shuttingdown) { | |
b31e64c4 JS |
2554 | if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) { |
2555 | conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next, | |
ff13fd40 | 2556 | struct ksock_conn, ksnc_list); |
b31e64c4 | 2557 | list_del(&conn->ksnc_list); |
d7e09d03 PT |
2558 | |
2559 | spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); | |
2560 | ||
2561 | ksocknal_terminate_conn(conn); | |
2562 | ksocknal_conn_decref(conn); | |
2563 | ||
2564 | spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); | |
2565 | continue; | |
2566 | } | |
2567 | ||
b31e64c4 JS |
2568 | if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) { |
2569 | conn = list_entry(ksocknal_data.ksnd_zombie_conns.next, | |
ff13fd40 | 2570 | struct ksock_conn, ksnc_list); |
b31e64c4 | 2571 | list_del(&conn->ksnc_list); |
d7e09d03 PT |
2572 | |
2573 | spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); | |
2574 | ||
2575 | ksocknal_destroy_conn(conn); | |
2576 | ||
2577 | spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); | |
2578 | continue; | |
2579 | } | |
2580 | ||
b31e64c4 | 2581 | if (!list_empty(&ksocknal_data.ksnd_enomem_conns)) { |
d7e09d03 | 2582 | list_add(&enomem_conns, |
c314c319 | 2583 | &ksocknal_data.ksnd_enomem_conns); |
d7e09d03 PT |
2584 | list_del_init(&ksocknal_data.ksnd_enomem_conns); |
2585 | } | |
2586 | ||
2587 | spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); | |
2588 | ||
2589 | /* reschedule all the connections that stalled with ENOMEM... */ | |
2590 | nenomem_conns = 0; | |
b31e64c4 | 2591 | while (!list_empty(&enomem_conns)) { |
ff13fd40 | 2592 | conn = list_entry(enomem_conns.next, struct ksock_conn, |
c314c319 | 2593 | ksnc_tx_list); |
b31e64c4 | 2594 | list_del(&conn->ksnc_tx_list); |
d7e09d03 PT |
2595 | |
2596 | sched = conn->ksnc_scheduler; | |
2597 | ||
2598 | spin_lock_bh(&sched->kss_lock); | |
2599 | ||
2600 | LASSERT(conn->ksnc_tx_scheduled); | |
2601 | conn->ksnc_tx_ready = 1; | |
2602 | list_add_tail(&conn->ksnc_tx_list, | |
c314c319 | 2603 | &sched->kss_tx_conns); |
d7e09d03 PT |
2604 | wake_up(&sched->kss_waitq); |
2605 | ||
2606 | spin_unlock_bh(&sched->kss_lock); | |
2607 | nenomem_conns++; | |
2608 | } | |
2609 | ||
2610 | /* careful with the jiffy wrap... */ | |
2611 | while ((timeout = cfs_time_sub(deadline, | |
2612 | cfs_time_current())) <= 0) { | |
2613 | const int n = 4; | |
2614 | const int p = 1; | |
97d10d0a | 2615 | int chunk = ksocknal_data.ksnd_peer_hash_size; |
d7e09d03 | 2616 | |
4420cfd3 JS |
2617 | /* |
2618 | * Time to check for timeouts on a few more peers: I do | |
d7e09d03 PT |
2619 | * checks every 'p' seconds on a proportion of the peer |
2620 | * table and I need to check every connection 'n' times | |
2621 | * within a timeout interval, to ensure I detect a | |
2622 | * timeout on any connection within (n+1)/n times the | |
4420cfd3 JS |
2623 | * timeout interval. |
2624 | */ | |
d7e09d03 PT |
2625 | if (*ksocknal_tunables.ksnd_timeout > n * p) |
2626 | chunk = (chunk * n * p) / | |
2627 | *ksocknal_tunables.ksnd_timeout; | |
5fd88337 | 2628 | if (!chunk) |
d7e09d03 PT |
2629 | chunk = 1; |
2630 | ||
2631 | for (i = 0; i < chunk; i++) { | |
b31e64c4 | 2632 | ksocknal_check_peer_timeouts(peer_index); |
d7e09d03 PT |
2633 | peer_index = (peer_index + 1) % |
2634 | ksocknal_data.ksnd_peer_hash_size; | |
2635 | } | |
2636 | ||
2637 | deadline = cfs_time_add(deadline, cfs_time_seconds(p)); | |
2638 | } | |
2639 | ||
5fd88337 | 2640 | if (nenomem_conns) { |
4420cfd3 JS |
2641 | /* |
2642 | * Reduce my timeout if I rescheduled ENOMEM conns. | |
d7e09d03 | 2643 | * This also prevents me getting woken immediately |
4420cfd3 JS |
2644 | * if any go back on my enomem list. |
2645 | */ | |
d7e09d03 PT |
2646 | timeout = SOCKNAL_ENOMEM_RETRY; |
2647 | } | |
2648 | ksocknal_data.ksnd_reaper_waketime = | |
2649 | cfs_time_add(cfs_time_current(), timeout); | |
2650 | ||
b31e64c4 JS |
2651 | set_current_state(TASK_INTERRUPTIBLE); |
2652 | add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); | |
d7e09d03 PT |
2653 | |
2654 | if (!ksocknal_data.ksnd_shuttingdown && | |
b31e64c4 JS |
2655 | list_empty(&ksocknal_data.ksnd_deathrow_conns) && |
2656 | list_empty(&ksocknal_data.ksnd_zombie_conns)) | |
b7efb98d | 2657 | schedule_timeout(timeout); |
d7e09d03 | 2658 | |
b31e64c4 JS |
2659 | set_current_state(TASK_RUNNING); |
2660 | remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); | |
d7e09d03 PT |
2661 | |
2662 | spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); | |
2663 | } | |
2664 | ||
2665 | spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); | |
2666 | ||
2667 | ksocknal_thread_fini(); | |
2668 | return 0; | |
2669 | } |