net: remove inet6_reqsk_alloc
[deliverable/linux.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
1a2449a8 75#include <net/netdma.h>
6e5714ea 76#include <net/secure_seq.h>
d1a4c0b3 77#include <net/tcp_memcontrol.h>
076bb0c8 78#include <net/busy_poll.h>
1da177e4
LT
79
80#include <linux/inet.h>
81#include <linux/ipv6.h>
82#include <linux/stddef.h>
83#include <linux/proc_fs.h>
84#include <linux/seq_file.h>
85
cfb6eeb4
YH
86#include <linux/crypto.h>
87#include <linux/scatterlist.h>
88
ab32ea5d
BH
89int sysctl_tcp_tw_reuse __read_mostly;
90int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 91EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 92
1da177e4 93
cfb6eeb4 94#ifdef CONFIG_TCP_MD5SIG
a915da9b 95static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 96 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
97#endif
98
5caea4ea 99struct inet_hashinfo tcp_hashinfo;
4bc2f18b 100EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 101
cf533ea5 102static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 103{
eddc9ec5
ACM
104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 ip_hdr(skb)->saddr,
aa8223c7
ACM
106 tcp_hdr(skb)->dest,
107 tcp_hdr(skb)->source);
1da177e4
LT
108}
109
6d6ee43e
ACM
110int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111{
112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 struct tcp_sock *tp = tcp_sk(sk);
114
115 /* With PAWS, it is safe from the viewpoint
116 of data integrity. Even without PAWS it is safe provided sequence
117 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118
119 Actually, the idea is close to VJ's one, only timestamp cache is
120 held not per host, but per port pair and TW bucket is used as state
121 holder.
122
123 If TW bucket has been already destroyed we fall back to VJ's scheme
124 and use initial timestamp retrieved from peer table.
125 */
126 if (tcptw->tw_ts_recent_stamp &&
127 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 128 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
129 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
130 if (tp->write_seq == 0)
131 tp->write_seq = 1;
132 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
133 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
134 sock_hold(sktw);
135 return 1;
136 }
137
138 return 0;
139}
6d6ee43e
ACM
140EXPORT_SYMBOL_GPL(tcp_twsk_unique);
141
1da177e4
LT
142/* This will initiate an outgoing connection. */
143int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
144{
2d7192d6 145 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
146 struct inet_sock *inet = inet_sk(sk);
147 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 148 __be16 orig_sport, orig_dport;
bada8adc 149 __be32 daddr, nexthop;
da905bd1 150 struct flowi4 *fl4;
2d7192d6 151 struct rtable *rt;
1da177e4 152 int err;
f6d8bd05 153 struct ip_options_rcu *inet_opt;
1da177e4
LT
154
155 if (addr_len < sizeof(struct sockaddr_in))
156 return -EINVAL;
157
158 if (usin->sin_family != AF_INET)
159 return -EAFNOSUPPORT;
160
161 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
162 inet_opt = rcu_dereference_protected(inet->inet_opt,
163 sock_owned_by_user(sk));
164 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
165 if (!daddr)
166 return -EINVAL;
f6d8bd05 167 nexthop = inet_opt->opt.faddr;
1da177e4
LT
168 }
169
dca8b089
DM
170 orig_sport = inet->inet_sport;
171 orig_dport = usin->sin_port;
da905bd1
DM
172 fl4 = &inet->cork.fl.u.ip4;
173 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 IPPROTO_TCP,
0e0d44ab 176 orig_sport, orig_dport, sk);
b23dd4fe
DM
177 if (IS_ERR(rt)) {
178 err = PTR_ERR(rt);
179 if (err == -ENETUNREACH)
f1d8cba6 180 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 181 return err;
584bdf8c 182 }
1da177e4
LT
183
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 ip_rt_put(rt);
186 return -ENETUNREACH;
187 }
188
f6d8bd05 189 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 190 daddr = fl4->daddr;
1da177e4 191
c720c7e8 192 if (!inet->inet_saddr)
da905bd1 193 inet->inet_saddr = fl4->saddr;
c720c7e8 194 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 195
c720c7e8 196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
200 if (likely(!tp->repair))
201 tp->write_seq = 0;
1da177e4
LT
202 }
203
295ff7ed 204 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
205 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
206 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 207
c720c7e8
ED
208 inet->inet_dport = usin->sin_port;
209 inet->inet_daddr = daddr;
1da177e4 210
d83d8461 211 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
212 if (inet_opt)
213 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 214
bee7ca9e 215 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
216
217 /* Socket identity is still unknown (sport may be zero).
218 * However we set state to SYN-SENT and not releasing socket
219 * lock select source port, enter ourselves into the hash tables and
220 * complete initialization after this.
221 */
222 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 223 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
224 if (err)
225 goto failure;
226
da905bd1 227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
228 inet->inet_sport, inet->inet_dport, sk);
229 if (IS_ERR(rt)) {
230 err = PTR_ERR(rt);
231 rt = NULL;
1da177e4 232 goto failure;
b23dd4fe 233 }
1da177e4 234 /* OK, now commit destination to socket. */
bcd76111 235 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 236 sk_setup_caps(sk, &rt->dst);
1da177e4 237
ee995283 238 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
1da177e4
LT
242 usin->sin_port);
243
c720c7e8 244 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 245
2b916477 246 err = tcp_connect(sk);
ee995283 247
1da177e4
LT
248 rt = NULL;
249 if (err)
250 goto failure;
251
252 return 0;
253
254failure:
7174259e
ACM
255 /*
256 * This unhashes the socket and releases the local port,
257 * if necessary.
258 */
1da177e4
LT
259 tcp_set_state(sk, TCP_CLOSE);
260 ip_rt_put(rt);
261 sk->sk_route_caps = 0;
c720c7e8 262 inet->inet_dport = 0;
1da177e4
LT
263 return err;
264}
4bc2f18b 265EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 266
1da177e4 267/*
563d34d0
ED
268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269 * It can be called through tcp_release_cb() if socket was owned by user
270 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 271 */
563d34d0 272static void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
273{
274 struct dst_entry *dst;
275 struct inet_sock *inet = inet_sk(sk);
563d34d0 276 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 277
80d0a69f
DM
278 dst = inet_csk_update_pmtu(sk, mtu);
279 if (!dst)
1da177e4
LT
280 return;
281
1da177e4
LT
282 /* Something is about to be wrong... Remember soft error
283 * for the case, if this connection will not able to recover.
284 */
285 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 sk->sk_err_soft = EMSGSIZE;
287
288 mtu = dst_mtu(dst);
289
290 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 291 ip_sk_accept_pmtu(sk) &&
d83d8461 292 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
293 tcp_sync_mss(sk, mtu);
294
295 /* Resend the TCP packet because it's
296 * clear that the old packet has been
297 * dropped. This is the new "fast" path mtu
298 * discovery.
299 */
300 tcp_simple_retransmit(sk);
301 } /* else let the usual retransmit timer handle it */
302}
303
55be7a9c
DM
304static void do_redirect(struct sk_buff *skb, struct sock *sk)
305{
306 struct dst_entry *dst = __sk_dst_check(sk, 0);
307
1ed5c48f 308 if (dst)
6700c270 309 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
310}
311
1da177e4
LT
312/*
313 * This routine is called by the ICMP module when it gets some
314 * sort of error condition. If err < 0 then the socket should
315 * be closed and the error returned to the user. If err > 0
316 * it's just the icmp type << 8 | icmp code. After adjustment
317 * header points to the first 8 bytes of the tcp header. We need
318 * to find the appropriate port.
319 *
320 * The locking strategy used here is very "optimistic". When
321 * someone else accesses the socket the ICMP is just dropped
322 * and for some paths there is no check at all.
323 * A more general error queue to queue errors for later handling
324 * is probably better.
325 *
326 */
327
4d1a2d9e 328void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 329{
b71d1d42 330 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 331 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 332 struct inet_connection_sock *icsk;
1da177e4
LT
333 struct tcp_sock *tp;
334 struct inet_sock *inet;
4d1a2d9e
DL
335 const int type = icmp_hdr(icmp_skb)->type;
336 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 337 struct sock *sk;
f1ecd5d9 338 struct sk_buff *skb;
0a672f74
YC
339 struct request_sock *fastopen;
340 __u32 seq, snd_una;
f1ecd5d9 341 __u32 remaining;
1da177e4 342 int err;
4d1a2d9e 343 struct net *net = dev_net(icmp_skb->dev);
1da177e4 344
4d1a2d9e 345 if (icmp_skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 346 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
347 return;
348 }
349
fd54d716 350 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 351 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 352 if (!sk) {
dcfc23ca 353 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
354 return;
355 }
356 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 357 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
358 return;
359 }
360
361 bh_lock_sock(sk);
362 /* If too many ICMPs get dropped on busy
363 * servers this needs to be solved differently.
563d34d0
ED
364 * We do take care of PMTU discovery (RFC1191) special case :
365 * we can receive locally generated ICMP messages while socket is held.
1da177e4 366 */
b74aa930
ED
367 if (sock_owned_by_user(sk)) {
368 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
369 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
370 }
1da177e4
LT
371 if (sk->sk_state == TCP_CLOSE)
372 goto out;
373
97e3ecd1 374 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
375 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
376 goto out;
377 }
378
f1ecd5d9 379 icsk = inet_csk(sk);
1da177e4
LT
380 tp = tcp_sk(sk);
381 seq = ntohl(th->seq);
0a672f74
YC
382 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
383 fastopen = tp->fastopen_rsk;
384 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 385 if (sk->sk_state != TCP_LISTEN &&
0a672f74 386 !between(seq, snd_una, tp->snd_nxt)) {
de0744af 387 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
388 goto out;
389 }
390
391 switch (type) {
55be7a9c
DM
392 case ICMP_REDIRECT:
393 do_redirect(icmp_skb, sk);
394 goto out;
1da177e4
LT
395 case ICMP_SOURCE_QUENCH:
396 /* Just silently ignore these. */
397 goto out;
398 case ICMP_PARAMETERPROB:
399 err = EPROTO;
400 break;
401 case ICMP_DEST_UNREACH:
402 if (code > NR_ICMP_UNREACH)
403 goto out;
404
405 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
406 /* We are not interested in TCP_LISTEN and open_requests
407 * (SYN-ACKs send out by Linux are always <576bytes so
408 * they should go through unfragmented).
409 */
410 if (sk->sk_state == TCP_LISTEN)
411 goto out;
412
563d34d0 413 tp->mtu_info = info;
144d56e9 414 if (!sock_owned_by_user(sk)) {
563d34d0 415 tcp_v4_mtu_reduced(sk);
144d56e9
ED
416 } else {
417 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
418 sock_hold(sk);
419 }
1da177e4
LT
420 goto out;
421 }
422
423 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
424 /* check if icmp_skb allows revert of backoff
425 * (see draft-zimmermann-tcp-lcd) */
426 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
427 break;
428 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 429 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
430 break;
431
8f49c270
DM
432 if (sock_owned_by_user(sk))
433 break;
434
f1ecd5d9 435 icsk->icsk_backoff--;
740b0f18 436 inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
9ad7c049 437 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
f1ecd5d9
DL
438 tcp_bound_rto(sk);
439
440 skb = tcp_write_queue_head(sk);
441 BUG_ON(!skb);
442
443 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
444 tcp_time_stamp - TCP_SKB_CB(skb)->when);
445
446 if (remaining) {
447 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
449 } else {
450 /* RTO revert clocked out retransmission.
451 * Will retransmit now */
452 tcp_retransmit_timer(sk);
453 }
454
1da177e4
LT
455 break;
456 case ICMP_TIME_EXCEEDED:
457 err = EHOSTUNREACH;
458 break;
459 default:
460 goto out;
461 }
462
463 switch (sk->sk_state) {
60236fdd 464 struct request_sock *req, **prev;
1da177e4
LT
465 case TCP_LISTEN:
466 if (sock_owned_by_user(sk))
467 goto out;
468
463c84b9
ACM
469 req = inet_csk_search_req(sk, &prev, th->dest,
470 iph->daddr, iph->saddr);
1da177e4
LT
471 if (!req)
472 goto out;
473
474 /* ICMPs are not backlogged, hence we cannot get
475 an established socket here.
476 */
547b792c 477 WARN_ON(req->sk);
1da177e4 478
2e6599cb 479 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 480 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
481 goto out;
482 }
483
484 /*
485 * Still in SYN_RECV, just remove it silently.
486 * There is no good way to pass the error to the newly
487 * created socket, and POSIX does not want network
488 * errors returned from accept().
489 */
463c84b9 490 inet_csk_reqsk_queue_drop(sk, req, prev);
848bf15f 491 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
492 goto out;
493
494 case TCP_SYN_SENT:
0a672f74
YC
495 case TCP_SYN_RECV:
496 /* Only in fast or simultaneous open. If a fast open socket is
497 * is already accepted it is treated as a connected one below.
498 */
499 if (fastopen && fastopen->sk == NULL)
500 break;
501
1da177e4 502 if (!sock_owned_by_user(sk)) {
1da177e4
LT
503 sk->sk_err = err;
504
505 sk->sk_error_report(sk);
506
507 tcp_done(sk);
508 } else {
509 sk->sk_err_soft = err;
510 }
511 goto out;
512 }
513
514 /* If we've already connected we will keep trying
515 * until we time out, or the user gives up.
516 *
517 * rfc1122 4.2.3.9 allows to consider as hard errors
518 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519 * but it is obsoleted by pmtu discovery).
520 *
521 * Note, that in modern internet, where routing is unreliable
522 * and in each dark corner broken firewalls sit, sending random
523 * errors ordered by their masters even this two messages finally lose
524 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 *
526 * Now we are in compliance with RFCs.
527 * --ANK (980905)
528 */
529
530 inet = inet_sk(sk);
531 if (!sock_owned_by_user(sk) && inet->recverr) {
532 sk->sk_err = err;
533 sk->sk_error_report(sk);
534 } else { /* Only an error on timeout */
535 sk->sk_err_soft = err;
536 }
537
538out:
539 bh_unlock_sock(sk);
540 sock_put(sk);
541}
542
28850dc7 543void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 544{
aa8223c7 545 struct tcphdr *th = tcp_hdr(skb);
1da177e4 546
84fa7933 547 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 548 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 549 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 550 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 551 } else {
419f9f89 552 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 553 csum_partial(th,
1da177e4
LT
554 th->doff << 2,
555 skb->csum));
556 }
557}
558
419f9f89 559/* This routine computes an IPv4 TCP checksum. */
bb296246 560void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 561{
cf533ea5 562 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
563
564 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565}
4bc2f18b 566EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 567
1da177e4
LT
568/*
569 * This routine will send an RST to the other tcp.
570 *
571 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572 * for reset.
573 * Answer: if a packet caused RST, it is not for a socket
574 * existing in our system, if it is matched to a socket,
575 * it is just duplicate segment or bug in other side's TCP.
576 * So that we build reply only basing on parameters
577 * arrived with segment.
578 * Exception: precedence violation. We do not implement it in any case.
579 */
580
cfb6eeb4 581static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 582{
cf533ea5 583 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
584 struct {
585 struct tcphdr th;
586#ifdef CONFIG_TCP_MD5SIG
714e85be 587 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
588#endif
589 } rep;
1da177e4 590 struct ip_reply_arg arg;
cfb6eeb4
YH
591#ifdef CONFIG_TCP_MD5SIG
592 struct tcp_md5sig_key *key;
658ddaaf
SL
593 const __u8 *hash_location = NULL;
594 unsigned char newhash[16];
595 int genhash;
596 struct sock *sk1 = NULL;
cfb6eeb4 597#endif
a86b1e30 598 struct net *net;
1da177e4
LT
599
600 /* Never send a reset in response to a reset. */
601 if (th->rst)
602 return;
603
511c3f92 604 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
605 return;
606
607 /* Swap the send and the receive. */
cfb6eeb4
YH
608 memset(&rep, 0, sizeof(rep));
609 rep.th.dest = th->source;
610 rep.th.source = th->dest;
611 rep.th.doff = sizeof(struct tcphdr) / 4;
612 rep.th.rst = 1;
1da177e4
LT
613
614 if (th->ack) {
cfb6eeb4 615 rep.th.seq = th->ack_seq;
1da177e4 616 } else {
cfb6eeb4
YH
617 rep.th.ack = 1;
618 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 skb->len - (th->doff << 2));
1da177e4
LT
620 }
621
7174259e 622 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
623 arg.iov[0].iov_base = (unsigned char *)&rep;
624 arg.iov[0].iov_len = sizeof(rep.th);
625
626#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
627 hash_location = tcp_parse_md5sig_option(th);
628 if (!sk && hash_location) {
629 /*
630 * active side is lost. Try to find listening socket through
631 * source port, and then find md5 key through listening socket.
632 * we are not loose security here:
633 * Incoming packet is checked with md5 hash with finding key,
634 * no RST generated if md5 hash doesn't match.
635 */
636 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
da5e3630
TH
637 &tcp_hashinfo, ip_hdr(skb)->saddr,
638 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
639 ntohs(th->source), inet_iif(skb));
640 /* don't send rst if it can't find key */
641 if (!sk1)
642 return;
643 rcu_read_lock();
644 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
645 &ip_hdr(skb)->saddr, AF_INET);
646 if (!key)
647 goto release_sk1;
648
649 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
650 if (genhash || memcmp(hash_location, newhash, 16) != 0)
651 goto release_sk1;
652 } else {
653 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
654 &ip_hdr(skb)->saddr,
655 AF_INET) : NULL;
656 }
657
cfb6eeb4
YH
658 if (key) {
659 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
660 (TCPOPT_NOP << 16) |
661 (TCPOPT_MD5SIG << 8) |
662 TCPOLEN_MD5SIG);
663 /* Update length and the length the header thinks exists */
664 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
665 rep.th.doff = arg.iov[0].iov_len / 4;
666
49a72dfb 667 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
668 key, ip_hdr(skb)->saddr,
669 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
670 }
671#endif
eddc9ec5
ACM
672 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
673 ip_hdr(skb)->saddr, /* XXX */
52cd5750 674 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 675 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 676 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 677 /* When socket is gone, all binding information is lost.
4c675258
AK
678 * routing might fail in this case. No choice here, if we choose to force
679 * input interface, we will misroute in case of asymmetric route.
e2446eaa 680 */
4c675258
AK
681 if (sk)
682 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 683
adf30907 684 net = dev_net(skb_dst(skb)->dev);
66b13d99 685 arg.tos = ip_hdr(skb)->tos;
be9f4a44 686 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
70e73416 687 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 688
63231bdd
PE
689 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
690 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
691
692#ifdef CONFIG_TCP_MD5SIG
693release_sk1:
694 if (sk1) {
695 rcu_read_unlock();
696 sock_put(sk1);
697 }
698#endif
1da177e4
LT
699}
700
701/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
702 outside socket context is ugly, certainly. What can I do?
703 */
704
9501f972 705static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 706 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 707 struct tcp_md5sig_key *key,
66b13d99 708 int reply_flags, u8 tos)
1da177e4 709{
cf533ea5 710 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
711 struct {
712 struct tcphdr th;
714e85be 713 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 714#ifdef CONFIG_TCP_MD5SIG
714e85be 715 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
716#endif
717 ];
1da177e4
LT
718 } rep;
719 struct ip_reply_arg arg;
adf30907 720 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
721
722 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 723 memset(&arg, 0, sizeof(arg));
1da177e4
LT
724
725 arg.iov[0].iov_base = (unsigned char *)&rep;
726 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 727 if (tsecr) {
cfb6eeb4
YH
728 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
729 (TCPOPT_TIMESTAMP << 8) |
730 TCPOLEN_TIMESTAMP);
ee684b6f
AV
731 rep.opt[1] = htonl(tsval);
732 rep.opt[2] = htonl(tsecr);
cb48cfe8 733 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
734 }
735
736 /* Swap the send and the receive. */
737 rep.th.dest = th->source;
738 rep.th.source = th->dest;
739 rep.th.doff = arg.iov[0].iov_len / 4;
740 rep.th.seq = htonl(seq);
741 rep.th.ack_seq = htonl(ack);
742 rep.th.ack = 1;
743 rep.th.window = htons(win);
744
cfb6eeb4 745#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 746 if (key) {
ee684b6f 747 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
748
749 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
750 (TCPOPT_NOP << 16) |
751 (TCPOPT_MD5SIG << 8) |
752 TCPOLEN_MD5SIG);
753 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
754 rep.th.doff = arg.iov[0].iov_len/4;
755
49a72dfb 756 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
757 key, ip_hdr(skb)->saddr,
758 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
759 }
760#endif
88ef4a5a 761 arg.flags = reply_flags;
eddc9ec5
ACM
762 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
763 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
764 arg.iov[0].iov_len, IPPROTO_TCP, 0);
765 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
766 if (oif)
767 arg.bound_dev_if = oif;
66b13d99 768 arg.tos = tos;
be9f4a44 769 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
70e73416 770 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 771
63231bdd 772 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
773}
774
775static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
776{
8feaf0c0 777 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 778 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 779
9501f972 780 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 781 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 782 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
783 tcptw->tw_ts_recent,
784 tw->tw_bound_dev_if,
88ef4a5a 785 tcp_twsk_md5_key(tcptw),
66b13d99
ED
786 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
787 tw->tw_tos
9501f972 788 );
1da177e4 789
8feaf0c0 790 inet_twsk_put(tw);
1da177e4
LT
791}
792
6edafaaf 793static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 794 struct request_sock *req)
1da177e4 795{
168a8f58
JC
796 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
797 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
798 */
799 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
800 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
801 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
ee684b6f 802 tcp_time_stamp,
9501f972
YH
803 req->ts_recent,
804 0,
a915da9b
ED
805 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
806 AF_INET),
66b13d99
ED
807 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
808 ip_hdr(skb)->tos);
1da177e4
LT
809}
810
1da177e4 811/*
9bf1d83e 812 * Send a SYN-ACK after having received a SYN.
60236fdd 813 * This still operates on a request_sock only, not on a big
1da177e4
LT
814 * socket.
815 */
72659ecc
OP
816static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
817 struct request_sock *req,
843f4a55
YC
818 u16 queue_mapping,
819 struct tcp_fastopen_cookie *foc)
1da177e4 820{
2e6599cb 821 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 822 struct flowi4 fl4;
1da177e4 823 int err = -1;
d41db5af 824 struct sk_buff *skb;
1da177e4
LT
825
826 /* First, grab a route. */
ba3f7f04 827 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 828 return -1;
1da177e4 829
843f4a55 830 skb = tcp_make_synack(sk, dst, req, foc);
1da177e4
LT
831
832 if (skb) {
634fb979 833 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 834
fff32699 835 skb_set_queue_mapping(skb, queue_mapping);
634fb979
ED
836 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
837 ireq->ir_rmt_addr,
2e6599cb 838 ireq->opt);
b9df3cb8 839 err = net_xmit_eval(err);
016818d0
NC
840 if (!tcp_rsk(req)->snt_synack && !err)
841 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1da177e4
LT
842 }
843
1da177e4
LT
844 return err;
845}
846
1a2c6181 847static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
fd80eb94 848{
843f4a55 849 int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
e6c022a4 850
f19c29e3 851 if (!res) {
e6c022a4 852 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
f19c29e3
YC
853 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
854 }
e6c022a4 855 return res;
fd80eb94
DL
856}
857
1da177e4 858/*
60236fdd 859 * IPv4 request_sock destructor.
1da177e4 860 */
60236fdd 861static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 862{
a51482bd 863 kfree(inet_rsk(req)->opt);
1da177e4
LT
864}
865
946cedcc 866/*
a2a385d6 867 * Return true if a syncookie should be sent
946cedcc 868 */
a2a385d6 869bool tcp_syn_flood_action(struct sock *sk,
946cedcc
ED
870 const struct sk_buff *skb,
871 const char *proto)
1da177e4 872{
946cedcc 873 const char *msg = "Dropping request";
a2a385d6 874 bool want_cookie = false;
946cedcc
ED
875 struct listen_sock *lopt;
876
2a1d4bd4 877#ifdef CONFIG_SYN_COOKIES
946cedcc 878 if (sysctl_tcp_syncookies) {
2a1d4bd4 879 msg = "Sending cookies";
a2a385d6 880 want_cookie = true;
946cedcc
ED
881 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
882 } else
80e40daa 883#endif
946cedcc
ED
884 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
885
886 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
5ad37d5d 887 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
946cedcc 888 lopt->synflood_warned = 1;
afd46503 889 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
946cedcc
ED
890 proto, ntohs(tcp_hdr(skb)->dest), msg);
891 }
892 return want_cookie;
2a1d4bd4 893}
946cedcc 894EXPORT_SYMBOL(tcp_syn_flood_action);
1da177e4
LT
895
896/*
60236fdd 897 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 898 */
5dff747b 899static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
1da177e4 900{
f6d8bd05
ED
901 const struct ip_options *opt = &(IPCB(skb)->opt);
902 struct ip_options_rcu *dopt = NULL;
1da177e4
LT
903
904 if (opt && opt->optlen) {
f6d8bd05
ED
905 int opt_size = sizeof(*dopt) + opt->optlen;
906
1da177e4
LT
907 dopt = kmalloc(opt_size, GFP_ATOMIC);
908 if (dopt) {
f6d8bd05 909 if (ip_options_echo(&dopt->opt, skb)) {
1da177e4
LT
910 kfree(dopt);
911 dopt = NULL;
912 }
913 }
914 }
915 return dopt;
916}
917
cfb6eeb4
YH
918#ifdef CONFIG_TCP_MD5SIG
919/*
920 * RFC2385 MD5 checksumming requires a mapping of
921 * IP address->MD5 Key.
922 * We need to maintain these in the sk structure.
923 */
924
925/* Find the Key structure for an address. */
a915da9b
ED
926struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
927 const union tcp_md5_addr *addr,
928 int family)
cfb6eeb4
YH
929{
930 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 931 struct tcp_md5sig_key *key;
a915da9b 932 unsigned int size = sizeof(struct in_addr);
a8afca03 933 struct tcp_md5sig_info *md5sig;
cfb6eeb4 934
a8afca03
ED
935 /* caller either holds rcu_read_lock() or socket lock */
936 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea
ED
937 sock_owned_by_user(sk) ||
938 lockdep_is_held(&sk->sk_lock.slock));
a8afca03 939 if (!md5sig)
cfb6eeb4 940 return NULL;
a915da9b
ED
941#if IS_ENABLED(CONFIG_IPV6)
942 if (family == AF_INET6)
943 size = sizeof(struct in6_addr);
944#endif
b67bfe0d 945 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
946 if (key->family != family)
947 continue;
948 if (!memcmp(&key->addr, addr, size))
949 return key;
cfb6eeb4
YH
950 }
951 return NULL;
952}
a915da9b 953EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4
YH
954
955struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
956 struct sock *addr_sk)
957{
a915da9b
ED
958 union tcp_md5_addr *addr;
959
960 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
961 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 962}
cfb6eeb4
YH
963EXPORT_SYMBOL(tcp_v4_md5_lookup);
964
f5b99bcd
AB
965static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
966 struct request_sock *req)
cfb6eeb4 967{
a915da9b
ED
968 union tcp_md5_addr *addr;
969
634fb979 970 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
a915da9b 971 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4
YH
972}
973
974/* This can be called on a newly created socket, from other files */
a915da9b
ED
975int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
976 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
977{
978 /* Add Key to the list */
b0a713e9 979 struct tcp_md5sig_key *key;
cfb6eeb4 980 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 981 struct tcp_md5sig_info *md5sig;
cfb6eeb4 982
c0353c7b 983 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
984 if (key) {
985 /* Pre-existing entry - just update that one. */
a915da9b 986 memcpy(key->key, newkey, newkeylen);
b0a713e9 987 key->keylen = newkeylen;
a915da9b
ED
988 return 0;
989 }
260fcbeb 990
a8afca03
ED
991 md5sig = rcu_dereference_protected(tp->md5sig_info,
992 sock_owned_by_user(sk));
a915da9b
ED
993 if (!md5sig) {
994 md5sig = kmalloc(sizeof(*md5sig), gfp);
995 if (!md5sig)
cfb6eeb4 996 return -ENOMEM;
cfb6eeb4 997
a915da9b
ED
998 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
999 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1000 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1001 }
cfb6eeb4 1002
5f3d9cb2 1003 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1004 if (!key)
1005 return -ENOMEM;
71cea17e 1006 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1007 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1008 return -ENOMEM;
cfb6eeb4 1009 }
a915da9b
ED
1010
1011 memcpy(key->key, newkey, newkeylen);
1012 key->keylen = newkeylen;
1013 key->family = family;
1014 memcpy(&key->addr, addr,
1015 (family == AF_INET6) ? sizeof(struct in6_addr) :
1016 sizeof(struct in_addr));
1017 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1018 return 0;
1019}
a915da9b 1020EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1021
a915da9b 1022int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 1023{
a915da9b
ED
1024 struct tcp_md5sig_key *key;
1025
c0353c7b 1026 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
1027 if (!key)
1028 return -ENOENT;
1029 hlist_del_rcu(&key->node);
5f3d9cb2 1030 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1031 kfree_rcu(key, rcu);
a915da9b 1032 return 0;
cfb6eeb4 1033}
a915da9b 1034EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1035
e0683e70 1036static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1037{
1038 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1039 struct tcp_md5sig_key *key;
b67bfe0d 1040 struct hlist_node *n;
a8afca03 1041 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1042
a8afca03
ED
1043 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1044
b67bfe0d 1045 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1046 hlist_del_rcu(&key->node);
5f3d9cb2 1047 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1048 kfree_rcu(key, rcu);
cfb6eeb4
YH
1049 }
1050}
1051
7174259e
ACM
1052static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1053 int optlen)
cfb6eeb4
YH
1054{
1055 struct tcp_md5sig cmd;
1056 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1057
1058 if (optlen < sizeof(cmd))
1059 return -EINVAL;
1060
7174259e 1061 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1062 return -EFAULT;
1063
1064 if (sin->sin_family != AF_INET)
1065 return -EINVAL;
1066
a8afca03 1067 if (!cmd.tcpm_key || !cmd.tcpm_keylen)
a915da9b
ED
1068 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1069 AF_INET);
cfb6eeb4
YH
1070
1071 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1072 return -EINVAL;
1073
a915da9b
ED
1074 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1075 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1076 GFP_KERNEL);
cfb6eeb4
YH
1077}
1078
49a72dfb
AL
1079static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1080 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1081{
cfb6eeb4 1082 struct tcp4_pseudohdr *bp;
49a72dfb 1083 struct scatterlist sg;
cfb6eeb4
YH
1084
1085 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1086
1087 /*
49a72dfb 1088 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1089 * destination IP address, zero-padded protocol number, and
1090 * segment length)
1091 */
1092 bp->saddr = saddr;
1093 bp->daddr = daddr;
1094 bp->pad = 0;
076fb722 1095 bp->protocol = IPPROTO_TCP;
49a72dfb 1096 bp->len = cpu_to_be16(nbytes);
c7da57a1 1097
49a72dfb
AL
1098 sg_init_one(&sg, bp, sizeof(*bp));
1099 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1100}
1101
a915da9b 1102static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1103 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1104{
1105 struct tcp_md5sig_pool *hp;
1106 struct hash_desc *desc;
1107
1108 hp = tcp_get_md5sig_pool();
1109 if (!hp)
1110 goto clear_hash_noput;
1111 desc = &hp->md5_desc;
1112
1113 if (crypto_hash_init(desc))
1114 goto clear_hash;
1115 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1116 goto clear_hash;
1117 if (tcp_md5_hash_header(hp, th))
1118 goto clear_hash;
1119 if (tcp_md5_hash_key(hp, key))
1120 goto clear_hash;
1121 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1122 goto clear_hash;
1123
cfb6eeb4 1124 tcp_put_md5sig_pool();
cfb6eeb4 1125 return 0;
49a72dfb 1126
cfb6eeb4
YH
1127clear_hash:
1128 tcp_put_md5sig_pool();
1129clear_hash_noput:
1130 memset(md5_hash, 0, 16);
49a72dfb 1131 return 1;
cfb6eeb4
YH
1132}
1133
49a72dfb 1134int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
318cf7aa
ED
1135 const struct sock *sk, const struct request_sock *req,
1136 const struct sk_buff *skb)
cfb6eeb4 1137{
49a72dfb
AL
1138 struct tcp_md5sig_pool *hp;
1139 struct hash_desc *desc;
318cf7aa 1140 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1141 __be32 saddr, daddr;
1142
1143 if (sk) {
c720c7e8
ED
1144 saddr = inet_sk(sk)->inet_saddr;
1145 daddr = inet_sk(sk)->inet_daddr;
49a72dfb 1146 } else if (req) {
634fb979
ED
1147 saddr = inet_rsk(req)->ir_loc_addr;
1148 daddr = inet_rsk(req)->ir_rmt_addr;
cfb6eeb4 1149 } else {
49a72dfb
AL
1150 const struct iphdr *iph = ip_hdr(skb);
1151 saddr = iph->saddr;
1152 daddr = iph->daddr;
cfb6eeb4 1153 }
49a72dfb
AL
1154
1155 hp = tcp_get_md5sig_pool();
1156 if (!hp)
1157 goto clear_hash_noput;
1158 desc = &hp->md5_desc;
1159
1160 if (crypto_hash_init(desc))
1161 goto clear_hash;
1162
1163 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1164 goto clear_hash;
1165 if (tcp_md5_hash_header(hp, th))
1166 goto clear_hash;
1167 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1168 goto clear_hash;
1169 if (tcp_md5_hash_key(hp, key))
1170 goto clear_hash;
1171 if (crypto_hash_final(desc, md5_hash))
1172 goto clear_hash;
1173
1174 tcp_put_md5sig_pool();
1175 return 0;
1176
1177clear_hash:
1178 tcp_put_md5sig_pool();
1179clear_hash_noput:
1180 memset(md5_hash, 0, 16);
1181 return 1;
cfb6eeb4 1182}
49a72dfb 1183EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1184
a2a385d6 1185static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
cfb6eeb4
YH
1186{
1187 /*
1188 * This gets called for each TCP segment that arrives
1189 * so we want to be efficient.
1190 * We have 3 drop cases:
1191 * o No MD5 hash and one expected.
1192 * o MD5 hash and we're not expecting one.
1193 * o MD5 hash and its wrong.
1194 */
cf533ea5 1195 const __u8 *hash_location = NULL;
cfb6eeb4 1196 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1197 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1198 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1199 int genhash;
cfb6eeb4
YH
1200 unsigned char newhash[16];
1201
a915da9b
ED
1202 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1203 AF_INET);
7d5d5525 1204 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1205
cfb6eeb4
YH
1206 /* We've parsed the options - do we have a hash? */
1207 if (!hash_expected && !hash_location)
a2a385d6 1208 return false;
cfb6eeb4
YH
1209
1210 if (hash_expected && !hash_location) {
785957d3 1211 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1212 return true;
cfb6eeb4
YH
1213 }
1214
1215 if (!hash_expected && hash_location) {
785957d3 1216 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1217 return true;
cfb6eeb4
YH
1218 }
1219
1220 /* Okay, so this is hash_expected and hash_location -
1221 * so we need to calculate the checksum.
1222 */
49a72dfb
AL
1223 genhash = tcp_v4_md5_hash_skb(newhash,
1224 hash_expected,
1225 NULL, NULL, skb);
cfb6eeb4
YH
1226
1227 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1228 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1229 &iph->saddr, ntohs(th->source),
1230 &iph->daddr, ntohs(th->dest),
1231 genhash ? " tcp_v4_calc_md5_hash failed"
1232 : "");
a2a385d6 1233 return true;
cfb6eeb4 1234 }
a2a385d6 1235 return false;
cfb6eeb4
YH
1236}
1237
1238#endif
1239
72a3effa 1240struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1241 .family = PF_INET,
2e6599cb 1242 .obj_size = sizeof(struct tcp_request_sock),
72659ecc 1243 .rtx_syn_ack = tcp_v4_rtx_synack,
60236fdd
ACM
1244 .send_ack = tcp_v4_reqsk_send_ack,
1245 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1246 .send_reset = tcp_v4_send_reset,
72659ecc 1247 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1248};
1249
cfb6eeb4 1250#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1251static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1252 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1253 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1254};
b6332e6c 1255#endif
cfb6eeb4 1256
1da177e4
LT
1257int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1258{
1259 struct tcp_options_received tmp_opt;
60236fdd 1260 struct request_sock *req;
e6b4d113 1261 struct inet_request_sock *ireq;
4957faad 1262 struct tcp_sock *tp = tcp_sk(sk);
e6b4d113 1263 struct dst_entry *dst = NULL;
eddc9ec5
ACM
1264 __be32 saddr = ip_hdr(skb)->saddr;
1265 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4 1266 __u32 isn = TCP_SKB_CB(skb)->when;
843f4a55 1267 bool want_cookie = false, fastopen;
168a8f58
JC
1268 struct flowi4 fl4;
1269 struct tcp_fastopen_cookie foc = { .len = -1 };
843f4a55 1270 int err;
1da177e4
LT
1271
1272 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1273 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1274 goto drop;
1275
1276 /* TW buckets are converted to open requests without
1277 * limitations, they conserve resources and peer is
1278 * evidently real one.
1279 */
5ad37d5d
HFS
1280 if ((sysctl_tcp_syncookies == 2 ||
1281 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
946cedcc
ED
1282 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1283 if (!want_cookie)
1284 goto drop;
1da177e4
LT
1285 }
1286
1287 /* Accept backlog is full. If we have already queued enough
1288 * of warm entries in syn queue, drop request. It is better than
1289 * clogging syn queue with openreqs with exponentially increasing
1290 * timeout.
1291 */
2aeef18d
NS
1292 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1293 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1da177e4 1294 goto drop;
2aeef18d 1295 }
1da177e4 1296
ce4a7d0d 1297 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1298 if (!req)
1299 goto drop;
1300
cfb6eeb4
YH
1301#ifdef CONFIG_TCP_MD5SIG
1302 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1303#endif
1304
1da177e4 1305 tcp_clear_options(&tmp_opt);
bee7ca9e 1306 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4957faad 1307 tmp_opt.user_mss = tp->rx_opt.user_mss;
1a2c6181 1308 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1da177e4 1309
4dfc2817 1310 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1311 tcp_clear_options(&tmp_opt);
1da177e4 1312
1da177e4 1313 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
e0f802fb 1314 tcp_openreq_init(req, &tmp_opt, skb, sk);
1da177e4 1315
bb5b7c11 1316 ireq = inet_rsk(req);
634fb979
ED
1317 ireq->ir_loc_addr = daddr;
1318 ireq->ir_rmt_addr = saddr;
bb5b7c11 1319 ireq->no_srccheck = inet_sk(sk)->transparent;
5dff747b 1320 ireq->opt = tcp_v4_save_options(skb);
bb5b7c11 1321
284904aa 1322 if (security_inet_conn_request(sk, skb, req))
bb5b7c11 1323 goto drop_and_free;
284904aa 1324
172d69e6 1325 if (!want_cookie || tmp_opt.tstamp_ok)
5d134f1c 1326 TCP_ECN_create_request(req, skb, sock_net(sk));
1da177e4
LT
1327
1328 if (want_cookie) {
1da177e4 1329 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
172d69e6 1330 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4 1331 } else if (!isn) {
1da177e4
LT
1332 /* VJ's idea. We save last timestamp seen
1333 * from the destination in peer table, when entering
1334 * state TIME-WAIT, and check against it before
1335 * accepting new connection request.
1336 *
1337 * If "isn" is not zero, this request hit alive
1338 * timewait bucket, so that all the necessary checks
1339 * are made in the function processing timewait state.
1340 */
1341 if (tmp_opt.saw_tstamp &&
295ff7ed 1342 tcp_death_row.sysctl_tw_recycle &&
ba3f7f04 1343 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
81166dd6
DM
1344 fl4.daddr == saddr) {
1345 if (!tcp_peer_is_proven(req, dst, true)) {
de0744af 1346 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1347 goto drop_and_release;
1da177e4
LT
1348 }
1349 }
1350 /* Kill the following clause, if you dislike this way. */
1351 else if (!sysctl_tcp_syncookies &&
463c84b9 1352 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4 1353 (sysctl_max_syn_backlog >> 2)) &&
81166dd6 1354 !tcp_peer_is_proven(req, dst, false)) {
1da177e4
LT
1355 /* Without syncookies last quarter of
1356 * backlog is filled with destinations,
1357 * proven to be alive.
1358 * It means that we continue to communicate
1359 * to destinations, already remembered
1360 * to the moment of synflood.
1361 */
afd46503 1362 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
673d57e7 1363 &saddr, ntohs(tcp_hdr(skb)->source));
7cd04fa7 1364 goto drop_and_release;
1da177e4
LT
1365 }
1366
a94f723d 1367 isn = tcp_v4_init_sequence(skb);
1da177e4 1368 }
843f4a55 1369 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
168a8f58
JC
1370 goto drop_and_free;
1371
843f4a55 1372 tcp_rsk(req)->snt_isn = isn;
843f4a55
YC
1373 tcp_openreq_init_rwin(req, sk, dst);
1374 fastopen = !want_cookie &&
1375 tcp_try_fastopen(sk, skb, req, &foc, dst);
1376 err = tcp_v4_send_synack(sk, dst, req,
1377 skb_get_queue_mapping(skb), &foc);
1378 if (!fastopen) {
168a8f58
JC
1379 if (err || want_cookie)
1380 goto drop_and_free;
1381
1382 tcp_rsk(req)->listener = NULL;
168a8f58 1383 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
843f4a55 1384 }
1da177e4 1385
1da177e4
LT
1386 return 0;
1387
7cd04fa7
DL
1388drop_and_release:
1389 dst_release(dst);
1da177e4 1390drop_and_free:
60236fdd 1391 reqsk_free(req);
1da177e4 1392drop:
848bf15f 1393 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1394 return 0;
1395}
4bc2f18b 1396EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1397
1398
1399/*
1400 * The three way handshake has completed - we got a valid synack -
1401 * now create the new socket.
1402 */
1403struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1404 struct request_sock *req,
1da177e4
LT
1405 struct dst_entry *dst)
1406{
2e6599cb 1407 struct inet_request_sock *ireq;
1da177e4
LT
1408 struct inet_sock *newinet;
1409 struct tcp_sock *newtp;
1410 struct sock *newsk;
cfb6eeb4
YH
1411#ifdef CONFIG_TCP_MD5SIG
1412 struct tcp_md5sig_key *key;
1413#endif
f6d8bd05 1414 struct ip_options_rcu *inet_opt;
1da177e4
LT
1415
1416 if (sk_acceptq_is_full(sk))
1417 goto exit_overflow;
1418
1da177e4
LT
1419 newsk = tcp_create_openreq_child(sk, req, skb);
1420 if (!newsk)
093d2823 1421 goto exit_nonewsk;
1da177e4 1422
bcd76111 1423 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1424 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1425
1426 newtp = tcp_sk(newsk);
1427 newinet = inet_sk(newsk);
2e6599cb 1428 ireq = inet_rsk(req);
634fb979
ED
1429 newinet->inet_daddr = ireq->ir_rmt_addr;
1430 newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1431 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1432 inet_opt = ireq->opt;
1433 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1434 ireq->opt = NULL;
463c84b9 1435 newinet->mc_index = inet_iif(skb);
eddc9ec5 1436 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1437 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1438 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1439 if (inet_opt)
1440 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1441 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1442
dfd25fff
ED
1443 if (!dst) {
1444 dst = inet_csk_route_child_sock(sk, newsk, req);
1445 if (!dst)
1446 goto put_and_exit;
1447 } else {
1448 /* syncookie case : see end of cookie_v4_check() */
1449 }
0e734419
DM
1450 sk_setup_caps(newsk, dst);
1451
1da177e4 1452 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1453 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1454 if (tcp_sk(sk)->rx_opt.user_mss &&
1455 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1456 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1457
1da177e4
LT
1458 tcp_initialize_rcv_mss(newsk);
1459
cfb6eeb4
YH
1460#ifdef CONFIG_TCP_MD5SIG
1461 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1462 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1463 AF_INET);
c720c7e8 1464 if (key != NULL) {
cfb6eeb4
YH
1465 /*
1466 * We're using one, so create a matching key
1467 * on the newsk structure. If we fail to get
1468 * memory, then we end up not copying the key
1469 * across. Shucks.
1470 */
a915da9b
ED
1471 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1473 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1474 }
1475#endif
1476
0e734419
DM
1477 if (__inet_inherit_port(sk, newsk) < 0)
1478 goto put_and_exit;
9327f705 1479 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1480
1481 return newsk;
1482
1483exit_overflow:
de0744af 1484 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1485exit_nonewsk:
1486 dst_release(dst);
1da177e4 1487exit:
de0744af 1488 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1489 return NULL;
0e734419 1490put_and_exit:
e337e24d
CP
1491 inet_csk_prepare_forced_close(newsk);
1492 tcp_done(newsk);
0e734419 1493 goto exit;
1da177e4 1494}
4bc2f18b 1495EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1496
1497static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1498{
aa8223c7 1499 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1500 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1501 struct sock *nsk;
60236fdd 1502 struct request_sock **prev;
1da177e4 1503 /* Find possible connection requests. */
463c84b9
ACM
1504 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1505 iph->saddr, iph->daddr);
1da177e4 1506 if (req)
8336886f 1507 return tcp_check_req(sk, skb, req, prev, false);
1da177e4 1508
3b1e0a65 1509 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1510 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1511
1512 if (nsk) {
1513 if (nsk->sk_state != TCP_TIME_WAIT) {
1514 bh_lock_sock(nsk);
1515 return nsk;
1516 }
9469c7b4 1517 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1518 return NULL;
1519 }
1520
1521#ifdef CONFIG_SYN_COOKIES
af9b4738 1522 if (!th->syn)
1da177e4
LT
1523 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1524#endif
1525 return sk;
1526}
1527
1da177e4
LT
1528/* The socket must have it's spinlock held when we get
1529 * here.
1530 *
1531 * We have a potential double-lock case here, so even when
1532 * doing backlog processing we use the BH locking scheme.
1533 * This is because we cannot sleep with the original spinlock
1534 * held.
1535 */
1536int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1537{
cfb6eeb4
YH
1538 struct sock *rsk;
1539#ifdef CONFIG_TCP_MD5SIG
1540 /*
1541 * We really want to reject the packet as early as possible
1542 * if:
1543 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1544 * o There is an MD5 option and we're not expecting one
1545 */
7174259e 1546 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1547 goto discard;
1548#endif
1549
1da177e4 1550 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1551 struct dst_entry *dst = sk->sk_rx_dst;
1552
bdeab991 1553 sock_rps_save_rxhash(sk, skb);
404e0a8b 1554 if (dst) {
505fbcf0
ED
1555 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1556 dst->ops->check(dst, 0) == NULL) {
92101b3b
DM
1557 dst_release(dst);
1558 sk->sk_rx_dst = NULL;
1559 }
1560 }
c995ae22 1561 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1562 return 0;
1563 }
1564
ab6a5bb6 1565 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1566 goto csum_err;
1567
1568 if (sk->sk_state == TCP_LISTEN) {
1569 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1570 if (!nsk)
1571 goto discard;
1572
1573 if (nsk != sk) {
bdeab991 1574 sock_rps_save_rxhash(nsk, skb);
cfb6eeb4
YH
1575 if (tcp_child_process(sk, nsk, skb)) {
1576 rsk = nsk;
1da177e4 1577 goto reset;
cfb6eeb4 1578 }
1da177e4
LT
1579 return 0;
1580 }
ca55158c 1581 } else
bdeab991 1582 sock_rps_save_rxhash(sk, skb);
ca55158c 1583
aa8223c7 1584 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1585 rsk = sk;
1da177e4 1586 goto reset;
cfb6eeb4 1587 }
1da177e4
LT
1588 return 0;
1589
1590reset:
cfb6eeb4 1591 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1592discard:
1593 kfree_skb(skb);
1594 /* Be careful here. If this function gets more complicated and
1595 * gcc suffers from register pressure on the x86, sk (in %ebx)
1596 * might be destroyed here. This current version compiles correctly,
1597 * but you have been warned.
1598 */
1599 return 0;
1600
1601csum_err:
6a5dc9e5 1602 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1603 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1604 goto discard;
1605}
4bc2f18b 1606EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1607
160eb5a6 1608void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1609{
41063e9d
DM
1610 const struct iphdr *iph;
1611 const struct tcphdr *th;
1612 struct sock *sk;
41063e9d 1613
41063e9d 1614 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1615 return;
41063e9d 1616
45f00f99 1617 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1618 return;
41063e9d
DM
1619
1620 iph = ip_hdr(skb);
45f00f99 1621 th = tcp_hdr(skb);
41063e9d
DM
1622
1623 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1624 return;
41063e9d 1625
45f00f99 1626 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1627 iph->saddr, th->source,
7011d085 1628 iph->daddr, ntohs(th->dest),
9cb429d6 1629 skb->skb_iif);
41063e9d
DM
1630 if (sk) {
1631 skb->sk = sk;
1632 skb->destructor = sock_edemux;
1633 if (sk->sk_state != TCP_TIME_WAIT) {
1634 struct dst_entry *dst = sk->sk_rx_dst;
505fbcf0 1635
41063e9d
DM
1636 if (dst)
1637 dst = dst_check(dst, 0);
92101b3b 1638 if (dst &&
505fbcf0 1639 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1640 skb_dst_set_noref(skb, dst);
41063e9d
DM
1641 }
1642 }
41063e9d
DM
1643}
1644
b2fb4f54
ED
1645/* Packet is added to VJ-style prequeue for processing in process
1646 * context, if a reader task is waiting. Apparently, this exciting
1647 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1648 * failed somewhere. Latency? Burstiness? Well, at least now we will
1649 * see, why it failed. 8)8) --ANK
1650 *
1651 */
1652bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1653{
1654 struct tcp_sock *tp = tcp_sk(sk);
1655
1656 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1657 return false;
1658
1659 if (skb->len <= tcp_hdrlen(skb) &&
1660 skb_queue_len(&tp->ucopy.prequeue) == 0)
1661 return false;
1662
58717686 1663 skb_dst_force(skb);
b2fb4f54
ED
1664 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1665 tp->ucopy.memory += skb->truesize;
1666 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1667 struct sk_buff *skb1;
1668
1669 BUG_ON(sock_owned_by_user(sk));
1670
1671 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1672 sk_backlog_rcv(sk, skb1);
1673 NET_INC_STATS_BH(sock_net(sk),
1674 LINUX_MIB_TCPPREQUEUEDROPPED);
1675 }
1676
1677 tp->ucopy.memory = 0;
1678 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1679 wake_up_interruptible_sync_poll(sk_sleep(sk),
1680 POLLIN | POLLRDNORM | POLLRDBAND);
1681 if (!inet_csk_ack_scheduled(sk))
1682 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1683 (3 * tcp_rto_min(sk)) / 4,
1684 TCP_RTO_MAX);
1685 }
1686 return true;
1687}
1688EXPORT_SYMBOL(tcp_prequeue);
1689
1da177e4
LT
1690/*
1691 * From tcp_input.c
1692 */
1693
1694int tcp_v4_rcv(struct sk_buff *skb)
1695{
eddc9ec5 1696 const struct iphdr *iph;
cf533ea5 1697 const struct tcphdr *th;
1da177e4
LT
1698 struct sock *sk;
1699 int ret;
a86b1e30 1700 struct net *net = dev_net(skb->dev);
1da177e4
LT
1701
1702 if (skb->pkt_type != PACKET_HOST)
1703 goto discard_it;
1704
1705 /* Count it even if it's bad */
63231bdd 1706 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1707
1708 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1709 goto discard_it;
1710
aa8223c7 1711 th = tcp_hdr(skb);
1da177e4
LT
1712
1713 if (th->doff < sizeof(struct tcphdr) / 4)
1714 goto bad_packet;
1715 if (!pskb_may_pull(skb, th->doff * 4))
1716 goto discard_it;
1717
1718 /* An explanation is required here, I think.
1719 * Packet length and doff are validated by header prediction,
caa20d9a 1720 * provided case of th->doff==0 is eliminated.
1da177e4 1721 * So, we defer the checks. */
ed70fcfc
TH
1722
1723 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1724 goto csum_error;
1da177e4 1725
aa8223c7 1726 th = tcp_hdr(skb);
eddc9ec5 1727 iph = ip_hdr(skb);
1da177e4
LT
1728 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1729 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1730 skb->len - th->doff * 4);
1731 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1732 TCP_SKB_CB(skb)->when = 0;
b82d1bb4 1733 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1734 TCP_SKB_CB(skb)->sacked = 0;
1735
9a1f27c4 1736 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1737 if (!sk)
1738 goto no_tcp_socket;
1739
bb134d5d
ED
1740process:
1741 if (sk->sk_state == TCP_TIME_WAIT)
1742 goto do_time_wait;
1743
6cce09f8
ED
1744 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1745 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1746 goto discard_and_relse;
6cce09f8 1747 }
d218d111 1748
1da177e4
LT
1749 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1750 goto discard_and_relse;
b59c2701 1751 nf_reset(skb);
1da177e4 1752
fda9ef5d 1753 if (sk_filter(sk, skb))
1da177e4
LT
1754 goto discard_and_relse;
1755
8b80cda5 1756 sk_mark_napi_id(sk, skb);
1da177e4
LT
1757 skb->dev = NULL;
1758
c6366184 1759 bh_lock_sock_nested(sk);
1da177e4
LT
1760 ret = 0;
1761 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
1762#ifdef CONFIG_NET_DMA
1763 struct tcp_sock *tp = tcp_sk(sk);
1764 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
a2bd1140 1765 tp->ucopy.dma_chan = net_dma_find_channel();
1a2449a8 1766 if (tp->ucopy.dma_chan)
1da177e4 1767 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
1768 else
1769#endif
1770 {
1771 if (!tcp_prequeue(sk, skb))
ae8d7f88 1772 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 1773 }
da882c1f
ED
1774 } else if (unlikely(sk_add_backlog(sk, skb,
1775 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 1776 bh_unlock_sock(sk);
6cce09f8 1777 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1778 goto discard_and_relse;
1779 }
1da177e4
LT
1780 bh_unlock_sock(sk);
1781
1782 sock_put(sk);
1783
1784 return ret;
1785
1786no_tcp_socket:
1787 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1788 goto discard_it;
1789
1790 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
6a5dc9e5
ED
1791csum_error:
1792 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 1793bad_packet:
63231bdd 1794 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1795 } else {
cfb6eeb4 1796 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1797 }
1798
1799discard_it:
1800 /* Discard frame. */
1801 kfree_skb(skb);
e905a9ed 1802 return 0;
1da177e4
LT
1803
1804discard_and_relse:
1805 sock_put(sk);
1806 goto discard_it;
1807
1808do_time_wait:
1809 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1810 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1811 goto discard_it;
1812 }
1813
6a5dc9e5 1814 if (skb->len < (th->doff << 2)) {
9469c7b4 1815 inet_twsk_put(inet_twsk(sk));
6a5dc9e5
ED
1816 goto bad_packet;
1817 }
1818 if (tcp_checksum_complete(skb)) {
1819 inet_twsk_put(inet_twsk(sk));
1820 goto csum_error;
1da177e4 1821 }
9469c7b4 1822 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1823 case TCP_TW_SYN: {
c346dca1 1824 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1825 &tcp_hashinfo,
da5e3630 1826 iph->saddr, th->source,
eddc9ec5 1827 iph->daddr, th->dest,
463c84b9 1828 inet_iif(skb));
1da177e4 1829 if (sk2) {
9469c7b4
YH
1830 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1831 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1832 sk = sk2;
1833 goto process;
1834 }
1835 /* Fall through to ACK */
1836 }
1837 case TCP_TW_ACK:
1838 tcp_v4_timewait_ack(sk, skb);
1839 break;
1840 case TCP_TW_RST:
1841 goto no_tcp_socket;
1842 case TCP_TW_SUCCESS:;
1843 }
1844 goto discard_it;
1845}
1846
ccb7c410
DM
1847static struct timewait_sock_ops tcp_timewait_sock_ops = {
1848 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1849 .twsk_unique = tcp_twsk_unique,
1850 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1851};
1da177e4 1852
63d02d15 1853void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1854{
1855 struct dst_entry *dst = skb_dst(skb);
1856
1857 dst_hold(dst);
1858 sk->sk_rx_dst = dst;
1859 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1860}
63d02d15 1861EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1862
3b401a81 1863const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1864 .queue_xmit = ip_queue_xmit,
1865 .send_check = tcp_v4_send_check,
1866 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1867 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1868 .conn_request = tcp_v4_conn_request,
1869 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1870 .net_header_len = sizeof(struct iphdr),
1871 .setsockopt = ip_setsockopt,
1872 .getsockopt = ip_getsockopt,
1873 .addr2sockaddr = inet_csk_addr2sockaddr,
1874 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1875 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1876#ifdef CONFIG_COMPAT
543d9cfe
ACM
1877 .compat_setsockopt = compat_ip_setsockopt,
1878 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1879#endif
1da177e4 1880};
4bc2f18b 1881EXPORT_SYMBOL(ipv4_specific);
1da177e4 1882
cfb6eeb4 1883#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1884static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1885 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1886 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1887 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1888};
b6332e6c 1889#endif
cfb6eeb4 1890
1da177e4
LT
1891/* NOTE: A lot of things set to zero explicitly by call to
1892 * sk_alloc() so need not be done here.
1893 */
1894static int tcp_v4_init_sock(struct sock *sk)
1895{
6687e988 1896 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1897
900f65d3 1898 tcp_init_sock(sk);
1da177e4 1899
8292a17a 1900 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1901
cfb6eeb4 1902#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1903 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1904#endif
1da177e4 1905
1da177e4
LT
1906 return 0;
1907}
1908
7d06b2e0 1909void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1910{
1911 struct tcp_sock *tp = tcp_sk(sk);
1912
1913 tcp_clear_xmit_timers(sk);
1914
6687e988 1915 tcp_cleanup_congestion_control(sk);
317a76f9 1916
1da177e4 1917 /* Cleanup up the write buffer. */
fe067e8a 1918 tcp_write_queue_purge(sk);
1da177e4
LT
1919
1920 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1921 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1922
cfb6eeb4
YH
1923#ifdef CONFIG_TCP_MD5SIG
1924 /* Clean up the MD5 key list, if any */
1925 if (tp->md5sig_info) {
a915da9b 1926 tcp_clear_md5_list(sk);
a8afca03 1927 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1928 tp->md5sig_info = NULL;
1929 }
1930#endif
1931
1a2449a8
CL
1932#ifdef CONFIG_NET_DMA
1933 /* Cleans up our sk_async_wait_queue */
e905a9ed 1934 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
1935#endif
1936
1da177e4
LT
1937 /* Clean prequeue, it must be empty really */
1938 __skb_queue_purge(&tp->ucopy.prequeue);
1939
1940 /* Clean up a referenced TCP bind bucket. */
463c84b9 1941 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1942 inet_put_port(sk);
1da177e4 1943
168a8f58 1944 BUG_ON(tp->fastopen_rsk != NULL);
435cf559 1945
cf60af03
YC
1946 /* If socket is aborted during connect operation */
1947 tcp_free_fastopen_req(tp);
1948
180d8cd9 1949 sk_sockets_allocated_dec(sk);
d1a4c0b3 1950 sock_release_memcg(sk);
1da177e4 1951}
1da177e4
LT
1952EXPORT_SYMBOL(tcp_v4_destroy_sock);
1953
1954#ifdef CONFIG_PROC_FS
1955/* Proc filesystem TCP sock list dumping. */
1956
a8b690f9
TH
1957/*
1958 * Get next listener socket follow cur. If cur is NULL, get first socket
1959 * starting from bucket given in st->bucket; when st->bucket is zero the
1960 * very first socket in the hash table is returned.
1961 */
1da177e4
LT
1962static void *listening_get_next(struct seq_file *seq, void *cur)
1963{
463c84b9 1964 struct inet_connection_sock *icsk;
c25eb3bf 1965 struct hlist_nulls_node *node;
1da177e4 1966 struct sock *sk = cur;
5caea4ea 1967 struct inet_listen_hashbucket *ilb;
5799de0b 1968 struct tcp_iter_state *st = seq->private;
a4146b1b 1969 struct net *net = seq_file_net(seq);
1da177e4
LT
1970
1971 if (!sk) {
a8b690f9 1972 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1973 spin_lock_bh(&ilb->lock);
c25eb3bf 1974 sk = sk_nulls_head(&ilb->head);
a8b690f9 1975 st->offset = 0;
1da177e4
LT
1976 goto get_sk;
1977 }
5caea4ea 1978 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1979 ++st->num;
a8b690f9 1980 ++st->offset;
1da177e4
LT
1981
1982 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1983 struct request_sock *req = cur;
1da177e4 1984
72a3effa 1985 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1986 req = req->dl_next;
1987 while (1) {
1988 while (req) {
bdccc4ca 1989 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1990 cur = req;
1991 goto out;
1992 }
1993 req = req->dl_next;
1994 }
72a3effa 1995 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
1996 break;
1997get_req:
463c84b9 1998 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 1999 }
1bde5ac4 2000 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 2001 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 2002 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2003 } else {
e905a9ed 2004 icsk = inet_csk(sk);
463c84b9
ACM
2005 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2006 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 2007 goto start_req;
463c84b9 2008 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 2009 sk = sk_nulls_next(sk);
1da177e4
LT
2010 }
2011get_sk:
c25eb3bf 2012 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2013 if (!net_eq(sock_net(sk), net))
2014 continue;
2015 if (sk->sk_family == st->family) {
1da177e4
LT
2016 cur = sk;
2017 goto out;
2018 }
e905a9ed 2019 icsk = inet_csk(sk);
463c84b9
ACM
2020 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
2022start_req:
2023 st->uid = sock_i_uid(sk);
2024 st->syn_wait_sk = sk;
2025 st->state = TCP_SEQ_STATE_OPENREQ;
2026 st->sbucket = 0;
2027 goto get_req;
2028 }
463c84b9 2029 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2030 }
5caea4ea 2031 spin_unlock_bh(&ilb->lock);
a8b690f9 2032 st->offset = 0;
0f7ff927 2033 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2034 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2035 spin_lock_bh(&ilb->lock);
c25eb3bf 2036 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2037 goto get_sk;
2038 }
2039 cur = NULL;
2040out:
2041 return cur;
2042}
2043
2044static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2045{
a8b690f9
TH
2046 struct tcp_iter_state *st = seq->private;
2047 void *rc;
2048
2049 st->bucket = 0;
2050 st->offset = 0;
2051 rc = listening_get_next(seq, NULL);
1da177e4
LT
2052
2053 while (rc && *pos) {
2054 rc = listening_get_next(seq, rc);
2055 --*pos;
2056 }
2057 return rc;
2058}
2059
05dbc7b5 2060static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2061{
05dbc7b5 2062 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2063}
2064
a8b690f9
TH
2065/*
2066 * Get first established socket starting from bucket given in st->bucket.
2067 * If st->bucket is zero, the very first socket in the hash is returned.
2068 */
1da177e4
LT
2069static void *established_get_first(struct seq_file *seq)
2070{
5799de0b 2071 struct tcp_iter_state *st = seq->private;
a4146b1b 2072 struct net *net = seq_file_net(seq);
1da177e4
LT
2073 void *rc = NULL;
2074
a8b690f9
TH
2075 st->offset = 0;
2076 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2077 struct sock *sk;
3ab5aee7 2078 struct hlist_nulls_node *node;
9db66bdc 2079 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2080
6eac5604
AK
2081 /* Lockless fast path for the common case of empty buckets */
2082 if (empty_bucket(st))
2083 continue;
2084
9db66bdc 2085 spin_lock_bh(lock);
3ab5aee7 2086 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2087 if (sk->sk_family != st->family ||
878628fb 2088 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2089 continue;
2090 }
2091 rc = sk;
2092 goto out;
2093 }
9db66bdc 2094 spin_unlock_bh(lock);
1da177e4
LT
2095 }
2096out:
2097 return rc;
2098}
2099
2100static void *established_get_next(struct seq_file *seq, void *cur)
2101{
2102 struct sock *sk = cur;
3ab5aee7 2103 struct hlist_nulls_node *node;
5799de0b 2104 struct tcp_iter_state *st = seq->private;
a4146b1b 2105 struct net *net = seq_file_net(seq);
1da177e4
LT
2106
2107 ++st->num;
a8b690f9 2108 ++st->offset;
1da177e4 2109
05dbc7b5 2110 sk = sk_nulls_next(sk);
1da177e4 2111
3ab5aee7 2112 sk_nulls_for_each_from(sk, node) {
878628fb 2113 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2114 return sk;
1da177e4
LT
2115 }
2116
05dbc7b5
ED
2117 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2118 ++st->bucket;
2119 return established_get_first(seq);
1da177e4
LT
2120}
2121
2122static void *established_get_idx(struct seq_file *seq, loff_t pos)
2123{
a8b690f9
TH
2124 struct tcp_iter_state *st = seq->private;
2125 void *rc;
2126
2127 st->bucket = 0;
2128 rc = established_get_first(seq);
1da177e4
LT
2129
2130 while (rc && pos) {
2131 rc = established_get_next(seq, rc);
2132 --pos;
7174259e 2133 }
1da177e4
LT
2134 return rc;
2135}
2136
2137static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2138{
2139 void *rc;
5799de0b 2140 struct tcp_iter_state *st = seq->private;
1da177e4 2141
1da177e4
LT
2142 st->state = TCP_SEQ_STATE_LISTENING;
2143 rc = listening_get_idx(seq, &pos);
2144
2145 if (!rc) {
1da177e4
LT
2146 st->state = TCP_SEQ_STATE_ESTABLISHED;
2147 rc = established_get_idx(seq, pos);
2148 }
2149
2150 return rc;
2151}
2152
a8b690f9
TH
2153static void *tcp_seek_last_pos(struct seq_file *seq)
2154{
2155 struct tcp_iter_state *st = seq->private;
2156 int offset = st->offset;
2157 int orig_num = st->num;
2158 void *rc = NULL;
2159
2160 switch (st->state) {
2161 case TCP_SEQ_STATE_OPENREQ:
2162 case TCP_SEQ_STATE_LISTENING:
2163 if (st->bucket >= INET_LHTABLE_SIZE)
2164 break;
2165 st->state = TCP_SEQ_STATE_LISTENING;
2166 rc = listening_get_next(seq, NULL);
2167 while (offset-- && rc)
2168 rc = listening_get_next(seq, rc);
2169 if (rc)
2170 break;
2171 st->bucket = 0;
05dbc7b5 2172 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2173 /* Fallthrough */
2174 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2175 if (st->bucket > tcp_hashinfo.ehash_mask)
2176 break;
2177 rc = established_get_first(seq);
2178 while (offset-- && rc)
2179 rc = established_get_next(seq, rc);
2180 }
2181
2182 st->num = orig_num;
2183
2184 return rc;
2185}
2186
1da177e4
LT
2187static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2188{
5799de0b 2189 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2190 void *rc;
2191
2192 if (*pos && *pos == st->last_pos) {
2193 rc = tcp_seek_last_pos(seq);
2194 if (rc)
2195 goto out;
2196 }
2197
1da177e4
LT
2198 st->state = TCP_SEQ_STATE_LISTENING;
2199 st->num = 0;
a8b690f9
TH
2200 st->bucket = 0;
2201 st->offset = 0;
2202 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2203
2204out:
2205 st->last_pos = *pos;
2206 return rc;
1da177e4
LT
2207}
2208
2209static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2210{
a8b690f9 2211 struct tcp_iter_state *st = seq->private;
1da177e4 2212 void *rc = NULL;
1da177e4
LT
2213
2214 if (v == SEQ_START_TOKEN) {
2215 rc = tcp_get_idx(seq, 0);
2216 goto out;
2217 }
1da177e4
LT
2218
2219 switch (st->state) {
2220 case TCP_SEQ_STATE_OPENREQ:
2221 case TCP_SEQ_STATE_LISTENING:
2222 rc = listening_get_next(seq, v);
2223 if (!rc) {
1da177e4 2224 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2225 st->bucket = 0;
2226 st->offset = 0;
1da177e4
LT
2227 rc = established_get_first(seq);
2228 }
2229 break;
2230 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2231 rc = established_get_next(seq, v);
2232 break;
2233 }
2234out:
2235 ++*pos;
a8b690f9 2236 st->last_pos = *pos;
1da177e4
LT
2237 return rc;
2238}
2239
2240static void tcp_seq_stop(struct seq_file *seq, void *v)
2241{
5799de0b 2242 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2243
2244 switch (st->state) {
2245 case TCP_SEQ_STATE_OPENREQ:
2246 if (v) {
463c84b9
ACM
2247 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2248 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2249 }
2250 case TCP_SEQ_STATE_LISTENING:
2251 if (v != SEQ_START_TOKEN)
5caea4ea 2252 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2253 break;
1da177e4
LT
2254 case TCP_SEQ_STATE_ESTABLISHED:
2255 if (v)
9db66bdc 2256 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2257 break;
2258 }
2259}
2260
73cb88ec 2261int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2262{
d9dda78b 2263 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2264 struct tcp_iter_state *s;
52d6f3f1 2265 int err;
1da177e4 2266
52d6f3f1
DL
2267 err = seq_open_net(inode, file, &afinfo->seq_ops,
2268 sizeof(struct tcp_iter_state));
2269 if (err < 0)
2270 return err;
f40c8174 2271
52d6f3f1 2272 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2273 s->family = afinfo->family;
a8b690f9 2274 s->last_pos = 0;
f40c8174
DL
2275 return 0;
2276}
73cb88ec 2277EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2278
6f8b13bc 2279int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2280{
2281 int rc = 0;
2282 struct proc_dir_entry *p;
2283
9427c4b3
DL
2284 afinfo->seq_ops.start = tcp_seq_start;
2285 afinfo->seq_ops.next = tcp_seq_next;
2286 afinfo->seq_ops.stop = tcp_seq_stop;
2287
84841c3c 2288 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2289 afinfo->seq_fops, afinfo);
84841c3c 2290 if (!p)
1da177e4
LT
2291 rc = -ENOMEM;
2292 return rc;
2293}
4bc2f18b 2294EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2295
6f8b13bc 2296void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2297{
ece31ffd 2298 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2299}
4bc2f18b 2300EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2301
cf533ea5 2302static void get_openreq4(const struct sock *sk, const struct request_sock *req,
652586df 2303 struct seq_file *f, int i, kuid_t uid)
1da177e4 2304{
2e6599cb 2305 const struct inet_request_sock *ireq = inet_rsk(req);
a399a805 2306 long delta = req->expires - jiffies;
1da177e4 2307
5e659e4c 2308 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2309 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2310 i,
634fb979 2311 ireq->ir_loc_addr,
c720c7e8 2312 ntohs(inet_sk(sk)->inet_sport),
634fb979
ED
2313 ireq->ir_rmt_addr,
2314 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2315 TCP_SYN_RECV,
2316 0, 0, /* could print option size, but that is af dependent. */
2317 1, /* timers active (only the expire timer) */
a399a805 2318 jiffies_delta_to_clock_t(delta),
e6c022a4 2319 req->num_timeout,
a7cb5a49 2320 from_kuid_munged(seq_user_ns(f), uid),
1da177e4
LT
2321 0, /* non standard timer */
2322 0, /* open_requests have no inode */
2323 atomic_read(&sk->sk_refcnt),
652586df 2324 req);
1da177e4
LT
2325}
2326
652586df 2327static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2328{
2329 int timer_active;
2330 unsigned long timer_expires;
cf533ea5 2331 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2332 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2333 const struct inet_sock *inet = inet_sk(sk);
168a8f58 2334 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2335 __be32 dest = inet->inet_daddr;
2336 __be32 src = inet->inet_rcv_saddr;
2337 __u16 destp = ntohs(inet->inet_dport);
2338 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2339 int rx_queue;
1da177e4 2340
6ba8a3b1
ND
2341 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2342 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2343 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2344 timer_active = 1;
463c84b9
ACM
2345 timer_expires = icsk->icsk_timeout;
2346 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2347 timer_active = 4;
463c84b9 2348 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2349 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2350 timer_active = 2;
cf4c6bf8 2351 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2352 } else {
2353 timer_active = 0;
2354 timer_expires = jiffies;
2355 }
2356
49d09007
ED
2357 if (sk->sk_state == TCP_LISTEN)
2358 rx_queue = sk->sk_ack_backlog;
2359 else
2360 /*
2361 * because we dont lock socket, we might find a transient negative value
2362 */
2363 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2364
5e659e4c 2365 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2366 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
cf4c6bf8 2367 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2368 tp->write_seq - tp->snd_una,
49d09007 2369 rx_queue,
1da177e4 2370 timer_active,
a399a805 2371 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2372 icsk->icsk_retransmits,
a7cb5a49 2373 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2374 icsk->icsk_probes_out,
cf4c6bf8
IJ
2375 sock_i_ino(sk),
2376 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2377 jiffies_to_clock_t(icsk->icsk_rto),
2378 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2379 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2380 tp->snd_cwnd,
168a8f58
JC
2381 sk->sk_state == TCP_LISTEN ?
2382 (fastopenq ? fastopenq->max_qlen : 0) :
652586df 2383 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2384}
2385
cf533ea5 2386static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2387 struct seq_file *f, int i)
1da177e4 2388{
23f33c2d 2389 __be32 dest, src;
1da177e4 2390 __u16 destp, srcp;
e2a1d3e4 2391 s32 delta = tw->tw_ttd - inet_tw_time_stamp();
1da177e4
LT
2392
2393 dest = tw->tw_daddr;
2394 src = tw->tw_rcv_saddr;
2395 destp = ntohs(tw->tw_dport);
2396 srcp = ntohs(tw->tw_sport);
2397
5e659e4c 2398 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2399 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2400 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2401 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2402 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2403}
2404
2405#define TMPSZ 150
2406
2407static int tcp4_seq_show(struct seq_file *seq, void *v)
2408{
5799de0b 2409 struct tcp_iter_state *st;
05dbc7b5 2410 struct sock *sk = v;
1da177e4 2411
652586df 2412 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2413 if (v == SEQ_START_TOKEN) {
652586df 2414 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2415 "rx_queue tr tm->when retrnsmt uid timeout "
2416 "inode");
2417 goto out;
2418 }
2419 st = seq->private;
2420
2421 switch (st->state) {
2422 case TCP_SEQ_STATE_LISTENING:
2423 case TCP_SEQ_STATE_ESTABLISHED:
05dbc7b5 2424 if (sk->sk_state == TCP_TIME_WAIT)
652586df 2425 get_timewait4_sock(v, seq, st->num);
05dbc7b5 2426 else
652586df 2427 get_tcp4_sock(v, seq, st->num);
1da177e4
LT
2428 break;
2429 case TCP_SEQ_STATE_OPENREQ:
652586df 2430 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
1da177e4
LT
2431 break;
2432 }
1da177e4 2433out:
652586df 2434 seq_pad(seq, '\n');
1da177e4
LT
2435 return 0;
2436}
2437
73cb88ec
AV
2438static const struct file_operations tcp_afinfo_seq_fops = {
2439 .owner = THIS_MODULE,
2440 .open = tcp_seq_open,
2441 .read = seq_read,
2442 .llseek = seq_lseek,
2443 .release = seq_release_net
2444};
2445
1da177e4 2446static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2447 .name = "tcp",
2448 .family = AF_INET,
73cb88ec 2449 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2450 .seq_ops = {
2451 .show = tcp4_seq_show,
2452 },
1da177e4
LT
2453};
2454
2c8c1e72 2455static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2456{
2457 return tcp_proc_register(net, &tcp4_seq_afinfo);
2458}
2459
2c8c1e72 2460static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2461{
2462 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2463}
2464
2465static struct pernet_operations tcp4_net_ops = {
2466 .init = tcp4_proc_init_net,
2467 .exit = tcp4_proc_exit_net,
2468};
2469
1da177e4
LT
2470int __init tcp4_proc_init(void)
2471{
757764f6 2472 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2473}
2474
2475void tcp4_proc_exit(void)
2476{
757764f6 2477 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2478}
2479#endif /* CONFIG_PROC_FS */
2480
2481struct proto tcp_prot = {
2482 .name = "TCP",
2483 .owner = THIS_MODULE,
2484 .close = tcp_close,
2485 .connect = tcp_v4_connect,
2486 .disconnect = tcp_disconnect,
463c84b9 2487 .accept = inet_csk_accept,
1da177e4
LT
2488 .ioctl = tcp_ioctl,
2489 .init = tcp_v4_init_sock,
2490 .destroy = tcp_v4_destroy_sock,
2491 .shutdown = tcp_shutdown,
2492 .setsockopt = tcp_setsockopt,
2493 .getsockopt = tcp_getsockopt,
1da177e4 2494 .recvmsg = tcp_recvmsg,
7ba42910
CG
2495 .sendmsg = tcp_sendmsg,
2496 .sendpage = tcp_sendpage,
1da177e4 2497 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2498 .release_cb = tcp_release_cb,
563d34d0 2499 .mtu_reduced = tcp_v4_mtu_reduced,
ab1e0a13
ACM
2500 .hash = inet_hash,
2501 .unhash = inet_unhash,
2502 .get_port = inet_csk_get_port,
1da177e4 2503 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2504 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2505 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2506 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2507 .memory_allocated = &tcp_memory_allocated,
2508 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2509 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2510 .sysctl_wmem = sysctl_tcp_wmem,
2511 .sysctl_rmem = sysctl_tcp_rmem,
2512 .max_header = MAX_TCP_HEADER,
2513 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2514 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2515 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2516 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2517 .h.hashinfo = &tcp_hashinfo,
7ba42910 2518 .no_autobind = true,
543d9cfe
ACM
2519#ifdef CONFIG_COMPAT
2520 .compat_setsockopt = compat_tcp_setsockopt,
2521 .compat_getsockopt = compat_tcp_getsockopt,
2522#endif
c255a458 2523#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
2524 .init_cgroup = tcp_init_cgroup,
2525 .destroy_cgroup = tcp_destroy_cgroup,
2526 .proto_cgroup = tcp_proto_cgroup,
2527#endif
1da177e4 2528};
4bc2f18b 2529EXPORT_SYMBOL(tcp_prot);
1da177e4 2530
046ee902
DL
2531static int __net_init tcp_sk_init(struct net *net)
2532{
5d134f1c 2533 net->ipv4.sysctl_tcp_ecn = 2;
be9f4a44 2534 return 0;
046ee902
DL
2535}
2536
2537static void __net_exit tcp_sk_exit(struct net *net)
2538{
b099ce26
EB
2539}
2540
2541static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2542{
2543 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2544}
2545
2546static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2547 .init = tcp_sk_init,
2548 .exit = tcp_sk_exit,
2549 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2550};
2551
9b0f976f 2552void __init tcp_v4_init(void)
1da177e4 2553{
5caea4ea 2554 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2555 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2556 panic("Failed to create the TCP control socket.\n");
1da177e4 2557}
This page took 1.052682 seconds and 5 git commands to generate.