net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 static int tcp_repair_connect(struct sock *sk)
 142 {
 143         tcp_connect_init(sk);
 144         tcp_finish_connect(sk, NULL);
 145
 146         return 0;
 147 }
 148
 149 /* This will initiate an outgoing connection. */
 150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151 {
 152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153         struct inet_sock *inet = inet_sk(sk);
 154         struct tcp_sock *tp = tcp_sk(sk);
 155         __be16 orig_sport, orig_dport;
 156         __be32 daddr, nexthop;
 157         struct flowi4 *fl4;
 158         struct rtable *rt;
 159         int err;
 160         struct ip_options_rcu *inet_opt;
 161
 162         if (addr_len < sizeof(struct sockaddr_in))
 163                 return -EINVAL;
 164
 165         if (usin->sin_family != AF_INET)
 166                 return -EAFNOSUPPORT;
 167
 168         nexthop = daddr = usin->sin_addr.s_addr;
 169         inet_opt = rcu_dereference_protected(inet->inet_opt,
 170                                              sock_owned_by_user(sk));
 171         if (inet_opt && inet_opt->opt.srr) {
 172                 if (!daddr)
 173                         return -EINVAL;
 174                 nexthop = inet_opt->opt.faddr;
 175         }
 176
 177         orig_sport = inet->inet_sport;
 178         orig_dport = usin->sin_port;
 179         fl4 = &inet->cork.fl.u.ip4;
 180         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 181                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 182                               IPPROTO_TCP,
 183                               orig_sport, orig_dport, sk, true);
 184         if (IS_ERR(rt)) {
 185                 err = PTR_ERR(rt);
 186                 if (err == -ENETUNREACH)
 187                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 188                 return err;
 189         }
 190
 191         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 192                 ip_rt_put(rt);
 193                 return -ENETUNREACH;
 194         }
 195
 196         if (!inet_opt || !inet_opt->opt.srr)
 197                 daddr = fl4->daddr;
 198
 199         if (!inet->inet_saddr)
 200                 inet->inet_saddr = fl4->saddr;
 201         inet->inet_rcv_saddr = inet->inet_saddr;
 202
 203         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 204                 /* Reset inherited state */
 205                 tp->rx_opt.ts_recent       = 0;
 206                 tp->rx_opt.ts_recent_stamp = 0;
 207                 if (likely(!tp->repair))
 208                         tp->write_seq      = 0;
 209         }
 210
 211         if (tcp_death_row.sysctl_tw_recycle &&
 212             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 213                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 214
 215         inet->inet_dport = usin->sin_port;
 216         inet->inet_daddr = daddr;
 217
 218         inet_csk(sk)->icsk_ext_hdr_len = 0;
 219         if (inet_opt)
 220                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 221
 222         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 223
 224         /* Socket identity is still unknown (sport may be zero).
 225          * However we set state to SYN-SENT and not releasing socket
 226          * lock select source port, enter ourselves into the hash tables and
 227          * complete initialization after this.
 228          */
 229         tcp_set_state(sk, TCP_SYN_SENT);
 230         err = inet_hash_connect(&tcp_death_row, sk);
 231         if (err)
 232                 goto failure;
 233
 234         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 235                                inet->inet_sport, inet->inet_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 rt = NULL;
 239                 goto failure;
 240         }
 241         /* OK, now commit destination to socket.  */
 242         sk->sk_gso_type = SKB_GSO_TCPV4;
 243         sk_setup_caps(sk, &rt->dst);
 244
 245         if (!tp->write_seq && likely(!tp->repair))
 246                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                            inet->inet_daddr,
 248                                                            inet->inet_sport,
 249                                                            usin->sin_port);
 250
 251         inet->inet_id = tp->write_seq ^ jiffies;
 252
 253         if (likely(!tp->repair))
 254                 err = tcp_connect(sk);
 255         else
 256                 err = tcp_repair_connect(sk);
 257
 258         rt = NULL;
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine does path mtu discovery as defined in RFC1191.
 279  */
 280 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 281 {
 282         struct dst_entry *dst;
 283         struct inet_sock *inet = inet_sk(sk);
 284
 285         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 286          * send out by Linux are always <576bytes so they should go through
 287          * unfragmented).
 288          */
 289         if (sk->sk_state == TCP_LISTEN)
 290                 return;
 291
 292         /* We don't check in the destentry if pmtu discovery is forbidden
 293          * on this route. We just assume that no packet_to_big packets
 294          * are send back when pmtu discovery is not active.
 295          * There is a small race when the user changes this flag in the
 296          * route, but I think that's acceptable.
 297          */
 298         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 299                 return;
 300
 301         dst->ops->update_pmtu(dst, mtu);
 302
 303         /* Something is about to be wrong... Remember soft error
 304          * for the case, if this connection will not able to recover.
 305          */
 306         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 307                 sk->sk_err_soft = EMSGSIZE;
 308
 309         mtu = dst_mtu(dst);
 310
 311         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 312             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 313                 tcp_sync_mss(sk, mtu);
 314
 315                 /* Resend the TCP packet because it's
 316                  * clear that the old packet has been
 317                  * dropped. This is the new "fast" path mtu
 318                  * discovery.
 319                  */
 320                 tcp_simple_retransmit(sk);
 321         } /* else let the usual retransmit timer handle it */
 322 }
 323
 324 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 325 {
 326         struct dst_entry *dst = __sk_dst_check(sk, 0);
 327
 328         if (dst)
 329                 dst->ops->redirect(dst, skb);
 330 }
 331
 332 /*
 333  * This routine is called by the ICMP module when it gets some
 334  * sort of error condition.  If err < 0 then the socket should
 335  * be closed and the error returned to the user.  If err > 0
 336  * it's just the icmp type << 8 | icmp code.  After adjustment
 337  * header points to the first 8 bytes of the tcp header.  We need
 338  * to find the appropriate port.
 339  *
 340  * The locking strategy used here is very "optimistic". When
 341  * someone else accesses the socket the ICMP is just dropped
 342  * and for some paths there is no check at all.
 343  * A more general error queue to queue errors for later handling
 344  * is probably better.
 345  *
 346  */
 347
 348 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 349 {
 350         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 351         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 352         struct inet_connection_sock *icsk;
 353         struct tcp_sock *tp;
 354         struct inet_sock *inet;
 355         const int type = icmp_hdr(icmp_skb)->type;
 356         const int code = icmp_hdr(icmp_skb)->code;
 357         struct sock *sk;
 358         struct sk_buff *skb;
 359         __u32 seq;
 360         __u32 remaining;
 361         int err;
 362         struct net *net = dev_net(icmp_skb->dev);
 363
 364         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 365                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 366                 return;
 367         }
 368
 369         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 370                         iph->saddr, th->source, inet_iif(icmp_skb));
 371         if (!sk) {
 372                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 373                 return;
 374         }
 375         if (sk->sk_state == TCP_TIME_WAIT) {
 376                 inet_twsk_put(inet_twsk(sk));
 377                 return;
 378         }
 379
 380         bh_lock_sock(sk);
 381         /* If too many ICMPs get dropped on busy
 382          * servers this needs to be solved differently.
 383          */
 384         if (sock_owned_by_user(sk))
 385                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 386
 387         if (sk->sk_state == TCP_CLOSE)
 388                 goto out;
 389
 390         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 391                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 392                 goto out;
 393         }
 394
 395         icsk = inet_csk(sk);
 396         tp = tcp_sk(sk);
 397         seq = ntohl(th->seq);
 398         if (sk->sk_state != TCP_LISTEN &&
 399             !between(seq, tp->snd_una, tp->snd_nxt)) {
 400                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 401                 goto out;
 402         }
 403
 404         switch (type) {
 405         case ICMP_REDIRECT:
 406                 do_redirect(icmp_skb, sk);
 407                 goto out;
 408         case ICMP_SOURCE_QUENCH:
 409                 /* Just silently ignore these. */
 410                 goto out;
 411         case ICMP_PARAMETERPROB:
 412                 err = EPROTO;
 413                 break;
 414         case ICMP_DEST_UNREACH:
 415                 if (code > NR_ICMP_UNREACH)
 416                         goto out;
 417
 418                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 419                         if (!sock_owned_by_user(sk))
 420                                 do_pmtu_discovery(sk, iph, info);
 421                         goto out;
 422                 }
 423
 424                 err = icmp_err_convert[code].errno;
 425                 /* check if icmp_skb allows revert of backoff
 426                  * (see draft-zimmermann-tcp-lcd) */
 427                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 428                         break;
 429                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 430                     !icsk->icsk_backoff)
 431                         break;
 432
 433                 if (sock_owned_by_user(sk))
 434                         break;
 435
 436                 icsk->icsk_backoff--;
 437                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 438                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 439                 tcp_bound_rto(sk);
 440
 441                 skb = tcp_write_queue_head(sk);
 442                 BUG_ON(!skb);
 443
 444                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 445                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 446
 447                 if (remaining) {
 448                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 449                                                   remaining, TCP_RTO_MAX);
 450                 } else {
 451                         /* RTO revert clocked out retransmission.
 452                          * Will retransmit now */
 453                         tcp_retransmit_timer(sk);
 454                 }
 455
 456                 break;
 457         case ICMP_TIME_EXCEEDED:
 458                 err = EHOSTUNREACH;
 459                 break;
 460         default:
 461                 goto out;
 462         }
 463
 464         switch (sk->sk_state) {
 465                 struct request_sock *req, **prev;
 466         case TCP_LISTEN:
 467                 if (sock_owned_by_user(sk))
 468                         goto out;
 469
 470                 req = inet_csk_search_req(sk, &prev, th->dest,
 471                                           iph->daddr, iph->saddr);
 472                 if (!req)
 473                         goto out;
 474
 475                 /* ICMPs are not backlogged, hence we cannot get
 476                    an established socket here.
 477                  */
 478                 WARN_ON(req->sk);
 479
 480                 if (seq != tcp_rsk(req)->snt_isn) {
 481                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 482                         goto out;
 483                 }
 484
 485                 /*
 486                  * Still in SYN_RECV, just remove it silently.
 487                  * There is no good way to pass the error to the newly
 488                  * created socket, and POSIX does not want network
 489                  * errors returned from accept().
 490                  */
 491                 inet_csk_reqsk_queue_drop(sk, req, prev);
 492                 goto out;
 493
 494         case TCP_SYN_SENT:
 495         case TCP_SYN_RECV:  /* Cannot happen.
 496                                It can f.e. if SYNs crossed.
 497                              */
 498                 if (!sock_owned_by_user(sk)) {
 499                         sk->sk_err = err;
 500
 501                         sk->sk_error_report(sk);
 502
 503                         tcp_done(sk);
 504                 } else {
 505                         sk->sk_err_soft = err;
 506                 }
 507                 goto out;
 508         }
 509
 510         /* If we've already connected we will keep trying
 511          * until we time out, or the user gives up.
 512          *
 513          * rfc1122 4.2.3.9 allows to consider as hard errors
 514          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 515          * but it is obsoleted by pmtu discovery).
 516          *
 517          * Note, that in modern internet, where routing is unreliable
 518          * and in each dark corner broken firewalls sit, sending random
 519          * errors ordered by their masters even this two messages finally lose
 520          * their original sense (even Linux sends invalid PORT_UNREACHs)
 521          *
 522          * Now we are in compliance with RFCs.
 523          *                                                      --ANK (980905)
 524          */
 525
 526         inet = inet_sk(sk);
 527         if (!sock_owned_by_user(sk) && inet->recverr) {
 528                 sk->sk_err = err;
 529                 sk->sk_error_report(sk);
 530         } else  { /* Only an error on timeout */
 531                 sk->sk_err_soft = err;
 532         }
 533
 534 out:
 535         bh_unlock_sock(sk);
 536         sock_put(sk);
 537 }
 538
 539 static void __tcp_v4_send_check(struct sk_buff *skb,
 540                                 __be32 saddr, __be32 daddr)
 541 {
 542         struct tcphdr *th = tcp_hdr(skb);
 543
 544         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 545                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 546                 skb->csum_start = skb_transport_header(skb) - skb->head;
 547                 skb->csum_offset = offsetof(struct tcphdr, check);
 548         } else {
 549                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 550                                          csum_partial(th,
 551                                                       th->doff << 2,
 552                                                       skb->csum));
 553         }
 554 }
 555
 556 /* This routine computes an IPv4 TCP checksum. */
 557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 558 {
 559         const struct inet_sock *inet = inet_sk(sk);
 560
 561         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 562 }
 563 EXPORT_SYMBOL(tcp_v4_send_check);
 564
 565 int tcp_v4_gso_send_check(struct sk_buff *skb)
 566 {
 567         const struct iphdr *iph;
 568         struct tcphdr *th;
 569
 570         if (!pskb_may_pull(skb, sizeof(*th)))
 571                 return -EINVAL;
 572
 573         iph = ip_hdr(skb);
 574         th = tcp_hdr(skb);
 575
 576         th->check = 0;
 577         skb->ip_summed = CHECKSUM_PARTIAL;
 578         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 579         return 0;
 580 }
 581
 582 /*
 583  *      This routine will send an RST to the other tcp.
 584  *
 585  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 586  *                    for reset.
 587  *      Answer: if a packet caused RST, it is not for a socket
 588  *              existing in our system, if it is matched to a socket,
 589  *              it is just duplicate segment or bug in other side's TCP.
 590  *              So that we build reply only basing on parameters
 591  *              arrived with segment.
 592  *      Exception: precedence violation. We do not implement it in any case.
 593  */
 594
 595 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 596 {
 597         const struct tcphdr *th = tcp_hdr(skb);
 598         struct {
 599                 struct tcphdr th;
 600 #ifdef CONFIG_TCP_MD5SIG
 601                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 602 #endif
 603         } rep;
 604         struct ip_reply_arg arg;
 605 #ifdef CONFIG_TCP_MD5SIG
 606         struct tcp_md5sig_key *key;
 607         const __u8 *hash_location = NULL;
 608         unsigned char newhash[16];
 609         int genhash;
 610         struct sock *sk1 = NULL;
 611 #endif
 612         struct net *net;
 613
 614         /* Never send a reset in response to a reset. */
 615         if (th->rst)
 616                 return;
 617
 618         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 619                 return;
 620
 621         /* Swap the send and the receive. */
 622         memset(&rep, 0, sizeof(rep));
 623         rep.th.dest   = th->source;
 624         rep.th.source = th->dest;
 625         rep.th.doff   = sizeof(struct tcphdr) / 4;
 626         rep.th.rst    = 1;
 627
 628         if (th->ack) {
 629                 rep.th.seq = th->ack_seq;
 630         } else {
 631                 rep.th.ack = 1;
 632                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 633                                        skb->len - (th->doff << 2));
 634         }
 635
 636         memset(&arg, 0, sizeof(arg));
 637         arg.iov[0].iov_base = (unsigned char *)&rep;
 638         arg.iov[0].iov_len  = sizeof(rep.th);
 639
 640 #ifdef CONFIG_TCP_MD5SIG
 641         hash_location = tcp_parse_md5sig_option(th);
 642         if (!sk && hash_location) {
 643                 /*
 644                  * active side is lost. Try to find listening socket through
 645                  * source port, and then find md5 key through listening socket.
 646                  * we are not loose security here:
 647                  * Incoming packet is checked with md5 hash with finding key,
 648                  * no RST generated if md5 hash doesn't match.
 649                  */
 650                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 651                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 652                                              ntohs(th->source), inet_iif(skb));
 653                 /* don't send rst if it can't find key */
 654                 if (!sk1)
 655                         return;
 656                 rcu_read_lock();
 657                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 658                                         &ip_hdr(skb)->saddr, AF_INET);
 659                 if (!key)
 660                         goto release_sk1;
 661
 662                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 663                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 664                         goto release_sk1;
 665         } else {
 666                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 667                                              &ip_hdr(skb)->saddr,
 668                                              AF_INET) : NULL;
 669         }
 670
 671         if (key) {
 672                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 673                                    (TCPOPT_NOP << 16) |
 674                                    (TCPOPT_MD5SIG << 8) |
 675                                    TCPOLEN_MD5SIG);
 676                 /* Update length and the length the header thinks exists */
 677                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 678                 rep.th.doff = arg.iov[0].iov_len / 4;
 679
 680                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 681                                      key, ip_hdr(skb)->saddr,
 682                                      ip_hdr(skb)->daddr, &rep.th);
 683         }
 684 #endif
 685         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 686                                       ip_hdr(skb)->saddr, /* XXX */
 687                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 688         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 689         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 690         /* When socket is gone, all binding information is lost.
 691          * routing might fail in this case. using iif for oif to
 692          * make sure we can deliver it
 693          */
 694         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 695
 696         net = dev_net(skb_dst(skb)->dev);
 697         arg.tos = ip_hdr(skb)->tos;
 698         ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 699                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 700
 701         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 702         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 703
 704 #ifdef CONFIG_TCP_MD5SIG
 705 release_sk1:
 706         if (sk1) {
 707                 rcu_read_unlock();
 708                 sock_put(sk1);
 709         }
 710 #endif
 711 }
 712
 713 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 714    outside socket context is ugly, certainly. What can I do?
 715  */
 716
 717 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 718                             u32 win, u32 ts, int oif,
 719                             struct tcp_md5sig_key *key,
 720                             int reply_flags, u8 tos)
 721 {
 722         const struct tcphdr *th = tcp_hdr(skb);
 723         struct {
 724                 struct tcphdr th;
 725                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 726 #ifdef CONFIG_TCP_MD5SIG
 727                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 728 #endif
 729                         ];
 730         } rep;
 731         struct ip_reply_arg arg;
 732         struct net *net = dev_net(skb_dst(skb)->dev);
 733
 734         memset(&rep.th, 0, sizeof(struct tcphdr));
 735         memset(&arg, 0, sizeof(arg));
 736
 737         arg.iov[0].iov_base = (unsigned char *)&rep;
 738         arg.iov[0].iov_len  = sizeof(rep.th);
 739         if (ts) {
 740                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 741                                    (TCPOPT_TIMESTAMP << 8) |
 742                                    TCPOLEN_TIMESTAMP);
 743                 rep.opt[1] = htonl(tcp_time_stamp);
 744                 rep.opt[2] = htonl(ts);
 745                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 746         }
 747
 748         /* Swap the send and the receive. */
 749         rep.th.dest    = th->source;
 750         rep.th.source  = th->dest;
 751         rep.th.doff    = arg.iov[0].iov_len / 4;
 752         rep.th.seq     = htonl(seq);
 753         rep.th.ack_seq = htonl(ack);
 754         rep.th.ack     = 1;
 755         rep.th.window  = htons(win);
 756
 757 #ifdef CONFIG_TCP_MD5SIG
 758         if (key) {
 759                 int offset = (ts) ? 3 : 0;
 760
 761                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 762                                           (TCPOPT_NOP << 16) |
 763                                           (TCPOPT_MD5SIG << 8) |
 764                                           TCPOLEN_MD5SIG);
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len/4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 769                                     key, ip_hdr(skb)->saddr,
 770                                     ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.flags = reply_flags;
 774         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 775                                       ip_hdr(skb)->saddr, /* XXX */
 776                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 777         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 778         if (oif)
 779                 arg.bound_dev_if = oif;
 780         arg.tos = tos;
 781         ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 782                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 783
 784         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 785 }
 786
 787 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 788 {
 789         struct inet_timewait_sock *tw = inet_twsk(sk);
 790         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 791
 792         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 793                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 794                         tcptw->tw_ts_recent,
 795                         tw->tw_bound_dev_if,
 796                         tcp_twsk_md5_key(tcptw),
 797                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 798                         tw->tw_tos
 799                         );
 800
 801         inet_twsk_put(tw);
 802 }
 803
 804 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 805                                   struct request_sock *req)
 806 {
 807         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 808                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 809                         req->ts_recent,
 810                         0,
 811                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 812                                           AF_INET),
 813                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 814                         ip_hdr(skb)->tos);
 815 }
 816
 817 /*
 818  *      Send a SYN-ACK after having received a SYN.
 819  *      This still operates on a request_sock only, not on a big
 820  *      socket.
 821  */
 822 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 823                               struct request_sock *req,
 824                               struct request_values *rvp,
 825                               u16 queue_mapping,
 826                               bool nocache)
 827 {
 828         const struct inet_request_sock *ireq = inet_rsk(req);
 829         struct flowi4 fl4;
 830         int err = -1;
 831         struct sk_buff * skb;
 832
 833         /* First, grab a route. */
 834         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
 835                 return -1;
 836
 837         skb = tcp_make_synack(sk, dst, req, rvp);
 838
 839         if (skb) {
 840                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 841
 842                 skb_set_queue_mapping(skb, queue_mapping);
 843                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 844                                             ireq->rmt_addr,
 845                                             ireq->opt);
 846                 err = net_xmit_eval(err);
 847         }
 848
 849         return err;
 850 }
 851
 852 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 853                               struct request_values *rvp)
 854 {
 855         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 856         return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 857 }
 858
 859 /*
 860  *      IPv4 request_sock destructor.
 861  */
 862 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 863 {
 864         kfree(inet_rsk(req)->opt);
 865 }
 866
 867 /*
 868  * Return true if a syncookie should be sent
 869  */
 870 bool tcp_syn_flood_action(struct sock *sk,
 871                          const struct sk_buff *skb,
 872                          const char *proto)
 873 {
 874         const char *msg = "Dropping request";
 875         bool want_cookie = false;
 876         struct listen_sock *lopt;
 877
 878
 879
 880 #ifdef CONFIG_SYN_COOKIES
 881         if (sysctl_tcp_syncookies) {
 882                 msg = "Sending cookies";
 883                 want_cookie = true;
 884                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 885         } else
 886 #endif
 887                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 888
 889         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 890         if (!lopt->synflood_warned) {
 891                 lopt->synflood_warned = 1;
 892                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 893                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 894         }
 895         return want_cookie;
 896 }
 897 EXPORT_SYMBOL(tcp_syn_flood_action);
 898
 899 /*
 900  * Save and compile IPv4 options into the request_sock if needed.
 901  */
 902 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 903                                                   struct sk_buff *skb)
 904 {
 905         const struct ip_options *opt = &(IPCB(skb)->opt);
 906         struct ip_options_rcu *dopt = NULL;
 907
 908         if (opt && opt->optlen) {
 909                 int opt_size = sizeof(*dopt) + opt->optlen;
 910
 911                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 912                 if (dopt) {
 913                         if (ip_options_echo(&dopt->opt, skb)) {
 914                                 kfree(dopt);
 915                                 dopt = NULL;
 916                         }
 917                 }
 918         }
 919         return dopt;
 920 }
 921
 922 #ifdef CONFIG_TCP_MD5SIG
 923 /*
 924  * RFC2385 MD5 checksumming requires a mapping of
 925  * IP address->MD5 Key.
 926  * We need to maintain these in the sk structure.
 927  */
 928
 929 /* Find the Key structure for an address.  */
 930 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 931                                          const union tcp_md5_addr *addr,
 932                                          int family)
 933 {
 934         struct tcp_sock *tp = tcp_sk(sk);
 935         struct tcp_md5sig_key *key;
 936         struct hlist_node *pos;
 937         unsigned int size = sizeof(struct in_addr);
 938         struct tcp_md5sig_info *md5sig;
 939
 940         /* caller either holds rcu_read_lock() or socket lock */
 941         md5sig = rcu_dereference_check(tp->md5sig_info,
 942                                        sock_owned_by_user(sk) ||
 943                                        lockdep_is_held(&sk->sk_lock.slock));
 944         if (!md5sig)
 945                 return NULL;
 946 #if IS_ENABLED(CONFIG_IPV6)
 947         if (family == AF_INET6)
 948                 size = sizeof(struct in6_addr);
 949 #endif
 950         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 951                 if (key->family != family)
 952                         continue;
 953                 if (!memcmp(&key->addr, addr, size))
 954                         return key;
 955         }
 956         return NULL;
 957 }
 958 EXPORT_SYMBOL(tcp_md5_do_lookup);
 959
 960 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 961                                          struct sock *addr_sk)
 962 {
 963         union tcp_md5_addr *addr;
 964
 965         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 966         return tcp_md5_do_lookup(sk, addr, AF_INET);
 967 }
 968 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 969
 970 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 971                                                       struct request_sock *req)
 972 {
 973         union tcp_md5_addr *addr;
 974
 975         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 976         return tcp_md5_do_lookup(sk, addr, AF_INET);
 977 }
 978
 979 /* This can be called on a newly created socket, from other files */
 980 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 981                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 982 {
 983         /* Add Key to the list */
 984         struct tcp_md5sig_key *key;
 985         struct tcp_sock *tp = tcp_sk(sk);
 986         struct tcp_md5sig_info *md5sig;
 987
 988         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
 989         if (key) {
 990                 /* Pre-existing entry - just update that one. */
 991                 memcpy(key->key, newkey, newkeylen);
 992                 key->keylen = newkeylen;
 993                 return 0;
 994         }
 995
 996         md5sig = rcu_dereference_protected(tp->md5sig_info,
 997                                            sock_owned_by_user(sk));
 998         if (!md5sig) {
 999                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1000                 if (!md5sig)
1001                         return -ENOMEM;
1002
1003                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1004                 INIT_HLIST_HEAD(&md5sig->head);
1005                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1006         }
1007
1008         key = sock_kmalloc(sk, sizeof(*key), gfp);
1009         if (!key)
1010                 return -ENOMEM;
1011         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1012                 sock_kfree_s(sk, key, sizeof(*key));
1013                 return -ENOMEM;
1014         }
1015
1016         memcpy(key->key, newkey, newkeylen);
1017         key->keylen = newkeylen;
1018         key->family = family;
1019         memcpy(&key->addr, addr,
1020                (family == AF_INET6) ? sizeof(struct in6_addr) :
1021                                       sizeof(struct in_addr));
1022         hlist_add_head_rcu(&key->node, &md5sig->head);
1023         return 0;
1024 }
1025 EXPORT_SYMBOL(tcp_md5_do_add);
1026
1027 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1028 {
1029         struct tcp_sock *tp = tcp_sk(sk);
1030         struct tcp_md5sig_key *key;
1031         struct tcp_md5sig_info *md5sig;
1032
1033         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1034         if (!key)
1035                 return -ENOENT;
1036         hlist_del_rcu(&key->node);
1037         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1038         kfree_rcu(key, rcu);
1039         md5sig = rcu_dereference_protected(tp->md5sig_info,
1040                                            sock_owned_by_user(sk));
1041         if (hlist_empty(&md5sig->head))
1042                 tcp_free_md5sig_pool();
1043         return 0;
1044 }
1045 EXPORT_SYMBOL(tcp_md5_do_del);
1046
1047 void tcp_clear_md5_list(struct sock *sk)
1048 {
1049         struct tcp_sock *tp = tcp_sk(sk);
1050         struct tcp_md5sig_key *key;
1051         struct hlist_node *pos, *n;
1052         struct tcp_md5sig_info *md5sig;
1053
1054         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1055
1056         if (!hlist_empty(&md5sig->head))
1057                 tcp_free_md5sig_pool();
1058         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1059                 hlist_del_rcu(&key->node);
1060                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1061                 kfree_rcu(key, rcu);
1062         }
1063 }
1064
1065 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1066                                  int optlen)
1067 {
1068         struct tcp_md5sig cmd;
1069         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1070
1071         if (optlen < sizeof(cmd))
1072                 return -EINVAL;
1073
1074         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1075                 return -EFAULT;
1076
1077         if (sin->sin_family != AF_INET)
1078                 return -EINVAL;
1079
1080         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1081                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1082                                       AF_INET);
1083
1084         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1085                 return -EINVAL;
1086
1087         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1088                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1089                               GFP_KERNEL);
1090 }
1091
1092 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1093                                         __be32 daddr, __be32 saddr, int nbytes)
1094 {
1095         struct tcp4_pseudohdr *bp;
1096         struct scatterlist sg;
1097
1098         bp = &hp->md5_blk.ip4;
1099
1100         /*
1101          * 1. the TCP pseudo-header (in the order: source IP address,
1102          * destination IP address, zero-padded protocol number, and
1103          * segment length)
1104          */
1105         bp->saddr = saddr;
1106         bp->daddr = daddr;
1107         bp->pad = 0;
1108         bp->protocol = IPPROTO_TCP;
1109         bp->len = cpu_to_be16(nbytes);
1110
1111         sg_init_one(&sg, bp, sizeof(*bp));
1112         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1113 }
1114
1115 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1116                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1117 {
1118         struct tcp_md5sig_pool *hp;
1119         struct hash_desc *desc;
1120
1121         hp = tcp_get_md5sig_pool();
1122         if (!hp)
1123                 goto clear_hash_noput;
1124         desc = &hp->md5_desc;
1125
1126         if (crypto_hash_init(desc))
1127                 goto clear_hash;
1128         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1129                 goto clear_hash;
1130         if (tcp_md5_hash_header(hp, th))
1131                 goto clear_hash;
1132         if (tcp_md5_hash_key(hp, key))
1133                 goto clear_hash;
1134         if (crypto_hash_final(desc, md5_hash))
1135                 goto clear_hash;
1136
1137         tcp_put_md5sig_pool();
1138         return 0;
1139
1140 clear_hash:
1141         tcp_put_md5sig_pool();
1142 clear_hash_noput:
1143         memset(md5_hash, 0, 16);
1144         return 1;
1145 }
1146
1147 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1148                         const struct sock *sk, const struct request_sock *req,
1149                         const struct sk_buff *skb)
1150 {
1151         struct tcp_md5sig_pool *hp;
1152         struct hash_desc *desc;
1153         const struct tcphdr *th = tcp_hdr(skb);
1154         __be32 saddr, daddr;
1155
1156         if (sk) {
1157                 saddr = inet_sk(sk)->inet_saddr;
1158                 daddr = inet_sk(sk)->inet_daddr;
1159         } else if (req) {
1160                 saddr = inet_rsk(req)->loc_addr;
1161                 daddr = inet_rsk(req)->rmt_addr;
1162         } else {
1163                 const struct iphdr *iph = ip_hdr(skb);
1164                 saddr = iph->saddr;
1165                 daddr = iph->daddr;
1166         }
1167
1168         hp = tcp_get_md5sig_pool();
1169         if (!hp)
1170                 goto clear_hash_noput;
1171         desc = &hp->md5_desc;
1172
1173         if (crypto_hash_init(desc))
1174                 goto clear_hash;
1175
1176         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1177                 goto clear_hash;
1178         if (tcp_md5_hash_header(hp, th))
1179                 goto clear_hash;
1180         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1181                 goto clear_hash;
1182         if (tcp_md5_hash_key(hp, key))
1183                 goto clear_hash;
1184         if (crypto_hash_final(desc, md5_hash))
1185                 goto clear_hash;
1186
1187         tcp_put_md5sig_pool();
1188         return 0;
1189
1190 clear_hash:
1191         tcp_put_md5sig_pool();
1192 clear_hash_noput:
1193         memset(md5_hash, 0, 16);
1194         return 1;
1195 }
1196 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1197
1198 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1199 {
1200         /*
1201          * This gets called for each TCP segment that arrives
1202          * so we want to be efficient.
1203          * We have 3 drop cases:
1204          * o No MD5 hash and one expected.
1205          * o MD5 hash and we're not expecting one.
1206          * o MD5 hash and its wrong.
1207          */
1208         const __u8 *hash_location = NULL;
1209         struct tcp_md5sig_key *hash_expected;
1210         const struct iphdr *iph = ip_hdr(skb);
1211         const struct tcphdr *th = tcp_hdr(skb);
1212         int genhash;
1213         unsigned char newhash[16];
1214
1215         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1216                                           AF_INET);
1217         hash_location = tcp_parse_md5sig_option(th);
1218
1219         /* We've parsed the options - do we have a hash? */
1220         if (!hash_expected && !hash_location)
1221                 return false;
1222
1223         if (hash_expected && !hash_location) {
1224                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1225                 return true;
1226         }
1227
1228         if (!hash_expected && hash_location) {
1229                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1230                 return true;
1231         }
1232
1233         /* Okay, so this is hash_expected and hash_location -
1234          * so we need to calculate the checksum.
1235          */
1236         genhash = tcp_v4_md5_hash_skb(newhash,
1237                                       hash_expected,
1238                                       NULL, NULL, skb);
1239
1240         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1241                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1242                                      &iph->saddr, ntohs(th->source),
1243                                      &iph->daddr, ntohs(th->dest),
1244                                      genhash ? " tcp_v4_calc_md5_hash failed"
1245                                      : "");
1246                 return true;
1247         }
1248         return false;
1249 }
1250
1251 #endif
1252
1253 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1254         .family         =       PF_INET,
1255         .obj_size       =       sizeof(struct tcp_request_sock),
1256         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1257         .send_ack       =       tcp_v4_reqsk_send_ack,
1258         .destructor     =       tcp_v4_reqsk_destructor,
1259         .send_reset     =       tcp_v4_send_reset,
1260         .syn_ack_timeout =      tcp_syn_ack_timeout,
1261 };
1262
1263 #ifdef CONFIG_TCP_MD5SIG
1264 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1265         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1266         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1267 };
1268 #endif
1269
1270 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1271 {
1272         struct tcp_extend_values tmp_ext;
1273         struct tcp_options_received tmp_opt;
1274         const u8 *hash_location;
1275         struct request_sock *req;
1276         struct inet_request_sock *ireq;
1277         struct tcp_sock *tp = tcp_sk(sk);
1278         struct dst_entry *dst = NULL;
1279         __be32 saddr = ip_hdr(skb)->saddr;
1280         __be32 daddr = ip_hdr(skb)->daddr;
1281         __u32 isn = TCP_SKB_CB(skb)->when;
1282         bool want_cookie = false;
1283
1284         /* Never answer to SYNs send to broadcast or multicast */
1285         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1286                 goto drop;
1287
1288         /* TW buckets are converted to open requests without
1289          * limitations, they conserve resources and peer is
1290          * evidently real one.
1291          */
1292         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1293                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1294                 if (!want_cookie)
1295                         goto drop;
1296         }
1297
1298         /* Accept backlog is full. If we have already queued enough
1299          * of warm entries in syn queue, drop request. It is better than
1300          * clogging syn queue with openreqs with exponentially increasing
1301          * timeout.
1302          */
1303         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1304                 goto drop;
1305
1306         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1307         if (!req)
1308                 goto drop;
1309
1310 #ifdef CONFIG_TCP_MD5SIG
1311         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1312 #endif
1313
1314         tcp_clear_options(&tmp_opt);
1315         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1316         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1317         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1318
1319         if (tmp_opt.cookie_plus > 0 &&
1320             tmp_opt.saw_tstamp &&
1321             !tp->rx_opt.cookie_out_never &&
1322             (sysctl_tcp_cookie_size > 0 ||
1323              (tp->cookie_values != NULL &&
1324               tp->cookie_values->cookie_desired > 0))) {
1325                 u8 *c;
1326                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1327                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1328
1329                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1330                         goto drop_and_release;
1331
1332                 /* Secret recipe starts with IP addresses */
1333                 *mess++ ^= (__force u32)daddr;
1334                 *mess++ ^= (__force u32)saddr;
1335
1336                 /* plus variable length Initiator Cookie */
1337                 c = (u8 *)mess;
1338                 while (l-- > 0)
1339                         *c++ ^= *hash_location++;
1340
1341                 want_cookie = false;    /* not our kind of cookie */
1342                 tmp_ext.cookie_out_never = 0; /* false */
1343                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1344         } else if (!tp->rx_opt.cookie_in_always) {
1345                 /* redundant indications, but ensure initialization. */
1346                 tmp_ext.cookie_out_never = 1; /* true */
1347                 tmp_ext.cookie_plus = 0;
1348         } else {
1349                 goto drop_and_release;
1350         }
1351         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1352
1353         if (want_cookie && !tmp_opt.saw_tstamp)
1354                 tcp_clear_options(&tmp_opt);
1355
1356         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1357         tcp_openreq_init(req, &tmp_opt, skb);
1358
1359         ireq = inet_rsk(req);
1360         ireq->loc_addr = daddr;
1361         ireq->rmt_addr = saddr;
1362         ireq->no_srccheck = inet_sk(sk)->transparent;
1363         ireq->opt = tcp_v4_save_options(sk, skb);
1364
1365         if (security_inet_conn_request(sk, skb, req))
1366                 goto drop_and_free;
1367
1368         if (!want_cookie || tmp_opt.tstamp_ok)
1369                 TCP_ECN_create_request(req, skb);
1370
1371         if (want_cookie) {
1372                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1373                 req->cookie_ts = tmp_opt.tstamp_ok;
1374         } else if (!isn) {
1375                 struct flowi4 fl4;
1376
1377                 /* VJ's idea. We save last timestamp seen
1378                  * from the destination in peer table, when entering
1379                  * state TIME-WAIT, and check against it before
1380                  * accepting new connection request.
1381                  *
1382                  * If "isn" is not zero, this request hit alive
1383                  * timewait bucket, so that all the necessary checks
1384                  * are made in the function processing timewait state.
1385                  */
1386                 if (tmp_opt.saw_tstamp &&
1387                     tcp_death_row.sysctl_tw_recycle &&
1388                     (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
1389                     fl4.daddr == saddr) {
1390                         if (!tcp_peer_is_proven(req, dst, true)) {
1391                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1392                                 goto drop_and_release;
1393                         }
1394                 }
1395                 /* Kill the following clause, if you dislike this way. */
1396                 else if (!sysctl_tcp_syncookies &&
1397                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1398                           (sysctl_max_syn_backlog >> 2)) &&
1399                          !tcp_peer_is_proven(req, dst, false)) {
1400                         /* Without syncookies last quarter of
1401                          * backlog is filled with destinations,
1402                          * proven to be alive.
1403                          * It means that we continue to communicate
1404                          * to destinations, already remembered
1405                          * to the moment of synflood.
1406                          */
1407                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1408                                        &saddr, ntohs(tcp_hdr(skb)->source));
1409                         goto drop_and_release;
1410                 }
1411
1412                 isn = tcp_v4_init_sequence(skb);
1413         }
1414         tcp_rsk(req)->snt_isn = isn;
1415         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1416
1417         if (tcp_v4_send_synack(sk, dst, req,
1418                                (struct request_values *)&tmp_ext,
1419                                skb_get_queue_mapping(skb),
1420                                want_cookie) ||
1421             want_cookie)
1422                 goto drop_and_free;
1423
1424         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1425         return 0;
1426
1427 drop_and_release:
1428         dst_release(dst);
1429 drop_and_free:
1430         reqsk_free(req);
1431 drop:
1432         return 0;
1433 }
1434 EXPORT_SYMBOL(tcp_v4_conn_request);
1435
1436
1437 /*
1438  * The three way handshake has completed - we got a valid synack -
1439  * now create the new socket.
1440  */
1441 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1442                                   struct request_sock *req,
1443                                   struct dst_entry *dst)
1444 {
1445         struct inet_request_sock *ireq;
1446         struct inet_sock *newinet;
1447         struct tcp_sock *newtp;
1448         struct sock *newsk;
1449 #ifdef CONFIG_TCP_MD5SIG
1450         struct tcp_md5sig_key *key;
1451 #endif
1452         struct ip_options_rcu *inet_opt;
1453
1454         if (sk_acceptq_is_full(sk))
1455                 goto exit_overflow;
1456
1457         newsk = tcp_create_openreq_child(sk, req, skb);
1458         if (!newsk)
1459                 goto exit_nonewsk;
1460
1461         newsk->sk_gso_type = SKB_GSO_TCPV4;
1462
1463         newtp                 = tcp_sk(newsk);
1464         newinet               = inet_sk(newsk);
1465         ireq                  = inet_rsk(req);
1466         newinet->inet_daddr   = ireq->rmt_addr;
1467         newinet->inet_rcv_saddr = ireq->loc_addr;
1468         newinet->inet_saddr           = ireq->loc_addr;
1469         inet_opt              = ireq->opt;
1470         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1471         ireq->opt             = NULL;
1472         newinet->mc_index     = inet_iif(skb);
1473         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1474         newinet->rcv_tos      = ip_hdr(skb)->tos;
1475         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1476         if (inet_opt)
1477                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1478         newinet->inet_id = newtp->write_seq ^ jiffies;
1479
1480         if (!dst) {
1481                 dst = inet_csk_route_child_sock(sk, newsk, req);
1482                 if (!dst)
1483                         goto put_and_exit;
1484         } else {
1485                 /* syncookie case : see end of cookie_v4_check() */
1486         }
1487         sk_setup_caps(newsk, dst);
1488
1489         tcp_mtup_init(newsk);
1490         tcp_sync_mss(newsk, dst_mtu(dst));
1491         newtp->advmss = dst_metric_advmss(dst);
1492         if (tcp_sk(sk)->rx_opt.user_mss &&
1493             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1494                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1495
1496         tcp_initialize_rcv_mss(newsk);
1497         if (tcp_rsk(req)->snt_synack)
1498                 tcp_valid_rtt_meas(newsk,
1499                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1500         newtp->total_retrans = req->retrans;
1501
1502 #ifdef CONFIG_TCP_MD5SIG
1503         /* Copy over the MD5 key from the original socket */
1504         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1505                                 AF_INET);
1506         if (key != NULL) {
1507                 /*
1508                  * We're using one, so create a matching key
1509                  * on the newsk structure. If we fail to get
1510                  * memory, then we end up not copying the key
1511                  * across. Shucks.
1512                  */
1513                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1514                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1515                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1516         }
1517 #endif
1518
1519         if (__inet_inherit_port(sk, newsk) < 0)
1520                 goto put_and_exit;
1521         __inet_hash_nolisten(newsk, NULL);
1522
1523         return newsk;
1524
1525 exit_overflow:
1526         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1527 exit_nonewsk:
1528         dst_release(dst);
1529 exit:
1530         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1531         return NULL;
1532 put_and_exit:
1533         tcp_clear_xmit_timers(newsk);
1534         tcp_cleanup_congestion_control(newsk);
1535         bh_unlock_sock(newsk);
1536         sock_put(newsk);
1537         goto exit;
1538 }
1539 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1540
1541 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1542 {
1543         struct tcphdr *th = tcp_hdr(skb);
1544         const struct iphdr *iph = ip_hdr(skb);
1545         struct sock *nsk;
1546         struct request_sock **prev;
1547         /* Find possible connection requests. */
1548         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1549                                                        iph->saddr, iph->daddr);
1550         if (req)
1551                 return tcp_check_req(sk, skb, req, prev);
1552
1553         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1554                         th->source, iph->daddr, th->dest, inet_iif(skb));
1555
1556         if (nsk) {
1557                 if (nsk->sk_state != TCP_TIME_WAIT) {
1558                         bh_lock_sock(nsk);
1559                         return nsk;
1560                 }
1561                 inet_twsk_put(inet_twsk(nsk));
1562                 return NULL;
1563         }
1564
1565 #ifdef CONFIG_SYN_COOKIES
1566         if (!th->syn)
1567                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1568 #endif
1569         return sk;
1570 }
1571
1572 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1573 {
1574         const struct iphdr *iph = ip_hdr(skb);
1575
1576         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1577                 if (!tcp_v4_check(skb->len, iph->saddr,
1578                                   iph->daddr, skb->csum)) {
1579                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1580                         return 0;
1581                 }
1582         }
1583
1584         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1585                                        skb->len, IPPROTO_TCP, 0);
1586
1587         if (skb->len <= 76) {
1588                 return __skb_checksum_complete(skb);
1589         }
1590         return 0;
1591 }
1592
1593
1594 /* The socket must have it's spinlock held when we get
1595  * here.
1596  *
1597  * We have a potential double-lock case here, so even when
1598  * doing backlog processing we use the BH locking scheme.
1599  * This is because we cannot sleep with the original spinlock
1600  * held.
1601  */
1602 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1603 {
1604         struct sock *rsk;
1605 #ifdef CONFIG_TCP_MD5SIG
1606         /*
1607          * We really want to reject the packet as early as possible
1608          * if:
1609          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1610          *  o There is an MD5 option and we're not expecting one
1611          */
1612         if (tcp_v4_inbound_md5_hash(sk, skb))
1613                 goto discard;
1614 #endif
1615
1616         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1617                 sock_rps_save_rxhash(sk, skb);
1618                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1619                         rsk = sk;
1620                         goto reset;
1621                 }
1622                 return 0;
1623         }
1624
1625         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1626                 goto csum_err;
1627
1628         if (sk->sk_state == TCP_LISTEN) {
1629                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1630                 if (!nsk)
1631                         goto discard;
1632
1633                 if (nsk != sk) {
1634                         sock_rps_save_rxhash(nsk, skb);
1635                         if (tcp_child_process(sk, nsk, skb)) {
1636                                 rsk = nsk;
1637                                 goto reset;
1638                         }
1639                         return 0;
1640                 }
1641         } else
1642                 sock_rps_save_rxhash(sk, skb);
1643
1644         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1645                 rsk = sk;
1646                 goto reset;
1647         }
1648         return 0;
1649
1650 reset:
1651         tcp_v4_send_reset(rsk, skb);
1652 discard:
1653         kfree_skb(skb);
1654         /* Be careful here. If this function gets more complicated and
1655          * gcc suffers from register pressure on the x86, sk (in %ebx)
1656          * might be destroyed here. This current version compiles correctly,
1657          * but you have been warned.
1658          */
1659         return 0;
1660
1661 csum_err:
1662         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1663         goto discard;
1664 }
1665 EXPORT_SYMBOL(tcp_v4_do_rcv);
1666
1667 void tcp_v4_early_demux(struct sk_buff *skb)
1668 {
1669         struct net *net = dev_net(skb->dev);
1670         const struct iphdr *iph;
1671         const struct tcphdr *th;
1672         struct net_device *dev;
1673         struct sock *sk;
1674
1675         if (skb->pkt_type != PACKET_HOST)
1676                 return;
1677
1678         if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1679                 return;
1680
1681         iph = ip_hdr(skb);
1682         th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1683
1684         if (th->doff < sizeof(struct tcphdr) / 4)
1685                 return;
1686
1687         if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1688                 return;
1689
1690         dev = skb->dev;
1691         sk = __inet_lookup_established(net, &tcp_hashinfo,
1692                                        iph->saddr, th->source,
1693                                        iph->daddr, ntohs(th->dest),
1694                                        dev->ifindex);
1695         if (sk) {
1696                 skb->sk = sk;
1697                 skb->destructor = sock_edemux;
1698                 if (sk->sk_state != TCP_TIME_WAIT) {
1699                         struct dst_entry *dst = sk->sk_rx_dst;
1700                         if (dst)
1701                                 dst = dst_check(dst, 0);
1702                         if (dst) {
1703                                 struct rtable *rt = (struct rtable *) dst;
1704
1705                                 if (rt->rt_iif == dev->ifindex)
1706                                         skb_dst_set_noref(skb, dst);
1707                         }
1708                 }
1709         }
1710 }
1711
1712 /*
1713  *      From tcp_input.c
1714  */
1715
1716 int tcp_v4_rcv(struct sk_buff *skb)
1717 {
1718         const struct iphdr *iph;
1719         const struct tcphdr *th;
1720         struct sock *sk;
1721         int ret;
1722         struct net *net = dev_net(skb->dev);
1723
1724         if (skb->pkt_type != PACKET_HOST)
1725                 goto discard_it;
1726
1727         /* Count it even if it's bad */
1728         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1729
1730         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1731                 goto discard_it;
1732
1733         th = tcp_hdr(skb);
1734
1735         if (th->doff < sizeof(struct tcphdr) / 4)
1736                 goto bad_packet;
1737         if (!pskb_may_pull(skb, th->doff * 4))
1738                 goto discard_it;
1739
1740         /* An explanation is required here, I think.
1741          * Packet length and doff are validated by header prediction,
1742          * provided case of th->doff==0 is eliminated.
1743          * So, we defer the checks. */
1744         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1745                 goto bad_packet;
1746
1747         th = tcp_hdr(skb);
1748         iph = ip_hdr(skb);
1749         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1750         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1751                                     skb->len - th->doff * 4);
1752         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1753         TCP_SKB_CB(skb)->when    = 0;
1754         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1755         TCP_SKB_CB(skb)->sacked  = 0;
1756
1757         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1758         if (!sk)
1759                 goto no_tcp_socket;
1760
1761 process:
1762         if (sk->sk_state == TCP_TIME_WAIT)
1763                 goto do_time_wait;
1764
1765         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1766                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1767                 goto discard_and_relse;
1768         }
1769
1770         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1771                 goto discard_and_relse;
1772         nf_reset(skb);
1773
1774         if (sk_filter(sk, skb))
1775                 goto discard_and_relse;
1776
1777         skb->dev = NULL;
1778
1779         bh_lock_sock_nested(sk);
1780         ret = 0;
1781         if (!sock_owned_by_user(sk)) {
1782 #ifdef CONFIG_NET_DMA
1783                 struct tcp_sock *tp = tcp_sk(sk);
1784                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1785                         tp->ucopy.dma_chan = net_dma_find_channel();
1786                 if (tp->ucopy.dma_chan)
1787                         ret = tcp_v4_do_rcv(sk, skb);
1788                 else
1789 #endif
1790                 {
1791                         if (!tcp_prequeue(sk, skb))
1792                                 ret = tcp_v4_do_rcv(sk, skb);
1793                 }
1794         } else if (unlikely(sk_add_backlog(sk, skb,
1795                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1796                 bh_unlock_sock(sk);
1797                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1798                 goto discard_and_relse;
1799         }
1800         bh_unlock_sock(sk);
1801
1802         sock_put(sk);
1803
1804         return ret;
1805
1806 no_tcp_socket:
1807         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1808                 goto discard_it;
1809
1810         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1811 bad_packet:
1812                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1813         } else {
1814                 tcp_v4_send_reset(NULL, skb);
1815         }
1816
1817 discard_it:
1818         /* Discard frame. */
1819         kfree_skb(skb);
1820         return 0;
1821
1822 discard_and_relse:
1823         sock_put(sk);
1824         goto discard_it;
1825
1826 do_time_wait:
1827         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1828                 inet_twsk_put(inet_twsk(sk));
1829                 goto discard_it;
1830         }
1831
1832         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1833                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1834                 inet_twsk_put(inet_twsk(sk));
1835                 goto discard_it;
1836         }
1837         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1838         case TCP_TW_SYN: {
1839                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1840                                                         &tcp_hashinfo,
1841                                                         iph->daddr, th->dest,
1842                                                         inet_iif(skb));
1843                 if (sk2) {
1844                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1845                         inet_twsk_put(inet_twsk(sk));
1846                         sk = sk2;
1847                         goto process;
1848                 }
1849                 /* Fall through to ACK */
1850         }
1851         case TCP_TW_ACK:
1852                 tcp_v4_timewait_ack(sk, skb);
1853                 break;
1854         case TCP_TW_RST:
1855                 goto no_tcp_socket;
1856         case TCP_TW_SUCCESS:;
1857         }
1858         goto discard_it;
1859 }
1860
1861 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1862         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1863         .twsk_unique    = tcp_twsk_unique,
1864         .twsk_destructor= tcp_twsk_destructor,
1865 };
1866
1867 const struct inet_connection_sock_af_ops ipv4_specific = {
1868         .queue_xmit        = ip_queue_xmit,
1869         .send_check        = tcp_v4_send_check,
1870         .rebuild_header    = inet_sk_rebuild_header,
1871         .conn_request      = tcp_v4_conn_request,
1872         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1873         .net_header_len    = sizeof(struct iphdr),
1874         .setsockopt        = ip_setsockopt,
1875         .getsockopt        = ip_getsockopt,
1876         .addr2sockaddr     = inet_csk_addr2sockaddr,
1877         .sockaddr_len      = sizeof(struct sockaddr_in),
1878         .bind_conflict     = inet_csk_bind_conflict,
1879 #ifdef CONFIG_COMPAT
1880         .compat_setsockopt = compat_ip_setsockopt,
1881         .compat_getsockopt = compat_ip_getsockopt,
1882 #endif
1883 };
1884 EXPORT_SYMBOL(ipv4_specific);
1885
1886 #ifdef CONFIG_TCP_MD5SIG
1887 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1888         .md5_lookup             = tcp_v4_md5_lookup,
1889         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1890         .md5_parse              = tcp_v4_parse_md5_keys,
1891 };
1892 #endif
1893
1894 /* NOTE: A lot of things set to zero explicitly by call to
1895  *       sk_alloc() so need not be done here.
1896  */
1897 static int tcp_v4_init_sock(struct sock *sk)
1898 {
1899         struct inet_connection_sock *icsk = inet_csk(sk);
1900
1901         tcp_init_sock(sk);
1902
1903         icsk->icsk_af_ops = &ipv4_specific;
1904
1905 #ifdef CONFIG_TCP_MD5SIG
1906         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1907 #endif
1908
1909         return 0;
1910 }
1911
1912 void tcp_v4_destroy_sock(struct sock *sk)
1913 {
1914         struct tcp_sock *tp = tcp_sk(sk);
1915
1916         tcp_clear_xmit_timers(sk);
1917
1918         tcp_cleanup_congestion_control(sk);
1919
1920         /* Cleanup up the write buffer. */
1921         tcp_write_queue_purge(sk);
1922
1923         /* Cleans up our, hopefully empty, out_of_order_queue. */
1924         __skb_queue_purge(&tp->out_of_order_queue);
1925
1926 #ifdef CONFIG_TCP_MD5SIG
1927         /* Clean up the MD5 key list, if any */
1928         if (tp->md5sig_info) {
1929                 tcp_clear_md5_list(sk);
1930                 kfree_rcu(tp->md5sig_info, rcu);
1931                 tp->md5sig_info = NULL;
1932         }
1933 #endif
1934
1935 #ifdef CONFIG_NET_DMA
1936         /* Cleans up our sk_async_wait_queue */
1937         __skb_queue_purge(&sk->sk_async_wait_queue);
1938 #endif
1939
1940         /* Clean prequeue, it must be empty really */
1941         __skb_queue_purge(&tp->ucopy.prequeue);
1942
1943         /* Clean up a referenced TCP bind bucket. */
1944         if (inet_csk(sk)->icsk_bind_hash)
1945                 inet_put_port(sk);
1946
1947         /*
1948          * If sendmsg cached page exists, toss it.
1949          */
1950         if (sk->sk_sndmsg_page) {
1951                 __free_page(sk->sk_sndmsg_page);
1952                 sk->sk_sndmsg_page = NULL;
1953         }
1954
1955         /* TCP Cookie Transactions */
1956         if (tp->cookie_values != NULL) {
1957                 kref_put(&tp->cookie_values->kref,
1958                          tcp_cookie_values_release);
1959                 tp->cookie_values = NULL;
1960         }
1961
1962         sk_sockets_allocated_dec(sk);
1963         sock_release_memcg(sk);
1964 }
1965 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1966
1967 #ifdef CONFIG_PROC_FS
1968 /* Proc filesystem TCP sock list dumping. */
1969
1970 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1971 {
1972         return hlist_nulls_empty(head) ? NULL :
1973                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1974 }
1975
1976 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1977 {
1978         return !is_a_nulls(tw->tw_node.next) ?
1979                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1980 }
1981
1982 /*
1983  * Get next listener socket follow cur.  If cur is NULL, get first socket
1984  * starting from bucket given in st->bucket; when st->bucket is zero the
1985  * very first socket in the hash table is returned.
1986  */
1987 static void *listening_get_next(struct seq_file *seq, void *cur)
1988 {
1989         struct inet_connection_sock *icsk;
1990         struct hlist_nulls_node *node;
1991         struct sock *sk = cur;
1992         struct inet_listen_hashbucket *ilb;
1993         struct tcp_iter_state *st = seq->private;
1994         struct net *net = seq_file_net(seq);
1995
1996         if (!sk) {
1997                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1998                 spin_lock_bh(&ilb->lock);
1999                 sk = sk_nulls_head(&ilb->head);
2000                 st->offset = 0;
2001                 goto get_sk;
2002         }
2003         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2004         ++st->num;
2005         ++st->offset;
2006
2007         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2008                 struct request_sock *req = cur;
2009
2010                 icsk = inet_csk(st->syn_wait_sk);
2011                 req = req->dl_next;
2012                 while (1) {
2013                         while (req) {
2014                                 if (req->rsk_ops->family == st->family) {
2015                                         cur = req;
2016                                         goto out;
2017                                 }
2018                                 req = req->dl_next;
2019                         }
2020                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2021                                 break;
2022 get_req:
2023                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2024                 }
2025                 sk        = sk_nulls_next(st->syn_wait_sk);
2026                 st->state = TCP_SEQ_STATE_LISTENING;
2027                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2028         } else {
2029                 icsk = inet_csk(sk);
2030                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2031                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2032                         goto start_req;
2033                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034                 sk = sk_nulls_next(sk);
2035         }
2036 get_sk:
2037         sk_nulls_for_each_from(sk, node) {
2038                 if (!net_eq(sock_net(sk), net))
2039                         continue;
2040                 if (sk->sk_family == st->family) {
2041                         cur = sk;
2042                         goto out;
2043                 }
2044                 icsk = inet_csk(sk);
2045                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2046                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2047 start_req:
2048                         st->uid         = sock_i_uid(sk);
2049                         st->syn_wait_sk = sk;
2050                         st->state       = TCP_SEQ_STATE_OPENREQ;
2051                         st->sbucket     = 0;
2052                         goto get_req;
2053                 }
2054                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2055         }
2056         spin_unlock_bh(&ilb->lock);
2057         st->offset = 0;
2058         if (++st->bucket < INET_LHTABLE_SIZE) {
2059                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2060                 spin_lock_bh(&ilb->lock);
2061                 sk = sk_nulls_head(&ilb->head);
2062                 goto get_sk;
2063         }
2064         cur = NULL;
2065 out:
2066         return cur;
2067 }
2068
2069 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2070 {
2071         struct tcp_iter_state *st = seq->private;
2072         void *rc;
2073
2074         st->bucket = 0;
2075         st->offset = 0;
2076         rc = listening_get_next(seq, NULL);
2077
2078         while (rc && *pos) {
2079                 rc = listening_get_next(seq, rc);
2080                 --*pos;
2081         }
2082         return rc;
2083 }
2084
2085 static inline bool empty_bucket(struct tcp_iter_state *st)
2086 {
2087         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2088                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2089 }
2090
2091 /*
2092  * Get first established socket starting from bucket given in st->bucket.
2093  * If st->bucket is zero, the very first socket in the hash is returned.
2094  */
2095 static void *established_get_first(struct seq_file *seq)
2096 {
2097         struct tcp_iter_state *st = seq->private;
2098         struct net *net = seq_file_net(seq);
2099         void *rc = NULL;
2100
2101         st->offset = 0;
2102         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2103                 struct sock *sk;
2104                 struct hlist_nulls_node *node;
2105                 struct inet_timewait_sock *tw;
2106                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2107
2108                 /* Lockless fast path for the common case of empty buckets */
2109                 if (empty_bucket(st))
2110                         continue;
2111
2112                 spin_lock_bh(lock);
2113                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2114                         if (sk->sk_family != st->family ||
2115                             !net_eq(sock_net(sk), net)) {
2116                                 continue;
2117                         }
2118                         rc = sk;
2119                         goto out;
2120                 }
2121                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2122                 inet_twsk_for_each(tw, node,
2123                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2124                         if (tw->tw_family != st->family ||
2125                             !net_eq(twsk_net(tw), net)) {
2126                                 continue;
2127                         }
2128                         rc = tw;
2129                         goto out;
2130                 }
2131                 spin_unlock_bh(lock);
2132                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2133         }
2134 out:
2135         return rc;
2136 }
2137
2138 static void *established_get_next(struct seq_file *seq, void *cur)
2139 {
2140         struct sock *sk = cur;
2141         struct inet_timewait_sock *tw;
2142         struct hlist_nulls_node *node;
2143         struct tcp_iter_state *st = seq->private;
2144         struct net *net = seq_file_net(seq);
2145
2146         ++st->num;
2147         ++st->offset;
2148
2149         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2150                 tw = cur;
2151                 tw = tw_next(tw);
2152 get_tw:
2153                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2154                         tw = tw_next(tw);
2155                 }
2156                 if (tw) {
2157                         cur = tw;
2158                         goto out;
2159                 }
2160                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2161                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2162
2163                 /* Look for next non empty bucket */
2164                 st->offset = 0;
2165                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2166                                 empty_bucket(st))
2167                         ;
2168                 if (st->bucket > tcp_hashinfo.ehash_mask)
2169                         return NULL;
2170
2171                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2172                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2173         } else
2174                 sk = sk_nulls_next(sk);
2175
2176         sk_nulls_for_each_from(sk, node) {
2177                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2178                         goto found;
2179         }
2180
2181         st->state = TCP_SEQ_STATE_TIME_WAIT;
2182         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2183         goto get_tw;
2184 found:
2185         cur = sk;
2186 out:
2187         return cur;
2188 }
2189
2190 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2191 {
2192         struct tcp_iter_state *st = seq->private;
2193         void *rc;
2194
2195         st->bucket = 0;
2196         rc = established_get_first(seq);
2197
2198         while (rc && pos) {
2199                 rc = established_get_next(seq, rc);
2200                 --pos;
2201         }
2202         return rc;
2203 }
2204
2205 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2206 {
2207         void *rc;
2208         struct tcp_iter_state *st = seq->private;
2209
2210         st->state = TCP_SEQ_STATE_LISTENING;
2211         rc        = listening_get_idx(seq, &pos);
2212
2213         if (!rc) {
2214                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2215                 rc        = established_get_idx(seq, pos);
2216         }
2217
2218         return rc;
2219 }
2220
2221 static void *tcp_seek_last_pos(struct seq_file *seq)
2222 {
2223         struct tcp_iter_state *st = seq->private;
2224         int offset = st->offset;
2225         int orig_num = st->num;
2226         void *rc = NULL;
2227
2228         switch (st->state) {
2229         case TCP_SEQ_STATE_OPENREQ:
2230         case TCP_SEQ_STATE_LISTENING:
2231                 if (st->bucket >= INET_LHTABLE_SIZE)
2232                         break;
2233                 st->state = TCP_SEQ_STATE_LISTENING;
2234                 rc = listening_get_next(seq, NULL);
2235                 while (offset-- && rc)
2236                         rc = listening_get_next(seq, rc);
2237                 if (rc)
2238                         break;
2239                 st->bucket = 0;
2240                 /* Fallthrough */
2241         case TCP_SEQ_STATE_ESTABLISHED:
2242         case TCP_SEQ_STATE_TIME_WAIT:
2243                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2244                 if (st->bucket > tcp_hashinfo.ehash_mask)
2245                         break;
2246                 rc = established_get_first(seq);
2247                 while (offset-- && rc)
2248                         rc = established_get_next(seq, rc);
2249         }
2250
2251         st->num = orig_num;
2252
2253         return rc;
2254 }
2255
2256 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2257 {
2258         struct tcp_iter_state *st = seq->private;
2259         void *rc;
2260
2261         if (*pos && *pos == st->last_pos) {
2262                 rc = tcp_seek_last_pos(seq);
2263                 if (rc)
2264                         goto out;
2265         }
2266
2267         st->state = TCP_SEQ_STATE_LISTENING;
2268         st->num = 0;
2269         st->bucket = 0;
2270         st->offset = 0;
2271         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2272
2273 out:
2274         st->last_pos = *pos;
2275         return rc;
2276 }
2277
2278 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2279 {
2280         struct tcp_iter_state *st = seq->private;
2281         void *rc = NULL;
2282
2283         if (v == SEQ_START_TOKEN) {
2284                 rc = tcp_get_idx(seq, 0);
2285                 goto out;
2286         }
2287
2288         switch (st->state) {
2289         case TCP_SEQ_STATE_OPENREQ:
2290         case TCP_SEQ_STATE_LISTENING:
2291                 rc = listening_get_next(seq, v);
2292                 if (!rc) {
2293                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2294                         st->bucket = 0;
2295                         st->offset = 0;
2296                         rc        = established_get_first(seq);
2297                 }
2298                 break;
2299         case TCP_SEQ_STATE_ESTABLISHED:
2300         case TCP_SEQ_STATE_TIME_WAIT:
2301                 rc = established_get_next(seq, v);
2302                 break;
2303         }
2304 out:
2305         ++*pos;
2306         st->last_pos = *pos;
2307         return rc;
2308 }
2309
2310 static void tcp_seq_stop(struct seq_file *seq, void *v)
2311 {
2312         struct tcp_iter_state *st = seq->private;
2313
2314         switch (st->state) {
2315         case TCP_SEQ_STATE_OPENREQ:
2316                 if (v) {
2317                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2318                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2319                 }
2320         case TCP_SEQ_STATE_LISTENING:
2321                 if (v != SEQ_START_TOKEN)
2322                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2323                 break;
2324         case TCP_SEQ_STATE_TIME_WAIT:
2325         case TCP_SEQ_STATE_ESTABLISHED:
2326                 if (v)
2327                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2328                 break;
2329         }
2330 }
2331
2332 int tcp_seq_open(struct inode *inode, struct file *file)
2333 {
2334         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2335         struct tcp_iter_state *s;
2336         int err;
2337
2338         err = seq_open_net(inode, file, &afinfo->seq_ops,
2339                           sizeof(struct tcp_iter_state));
2340         if (err < 0)
2341                 return err;
2342
2343         s = ((struct seq_file *)file->private_data)->private;
2344         s->family               = afinfo->family;
2345         s->last_pos             = 0;
2346         return 0;
2347 }
2348 EXPORT_SYMBOL(tcp_seq_open);
2349
2350 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2351 {
2352         int rc = 0;
2353         struct proc_dir_entry *p;
2354
2355         afinfo->seq_ops.start           = tcp_seq_start;
2356         afinfo->seq_ops.next            = tcp_seq_next;
2357         afinfo->seq_ops.stop            = tcp_seq_stop;
2358
2359         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2360                              afinfo->seq_fops, afinfo);
2361         if (!p)
2362                 rc = -ENOMEM;
2363         return rc;
2364 }
2365 EXPORT_SYMBOL(tcp_proc_register);
2366
2367 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2368 {
2369         proc_net_remove(net, afinfo->name);
2370 }
2371 EXPORT_SYMBOL(tcp_proc_unregister);
2372
2373 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2374                          struct seq_file *f, int i, int uid, int *len)
2375 {
2376         const struct inet_request_sock *ireq = inet_rsk(req);
2377         int ttd = req->expires - jiffies;
2378
2379         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2380                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2381                 i,
2382                 ireq->loc_addr,
2383                 ntohs(inet_sk(sk)->inet_sport),
2384                 ireq->rmt_addr,
2385                 ntohs(ireq->rmt_port),
2386                 TCP_SYN_RECV,
2387                 0, 0, /* could print option size, but that is af dependent. */
2388                 1,    /* timers active (only the expire timer) */
2389                 jiffies_to_clock_t(ttd),
2390                 req->retrans,
2391                 uid,
2392                 0,  /* non standard timer */
2393                 0, /* open_requests have no inode */
2394                 atomic_read(&sk->sk_refcnt),
2395                 req,
2396                 len);
2397 }
2398
2399 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2400 {
2401         int timer_active;
2402         unsigned long timer_expires;
2403         const struct tcp_sock *tp = tcp_sk(sk);
2404         const struct inet_connection_sock *icsk = inet_csk(sk);
2405         const struct inet_sock *inet = inet_sk(sk);
2406         __be32 dest = inet->inet_daddr;
2407         __be32 src = inet->inet_rcv_saddr;
2408         __u16 destp = ntohs(inet->inet_dport);
2409         __u16 srcp = ntohs(inet->inet_sport);
2410         int rx_queue;
2411
2412         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2413                 timer_active    = 1;
2414                 timer_expires   = icsk->icsk_timeout;
2415         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2416                 timer_active    = 4;
2417                 timer_expires   = icsk->icsk_timeout;
2418         } else if (timer_pending(&sk->sk_timer)) {
2419                 timer_active    = 2;
2420                 timer_expires   = sk->sk_timer.expires;
2421         } else {
2422                 timer_active    = 0;
2423                 timer_expires = jiffies;
2424         }
2425
2426         if (sk->sk_state == TCP_LISTEN)
2427                 rx_queue = sk->sk_ack_backlog;
2428         else
2429                 /*
2430                  * because we dont lock socket, we might find a transient negative value
2431                  */
2432                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2433
2434         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2435                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2436                 i, src, srcp, dest, destp, sk->sk_state,
2437                 tp->write_seq - tp->snd_una,
2438                 rx_queue,
2439                 timer_active,
2440                 jiffies_to_clock_t(timer_expires - jiffies),
2441                 icsk->icsk_retransmits,
2442                 sock_i_uid(sk),
2443                 icsk->icsk_probes_out,
2444                 sock_i_ino(sk),
2445                 atomic_read(&sk->sk_refcnt), sk,
2446                 jiffies_to_clock_t(icsk->icsk_rto),
2447                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2448                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2449                 tp->snd_cwnd,
2450                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2451                 len);
2452 }
2453
2454 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2455                                struct seq_file *f, int i, int *len)
2456 {
2457         __be32 dest, src;
2458         __u16 destp, srcp;
2459         int ttd = tw->tw_ttd - jiffies;
2460
2461         if (ttd < 0)
2462                 ttd = 0;
2463
2464         dest  = tw->tw_daddr;
2465         src   = tw->tw_rcv_saddr;
2466         destp = ntohs(tw->tw_dport);
2467         srcp  = ntohs(tw->tw_sport);
2468
2469         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2470                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2471                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2472                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2473                 atomic_read(&tw->tw_refcnt), tw, len);
2474 }
2475
2476 #define TMPSZ 150
2477
2478 static int tcp4_seq_show(struct seq_file *seq, void *v)
2479 {
2480         struct tcp_iter_state *st;
2481         int len;
2482
2483         if (v == SEQ_START_TOKEN) {
2484                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2485                            "  sl  local_address rem_address   st tx_queue "
2486                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2487                            "inode");
2488                 goto out;
2489         }
2490         st = seq->private;
2491
2492         switch (st->state) {
2493         case TCP_SEQ_STATE_LISTENING:
2494         case TCP_SEQ_STATE_ESTABLISHED:
2495                 get_tcp4_sock(v, seq, st->num, &len);
2496                 break;
2497         case TCP_SEQ_STATE_OPENREQ:
2498                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2499                 break;
2500         case TCP_SEQ_STATE_TIME_WAIT:
2501                 get_timewait4_sock(v, seq, st->num, &len);
2502                 break;
2503         }
2504         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2505 out:
2506         return 0;
2507 }
2508
2509 static const struct file_operations tcp_afinfo_seq_fops = {
2510         .owner   = THIS_MODULE,
2511         .open    = tcp_seq_open,
2512         .read    = seq_read,
2513         .llseek  = seq_lseek,
2514         .release = seq_release_net
2515 };
2516
2517 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2518         .name           = "tcp",
2519         .family         = AF_INET,
2520         .seq_fops       = &tcp_afinfo_seq_fops,
2521         .seq_ops        = {
2522                 .show           = tcp4_seq_show,
2523         },
2524 };
2525
2526 static int __net_init tcp4_proc_init_net(struct net *net)
2527 {
2528         return tcp_proc_register(net, &tcp4_seq_afinfo);
2529 }
2530
2531 static void __net_exit tcp4_proc_exit_net(struct net *net)
2532 {
2533         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2534 }
2535
2536 static struct pernet_operations tcp4_net_ops = {
2537         .init = tcp4_proc_init_net,
2538         .exit = tcp4_proc_exit_net,
2539 };
2540
2541 int __init tcp4_proc_init(void)
2542 {
2543         return register_pernet_subsys(&tcp4_net_ops);
2544 }
2545
2546 void tcp4_proc_exit(void)
2547 {
2548         unregister_pernet_subsys(&tcp4_net_ops);
2549 }
2550 #endif /* CONFIG_PROC_FS */
2551
2552 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2553 {
2554         const struct iphdr *iph = skb_gro_network_header(skb);
2555
2556         switch (skb->ip_summed) {
2557         case CHECKSUM_COMPLETE:
2558                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2559                                   skb->csum)) {
2560                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2561                         break;
2562                 }
2563
2564                 /* fall through */
2565         case CHECKSUM_NONE:
2566                 NAPI_GRO_CB(skb)->flush = 1;
2567                 return NULL;
2568         }
2569
2570         return tcp_gro_receive(head, skb);
2571 }
2572
2573 int tcp4_gro_complete(struct sk_buff *skb)
2574 {
2575         const struct iphdr *iph = ip_hdr(skb);
2576         struct tcphdr *th = tcp_hdr(skb);
2577
2578         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2579                                   iph->saddr, iph->daddr, 0);
2580         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2581
2582         return tcp_gro_complete(skb);
2583 }
2584
2585 struct proto tcp_prot = {
2586         .name                   = "TCP",
2587         .owner                  = THIS_MODULE,
2588         .close                  = tcp_close,
2589         .connect                = tcp_v4_connect,
2590         .disconnect             = tcp_disconnect,
2591         .accept                 = inet_csk_accept,
2592         .ioctl                  = tcp_ioctl,
2593         .init                   = tcp_v4_init_sock,
2594         .destroy                = tcp_v4_destroy_sock,
2595         .shutdown               = tcp_shutdown,
2596         .setsockopt             = tcp_setsockopt,
2597         .getsockopt             = tcp_getsockopt,
2598         .recvmsg                = tcp_recvmsg,
2599         .sendmsg                = tcp_sendmsg,
2600         .sendpage               = tcp_sendpage,
2601         .backlog_rcv            = tcp_v4_do_rcv,
2602         .release_cb             = tcp_release_cb,
2603         .hash                   = inet_hash,
2604         .unhash                 = inet_unhash,
2605         .get_port               = inet_csk_get_port,
2606         .enter_memory_pressure  = tcp_enter_memory_pressure,
2607         .sockets_allocated      = &tcp_sockets_allocated,
2608         .orphan_count           = &tcp_orphan_count,
2609         .memory_allocated       = &tcp_memory_allocated,
2610         .memory_pressure        = &tcp_memory_pressure,
2611         .sysctl_wmem            = sysctl_tcp_wmem,
2612         .sysctl_rmem            = sysctl_tcp_rmem,
2613         .max_header             = MAX_TCP_HEADER,
2614         .obj_size               = sizeof(struct tcp_sock),
2615         .slab_flags             = SLAB_DESTROY_BY_RCU,
2616         .twsk_prot              = &tcp_timewait_sock_ops,
2617         .rsk_prot               = &tcp_request_sock_ops,
2618         .h.hashinfo             = &tcp_hashinfo,
2619         .no_autobind            = true,
2620 #ifdef CONFIG_COMPAT
2621         .compat_setsockopt      = compat_tcp_setsockopt,
2622         .compat_getsockopt      = compat_tcp_getsockopt,
2623 #endif
2624 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2625         .init_cgroup            = tcp_init_cgroup,
2626         .destroy_cgroup         = tcp_destroy_cgroup,
2627         .proto_cgroup           = tcp_proto_cgroup,
2628 #endif
2629 };
2630 EXPORT_SYMBOL(tcp_prot);
2631
2632 static int __net_init tcp_sk_init(struct net *net)
2633 {
2634         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2635                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2636 }
2637
2638 static void __net_exit tcp_sk_exit(struct net *net)
2639 {
2640         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2641 }
2642
2643 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2644 {
2645         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2646 }
2647
2648 static struct pernet_operations __net_initdata tcp_sk_ops = {
2649        .init       = tcp_sk_init,
2650        .exit       = tcp_sk_exit,
2651        .exit_batch = tcp_sk_exit_batch,
2652 };
2653
2654 void __init tcp_v4_init(void)
2655 {
2656         inet_hashinfo_init(&tcp_hashinfo);
2657         if (register_pernet_subsys(&tcp_sk_ops))
2658                 panic("Failed to create the TCP control socket.\n");
2659 }