net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/errno.h>
  32 #include <linux/kernel.h>
  33 #include <linux/string.h>
  34 #include <linux/socket.h>
  35 #include <linux/net.h>
  36 #include <linux/netdevice.h>
  37 #include <linux/if_arp.h>
  38 #include <linux/in6.h>
  39 #include <linux/tcp.h>
  40 #include <linux/route.h>
  41 #include <linux/module.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 int __ip6_local_out(struct sk_buff *skb)
  74 {
  75         int len;
  76
  77         len = skb->len - sizeof(struct ipv6hdr);
  78         if (len > IPV6_MAXPLEN)
  79                 len = 0;
  80         ipv6_hdr(skb)->payload_len = htons(len);
  81
  82         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  83                        dst_output);
  84 }
  85
  86 int ip6_local_out(struct sk_buff *skb)
  87 {
  88         int err;
  89
  90         err = __ip6_local_out(skb);
  91         if (likely(err == 1))
  92                 err = dst_output(skb);
  93
  94         return err;
  95 }
  96 EXPORT_SYMBOL_GPL(ip6_local_out);
  97
  98 static int ip6_output_finish(struct sk_buff *skb)
  99 {
 100         struct dst_entry *dst = skb->dst;
 101
 102         if (dst->hh)
 103                 return neigh_hh_output(dst->hh, skb);
 104         else if (dst->neighbour)
 105                 return dst->neighbour->output(skb);
 106
 107         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 108         kfree_skb(skb);
 109         return -EINVAL;
 110
 111 }
 112
 113 /* dev_loopback_xmit for use with netfilter. */
 114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 115 {
 116         skb_reset_mac_header(newskb);
 117         __skb_pull(newskb, skb_network_offset(newskb));
 118         newskb->pkt_type = PACKET_LOOPBACK;
 119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 120         BUG_TRAP(newskb->dst);
 121
 122         netif_rx(newskb);
 123         return 0;
 124 }
 125
 126
 127 static int ip6_output2(struct sk_buff *skb)
 128 {
 129         struct dst_entry *dst = skb->dst;
 130         struct net_device *dev = dst->dev;
 131
 132         skb->protocol = htons(ETH_P_IPV6);
 133         skb->dev = dev;
 134
 135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 138
 139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 140                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 141                                         &ipv6_hdr(skb)->saddr)) {
 142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 143
 144                         /* Do not check for IFF_ALLMULTI; multicast routing
 145                            is not supported in any case.
 146                          */
 147                         if (newskb)
 148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 149                                         NULL, newskb->dev,
 150                                         ip6_dev_loopback_xmit);
 151
 152                         if (ipv6_hdr(skb)->hop_limit == 0) {
 153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 154                                 kfree_skb(skb);
 155                                 return 0;
 156                         }
 157                 }
 158
 159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 160         }
 161
 162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 163                        ip6_output_finish);
 164 }
 165
 166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 167 {
 168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 169
 170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 171                skb->dst->dev->mtu : dst_mtu(skb->dst);
 172 }
 173
 174 int ip6_output(struct sk_buff *skb)
 175 {
 176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 177                                 dst_allfrag(skb->dst))
 178                 return ip6_fragment(skb, ip6_output2);
 179         else
 180                 return ip6_output2(skb);
 181 }
 182
 183 /*
 184  *      xmit an sk_buff (used by TCP)
 185  */
 186
 187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 188              struct ipv6_txoptions *opt, int ipfragok)
 189 {
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb->dst;
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit, tclass;
 197         u32 mtu;
 198
 199         if (opt) {
 200                 unsigned int head_room;
 201
 202                 /* First: exthdrs may take lots of space (~8K for now)
 203                    MAX_HEADER is not enough.
 204                  */
 205                 head_room = opt->opt_nflen + opt->opt_flen;
 206                 seg_len += head_room;
 207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 208
 209                 if (skb_headroom(skb) < head_room) {
 210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                         if (skb2 == NULL) {
 212                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 213                                               IPSTATS_MIB_OUTDISCARDS);
 214                                 kfree_skb(skb);
 215                                 return -ENOBUFS;
 216                         }
 217                         kfree_skb(skb);
 218                         skb = skb2;
 219                         if (sk)
 220                                 skb_set_owner_w(skb, sk);
 221                 }
 222                 if (opt->opt_flen)
 223                         ipv6_push_frag_opts(skb, opt, &proto);
 224                 if (opt->opt_nflen)
 225                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 226         }
 227
 228         skb_push(skb, sizeof(struct ipv6hdr));
 229         skb_reset_network_header(skb);
 230         hdr = ipv6_hdr(skb);
 231
 232         /*
 233          *      Fill in the IPv6 header
 234          */
 235
 236         hlimit = -1;
 237         if (np)
 238                 hlimit = np->hop_limit;
 239         if (hlimit < 0)
 240                 hlimit = ip6_dst_hoplimit(dst);
 241
 242         tclass = -1;
 243         if (np)
 244                 tclass = np->tclass;
 245         if (tclass < 0)
 246                 tclass = 0;
 247
 248         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 249
 250         hdr->payload_len = htons(seg_len);
 251         hdr->nexthdr = proto;
 252         hdr->hop_limit = hlimit;
 253
 254         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 255         ipv6_addr_copy(&hdr->daddr, first_hop);
 256
 257         skb->priority = sk->sk_priority;
 258         skb->mark = sk->sk_mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
 262                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 263                               IPSTATS_MIB_OUTREQUESTS);
 264                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 265                                 dst_output);
 266         }
 267
 268         if (net_ratelimit())
 269                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 270         skb->dev = dst->dev;
 271         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 272         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 273         kfree_skb(skb);
 274         return -EMSGSIZE;
 275 }
 276
 277 EXPORT_SYMBOL(ip6_xmit);
 278
 279 /*
 280  *      To avoid extra problems ND packets are send through this
 281  *      routine. It's code duplication but I really want to avoid
 282  *      extra checks since ipv6_build_header is used by TCP (which
 283  *      is for us performance critical)
 284  */
 285
 286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 287                struct in6_addr *saddr, struct in6_addr *daddr,
 288                int proto, int len)
 289 {
 290         struct ipv6_pinfo *np = inet6_sk(sk);
 291         struct ipv6hdr *hdr;
 292         int totlen;
 293
 294         skb->protocol = htons(ETH_P_IPV6);
 295         skb->dev = dev;
 296
 297         totlen = len + sizeof(struct ipv6hdr);
 298
 299         skb_reset_network_header(skb);
 300         skb_put(skb, sizeof(struct ipv6hdr));
 301         hdr = ipv6_hdr(skb);
 302
 303         *(__be32*)hdr = htonl(0x60000000);
 304
 305         hdr->payload_len = htons(len);
 306         hdr->nexthdr = proto;
 307         hdr->hop_limit = np->hop_limit;
 308
 309         ipv6_addr_copy(&hdr->saddr, saddr);
 310         ipv6_addr_copy(&hdr->daddr, daddr);
 311
 312         return 0;
 313 }
 314
 315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 316 {
 317         struct ip6_ra_chain *ra;
 318         struct sock *last = NULL;
 319
 320         read_lock(&ip6_ra_lock);
 321         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 322                 struct sock *sk = ra->sk;
 323                 if (sk && ra->sel == sel &&
 324                     (!sk->sk_bound_dev_if ||
 325                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 326                         if (last) {
 327                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 328                                 if (skb2)
 329                                         rawv6_rcv(last, skb2);
 330                         }
 331                         last = sk;
 332                 }
 333         }
 334
 335         if (last) {
 336                 rawv6_rcv(last, skb);
 337                 read_unlock(&ip6_ra_lock);
 338                 return 1;
 339         }
 340         read_unlock(&ip6_ra_lock);
 341         return 0;
 342 }
 343
 344 static int ip6_forward_proxy_check(struct sk_buff *skb)
 345 {
 346         struct ipv6hdr *hdr = ipv6_hdr(skb);
 347         u8 nexthdr = hdr->nexthdr;
 348         int offset;
 349
 350         if (ipv6_ext_hdr(nexthdr)) {
 351                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 352                 if (offset < 0)
 353                         return 0;
 354         } else
 355                 offset = sizeof(struct ipv6hdr);
 356
 357         if (nexthdr == IPPROTO_ICMPV6) {
 358                 struct icmp6hdr *icmp6;
 359
 360                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 361                                          offset + 1 - skb->data)))
 362                         return 0;
 363
 364                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 365
 366                 switch (icmp6->icmp6_type) {
 367                 case NDISC_ROUTER_SOLICITATION:
 368                 case NDISC_ROUTER_ADVERTISEMENT:
 369                 case NDISC_NEIGHBOUR_SOLICITATION:
 370                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 371                 case NDISC_REDIRECT:
 372                         /* For reaction involving unicast neighbor discovery
 373                          * message destined to the proxied address, pass it to
 374                          * input function.
 375                          */
 376                         return 1;
 377                 default:
 378                         break;
 379                 }
 380         }
 381
 382         /*
 383          * The proxying router can't forward traffic sent to a link-local
 384          * address, so signal the sender and discard the packet. This
 385          * behavior is clarified by the MIPv6 specification.
 386          */
 387         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 388                 dst_link_failure(skb);
 389                 return -1;
 390         }
 391
 392         return 0;
 393 }
 394
 395 static inline int ip6_forward_finish(struct sk_buff *skb)
 396 {
 397         return dst_output(skb);
 398 }
 399
 400 int ip6_forward(struct sk_buff *skb)
 401 {
 402         struct dst_entry *dst = skb->dst;
 403         struct ipv6hdr *hdr = ipv6_hdr(skb);
 404         struct inet6_skb_parm *opt = IP6CB(skb);
 405         struct net *net = dev_net(dst->dev);
 406
 407         if (ipv6_devconf.forwarding == 0)
 408                 goto error;
 409
 410         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 411                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 412                 goto drop;
 413         }
 414
 415         skb_forward_csum(skb);
 416
 417         /*
 418          *      We DO NOT make any processing on
 419          *      RA packets, pushing them to user level AS IS
 420          *      without ane WARRANTY that application will be able
 421          *      to interpret them. The reason is that we
 422          *      cannot make anything clever here.
 423          *
 424          *      We are not end-node, so that if packet contains
 425          *      AH/ESP, we cannot make anything.
 426          *      Defragmentation also would be mistake, RA packets
 427          *      cannot be fragmented, because there is no warranty
 428          *      that different fragments will go along one path. --ANK
 429          */
 430         if (opt->ra) {
 431                 u8 *ptr = skb_network_header(skb) + opt->ra;
 432                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 433                         return 0;
 434         }
 435
 436         /*
 437          *      check and decrement ttl
 438          */
 439         if (hdr->hop_limit <= 1) {
 440                 /* Force OUTPUT device used as source address */
 441                 skb->dev = dst->dev;
 442                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 443                             0, skb->dev);
 444                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 445
 446                 kfree_skb(skb);
 447                 return -ETIMEDOUT;
 448         }
 449
 450         /* XXX: idev->cnf.proxy_ndp? */
 451         if (ipv6_devconf.proxy_ndp &&
 452             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 453                 int proxied = ip6_forward_proxy_check(skb);
 454                 if (proxied > 0)
 455                         return ip6_input(skb);
 456                 else if (proxied < 0) {
 457                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 458                         goto drop;
 459                 }
 460         }
 461
 462         if (!xfrm6_route_forward(skb)) {
 463                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 464                 goto drop;
 465         }
 466         dst = skb->dst;
 467
 468         /* IPv6 specs say nothing about it, but it is clear that we cannot
 469            send redirects to source routed frames.
 470            We don't send redirects to frames decapsulated from IPsec.
 471          */
 472         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 473             !skb->sp) {
 474                 struct in6_addr *target = NULL;
 475                 struct rt6_info *rt;
 476                 struct neighbour *n = dst->neighbour;
 477
 478                 /*
 479                  *      incoming and outgoing devices are the same
 480                  *      send a redirect.
 481                  */
 482
 483                 rt = (struct rt6_info *) dst;
 484                 if ((rt->rt6i_flags & RTF_GATEWAY))
 485                         target = (struct in6_addr*)&n->primary_key;
 486                 else
 487                         target = &hdr->daddr;
 488
 489                 /* Limit redirects both by destination (here)
 490                    and by source (inside ndisc_send_redirect)
 491                  */
 492                 if (xrlim_allow(dst, 1*HZ))
 493                         ndisc_send_redirect(skb, n, target);
 494         } else {
 495                 int addrtype = ipv6_addr_type(&hdr->saddr);
 496
 497                 /* This check is security critical. */
 498                 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
 499                         goto error;
 500                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 501                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 502                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 503                         goto error;
 504                 }
 505         }
 506
 507         if (skb->len > dst_mtu(dst)) {
 508                 /* Again, force OUTPUT device used as source address */
 509                 skb->dev = dst->dev;
 510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 511                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 512                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 513                 kfree_skb(skb);
 514                 return -EMSGSIZE;
 515         }
 516
 517         if (skb_cow(skb, dst->dev->hard_header_len)) {
 518                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 519                 goto drop;
 520         }
 521
 522         hdr = ipv6_hdr(skb);
 523
 524         /* Mangling hops number delayed to point after skb COW */
 525
 526         hdr->hop_limit--;
 527
 528         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 529         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 530                        ip6_forward_finish);
 531
 532 error:
 533         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 534 drop:
 535         kfree_skb(skb);
 536         return -EINVAL;
 537 }
 538
 539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 540 {
 541         to->pkt_type = from->pkt_type;
 542         to->priority = from->priority;
 543         to->protocol = from->protocol;
 544         dst_release(to->dst);
 545         to->dst = dst_clone(from->dst);
 546         to->dev = from->dev;
 547         to->mark = from->mark;
 548
 549 #ifdef CONFIG_NET_SCHED
 550         to->tc_index = from->tc_index;
 551 #endif
 552         nf_copy(to, from);
 553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 555         to->nf_trace = from->nf_trace;
 556 #endif
 557         skb_copy_secmark(to, from);
 558 }
 559
 560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 561 {
 562         u16 offset = sizeof(struct ipv6hdr);
 563         struct ipv6_opt_hdr *exthdr =
 564                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 565         unsigned int packet_len = skb->tail - skb->network_header;
 566         int found_rhdr = 0;
 567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 568
 569         while (offset + 1 <= packet_len) {
 570
 571                 switch (**nexthdr) {
 572
 573                 case NEXTHDR_HOP:
 574                         break;
 575                 case NEXTHDR_ROUTING:
 576                         found_rhdr = 1;
 577                         break;
 578                 case NEXTHDR_DEST:
 579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 580                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 581                                 break;
 582 #endif
 583                         if (found_rhdr)
 584                                 return offset;
 585                         break;
 586                 default :
 587                         return offset;
 588                 }
 589
 590                 offset += ipv6_optlen(exthdr);
 591                 *nexthdr = &exthdr->nexthdr;
 592                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 593                                                  offset);
 594         }
 595
 596         return offset;
 597 }
 598
 599 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 600 {
 601         struct net_device *dev;
 602         struct sk_buff *frag;
 603         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 604         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 605         struct ipv6hdr *tmp_hdr;
 606         struct frag_hdr *fh;
 607         unsigned int mtu, hlen, left, len;
 608         __be32 frag_id = 0;
 609         int ptr, offset = 0, err=0;
 610         u8 *prevhdr, nexthdr = 0;
 611
 612         dev = rt->u.dst.dev;
 613         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 614         nexthdr = *prevhdr;
 615
 616         mtu = ip6_skb_dst_mtu(skb);
 617
 618         /* We must not fragment if the socket is set to force MTU discovery
 619          * or if the skb it not generated by a local socket.  (This last
 620          * check should be redundant, but it's free.)
 621          */
 622         if (!skb->local_df) {
 623                 skb->dev = skb->dst->dev;
 624                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 625                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 626                 kfree_skb(skb);
 627                 return -EMSGSIZE;
 628         }
 629
 630         if (np && np->frag_size < mtu) {
 631                 if (np->frag_size)
 632                         mtu = np->frag_size;
 633         }
 634         mtu -= hlen + sizeof(struct frag_hdr);
 635
 636         if (skb_shinfo(skb)->frag_list) {
 637                 int first_len = skb_pagelen(skb);
 638                 int truesizes = 0;
 639
 640                 if (first_len - hlen > mtu ||
 641                     ((first_len - hlen) & 7) ||
 642                     skb_cloned(skb))
 643                         goto slow_path;
 644
 645                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 646                         /* Correct geometry. */
 647                         if (frag->len > mtu ||
 648                             ((frag->len & 7) && frag->next) ||
 649                             skb_headroom(frag) < hlen)
 650                             goto slow_path;
 651
 652                         /* Partially cloned skb? */
 653                         if (skb_shared(frag))
 654                                 goto slow_path;
 655
 656                         BUG_ON(frag->sk);
 657                         if (skb->sk) {
 658                                 sock_hold(skb->sk);
 659                                 frag->sk = skb->sk;
 660                                 frag->destructor = sock_wfree;
 661                                 truesizes += frag->truesize;
 662                         }
 663                 }
 664
 665                 err = 0;
 666                 offset = 0;
 667                 frag = skb_shinfo(skb)->frag_list;
 668                 skb_shinfo(skb)->frag_list = NULL;
 669                 /* BUILD HEADER */
 670
 671                 *prevhdr = NEXTHDR_FRAGMENT;
 672                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 673                 if (!tmp_hdr) {
 674                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 675                         return -ENOMEM;
 676                 }
 677
 678                 __skb_pull(skb, hlen);
 679                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 680                 __skb_push(skb, hlen);
 681                 skb_reset_network_header(skb);
 682                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 683
 684                 ipv6_select_ident(skb, fh);
 685                 fh->nexthdr = nexthdr;
 686                 fh->reserved = 0;
 687                 fh->frag_off = htons(IP6_MF);
 688                 frag_id = fh->identification;
 689
 690                 first_len = skb_pagelen(skb);
 691                 skb->data_len = first_len - skb_headlen(skb);
 692                 skb->truesize -= truesizes;
 693                 skb->len = first_len;
 694                 ipv6_hdr(skb)->payload_len = htons(first_len -
 695                                                    sizeof(struct ipv6hdr));
 696
 697                 dst_hold(&rt->u.dst);
 698
 699                 for (;;) {
 700                         /* Prepare header of the next frame,
 701                          * before previous one went down. */
 702                         if (frag) {
 703                                 frag->ip_summed = CHECKSUM_NONE;
 704                                 skb_reset_transport_header(frag);
 705                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 706                                 __skb_push(frag, hlen);
 707                                 skb_reset_network_header(frag);
 708                                 memcpy(skb_network_header(frag), tmp_hdr,
 709                                        hlen);
 710                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 711                                 fh->nexthdr = nexthdr;
 712                                 fh->reserved = 0;
 713                                 fh->frag_off = htons(offset);
 714                                 if (frag->next != NULL)
 715                                         fh->frag_off |= htons(IP6_MF);
 716                                 fh->identification = frag_id;
 717                                 ipv6_hdr(frag)->payload_len =
 718                                                 htons(frag->len -
 719                                                       sizeof(struct ipv6hdr));
 720                                 ip6_copy_metadata(frag, skb);
 721                         }
 722
 723                         err = output(skb);
 724                         if(!err)
 725                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
 726
 727                         if (err || !frag)
 728                                 break;
 729
 730                         skb = frag;
 731                         frag = skb->next;
 732                         skb->next = NULL;
 733                 }
 734
 735                 kfree(tmp_hdr);
 736
 737                 if (err == 0) {
 738                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
 739                         dst_release(&rt->u.dst);
 740                         return 0;
 741                 }
 742
 743                 while (frag) {
 744                         skb = frag->next;
 745                         kfree_skb(frag);
 746                         frag = skb;
 747                 }
 748
 749                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
 750                 dst_release(&rt->u.dst);
 751                 return err;
 752         }
 753
 754 slow_path:
 755         left = skb->len - hlen;         /* Space per frame */
 756         ptr = hlen;                     /* Where to start from */
 757
 758         /*
 759          *      Fragment the datagram.
 760          */
 761
 762         *prevhdr = NEXTHDR_FRAGMENT;
 763
 764         /*
 765          *      Keep copying data until we run out.
 766          */
 767         while(left > 0) {
 768                 len = left;
 769                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 770                 if (len > mtu)
 771                         len = mtu;
 772                 /* IF: we are not sending upto and including the packet end
 773                    then align the next start on an eight byte boundary */
 774                 if (len < left) {
 775                         len &= ~7;
 776                 }
 777                 /*
 778                  *      Allocate buffer.
 779                  */
 780
 781                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 782                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 783                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 784                                       IPSTATS_MIB_FRAGFAILS);
 785                         err = -ENOMEM;
 786                         goto fail;
 787                 }
 788
 789                 /*
 790                  *      Set up data on packet
 791                  */
 792
 793                 ip6_copy_metadata(frag, skb);
 794                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 795                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 796                 skb_reset_network_header(frag);
 797                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 798                 frag->transport_header = (frag->network_header + hlen +
 799                                           sizeof(struct frag_hdr));
 800
 801                 /*
 802                  *      Charge the memory for the fragment to any owner
 803                  *      it might possess
 804                  */
 805                 if (skb->sk)
 806                         skb_set_owner_w(frag, skb->sk);
 807
 808                 /*
 809                  *      Copy the packet header into the new buffer.
 810                  */
 811                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 812
 813                 /*
 814                  *      Build fragment header.
 815                  */
 816                 fh->nexthdr = nexthdr;
 817                 fh->reserved = 0;
 818                 if (!frag_id) {
 819                         ipv6_select_ident(skb, fh);
 820                         frag_id = fh->identification;
 821                 } else
 822                         fh->identification = frag_id;
 823
 824                 /*
 825                  *      Copy a block of the IP datagram.
 826                  */
 827                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 828                         BUG();
 829                 left -= len;
 830
 831                 fh->frag_off = htons(offset);
 832                 if (left > 0)
 833                         fh->frag_off |= htons(IP6_MF);
 834                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 835                                                     sizeof(struct ipv6hdr));
 836
 837                 ptr += len;
 838                 offset += len;
 839
 840                 /*
 841                  *      Put this fragment into the sending queue.
 842                  */
 843                 err = output(frag);
 844                 if (err)
 845                         goto fail;
 846
 847                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
 848         }
 849         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 850                       IPSTATS_MIB_FRAGOKS);
 851         kfree_skb(skb);
 852         return err;
 853
 854 fail:
 855         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 856                       IPSTATS_MIB_FRAGFAILS);
 857         kfree_skb(skb);
 858         return err;
 859 }
 860
 861 static inline int ip6_rt_check(struct rt6key *rt_key,
 862                                struct in6_addr *fl_addr,
 863                                struct in6_addr *addr_cache)
 864 {
 865         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 866                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 867 }
 868
 869 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 870                                           struct dst_entry *dst,
 871                                           struct flowi *fl)
 872 {
 873         struct ipv6_pinfo *np = inet6_sk(sk);
 874         struct rt6_info *rt = (struct rt6_info *)dst;
 875
 876         if (!dst)
 877                 goto out;
 878
 879         /* Yes, checking route validity in not connected
 880          * case is not very simple. Take into account,
 881          * that we do not support routing by source, TOS,
 882          * and MSG_DONTROUTE            --ANK (980726)
 883          *
 884          * 1. ip6_rt_check(): If route was host route,
 885          *    check that cached destination is current.
 886          *    If it is network route, we still may
 887          *    check its validity using saved pointer
 888          *    to the last used address: daddr_cache.
 889          *    We do not want to save whole address now,
 890          *    (because main consumer of this service
 891          *    is tcp, which has not this problem),
 892          *    so that the last trick works only on connected
 893          *    sockets.
 894          * 2. oif also should be the same.
 895          */
 896         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 897 #ifdef CONFIG_IPV6_SUBTREES
 898             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 899 #endif
 900             (fl->oif && fl->oif != dst->dev->ifindex)) {
 901                 dst_release(dst);
 902                 dst = NULL;
 903         }
 904
 905 out:
 906         return dst;
 907 }
 908
 909 static int ip6_dst_lookup_tail(struct sock *sk,
 910                                struct dst_entry **dst, struct flowi *fl)
 911 {
 912         int err;
 913         struct net *net = sock_net(sk);
 914
 915         if (*dst == NULL)
 916                 *dst = ip6_route_output(net, sk, fl);
 917
 918         if ((err = (*dst)->error))
 919                 goto out_err_release;
 920
 921         if (ipv6_addr_any(&fl->fl6_src)) {
 922                 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
 923                                          &fl->fl6_dst,
 924                                          sk ? inet6_sk(sk)->srcprefs : 0,
 925                                          &fl->fl6_src);
 926                 if (err)
 927                         goto out_err_release;
 928         }
 929
 930 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 931                 /*
 932                  * Here if the dst entry we've looked up
 933                  * has a neighbour entry that is in the INCOMPLETE
 934                  * state and the src address from the flow is
 935                  * marked as OPTIMISTIC, we release the found
 936                  * dst entry and replace it instead with the
 937                  * dst entry of the nexthop router
 938                  */
 939                 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
 940                         struct inet6_ifaddr *ifp;
 941                         struct flowi fl_gw;
 942                         int redirect;
 943
 944                         ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 945                                               (*dst)->dev, 1);
 946
 947                         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 948                         if (ifp)
 949                                 in6_ifa_put(ifp);
 950
 951                         if (redirect) {
 952                                 /*
 953                                  * We need to get the dst entry for the
 954                                  * default router instead
 955                                  */
 956                                 dst_release(*dst);
 957                                 memcpy(&fl_gw, fl, sizeof(struct flowi));
 958                                 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 959                                 *dst = ip6_route_output(net, sk, &fl_gw);
 960                                 if ((err = (*dst)->error))
 961                                         goto out_err_release;
 962                         }
 963                 }
 964 #endif
 965
 966         return 0;
 967
 968 out_err_release:
 969         if (err == -ENETUNREACH)
 970                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
 971         dst_release(*dst);
 972         *dst = NULL;
 973         return err;
 974 }
 975
 976 /**
 977  *      ip6_dst_lookup - perform route lookup on flow
 978  *      @sk: socket which provides route info
 979  *      @dst: pointer to dst_entry * for result
 980  *      @fl: flow to lookup
 981  *
 982  *      This function performs a route lookup on the given flow.
 983  *
 984  *      It returns zero on success, or a standard errno code on error.
 985  */
 986 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 987 {
 988         *dst = NULL;
 989         return ip6_dst_lookup_tail(sk, dst, fl);
 990 }
 991 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 992
 993 /**
 994  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
 995  *      @sk: socket which provides the dst cache and route info
 996  *      @dst: pointer to dst_entry * for result
 997  *      @fl: flow to lookup
 998  *
 999  *      This function performs a route lookup on the given flow with the
1000  *      possibility of using the cached route in the socket if it is valid.
1001  *      It will take the socket dst lock when operating on the dst cache.
1002  *      As a result, this function can only be used in process context.
1003  *
1004  *      It returns zero on success, or a standard errno code on error.
1005  */
1006 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1007 {
1008         *dst = NULL;
1009         if (sk) {
1010                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1011                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1012         }
1013
1014         return ip6_dst_lookup_tail(sk, dst, fl);
1015 }
1016 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1017
1018 static inline int ip6_ufo_append_data(struct sock *sk,
1019                         int getfrag(void *from, char *to, int offset, int len,
1020                         int odd, struct sk_buff *skb),
1021                         void *from, int length, int hh_len, int fragheaderlen,
1022                         int transhdrlen, int mtu,unsigned int flags)
1023
1024 {
1025         struct sk_buff *skb;
1026         int err;
1027
1028         /* There is support for UDP large send offload by network
1029          * device, so create one single skb packet containing complete
1030          * udp datagram
1031          */
1032         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1033                 skb = sock_alloc_send_skb(sk,
1034                         hh_len + fragheaderlen + transhdrlen + 20,
1035                         (flags & MSG_DONTWAIT), &err);
1036                 if (skb == NULL)
1037                         return -ENOMEM;
1038
1039                 /* reserve space for Hardware header */
1040                 skb_reserve(skb, hh_len);
1041
1042                 /* create space for UDP/IP header */
1043                 skb_put(skb,fragheaderlen + transhdrlen);
1044
1045                 /* initialize network header pointer */
1046                 skb_reset_network_header(skb);
1047
1048                 /* initialize protocol header pointer */
1049                 skb->transport_header = skb->network_header + fragheaderlen;
1050
1051                 skb->ip_summed = CHECKSUM_PARTIAL;
1052                 skb->csum = 0;
1053                 sk->sk_sndmsg_off = 0;
1054         }
1055
1056         err = skb_append_datato_frags(sk,skb, getfrag, from,
1057                                       (length - transhdrlen));
1058         if (!err) {
1059                 struct frag_hdr fhdr;
1060
1061                 /* specify the length of each IP datagram fragment*/
1062                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1063                                             sizeof(struct frag_hdr);
1064                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1065                 ipv6_select_ident(skb, &fhdr);
1066                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1067                 __skb_queue_tail(&sk->sk_write_queue, skb);
1068
1069                 return 0;
1070         }
1071         /* There is not enough support do UPD LSO,
1072          * so follow normal path
1073          */
1074         kfree_skb(skb);
1075
1076         return err;
1077 }
1078
1079 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1080         int offset, int len, int odd, struct sk_buff *skb),
1081         void *from, int length, int transhdrlen,
1082         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1083         struct rt6_info *rt, unsigned int flags)
1084 {
1085         struct inet_sock *inet = inet_sk(sk);
1086         struct ipv6_pinfo *np = inet6_sk(sk);
1087         struct sk_buff *skb;
1088         unsigned int maxfraglen, fragheaderlen;
1089         int exthdrlen;
1090         int hh_len;
1091         int mtu;
1092         int copy;
1093         int err;
1094         int offset = 0;
1095         int csummode = CHECKSUM_NONE;
1096
1097         if (flags&MSG_PROBE)
1098                 return 0;
1099         if (skb_queue_empty(&sk->sk_write_queue)) {
1100                 /*
1101                  * setup for corking
1102                  */
1103                 if (opt) {
1104                         if (np->cork.opt == NULL) {
1105                                 np->cork.opt = kmalloc(opt->tot_len,
1106                                                        sk->sk_allocation);
1107                                 if (unlikely(np->cork.opt == NULL))
1108                                         return -ENOBUFS;
1109                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1110                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1111                                 return -EINVAL;
1112                         }
1113                         memcpy(np->cork.opt, opt, opt->tot_len);
1114                         inet->cork.flags |= IPCORK_OPT;
1115                         /* need source address above miyazawa*/
1116                 }
1117                 dst_hold(&rt->u.dst);
1118                 inet->cork.dst = &rt->u.dst;
1119                 inet->cork.fl = *fl;
1120                 np->cork.hop_limit = hlimit;
1121                 np->cork.tclass = tclass;
1122                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1123                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1124                 if (np->frag_size < mtu) {
1125                         if (np->frag_size)
1126                                 mtu = np->frag_size;
1127                 }
1128                 inet->cork.fragsize = mtu;
1129                 if (dst_allfrag(rt->u.dst.path))
1130                         inet->cork.flags |= IPCORK_ALLFRAG;
1131                 inet->cork.length = 0;
1132                 sk->sk_sndmsg_page = NULL;
1133                 sk->sk_sndmsg_off = 0;
1134                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1135                             rt->rt6i_nfheader_len;
1136                 length += exthdrlen;
1137                 transhdrlen += exthdrlen;
1138         } else {
1139                 rt = (struct rt6_info *)inet->cork.dst;
1140                 fl = &inet->cork.fl;
1141                 if (inet->cork.flags & IPCORK_OPT)
1142                         opt = np->cork.opt;
1143                 transhdrlen = 0;
1144                 exthdrlen = 0;
1145                 mtu = inet->cork.fragsize;
1146         }
1147
1148         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1149
1150         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1151                         (opt ? opt->opt_nflen : 0);
1152         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1153
1154         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1155                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1156                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1157                         return -EMSGSIZE;
1158                 }
1159         }
1160
1161         /*
1162          * Let's try using as much space as possible.
1163          * Use MTU if total length of the message fits into the MTU.
1164          * Otherwise, we need to reserve fragment header and
1165          * fragment alignment (= 8-15 octects, in total).
1166          *
1167          * Note that we may need to "move" the data from the tail of
1168          * of the buffer to the new fragment when we split
1169          * the message.
1170          *
1171          * FIXME: It may be fragmented into multiple chunks
1172          *        at once if non-fragmentable extension headers
1173          *        are too large.
1174          * --yoshfuji
1175          */
1176
1177         inet->cork.length += length;
1178         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1179             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1180
1181                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1182                                           fragheaderlen, transhdrlen, mtu,
1183                                           flags);
1184                 if (err)
1185                         goto error;
1186                 return 0;
1187         }
1188
1189         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1190                 goto alloc_new_skb;
1191
1192         while (length > 0) {
1193                 /* Check if the remaining data fits into current packet. */
1194                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1195                 if (copy < length)
1196                         copy = maxfraglen - skb->len;
1197
1198                 if (copy <= 0) {
1199                         char *data;
1200                         unsigned int datalen;
1201                         unsigned int fraglen;
1202                         unsigned int fraggap;
1203                         unsigned int alloclen;
1204                         struct sk_buff *skb_prev;
1205 alloc_new_skb:
1206                         skb_prev = skb;
1207
1208                         /* There's no room in the current skb */
1209                         if (skb_prev)
1210                                 fraggap = skb_prev->len - maxfraglen;
1211                         else
1212                                 fraggap = 0;
1213
1214                         /*
1215                          * If remaining data exceeds the mtu,
1216                          * we know we need more fragment(s).
1217                          */
1218                         datalen = length + fraggap;
1219                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1220                                 datalen = maxfraglen - fragheaderlen;
1221
1222                         fraglen = datalen + fragheaderlen;
1223                         if ((flags & MSG_MORE) &&
1224                             !(rt->u.dst.dev->features&NETIF_F_SG))
1225                                 alloclen = mtu;
1226                         else
1227                                 alloclen = datalen + fragheaderlen;
1228
1229                         /*
1230                          * The last fragment gets additional space at tail.
1231                          * Note: we overallocate on fragments with MSG_MODE
1232                          * because we have no idea if we're the last one.
1233                          */
1234                         if (datalen == length + fraggap)
1235                                 alloclen += rt->u.dst.trailer_len;
1236
1237                         /*
1238                          * We just reserve space for fragment header.
1239                          * Note: this may be overallocation if the message
1240                          * (without MSG_MORE) fits into the MTU.
1241                          */
1242                         alloclen += sizeof(struct frag_hdr);
1243
1244                         if (transhdrlen) {
1245                                 skb = sock_alloc_send_skb(sk,
1246                                                 alloclen + hh_len,
1247                                                 (flags & MSG_DONTWAIT), &err);
1248                         } else {
1249                                 skb = NULL;
1250                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1251                                     2 * sk->sk_sndbuf)
1252                                         skb = sock_wmalloc(sk,
1253                                                            alloclen + hh_len, 1,
1254                                                            sk->sk_allocation);
1255                                 if (unlikely(skb == NULL))
1256                                         err = -ENOBUFS;
1257                         }
1258                         if (skb == NULL)
1259                                 goto error;
1260                         /*
1261                          *      Fill in the control structures
1262                          */
1263                         skb->ip_summed = csummode;
1264                         skb->csum = 0;
1265                         /* reserve for fragmentation */
1266                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1267
1268                         /*
1269                          *      Find where to start putting bytes
1270                          */
1271                         data = skb_put(skb, fraglen);
1272                         skb_set_network_header(skb, exthdrlen);
1273                         data += fragheaderlen;
1274                         skb->transport_header = (skb->network_header +
1275                                                  fragheaderlen);
1276                         if (fraggap) {
1277                                 skb->csum = skb_copy_and_csum_bits(
1278                                         skb_prev, maxfraglen,
1279                                         data + transhdrlen, fraggap, 0);
1280                                 skb_prev->csum = csum_sub(skb_prev->csum,
1281                                                           skb->csum);
1282                                 data += fraggap;
1283                                 pskb_trim_unique(skb_prev, maxfraglen);
1284                         }
1285                         copy = datalen - transhdrlen - fraggap;
1286                         if (copy < 0) {
1287                                 err = -EINVAL;
1288                                 kfree_skb(skb);
1289                                 goto error;
1290                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1291                                 err = -EFAULT;
1292                                 kfree_skb(skb);
1293                                 goto error;
1294                         }
1295
1296                         offset += copy;
1297                         length -= datalen - fraggap;
1298                         transhdrlen = 0;
1299                         exthdrlen = 0;
1300                         csummode = CHECKSUM_NONE;
1301
1302                         /*
1303                          * Put the packet on the pending queue
1304                          */
1305                         __skb_queue_tail(&sk->sk_write_queue, skb);
1306                         continue;
1307                 }
1308
1309                 if (copy > length)
1310                         copy = length;
1311
1312                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1313                         unsigned int off;
1314
1315                         off = skb->len;
1316                         if (getfrag(from, skb_put(skb, copy),
1317                                                 offset, copy, off, skb) < 0) {
1318                                 __skb_trim(skb, off);
1319                                 err = -EFAULT;
1320                                 goto error;
1321                         }
1322                 } else {
1323                         int i = skb_shinfo(skb)->nr_frags;
1324                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1325                         struct page *page = sk->sk_sndmsg_page;
1326                         int off = sk->sk_sndmsg_off;
1327                         unsigned int left;
1328
1329                         if (page && (left = PAGE_SIZE - off) > 0) {
1330                                 if (copy >= left)
1331                                         copy = left;
1332                                 if (page != frag->page) {
1333                                         if (i == MAX_SKB_FRAGS) {
1334                                                 err = -EMSGSIZE;
1335                                                 goto error;
1336                                         }
1337                                         get_page(page);
1338                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1339                                         frag = &skb_shinfo(skb)->frags[i];
1340                                 }
1341                         } else if(i < MAX_SKB_FRAGS) {
1342                                 if (copy > PAGE_SIZE)
1343                                         copy = PAGE_SIZE;
1344                                 page = alloc_pages(sk->sk_allocation, 0);
1345                                 if (page == NULL) {
1346                                         err = -ENOMEM;
1347                                         goto error;
1348                                 }
1349                                 sk->sk_sndmsg_page = page;
1350                                 sk->sk_sndmsg_off = 0;
1351
1352                                 skb_fill_page_desc(skb, i, page, 0, 0);
1353                                 frag = &skb_shinfo(skb)->frags[i];
1354                         } else {
1355                                 err = -EMSGSIZE;
1356                                 goto error;
1357                         }
1358                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1359                                 err = -EFAULT;
1360                                 goto error;
1361                         }
1362                         sk->sk_sndmsg_off += copy;
1363                         frag->size += copy;
1364                         skb->len += copy;
1365                         skb->data_len += copy;
1366                         skb->truesize += copy;
1367                         atomic_add(copy, &sk->sk_wmem_alloc);
1368                 }
1369                 offset += copy;
1370                 length -= copy;
1371         }
1372         return 0;
1373 error:
1374         inet->cork.length -= length;
1375         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1376         return err;
1377 }
1378
1379 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1380 {
1381         inet->cork.flags &= ~IPCORK_OPT;
1382         kfree(np->cork.opt);
1383         np->cork.opt = NULL;
1384         if (inet->cork.dst) {
1385                 dst_release(inet->cork.dst);
1386                 inet->cork.dst = NULL;
1387                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1388         }
1389         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1390 }
1391
1392 int ip6_push_pending_frames(struct sock *sk)
1393 {
1394         struct sk_buff *skb, *tmp_skb;
1395         struct sk_buff **tail_skb;
1396         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1397         struct inet_sock *inet = inet_sk(sk);
1398         struct ipv6_pinfo *np = inet6_sk(sk);
1399         struct ipv6hdr *hdr;
1400         struct ipv6_txoptions *opt = np->cork.opt;
1401         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1402         struct flowi *fl = &inet->cork.fl;
1403         unsigned char proto = fl->proto;
1404         int err = 0;
1405
1406         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1407                 goto out;
1408         tail_skb = &(skb_shinfo(skb)->frag_list);
1409
1410         /* move skb->data to ip header from ext header */
1411         if (skb->data < skb_network_header(skb))
1412                 __skb_pull(skb, skb_network_offset(skb));
1413         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1414                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1415                 *tail_skb = tmp_skb;
1416                 tail_skb = &(tmp_skb->next);
1417                 skb->len += tmp_skb->len;
1418                 skb->data_len += tmp_skb->len;
1419                 skb->truesize += tmp_skb->truesize;
1420                 __sock_put(tmp_skb->sk);
1421                 tmp_skb->destructor = NULL;
1422                 tmp_skb->sk = NULL;
1423         }
1424
1425         /* Allow local fragmentation. */
1426         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1427                 skb->local_df = 1;
1428
1429         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1430         __skb_pull(skb, skb_network_header_len(skb));
1431         if (opt && opt->opt_flen)
1432                 ipv6_push_frag_opts(skb, opt, &proto);
1433         if (opt && opt->opt_nflen)
1434                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1435
1436         skb_push(skb, sizeof(struct ipv6hdr));
1437         skb_reset_network_header(skb);
1438         hdr = ipv6_hdr(skb);
1439
1440         *(__be32*)hdr = fl->fl6_flowlabel |
1441                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1442
1443         hdr->hop_limit = np->cork.hop_limit;
1444         hdr->nexthdr = proto;
1445         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1446         ipv6_addr_copy(&hdr->daddr, final_dst);
1447
1448         skb->priority = sk->sk_priority;
1449         skb->mark = sk->sk_mark;
1450
1451         skb->dst = dst_clone(&rt->u.dst);
1452         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1453         if (proto == IPPROTO_ICMPV6) {
1454                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1455
1456                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1457                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1458         }
1459
1460         err = ip6_local_out(skb);
1461         if (err) {
1462                 if (err > 0)
1463                         err = np->recverr ? net_xmit_errno(err) : 0;
1464                 if (err)
1465                         goto error;
1466         }
1467
1468 out:
1469         ip6_cork_release(inet, np);
1470         return err;
1471 error:
1472         goto out;
1473 }
1474
1475 void ip6_flush_pending_frames(struct sock *sk)
1476 {
1477         struct sk_buff *skb;
1478
1479         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1480                 if (skb->dst)
1481                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1482                                       IPSTATS_MIB_OUTDISCARDS);
1483                 kfree_skb(skb);
1484         }
1485
1486         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1487 }