net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  61 {
  62         static u32 ipv6_fragmentation_id = 1;
  63         static DEFINE_SPINLOCK(ip6_id_lock);
  64
  65         spin_lock_bh(&ip6_id_lock);
  66         fhdr->identification = htonl(ipv6_fragmentation_id);
  67         if (++ipv6_fragmentation_id == 0)
  68                 ipv6_fragmentation_id = 1;
  69         spin_unlock_bh(&ip6_id_lock);
  70 }
  71
  72 int __ip6_local_out(struct sk_buff *skb)
  73 {
  74         int len;
  75
  76         len = skb->len - sizeof(struct ipv6hdr);
  77         if (len > IPV6_MAXPLEN)
  78                 len = 0;
  79         ipv6_hdr(skb)->payload_len = htons(len);
  80
  81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  82                        dst_output);
  83 }
  84
  85 int ip6_local_out(struct sk_buff *skb)
  86 {
  87         int err;
  88
  89         err = __ip6_local_out(skb);
  90         if (likely(err == 1))
  91                 err = dst_output(skb);
  92
  93         return err;
  94 }
  95 EXPORT_SYMBOL_GPL(ip6_local_out);
  96
  97 static int ip6_output_finish(struct sk_buff *skb)
  98 {
  99         struct dst_entry *dst = skb->dst;
 100
 101         if (dst->hh)
 102                 return neigh_hh_output(dst->hh, skb);
 103         else if (dst->neighbour)
 104                 return dst->neighbour->output(skb);
 105
 106         IP6_INC_STATS_BH(dev_net(dst->dev),
 107                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 108         kfree_skb(skb);
 109         return -EINVAL;
 110
 111 }
 112
 113 /* dev_loopback_xmit for use with netfilter. */
 114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 115 {
 116         skb_reset_mac_header(newskb);
 117         __skb_pull(newskb, skb_network_offset(newskb));
 118         newskb->pkt_type = PACKET_LOOPBACK;
 119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 120         WARN_ON(!newskb->dst);
 121
 122         netif_rx(newskb);
 123         return 0;
 124 }
 125
 126
 127 static int ip6_output2(struct sk_buff *skb)
 128 {
 129         struct dst_entry *dst = skb->dst;
 130         struct net_device *dev = dst->dev;
 131
 132         skb->protocol = htons(ETH_P_IPV6);
 133         skb->dev = dev;
 134
 135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 138
 139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 140                     ((mroute6_socket(dev_net(dev)) &&
 141                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 142                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 143                                          &ipv6_hdr(skb)->saddr))) {
 144                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 145
 146                         /* Do not check for IFF_ALLMULTI; multicast routing
 147                            is not supported in any case.
 148                          */
 149                         if (newskb)
 150                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 151                                         NULL, newskb->dev,
 152                                         ip6_dev_loopback_xmit);
 153
 154                         if (ipv6_hdr(skb)->hop_limit == 0) {
 155                                 IP6_INC_STATS(dev_net(dev), idev,
 156                                               IPSTATS_MIB_OUTDISCARDS);
 157                                 kfree_skb(skb);
 158                                 return 0;
 159                         }
 160                 }
 161
 162                 IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCASTPKTS);
 163         }
 164
 165         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 166                        ip6_output_finish);
 167 }
 168
 169 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 170 {
 171         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 172
 173         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 174                skb->dst->dev->mtu : dst_mtu(skb->dst);
 175 }
 176
 177 int ip6_output(struct sk_buff *skb)
 178 {
 179         struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 180         if (unlikely(idev->cnf.disable_ipv6)) {
 181                 IP6_INC_STATS(dev_net(skb->dst->dev), idev,
 182                               IPSTATS_MIB_OUTDISCARDS);
 183                 kfree_skb(skb);
 184                 return 0;
 185         }
 186
 187         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 188                                 dst_allfrag(skb->dst))
 189                 return ip6_fragment(skb, ip6_output2);
 190         else
 191                 return ip6_output2(skb);
 192 }
 193
 194 /*
 195  *      xmit an sk_buff (used by TCP)
 196  */
 197
 198 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 199              struct ipv6_txoptions *opt, int ipfragok)
 200 {
 201         struct net *net = sock_net(sk);
 202         struct ipv6_pinfo *np = inet6_sk(sk);
 203         struct in6_addr *first_hop = &fl->fl6_dst;
 204         struct dst_entry *dst = skb->dst;
 205         struct ipv6hdr *hdr;
 206         u8  proto = fl->proto;
 207         int seg_len = skb->len;
 208         int hlimit, tclass;
 209         u32 mtu;
 210
 211         if (opt) {
 212                 unsigned int head_room;
 213
 214                 /* First: exthdrs may take lots of space (~8K for now)
 215                    MAX_HEADER is not enough.
 216                  */
 217                 head_room = opt->opt_nflen + opt->opt_flen;
 218                 seg_len += head_room;
 219                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 220
 221                 if (skb_headroom(skb) < head_room) {
 222                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 223                         if (skb2 == NULL) {
 224                                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 225                                               IPSTATS_MIB_OUTDISCARDS);
 226                                 kfree_skb(skb);
 227                                 return -ENOBUFS;
 228                         }
 229                         kfree_skb(skb);
 230                         skb = skb2;
 231                         if (sk)
 232                                 skb_set_owner_w(skb, sk);
 233                 }
 234                 if (opt->opt_flen)
 235                         ipv6_push_frag_opts(skb, opt, &proto);
 236                 if (opt->opt_nflen)
 237                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 238         }
 239
 240         skb_push(skb, sizeof(struct ipv6hdr));
 241         skb_reset_network_header(skb);
 242         hdr = ipv6_hdr(skb);
 243
 244         /* Allow local fragmentation. */
 245         if (ipfragok)
 246                 skb->local_df = 1;
 247
 248         /*
 249          *      Fill in the IPv6 header
 250          */
 251
 252         hlimit = -1;
 253         if (np)
 254                 hlimit = np->hop_limit;
 255         if (hlimit < 0)
 256                 hlimit = ip6_dst_hoplimit(dst);
 257
 258         tclass = -1;
 259         if (np)
 260                 tclass = np->tclass;
 261         if (tclass < 0)
 262                 tclass = 0;
 263
 264         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 265
 266         hdr->payload_len = htons(seg_len);
 267         hdr->nexthdr = proto;
 268         hdr->hop_limit = hlimit;
 269
 270         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 271         ipv6_addr_copy(&hdr->daddr, first_hop);
 272
 273         skb->priority = sk->sk_priority;
 274         skb->mark = sk->sk_mark;
 275
 276         mtu = dst_mtu(dst);
 277         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 278                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 279                               IPSTATS_MIB_OUTREQUESTS);
 280                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 281                                 dst_output);
 282         }
 283
 284         if (net_ratelimit())
 285                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 286         skb->dev = dst->dev;
 287         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 288         IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 289         kfree_skb(skb);
 290         return -EMSGSIZE;
 291 }
 292
 293 EXPORT_SYMBOL(ip6_xmit);
 294
 295 /*
 296  *      To avoid extra problems ND packets are send through this
 297  *      routine. It's code duplication but I really want to avoid
 298  *      extra checks since ipv6_build_header is used by TCP (which
 299  *      is for us performance critical)
 300  */
 301
 302 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 303                const struct in6_addr *saddr, const struct in6_addr *daddr,
 304                int proto, int len)
 305 {
 306         struct ipv6_pinfo *np = inet6_sk(sk);
 307         struct ipv6hdr *hdr;
 308         int totlen;
 309
 310         skb->protocol = htons(ETH_P_IPV6);
 311         skb->dev = dev;
 312
 313         totlen = len + sizeof(struct ipv6hdr);
 314
 315         skb_reset_network_header(skb);
 316         skb_put(skb, sizeof(struct ipv6hdr));
 317         hdr = ipv6_hdr(skb);
 318
 319         *(__be32*)hdr = htonl(0x60000000);
 320
 321         hdr->payload_len = htons(len);
 322         hdr->nexthdr = proto;
 323         hdr->hop_limit = np->hop_limit;
 324
 325         ipv6_addr_copy(&hdr->saddr, saddr);
 326         ipv6_addr_copy(&hdr->daddr, daddr);
 327
 328         return 0;
 329 }
 330
 331 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 332 {
 333         struct ip6_ra_chain *ra;
 334         struct sock *last = NULL;
 335
 336         read_lock(&ip6_ra_lock);
 337         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 338                 struct sock *sk = ra->sk;
 339                 if (sk && ra->sel == sel &&
 340                     (!sk->sk_bound_dev_if ||
 341                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 342                         if (last) {
 343                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 344                                 if (skb2)
 345                                         rawv6_rcv(last, skb2);
 346                         }
 347                         last = sk;
 348                 }
 349         }
 350
 351         if (last) {
 352                 rawv6_rcv(last, skb);
 353                 read_unlock(&ip6_ra_lock);
 354                 return 1;
 355         }
 356         read_unlock(&ip6_ra_lock);
 357         return 0;
 358 }
 359
 360 static int ip6_forward_proxy_check(struct sk_buff *skb)
 361 {
 362         struct ipv6hdr *hdr = ipv6_hdr(skb);
 363         u8 nexthdr = hdr->nexthdr;
 364         int offset;
 365
 366         if (ipv6_ext_hdr(nexthdr)) {
 367                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 368                 if (offset < 0)
 369                         return 0;
 370         } else
 371                 offset = sizeof(struct ipv6hdr);
 372
 373         if (nexthdr == IPPROTO_ICMPV6) {
 374                 struct icmp6hdr *icmp6;
 375
 376                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 377                                          offset + 1 - skb->data)))
 378                         return 0;
 379
 380                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 381
 382                 switch (icmp6->icmp6_type) {
 383                 case NDISC_ROUTER_SOLICITATION:
 384                 case NDISC_ROUTER_ADVERTISEMENT:
 385                 case NDISC_NEIGHBOUR_SOLICITATION:
 386                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 387                 case NDISC_REDIRECT:
 388                         /* For reaction involving unicast neighbor discovery
 389                          * message destined to the proxied address, pass it to
 390                          * input function.
 391                          */
 392                         return 1;
 393                 default:
 394                         break;
 395                 }
 396         }
 397
 398         /*
 399          * The proxying router can't forward traffic sent to a link-local
 400          * address, so signal the sender and discard the packet. This
 401          * behavior is clarified by the MIPv6 specification.
 402          */
 403         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 404                 dst_link_failure(skb);
 405                 return -1;
 406         }
 407
 408         return 0;
 409 }
 410
 411 static inline int ip6_forward_finish(struct sk_buff *skb)
 412 {
 413         return dst_output(skb);
 414 }
 415
 416 int ip6_forward(struct sk_buff *skb)
 417 {
 418         struct dst_entry *dst = skb->dst;
 419         struct ipv6hdr *hdr = ipv6_hdr(skb);
 420         struct inet6_skb_parm *opt = IP6CB(skb);
 421         struct net *net = dev_net(dst->dev);
 422
 423         if (net->ipv6.devconf_all->forwarding == 0)
 424                 goto error;
 425
 426         if (skb_warn_if_lro(skb))
 427                 goto drop;
 428
 429         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 430                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 431                 goto drop;
 432         }
 433
 434         skb_forward_csum(skb);
 435
 436         /*
 437          *      We DO NOT make any processing on
 438          *      RA packets, pushing them to user level AS IS
 439          *      without ane WARRANTY that application will be able
 440          *      to interpret them. The reason is that we
 441          *      cannot make anything clever here.
 442          *
 443          *      We are not end-node, so that if packet contains
 444          *      AH/ESP, we cannot make anything.
 445          *      Defragmentation also would be mistake, RA packets
 446          *      cannot be fragmented, because there is no warranty
 447          *      that different fragments will go along one path. --ANK
 448          */
 449         if (opt->ra) {
 450                 u8 *ptr = skb_network_header(skb) + opt->ra;
 451                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 452                         return 0;
 453         }
 454
 455         /*
 456          *      check and decrement ttl
 457          */
 458         if (hdr->hop_limit <= 1) {
 459                 /* Force OUTPUT device used as source address */
 460                 skb->dev = dst->dev;
 461                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 462                             0, skb->dev);
 463                 IP6_INC_STATS_BH(net,
 464                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 465
 466                 kfree_skb(skb);
 467                 return -ETIMEDOUT;
 468         }
 469
 470         /* XXX: idev->cnf.proxy_ndp? */
 471         if (net->ipv6.devconf_all->proxy_ndp &&
 472             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 473                 int proxied = ip6_forward_proxy_check(skb);
 474                 if (proxied > 0)
 475                         return ip6_input(skb);
 476                 else if (proxied < 0) {
 477                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 478                                       IPSTATS_MIB_INDISCARDS);
 479                         goto drop;
 480                 }
 481         }
 482
 483         if (!xfrm6_route_forward(skb)) {
 484                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 485                 goto drop;
 486         }
 487         dst = skb->dst;
 488
 489         /* IPv6 specs say nothing about it, but it is clear that we cannot
 490            send redirects to source routed frames.
 491            We don't send redirects to frames decapsulated from IPsec.
 492          */
 493         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 494             !skb_sec_path(skb)) {
 495                 struct in6_addr *target = NULL;
 496                 struct rt6_info *rt;
 497                 struct neighbour *n = dst->neighbour;
 498
 499                 /*
 500                  *      incoming and outgoing devices are the same
 501                  *      send a redirect.
 502                  */
 503
 504                 rt = (struct rt6_info *) dst;
 505                 if ((rt->rt6i_flags & RTF_GATEWAY))
 506                         target = (struct in6_addr*)&n->primary_key;
 507                 else
 508                         target = &hdr->daddr;
 509
 510                 /* Limit redirects both by destination (here)
 511                    and by source (inside ndisc_send_redirect)
 512                  */
 513                 if (xrlim_allow(dst, 1*HZ))
 514                         ndisc_send_redirect(skb, n, target);
 515         } else {
 516                 int addrtype = ipv6_addr_type(&hdr->saddr);
 517
 518                 /* This check is security critical. */
 519                 if (addrtype == IPV6_ADDR_ANY ||
 520                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 521                         goto error;
 522                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 523                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 524                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 525                         goto error;
 526                 }
 527         }
 528
 529         if (skb->len > dst_mtu(dst)) {
 530                 /* Again, force OUTPUT device used as source address */
 531                 skb->dev = dst->dev;
 532                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 533                 IP6_INC_STATS_BH(net,
 534                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 535                 IP6_INC_STATS_BH(net,
 536                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 537                 kfree_skb(skb);
 538                 return -EMSGSIZE;
 539         }
 540
 541         if (skb_cow(skb, dst->dev->hard_header_len)) {
 542                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 543                 goto drop;
 544         }
 545
 546         hdr = ipv6_hdr(skb);
 547
 548         /* Mangling hops number delayed to point after skb COW */
 549
 550         hdr->hop_limit--;
 551
 552         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 553         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 554                        ip6_forward_finish);
 555
 556 error:
 557         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 558 drop:
 559         kfree_skb(skb);
 560         return -EINVAL;
 561 }
 562
 563 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 564 {
 565         to->pkt_type = from->pkt_type;
 566         to->priority = from->priority;
 567         to->protocol = from->protocol;
 568         dst_release(to->dst);
 569         to->dst = dst_clone(from->dst);
 570         to->dev = from->dev;
 571         to->mark = from->mark;
 572
 573 #ifdef CONFIG_NET_SCHED
 574         to->tc_index = from->tc_index;
 575 #endif
 576         nf_copy(to, from);
 577 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 578     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 579         to->nf_trace = from->nf_trace;
 580 #endif
 581         skb_copy_secmark(to, from);
 582 }
 583
 584 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 585 {
 586         u16 offset = sizeof(struct ipv6hdr);
 587         struct ipv6_opt_hdr *exthdr =
 588                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 589         unsigned int packet_len = skb->tail - skb->network_header;
 590         int found_rhdr = 0;
 591         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 592
 593         while (offset + 1 <= packet_len) {
 594
 595                 switch (**nexthdr) {
 596
 597                 case NEXTHDR_HOP:
 598                         break;
 599                 case NEXTHDR_ROUTING:
 600                         found_rhdr = 1;
 601                         break;
 602                 case NEXTHDR_DEST:
 603 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 604                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 605                                 break;
 606 #endif
 607                         if (found_rhdr)
 608                                 return offset;
 609                         break;
 610                 default :
 611                         return offset;
 612                 }
 613
 614                 offset += ipv6_optlen(exthdr);
 615                 *nexthdr = &exthdr->nexthdr;
 616                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 617                                                  offset);
 618         }
 619
 620         return offset;
 621 }
 622
 623 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 624 {
 625         struct sk_buff *frag;
 626         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 627         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 628         struct ipv6hdr *tmp_hdr;
 629         struct frag_hdr *fh;
 630         unsigned int mtu, hlen, left, len;
 631         __be32 frag_id = 0;
 632         int ptr, offset = 0, err=0;
 633         u8 *prevhdr, nexthdr = 0;
 634         struct net *net = dev_net(skb->dst->dev);
 635
 636         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 637         nexthdr = *prevhdr;
 638
 639         mtu = ip6_skb_dst_mtu(skb);
 640
 641         /* We must not fragment if the socket is set to force MTU discovery
 642          * or if the skb it not generated by a local socket.  (This last
 643          * check should be redundant, but it's free.)
 644          */
 645         if (!skb->local_df) {
 646                 skb->dev = skb->dst->dev;
 647                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 648                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 649                               IPSTATS_MIB_FRAGFAILS);
 650                 kfree_skb(skb);
 651                 return -EMSGSIZE;
 652         }
 653
 654         if (np && np->frag_size < mtu) {
 655                 if (np->frag_size)
 656                         mtu = np->frag_size;
 657         }
 658         mtu -= hlen + sizeof(struct frag_hdr);
 659
 660         if (skb_shinfo(skb)->frag_list) {
 661                 int first_len = skb_pagelen(skb);
 662                 int truesizes = 0;
 663
 664                 if (first_len - hlen > mtu ||
 665                     ((first_len - hlen) & 7) ||
 666                     skb_cloned(skb))
 667                         goto slow_path;
 668
 669                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 670                         /* Correct geometry. */
 671                         if (frag->len > mtu ||
 672                             ((frag->len & 7) && frag->next) ||
 673                             skb_headroom(frag) < hlen)
 674                             goto slow_path;
 675
 676                         /* Partially cloned skb? */
 677                         if (skb_shared(frag))
 678                                 goto slow_path;
 679
 680                         BUG_ON(frag->sk);
 681                         if (skb->sk) {
 682                                 sock_hold(skb->sk);
 683                                 frag->sk = skb->sk;
 684                                 frag->destructor = sock_wfree;
 685                                 truesizes += frag->truesize;
 686                         }
 687                 }
 688
 689                 err = 0;
 690                 offset = 0;
 691                 frag = skb_shinfo(skb)->frag_list;
 692                 skb_shinfo(skb)->frag_list = NULL;
 693                 /* BUILD HEADER */
 694
 695                 *prevhdr = NEXTHDR_FRAGMENT;
 696                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 697                 if (!tmp_hdr) {
 698                         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 699                                       IPSTATS_MIB_FRAGFAILS);
 700                         return -ENOMEM;
 701                 }
 702
 703                 __skb_pull(skb, hlen);
 704                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 705                 __skb_push(skb, hlen);
 706                 skb_reset_network_header(skb);
 707                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 708
 709                 ipv6_select_ident(skb, fh);
 710                 fh->nexthdr = nexthdr;
 711                 fh->reserved = 0;
 712                 fh->frag_off = htons(IP6_MF);
 713                 frag_id = fh->identification;
 714
 715                 first_len = skb_pagelen(skb);
 716                 skb->data_len = first_len - skb_headlen(skb);
 717                 skb->truesize -= truesizes;
 718                 skb->len = first_len;
 719                 ipv6_hdr(skb)->payload_len = htons(first_len -
 720                                                    sizeof(struct ipv6hdr));
 721
 722                 dst_hold(&rt->u.dst);
 723
 724                 for (;;) {
 725                         /* Prepare header of the next frame,
 726                          * before previous one went down. */
 727                         if (frag) {
 728                                 frag->ip_summed = CHECKSUM_NONE;
 729                                 skb_reset_transport_header(frag);
 730                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 731                                 __skb_push(frag, hlen);
 732                                 skb_reset_network_header(frag);
 733                                 memcpy(skb_network_header(frag), tmp_hdr,
 734                                        hlen);
 735                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 736                                 fh->nexthdr = nexthdr;
 737                                 fh->reserved = 0;
 738                                 fh->frag_off = htons(offset);
 739                                 if (frag->next != NULL)
 740                                         fh->frag_off |= htons(IP6_MF);
 741                                 fh->identification = frag_id;
 742                                 ipv6_hdr(frag)->payload_len =
 743                                                 htons(frag->len -
 744                                                       sizeof(struct ipv6hdr));
 745                                 ip6_copy_metadata(frag, skb);
 746                         }
 747
 748                         err = output(skb);
 749                         if(!err)
 750                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 751                                               IPSTATS_MIB_FRAGCREATES);
 752
 753                         if (err || !frag)
 754                                 break;
 755
 756                         skb = frag;
 757                         frag = skb->next;
 758                         skb->next = NULL;
 759                 }
 760
 761                 kfree(tmp_hdr);
 762
 763                 if (err == 0) {
 764                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 765                                       IPSTATS_MIB_FRAGOKS);
 766                         dst_release(&rt->u.dst);
 767                         return 0;
 768                 }
 769
 770                 while (frag) {
 771                         skb = frag->next;
 772                         kfree_skb(frag);
 773                         frag = skb;
 774                 }
 775
 776                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 777                               IPSTATS_MIB_FRAGFAILS);
 778                 dst_release(&rt->u.dst);
 779                 return err;
 780         }
 781
 782 slow_path:
 783         left = skb->len - hlen;         /* Space per frame */
 784         ptr = hlen;                     /* Where to start from */
 785
 786         /*
 787          *      Fragment the datagram.
 788          */
 789
 790         *prevhdr = NEXTHDR_FRAGMENT;
 791
 792         /*
 793          *      Keep copying data until we run out.
 794          */
 795         while(left > 0) {
 796                 len = left;
 797                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 798                 if (len > mtu)
 799                         len = mtu;
 800                 /* IF: we are not sending upto and including the packet end
 801                    then align the next start on an eight byte boundary */
 802                 if (len < left) {
 803                         len &= ~7;
 804                 }
 805                 /*
 806                  *      Allocate buffer.
 807                  */
 808
 809                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 810                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 811                         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 812                                       IPSTATS_MIB_FRAGFAILS);
 813                         err = -ENOMEM;
 814                         goto fail;
 815                 }
 816
 817                 /*
 818                  *      Set up data on packet
 819                  */
 820
 821                 ip6_copy_metadata(frag, skb);
 822                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 823                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 824                 skb_reset_network_header(frag);
 825                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 826                 frag->transport_header = (frag->network_header + hlen +
 827                                           sizeof(struct frag_hdr));
 828
 829                 /*
 830                  *      Charge the memory for the fragment to any owner
 831                  *      it might possess
 832                  */
 833                 if (skb->sk)
 834                         skb_set_owner_w(frag, skb->sk);
 835
 836                 /*
 837                  *      Copy the packet header into the new buffer.
 838                  */
 839                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 840
 841                 /*
 842                  *      Build fragment header.
 843                  */
 844                 fh->nexthdr = nexthdr;
 845                 fh->reserved = 0;
 846                 if (!frag_id) {
 847                         ipv6_select_ident(skb, fh);
 848                         frag_id = fh->identification;
 849                 } else
 850                         fh->identification = frag_id;
 851
 852                 /*
 853                  *      Copy a block of the IP datagram.
 854                  */
 855                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 856                         BUG();
 857                 left -= len;
 858
 859                 fh->frag_off = htons(offset);
 860                 if (left > 0)
 861                         fh->frag_off |= htons(IP6_MF);
 862                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 863                                                     sizeof(struct ipv6hdr));
 864
 865                 ptr += len;
 866                 offset += len;
 867
 868                 /*
 869                  *      Put this fragment into the sending queue.
 870                  */
 871                 err = output(frag);
 872                 if (err)
 873                         goto fail;
 874
 875                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 876                               IPSTATS_MIB_FRAGCREATES);
 877         }
 878         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 879                       IPSTATS_MIB_FRAGOKS);
 880         kfree_skb(skb);
 881         return err;
 882
 883 fail:
 884         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 885                       IPSTATS_MIB_FRAGFAILS);
 886         kfree_skb(skb);
 887         return err;
 888 }
 889
 890 static inline int ip6_rt_check(struct rt6key *rt_key,
 891                                struct in6_addr *fl_addr,
 892                                struct in6_addr *addr_cache)
 893 {
 894         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 895                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 896 }
 897
 898 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 899                                           struct dst_entry *dst,
 900                                           struct flowi *fl)
 901 {
 902         struct ipv6_pinfo *np = inet6_sk(sk);
 903         struct rt6_info *rt = (struct rt6_info *)dst;
 904
 905         if (!dst)
 906                 goto out;
 907
 908         /* Yes, checking route validity in not connected
 909          * case is not very simple. Take into account,
 910          * that we do not support routing by source, TOS,
 911          * and MSG_DONTROUTE            --ANK (980726)
 912          *
 913          * 1. ip6_rt_check(): If route was host route,
 914          *    check that cached destination is current.
 915          *    If it is network route, we still may
 916          *    check its validity using saved pointer
 917          *    to the last used address: daddr_cache.
 918          *    We do not want to save whole address now,
 919          *    (because main consumer of this service
 920          *    is tcp, which has not this problem),
 921          *    so that the last trick works only on connected
 922          *    sockets.
 923          * 2. oif also should be the same.
 924          */
 925         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 926 #ifdef CONFIG_IPV6_SUBTREES
 927             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 928 #endif
 929             (fl->oif && fl->oif != dst->dev->ifindex)) {
 930                 dst_release(dst);
 931                 dst = NULL;
 932         }
 933
 934 out:
 935         return dst;
 936 }
 937
 938 static int ip6_dst_lookup_tail(struct sock *sk,
 939                                struct dst_entry **dst, struct flowi *fl)
 940 {
 941         int err;
 942         struct net *net = sock_net(sk);
 943
 944         if (*dst == NULL)
 945                 *dst = ip6_route_output(net, sk, fl);
 946
 947         if ((err = (*dst)->error))
 948                 goto out_err_release;
 949
 950         if (ipv6_addr_any(&fl->fl6_src)) {
 951                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 952                                          &fl->fl6_dst,
 953                                          sk ? inet6_sk(sk)->srcprefs : 0,
 954                                          &fl->fl6_src);
 955                 if (err)
 956                         goto out_err_release;
 957         }
 958
 959 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 960         /*
 961          * Here if the dst entry we've looked up
 962          * has a neighbour entry that is in the INCOMPLETE
 963          * state and the src address from the flow is
 964          * marked as OPTIMISTIC, we release the found
 965          * dst entry and replace it instead with the
 966          * dst entry of the nexthop router
 967          */
 968         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 969                 struct inet6_ifaddr *ifp;
 970                 struct flowi fl_gw;
 971                 int redirect;
 972
 973                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 974                                       (*dst)->dev, 1);
 975
 976                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 977                 if (ifp)
 978                         in6_ifa_put(ifp);
 979
 980                 if (redirect) {
 981                         /*
 982                          * We need to get the dst entry for the
 983                          * default router instead
 984                          */
 985                         dst_release(*dst);
 986                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 987                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 988                         *dst = ip6_route_output(net, sk, &fl_gw);
 989                         if ((err = (*dst)->error))
 990                                 goto out_err_release;
 991                 }
 992         }
 993 #endif
 994
 995         return 0;
 996
 997 out_err_release:
 998         if (err == -ENETUNREACH)
 999                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1000         dst_release(*dst);
1001         *dst = NULL;
1002         return err;
1003 }
1004
1005 /**
1006  *      ip6_dst_lookup - perform route lookup on flow
1007  *      @sk: socket which provides route info
1008  *      @dst: pointer to dst_entry * for result
1009  *      @fl: flow to lookup
1010  *
1011  *      This function performs a route lookup on the given flow.
1012  *
1013  *      It returns zero on success, or a standard errno code on error.
1014  */
1015 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1016 {
1017         *dst = NULL;
1018         return ip6_dst_lookup_tail(sk, dst, fl);
1019 }
1020 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1021
1022 /**
1023  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1024  *      @sk: socket which provides the dst cache and route info
1025  *      @dst: pointer to dst_entry * for result
1026  *      @fl: flow to lookup
1027  *
1028  *      This function performs a route lookup on the given flow with the
1029  *      possibility of using the cached route in the socket if it is valid.
1030  *      It will take the socket dst lock when operating on the dst cache.
1031  *      As a result, this function can only be used in process context.
1032  *
1033  *      It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1036 {
1037         *dst = NULL;
1038         if (sk) {
1039                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1040                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1041         }
1042
1043         return ip6_dst_lookup_tail(sk, dst, fl);
1044 }
1045 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1046
1047 static inline int ip6_ufo_append_data(struct sock *sk,
1048                         int getfrag(void *from, char *to, int offset, int len,
1049                         int odd, struct sk_buff *skb),
1050                         void *from, int length, int hh_len, int fragheaderlen,
1051                         int transhdrlen, int mtu,unsigned int flags)
1052
1053 {
1054         struct sk_buff *skb;
1055         int err;
1056
1057         /* There is support for UDP large send offload by network
1058          * device, so create one single skb packet containing complete
1059          * udp datagram
1060          */
1061         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1062                 skb = sock_alloc_send_skb(sk,
1063                         hh_len + fragheaderlen + transhdrlen + 20,
1064                         (flags & MSG_DONTWAIT), &err);
1065                 if (skb == NULL)
1066                         return -ENOMEM;
1067
1068                 /* reserve space for Hardware header */
1069                 skb_reserve(skb, hh_len);
1070
1071                 /* create space for UDP/IP header */
1072                 skb_put(skb,fragheaderlen + transhdrlen);
1073
1074                 /* initialize network header pointer */
1075                 skb_reset_network_header(skb);
1076
1077                 /* initialize protocol header pointer */
1078                 skb->transport_header = skb->network_header + fragheaderlen;
1079
1080                 skb->ip_summed = CHECKSUM_PARTIAL;
1081                 skb->csum = 0;
1082                 sk->sk_sndmsg_off = 0;
1083         }
1084
1085         err = skb_append_datato_frags(sk,skb, getfrag, from,
1086                                       (length - transhdrlen));
1087         if (!err) {
1088                 struct frag_hdr fhdr;
1089
1090                 /* specify the length of each IP datagram fragment*/
1091                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1092                                             sizeof(struct frag_hdr);
1093                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1094                 ipv6_select_ident(skb, &fhdr);
1095                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1096                 __skb_queue_tail(&sk->sk_write_queue, skb);
1097
1098                 return 0;
1099         }
1100         /* There is not enough support do UPD LSO,
1101          * so follow normal path
1102          */
1103         kfree_skb(skb);
1104
1105         return err;
1106 }
1107
1108 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1109         int offset, int len, int odd, struct sk_buff *skb),
1110         void *from, int length, int transhdrlen,
1111         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1112         struct rt6_info *rt, unsigned int flags)
1113 {
1114         struct inet_sock *inet = inet_sk(sk);
1115         struct ipv6_pinfo *np = inet6_sk(sk);
1116         struct sk_buff *skb;
1117         unsigned int maxfraglen, fragheaderlen;
1118         int exthdrlen;
1119         int hh_len;
1120         int mtu;
1121         int copy;
1122         int err;
1123         int offset = 0;
1124         int csummode = CHECKSUM_NONE;
1125
1126         if (flags&MSG_PROBE)
1127                 return 0;
1128         if (skb_queue_empty(&sk->sk_write_queue)) {
1129                 /*
1130                  * setup for corking
1131                  */
1132                 if (opt) {
1133                         if (np->cork.opt == NULL) {
1134                                 np->cork.opt = kmalloc(opt->tot_len,
1135                                                        sk->sk_allocation);
1136                                 if (unlikely(np->cork.opt == NULL))
1137                                         return -ENOBUFS;
1138                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1139                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1140                                 return -EINVAL;
1141                         }
1142                         memcpy(np->cork.opt, opt, opt->tot_len);
1143                         inet->cork.flags |= IPCORK_OPT;
1144                         /* need source address above miyazawa*/
1145                 }
1146                 dst_hold(&rt->u.dst);
1147                 inet->cork.dst = &rt->u.dst;
1148                 inet->cork.fl = *fl;
1149                 np->cork.hop_limit = hlimit;
1150                 np->cork.tclass = tclass;
1151                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1152                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1153                 if (np->frag_size < mtu) {
1154                         if (np->frag_size)
1155                                 mtu = np->frag_size;
1156                 }
1157                 inet->cork.fragsize = mtu;
1158                 if (dst_allfrag(rt->u.dst.path))
1159                         inet->cork.flags |= IPCORK_ALLFRAG;
1160                 inet->cork.length = 0;
1161                 sk->sk_sndmsg_page = NULL;
1162                 sk->sk_sndmsg_off = 0;
1163                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1164                             rt->rt6i_nfheader_len;
1165                 length += exthdrlen;
1166                 transhdrlen += exthdrlen;
1167         } else {
1168                 rt = (struct rt6_info *)inet->cork.dst;
1169                 fl = &inet->cork.fl;
1170                 if (inet->cork.flags & IPCORK_OPT)
1171                         opt = np->cork.opt;
1172                 transhdrlen = 0;
1173                 exthdrlen = 0;
1174                 mtu = inet->cork.fragsize;
1175         }
1176
1177         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1178
1179         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1180                         (opt ? opt->opt_nflen : 0);
1181         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1182
1183         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1184                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1185                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1186                         return -EMSGSIZE;
1187                 }
1188         }
1189
1190         /*
1191          * Let's try using as much space as possible.
1192          * Use MTU if total length of the message fits into the MTU.
1193          * Otherwise, we need to reserve fragment header and
1194          * fragment alignment (= 8-15 octects, in total).
1195          *
1196          * Note that we may need to "move" the data from the tail of
1197          * of the buffer to the new fragment when we split
1198          * the message.
1199          *
1200          * FIXME: It may be fragmented into multiple chunks
1201          *        at once if non-fragmentable extension headers
1202          *        are too large.
1203          * --yoshfuji
1204          */
1205
1206         inet->cork.length += length;
1207         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1208             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1209
1210                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1211                                           fragheaderlen, transhdrlen, mtu,
1212                                           flags);
1213                 if (err)
1214                         goto error;
1215                 return 0;
1216         }
1217
1218         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1219                 goto alloc_new_skb;
1220
1221         while (length > 0) {
1222                 /* Check if the remaining data fits into current packet. */
1223                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1224                 if (copy < length)
1225                         copy = maxfraglen - skb->len;
1226
1227                 if (copy <= 0) {
1228                         char *data;
1229                         unsigned int datalen;
1230                         unsigned int fraglen;
1231                         unsigned int fraggap;
1232                         unsigned int alloclen;
1233                         struct sk_buff *skb_prev;
1234 alloc_new_skb:
1235                         skb_prev = skb;
1236
1237                         /* There's no room in the current skb */
1238                         if (skb_prev)
1239                                 fraggap = skb_prev->len - maxfraglen;
1240                         else
1241                                 fraggap = 0;
1242
1243                         /*
1244                          * If remaining data exceeds the mtu,
1245                          * we know we need more fragment(s).
1246                          */
1247                         datalen = length + fraggap;
1248                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1249                                 datalen = maxfraglen - fragheaderlen;
1250
1251                         fraglen = datalen + fragheaderlen;
1252                         if ((flags & MSG_MORE) &&
1253                             !(rt->u.dst.dev->features&NETIF_F_SG))
1254                                 alloclen = mtu;
1255                         else
1256                                 alloclen = datalen + fragheaderlen;
1257
1258                         /*
1259                          * The last fragment gets additional space at tail.
1260                          * Note: we overallocate on fragments with MSG_MODE
1261                          * because we have no idea if we're the last one.
1262                          */
1263                         if (datalen == length + fraggap)
1264                                 alloclen += rt->u.dst.trailer_len;
1265
1266                         /*
1267                          * We just reserve space for fragment header.
1268                          * Note: this may be overallocation if the message
1269                          * (without MSG_MORE) fits into the MTU.
1270                          */
1271                         alloclen += sizeof(struct frag_hdr);
1272
1273                         if (transhdrlen) {
1274                                 skb = sock_alloc_send_skb(sk,
1275                                                 alloclen + hh_len,
1276                                                 (flags & MSG_DONTWAIT), &err);
1277                         } else {
1278                                 skb = NULL;
1279                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1280                                     2 * sk->sk_sndbuf)
1281                                         skb = sock_wmalloc(sk,
1282                                                            alloclen + hh_len, 1,
1283                                                            sk->sk_allocation);
1284                                 if (unlikely(skb == NULL))
1285                                         err = -ENOBUFS;
1286                         }
1287                         if (skb == NULL)
1288                                 goto error;
1289                         /*
1290                          *      Fill in the control structures
1291                          */
1292                         skb->ip_summed = csummode;
1293                         skb->csum = 0;
1294                         /* reserve for fragmentation */
1295                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1296
1297                         /*
1298                          *      Find where to start putting bytes
1299                          */
1300                         data = skb_put(skb, fraglen);
1301                         skb_set_network_header(skb, exthdrlen);
1302                         data += fragheaderlen;
1303                         skb->transport_header = (skb->network_header +
1304                                                  fragheaderlen);
1305                         if (fraggap) {
1306                                 skb->csum = skb_copy_and_csum_bits(
1307                                         skb_prev, maxfraglen,
1308                                         data + transhdrlen, fraggap, 0);
1309                                 skb_prev->csum = csum_sub(skb_prev->csum,
1310                                                           skb->csum);
1311                                 data += fraggap;
1312                                 pskb_trim_unique(skb_prev, maxfraglen);
1313                         }
1314                         copy = datalen - transhdrlen - fraggap;
1315                         if (copy < 0) {
1316                                 err = -EINVAL;
1317                                 kfree_skb(skb);
1318                                 goto error;
1319                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1320                                 err = -EFAULT;
1321                                 kfree_skb(skb);
1322                                 goto error;
1323                         }
1324
1325                         offset += copy;
1326                         length -= datalen - fraggap;
1327                         transhdrlen = 0;
1328                         exthdrlen = 0;
1329                         csummode = CHECKSUM_NONE;
1330
1331                         /*
1332                          * Put the packet on the pending queue
1333                          */
1334                         __skb_queue_tail(&sk->sk_write_queue, skb);
1335                         continue;
1336                 }
1337
1338                 if (copy > length)
1339                         copy = length;
1340
1341                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1342                         unsigned int off;
1343
1344                         off = skb->len;
1345                         if (getfrag(from, skb_put(skb, copy),
1346                                                 offset, copy, off, skb) < 0) {
1347                                 __skb_trim(skb, off);
1348                                 err = -EFAULT;
1349                                 goto error;
1350                         }
1351                 } else {
1352                         int i = skb_shinfo(skb)->nr_frags;
1353                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1354                         struct page *page = sk->sk_sndmsg_page;
1355                         int off = sk->sk_sndmsg_off;
1356                         unsigned int left;
1357
1358                         if (page && (left = PAGE_SIZE - off) > 0) {
1359                                 if (copy >= left)
1360                                         copy = left;
1361                                 if (page != frag->page) {
1362                                         if (i == MAX_SKB_FRAGS) {
1363                                                 err = -EMSGSIZE;
1364                                                 goto error;
1365                                         }
1366                                         get_page(page);
1367                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1368                                         frag = &skb_shinfo(skb)->frags[i];
1369                                 }
1370                         } else if(i < MAX_SKB_FRAGS) {
1371                                 if (copy > PAGE_SIZE)
1372                                         copy = PAGE_SIZE;
1373                                 page = alloc_pages(sk->sk_allocation, 0);
1374                                 if (page == NULL) {
1375                                         err = -ENOMEM;
1376                                         goto error;
1377                                 }
1378                                 sk->sk_sndmsg_page = page;
1379                                 sk->sk_sndmsg_off = 0;
1380
1381                                 skb_fill_page_desc(skb, i, page, 0, 0);
1382                                 frag = &skb_shinfo(skb)->frags[i];
1383                         } else {
1384                                 err = -EMSGSIZE;
1385                                 goto error;
1386                         }
1387                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1388                                 err = -EFAULT;
1389                                 goto error;
1390                         }
1391                         sk->sk_sndmsg_off += copy;
1392                         frag->size += copy;
1393                         skb->len += copy;
1394                         skb->data_len += copy;
1395                         skb->truesize += copy;
1396                         atomic_add(copy, &sk->sk_wmem_alloc);
1397                 }
1398                 offset += copy;
1399                 length -= copy;
1400         }
1401         return 0;
1402 error:
1403         inet->cork.length -= length;
1404         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1405         return err;
1406 }
1407
1408 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1409 {
1410         inet->cork.flags &= ~IPCORK_OPT;
1411         kfree(np->cork.opt);
1412         np->cork.opt = NULL;
1413         if (inet->cork.dst) {
1414                 dst_release(inet->cork.dst);
1415                 inet->cork.dst = NULL;
1416                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1417         }
1418         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1419 }
1420
1421 int ip6_push_pending_frames(struct sock *sk)
1422 {
1423         struct sk_buff *skb, *tmp_skb;
1424         struct sk_buff **tail_skb;
1425         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1426         struct inet_sock *inet = inet_sk(sk);
1427         struct ipv6_pinfo *np = inet6_sk(sk);
1428         struct net *net = sock_net(sk);
1429         struct ipv6hdr *hdr;
1430         struct ipv6_txoptions *opt = np->cork.opt;
1431         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1432         struct flowi *fl = &inet->cork.fl;
1433         unsigned char proto = fl->proto;
1434         int err = 0;
1435
1436         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1437                 goto out;
1438         tail_skb = &(skb_shinfo(skb)->frag_list);
1439
1440         /* move skb->data to ip header from ext header */
1441         if (skb->data < skb_network_header(skb))
1442                 __skb_pull(skb, skb_network_offset(skb));
1443         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1444                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1445                 *tail_skb = tmp_skb;
1446                 tail_skb = &(tmp_skb->next);
1447                 skb->len += tmp_skb->len;
1448                 skb->data_len += tmp_skb->len;
1449                 skb->truesize += tmp_skb->truesize;
1450                 __sock_put(tmp_skb->sk);
1451                 tmp_skb->destructor = NULL;
1452                 tmp_skb->sk = NULL;
1453         }
1454
1455         /* Allow local fragmentation. */
1456         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1457                 skb->local_df = 1;
1458
1459         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1460         __skb_pull(skb, skb_network_header_len(skb));
1461         if (opt && opt->opt_flen)
1462                 ipv6_push_frag_opts(skb, opt, &proto);
1463         if (opt && opt->opt_nflen)
1464                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1465
1466         skb_push(skb, sizeof(struct ipv6hdr));
1467         skb_reset_network_header(skb);
1468         hdr = ipv6_hdr(skb);
1469
1470         *(__be32*)hdr = fl->fl6_flowlabel |
1471                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1472
1473         hdr->hop_limit = np->cork.hop_limit;
1474         hdr->nexthdr = proto;
1475         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1476         ipv6_addr_copy(&hdr->daddr, final_dst);
1477
1478         skb->priority = sk->sk_priority;
1479         skb->mark = sk->sk_mark;
1480
1481         skb->dst = dst_clone(&rt->u.dst);
1482         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1483         if (proto == IPPROTO_ICMPV6) {
1484                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1485
1486                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1487                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1488         }
1489
1490         err = ip6_local_out(skb);
1491         if (err) {
1492                 if (err > 0)
1493                         err = np->recverr ? net_xmit_errno(err) : 0;
1494                 if (err)
1495                         goto error;
1496         }
1497
1498 out:
1499         ip6_cork_release(inet, np);
1500         return err;
1501 error:
1502         goto out;
1503 }
1504
1505 void ip6_flush_pending_frames(struct sock *sk)
1506 {
1507         struct sk_buff *skb;
1508
1509         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1510                 if (skb->dst)
1511                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
1512                                       IPSTATS_MIB_OUTDISCARDS);
1513                 kfree_skb(skb);
1514         }
1515
1516         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1517 }