Merge remote-tracking branch 'mailbox/mailbox-for-next'
[deliverable/linux.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59 #include <net/lwtunnel.h>
60
61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
62 {
63 struct dst_entry *dst = skb_dst(skb);
64 struct net_device *dev = dst->dev;
65 struct neighbour *neigh;
66 struct in6_addr *nexthop;
67 int ret;
68
69 skb->protocol = htons(ETH_P_IPV6);
70 skb->dev = dev;
71
72 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
73 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
74
75 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
76 ((mroute6_socket(net, skb) &&
77 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
78 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
79 &ipv6_hdr(skb)->saddr))) {
80 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
81
82 /* Do not check for IFF_ALLMULTI; multicast routing
83 is not supported in any case.
84 */
85 if (newskb)
86 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
87 net, sk, newskb, NULL, newskb->dev,
88 dev_loopback_xmit);
89
90 if (ipv6_hdr(skb)->hop_limit == 0) {
91 IP6_INC_STATS(net, idev,
92 IPSTATS_MIB_OUTDISCARDS);
93 kfree_skb(skb);
94 return 0;
95 }
96 }
97
98 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
99
100 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
101 IPV6_ADDR_SCOPE_NODELOCAL &&
102 !(dev->flags & IFF_LOOPBACK)) {
103 kfree_skb(skb);
104 return 0;
105 }
106 }
107
108 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
109 int res = lwtunnel_xmit(skb);
110
111 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
112 return res;
113 }
114
115 rcu_read_lock_bh();
116 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
117 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
118 if (unlikely(!neigh))
119 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
120 if (!IS_ERR(neigh)) {
121 ret = dst_neigh_output(dst, neigh, skb);
122 rcu_read_unlock_bh();
123 return ret;
124 }
125 rcu_read_unlock_bh();
126
127 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
128 kfree_skb(skb);
129 return -EINVAL;
130 }
131
132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 {
134 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
135 dst_allfrag(skb_dst(skb)) ||
136 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
137 return ip6_fragment(net, sk, skb, ip6_finish_output2);
138 else
139 return ip6_finish_output2(net, sk, skb);
140 }
141
142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
143 {
144 struct net_device *dev = skb_dst(skb)->dev;
145 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
146
147 if (unlikely(idev->cnf.disable_ipv6)) {
148 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
149 kfree_skb(skb);
150 return 0;
151 }
152
153 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
154 net, sk, skb, NULL, dev,
155 ip6_finish_output,
156 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
157 }
158
159 /*
160 * xmit an sk_buff (used by TCP, SCTP and DCCP)
161 * Note : socket lock is not held for SYNACK packets, but might be modified
162 * by calls to skb_set_owner_w() and ipv6_local_error(),
163 * which are using proper atomic operations or spinlocks.
164 */
165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
166 struct ipv6_txoptions *opt, int tclass)
167 {
168 struct net *net = sock_net(sk);
169 const struct ipv6_pinfo *np = inet6_sk(sk);
170 struct in6_addr *first_hop = &fl6->daddr;
171 struct dst_entry *dst = skb_dst(skb);
172 struct ipv6hdr *hdr;
173 u8 proto = fl6->flowi6_proto;
174 int seg_len = skb->len;
175 int hlimit = -1;
176 u32 mtu;
177
178 if (opt) {
179 unsigned int head_room;
180
181 /* First: exthdrs may take lots of space (~8K for now)
182 MAX_HEADER is not enough.
183 */
184 head_room = opt->opt_nflen + opt->opt_flen;
185 seg_len += head_room;
186 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
187
188 if (skb_headroom(skb) < head_room) {
189 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
190 if (!skb2) {
191 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
192 IPSTATS_MIB_OUTDISCARDS);
193 kfree_skb(skb);
194 return -ENOBUFS;
195 }
196 consume_skb(skb);
197 skb = skb2;
198 /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
199 * it is safe to call in our context (socket lock not held)
200 */
201 skb_set_owner_w(skb, (struct sock *)sk);
202 }
203 if (opt->opt_flen)
204 ipv6_push_frag_opts(skb, opt, &proto);
205 if (opt->opt_nflen)
206 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207 }
208
209 skb_push(skb, sizeof(struct ipv6hdr));
210 skb_reset_network_header(skb);
211 hdr = ipv6_hdr(skb);
212
213 /*
214 * Fill in the IPv6 header
215 */
216 if (np)
217 hlimit = np->hop_limit;
218 if (hlimit < 0)
219 hlimit = ip6_dst_hoplimit(dst);
220
221 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
222 np->autoflowlabel, fl6));
223
224 hdr->payload_len = htons(seg_len);
225 hdr->nexthdr = proto;
226 hdr->hop_limit = hlimit;
227
228 hdr->saddr = fl6->saddr;
229 hdr->daddr = *first_hop;
230
231 skb->protocol = htons(ETH_P_IPV6);
232 skb->priority = sk->sk_priority;
233 skb->mark = sk->sk_mark;
234
235 mtu = dst_mtu(dst);
236 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
237 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
238 IPSTATS_MIB_OUT, skb->len);
239
240 /* if egress device is enslaved to an L3 master device pass the
241 * skb to its handler for processing
242 */
243 skb = l3mdev_ip6_out((struct sock *)sk, skb);
244 if (unlikely(!skb))
245 return 0;
246
247 /* hooks should never assume socket lock is held.
248 * we promote our socket to non const
249 */
250 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
251 net, (struct sock *)sk, skb, NULL, dst->dev,
252 dst_output);
253 }
254
255 skb->dev = dst->dev;
256 /* ipv6_local_error() does not require socket lock,
257 * we promote our socket to non const
258 */
259 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
260
261 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
262 kfree_skb(skb);
263 return -EMSGSIZE;
264 }
265 EXPORT_SYMBOL(ip6_xmit);
266
267 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
268 {
269 struct ip6_ra_chain *ra;
270 struct sock *last = NULL;
271
272 read_lock(&ip6_ra_lock);
273 for (ra = ip6_ra_chain; ra; ra = ra->next) {
274 struct sock *sk = ra->sk;
275 if (sk && ra->sel == sel &&
276 (!sk->sk_bound_dev_if ||
277 sk->sk_bound_dev_if == skb->dev->ifindex)) {
278 if (last) {
279 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
280 if (skb2)
281 rawv6_rcv(last, skb2);
282 }
283 last = sk;
284 }
285 }
286
287 if (last) {
288 rawv6_rcv(last, skb);
289 read_unlock(&ip6_ra_lock);
290 return 1;
291 }
292 read_unlock(&ip6_ra_lock);
293 return 0;
294 }
295
296 static int ip6_forward_proxy_check(struct sk_buff *skb)
297 {
298 struct ipv6hdr *hdr = ipv6_hdr(skb);
299 u8 nexthdr = hdr->nexthdr;
300 __be16 frag_off;
301 int offset;
302
303 if (ipv6_ext_hdr(nexthdr)) {
304 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
305 if (offset < 0)
306 return 0;
307 } else
308 offset = sizeof(struct ipv6hdr);
309
310 if (nexthdr == IPPROTO_ICMPV6) {
311 struct icmp6hdr *icmp6;
312
313 if (!pskb_may_pull(skb, (skb_network_header(skb) +
314 offset + 1 - skb->data)))
315 return 0;
316
317 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
318
319 switch (icmp6->icmp6_type) {
320 case NDISC_ROUTER_SOLICITATION:
321 case NDISC_ROUTER_ADVERTISEMENT:
322 case NDISC_NEIGHBOUR_SOLICITATION:
323 case NDISC_NEIGHBOUR_ADVERTISEMENT:
324 case NDISC_REDIRECT:
325 /* For reaction involving unicast neighbor discovery
326 * message destined to the proxied address, pass it to
327 * input function.
328 */
329 return 1;
330 default:
331 break;
332 }
333 }
334
335 /*
336 * The proxying router can't forward traffic sent to a link-local
337 * address, so signal the sender and discard the packet. This
338 * behavior is clarified by the MIPv6 specification.
339 */
340 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
341 dst_link_failure(skb);
342 return -1;
343 }
344
345 return 0;
346 }
347
348 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
349 struct sk_buff *skb)
350 {
351 return dst_output(net, sk, skb);
352 }
353
354 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
355 {
356 unsigned int mtu;
357 struct inet6_dev *idev;
358
359 if (dst_metric_locked(dst, RTAX_MTU)) {
360 mtu = dst_metric_raw(dst, RTAX_MTU);
361 if (mtu)
362 return mtu;
363 }
364
365 mtu = IPV6_MIN_MTU;
366 rcu_read_lock();
367 idev = __in6_dev_get(dst->dev);
368 if (idev)
369 mtu = idev->cnf.mtu6;
370 rcu_read_unlock();
371
372 return mtu;
373 }
374
375 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
376 {
377 if (skb->len <= mtu)
378 return false;
379
380 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
381 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
382 return true;
383
384 if (skb->ignore_df)
385 return false;
386
387 if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
388 return false;
389
390 return true;
391 }
392
393 int ip6_forward(struct sk_buff *skb)
394 {
395 struct dst_entry *dst = skb_dst(skb);
396 struct ipv6hdr *hdr = ipv6_hdr(skb);
397 struct inet6_skb_parm *opt = IP6CB(skb);
398 struct net *net = dev_net(dst->dev);
399 u32 mtu;
400
401 if (net->ipv6.devconf_all->forwarding == 0)
402 goto error;
403
404 if (skb->pkt_type != PACKET_HOST)
405 goto drop;
406
407 if (unlikely(skb->sk))
408 goto drop;
409
410 if (skb_warn_if_lro(skb))
411 goto drop;
412
413 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
414 __IP6_INC_STATS(net, ip6_dst_idev(dst),
415 IPSTATS_MIB_INDISCARDS);
416 goto drop;
417 }
418
419 skb_forward_csum(skb);
420
421 /*
422 * We DO NOT make any processing on
423 * RA packets, pushing them to user level AS IS
424 * without ane WARRANTY that application will be able
425 * to interpret them. The reason is that we
426 * cannot make anything clever here.
427 *
428 * We are not end-node, so that if packet contains
429 * AH/ESP, we cannot make anything.
430 * Defragmentation also would be mistake, RA packets
431 * cannot be fragmented, because there is no warranty
432 * that different fragments will go along one path. --ANK
433 */
434 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
435 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
436 return 0;
437 }
438
439 /*
440 * check and decrement ttl
441 */
442 if (hdr->hop_limit <= 1) {
443 /* Force OUTPUT device used as source address */
444 skb->dev = dst->dev;
445 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
446 __IP6_INC_STATS(net, ip6_dst_idev(dst),
447 IPSTATS_MIB_INHDRERRORS);
448
449 kfree_skb(skb);
450 return -ETIMEDOUT;
451 }
452
453 /* XXX: idev->cnf.proxy_ndp? */
454 if (net->ipv6.devconf_all->proxy_ndp &&
455 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
456 int proxied = ip6_forward_proxy_check(skb);
457 if (proxied > 0)
458 return ip6_input(skb);
459 else if (proxied < 0) {
460 __IP6_INC_STATS(net, ip6_dst_idev(dst),
461 IPSTATS_MIB_INDISCARDS);
462 goto drop;
463 }
464 }
465
466 if (!xfrm6_route_forward(skb)) {
467 __IP6_INC_STATS(net, ip6_dst_idev(dst),
468 IPSTATS_MIB_INDISCARDS);
469 goto drop;
470 }
471 dst = skb_dst(skb);
472
473 /* IPv6 specs say nothing about it, but it is clear that we cannot
474 send redirects to source routed frames.
475 We don't send redirects to frames decapsulated from IPsec.
476 */
477 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
478 struct in6_addr *target = NULL;
479 struct inet_peer *peer;
480 struct rt6_info *rt;
481
482 /*
483 * incoming and outgoing devices are the same
484 * send a redirect.
485 */
486
487 rt = (struct rt6_info *) dst;
488 if (rt->rt6i_flags & RTF_GATEWAY)
489 target = &rt->rt6i_gateway;
490 else
491 target = &hdr->daddr;
492
493 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
494
495 /* Limit redirects both by destination (here)
496 and by source (inside ndisc_send_redirect)
497 */
498 if (inet_peer_xrlim_allow(peer, 1*HZ))
499 ndisc_send_redirect(skb, target);
500 if (peer)
501 inet_putpeer(peer);
502 } else {
503 int addrtype = ipv6_addr_type(&hdr->saddr);
504
505 /* This check is security critical. */
506 if (addrtype == IPV6_ADDR_ANY ||
507 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
508 goto error;
509 if (addrtype & IPV6_ADDR_LINKLOCAL) {
510 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
511 ICMPV6_NOT_NEIGHBOUR, 0);
512 goto error;
513 }
514 }
515
516 mtu = ip6_dst_mtu_forward(dst);
517 if (mtu < IPV6_MIN_MTU)
518 mtu = IPV6_MIN_MTU;
519
520 if (ip6_pkt_too_big(skb, mtu)) {
521 /* Again, force OUTPUT device used as source address */
522 skb->dev = dst->dev;
523 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524 __IP6_INC_STATS(net, ip6_dst_idev(dst),
525 IPSTATS_MIB_INTOOBIGERRORS);
526 __IP6_INC_STATS(net, ip6_dst_idev(dst),
527 IPSTATS_MIB_FRAGFAILS);
528 kfree_skb(skb);
529 return -EMSGSIZE;
530 }
531
532 if (skb_cow(skb, dst->dev->hard_header_len)) {
533 __IP6_INC_STATS(net, ip6_dst_idev(dst),
534 IPSTATS_MIB_OUTDISCARDS);
535 goto drop;
536 }
537
538 hdr = ipv6_hdr(skb);
539
540 /* Mangling hops number delayed to point after skb COW */
541
542 hdr->hop_limit--;
543
544 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
545 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
546 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
547 net, NULL, skb, skb->dev, dst->dev,
548 ip6_forward_finish);
549
550 error:
551 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
552 drop:
553 kfree_skb(skb);
554 return -EINVAL;
555 }
556
557 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
558 {
559 to->pkt_type = from->pkt_type;
560 to->priority = from->priority;
561 to->protocol = from->protocol;
562 skb_dst_drop(to);
563 skb_dst_set(to, dst_clone(skb_dst(from)));
564 to->dev = from->dev;
565 to->mark = from->mark;
566
567 #ifdef CONFIG_NET_SCHED
568 to->tc_index = from->tc_index;
569 #endif
570 nf_copy(to, from);
571 skb_copy_secmark(to, from);
572 }
573
574 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
575 int (*output)(struct net *, struct sock *, struct sk_buff *))
576 {
577 struct sk_buff *frag;
578 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
579 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
580 inet6_sk(skb->sk) : NULL;
581 struct ipv6hdr *tmp_hdr;
582 struct frag_hdr *fh;
583 unsigned int mtu, hlen, left, len;
584 int hroom, troom;
585 __be32 frag_id;
586 int ptr, offset = 0, err = 0;
587 u8 *prevhdr, nexthdr = 0;
588
589 hlen = ip6_find_1stfragopt(skb, &prevhdr);
590 nexthdr = *prevhdr;
591
592 mtu = ip6_skb_dst_mtu(skb);
593
594 /* We must not fragment if the socket is set to force MTU discovery
595 * or if the skb it not generated by a local socket.
596 */
597 if (unlikely(!skb->ignore_df && skb->len > mtu))
598 goto fail_toobig;
599
600 if (IP6CB(skb)->frag_max_size) {
601 if (IP6CB(skb)->frag_max_size > mtu)
602 goto fail_toobig;
603
604 /* don't send fragments larger than what we received */
605 mtu = IP6CB(skb)->frag_max_size;
606 if (mtu < IPV6_MIN_MTU)
607 mtu = IPV6_MIN_MTU;
608 }
609
610 if (np && np->frag_size < mtu) {
611 if (np->frag_size)
612 mtu = np->frag_size;
613 }
614 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
615 goto fail_toobig;
616 mtu -= hlen + sizeof(struct frag_hdr);
617
618 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
619 &ipv6_hdr(skb)->saddr);
620
621 if (skb->ip_summed == CHECKSUM_PARTIAL &&
622 (err = skb_checksum_help(skb)))
623 goto fail;
624
625 hroom = LL_RESERVED_SPACE(rt->dst.dev);
626 if (skb_has_frag_list(skb)) {
627 int first_len = skb_pagelen(skb);
628 struct sk_buff *frag2;
629
630 if (first_len - hlen > mtu ||
631 ((first_len - hlen) & 7) ||
632 skb_cloned(skb) ||
633 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
634 goto slow_path;
635
636 skb_walk_frags(skb, frag) {
637 /* Correct geometry. */
638 if (frag->len > mtu ||
639 ((frag->len & 7) && frag->next) ||
640 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
641 goto slow_path_clean;
642
643 /* Partially cloned skb? */
644 if (skb_shared(frag))
645 goto slow_path_clean;
646
647 BUG_ON(frag->sk);
648 if (skb->sk) {
649 frag->sk = skb->sk;
650 frag->destructor = sock_wfree;
651 }
652 skb->truesize -= frag->truesize;
653 }
654
655 err = 0;
656 offset = 0;
657 /* BUILD HEADER */
658
659 *prevhdr = NEXTHDR_FRAGMENT;
660 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
661 if (!tmp_hdr) {
662 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
663 IPSTATS_MIB_FRAGFAILS);
664 err = -ENOMEM;
665 goto fail;
666 }
667 frag = skb_shinfo(skb)->frag_list;
668 skb_frag_list_init(skb);
669
670 __skb_pull(skb, hlen);
671 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
672 __skb_push(skb, hlen);
673 skb_reset_network_header(skb);
674 memcpy(skb_network_header(skb), tmp_hdr, hlen);
675
676 fh->nexthdr = nexthdr;
677 fh->reserved = 0;
678 fh->frag_off = htons(IP6_MF);
679 fh->identification = frag_id;
680
681 first_len = skb_pagelen(skb);
682 skb->data_len = first_len - skb_headlen(skb);
683 skb->len = first_len;
684 ipv6_hdr(skb)->payload_len = htons(first_len -
685 sizeof(struct ipv6hdr));
686
687 dst_hold(&rt->dst);
688
689 for (;;) {
690 /* Prepare header of the next frame,
691 * before previous one went down. */
692 if (frag) {
693 frag->ip_summed = CHECKSUM_NONE;
694 skb_reset_transport_header(frag);
695 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
696 __skb_push(frag, hlen);
697 skb_reset_network_header(frag);
698 memcpy(skb_network_header(frag), tmp_hdr,
699 hlen);
700 offset += skb->len - hlen - sizeof(struct frag_hdr);
701 fh->nexthdr = nexthdr;
702 fh->reserved = 0;
703 fh->frag_off = htons(offset);
704 if (frag->next)
705 fh->frag_off |= htons(IP6_MF);
706 fh->identification = frag_id;
707 ipv6_hdr(frag)->payload_len =
708 htons(frag->len -
709 sizeof(struct ipv6hdr));
710 ip6_copy_metadata(frag, skb);
711 }
712
713 err = output(net, sk, skb);
714 if (!err)
715 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
716 IPSTATS_MIB_FRAGCREATES);
717
718 if (err || !frag)
719 break;
720
721 skb = frag;
722 frag = skb->next;
723 skb->next = NULL;
724 }
725
726 kfree(tmp_hdr);
727
728 if (err == 0) {
729 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
730 IPSTATS_MIB_FRAGOKS);
731 ip6_rt_put(rt);
732 return 0;
733 }
734
735 kfree_skb_list(frag);
736
737 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738 IPSTATS_MIB_FRAGFAILS);
739 ip6_rt_put(rt);
740 return err;
741
742 slow_path_clean:
743 skb_walk_frags(skb, frag2) {
744 if (frag2 == frag)
745 break;
746 frag2->sk = NULL;
747 frag2->destructor = NULL;
748 skb->truesize += frag2->truesize;
749 }
750 }
751
752 slow_path:
753 left = skb->len - hlen; /* Space per frame */
754 ptr = hlen; /* Where to start from */
755
756 /*
757 * Fragment the datagram.
758 */
759
760 *prevhdr = NEXTHDR_FRAGMENT;
761 troom = rt->dst.dev->needed_tailroom;
762
763 /*
764 * Keep copying data until we run out.
765 */
766 while (left > 0) {
767 len = left;
768 /* IF: it doesn't fit, use 'mtu' - the data space left */
769 if (len > mtu)
770 len = mtu;
771 /* IF: we are not sending up to and including the packet end
772 then align the next start on an eight byte boundary */
773 if (len < left) {
774 len &= ~7;
775 }
776
777 /* Allocate buffer */
778 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
779 hroom + troom, GFP_ATOMIC);
780 if (!frag) {
781 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
782 IPSTATS_MIB_FRAGFAILS);
783 err = -ENOMEM;
784 goto fail;
785 }
786
787 /*
788 * Set up data on packet
789 */
790
791 ip6_copy_metadata(frag, skb);
792 skb_reserve(frag, hroom);
793 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
794 skb_reset_network_header(frag);
795 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
796 frag->transport_header = (frag->network_header + hlen +
797 sizeof(struct frag_hdr));
798
799 /*
800 * Charge the memory for the fragment to any owner
801 * it might possess
802 */
803 if (skb->sk)
804 skb_set_owner_w(frag, skb->sk);
805
806 /*
807 * Copy the packet header into the new buffer.
808 */
809 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
810
811 /*
812 * Build fragment header.
813 */
814 fh->nexthdr = nexthdr;
815 fh->reserved = 0;
816 fh->identification = frag_id;
817
818 /*
819 * Copy a block of the IP datagram.
820 */
821 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
822 len));
823 left -= len;
824
825 fh->frag_off = htons(offset);
826 if (left > 0)
827 fh->frag_off |= htons(IP6_MF);
828 ipv6_hdr(frag)->payload_len = htons(frag->len -
829 sizeof(struct ipv6hdr));
830
831 ptr += len;
832 offset += len;
833
834 /*
835 * Put this fragment into the sending queue.
836 */
837 err = output(net, sk, frag);
838 if (err)
839 goto fail;
840
841 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
842 IPSTATS_MIB_FRAGCREATES);
843 }
844 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
845 IPSTATS_MIB_FRAGOKS);
846 consume_skb(skb);
847 return err;
848
849 fail_toobig:
850 if (skb->sk && dst_allfrag(skb_dst(skb)))
851 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
852
853 skb->dev = skb_dst(skb)->dev;
854 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
855 err = -EMSGSIZE;
856
857 fail:
858 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859 IPSTATS_MIB_FRAGFAILS);
860 kfree_skb(skb);
861 return err;
862 }
863
864 static inline int ip6_rt_check(const struct rt6key *rt_key,
865 const struct in6_addr *fl_addr,
866 const struct in6_addr *addr_cache)
867 {
868 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
869 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
870 }
871
872 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
873 struct dst_entry *dst,
874 const struct flowi6 *fl6)
875 {
876 struct ipv6_pinfo *np = inet6_sk(sk);
877 struct rt6_info *rt;
878
879 if (!dst)
880 goto out;
881
882 if (dst->ops->family != AF_INET6) {
883 dst_release(dst);
884 return NULL;
885 }
886
887 rt = (struct rt6_info *)dst;
888 /* Yes, checking route validity in not connected
889 * case is not very simple. Take into account,
890 * that we do not support routing by source, TOS,
891 * and MSG_DONTROUTE --ANK (980726)
892 *
893 * 1. ip6_rt_check(): If route was host route,
894 * check that cached destination is current.
895 * If it is network route, we still may
896 * check its validity using saved pointer
897 * to the last used address: daddr_cache.
898 * We do not want to save whole address now,
899 * (because main consumer of this service
900 * is tcp, which has not this problem),
901 * so that the last trick works only on connected
902 * sockets.
903 * 2. oif also should be the same.
904 */
905 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
906 #ifdef CONFIG_IPV6_SUBTREES
907 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
908 #endif
909 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
910 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
911 dst_release(dst);
912 dst = NULL;
913 }
914
915 out:
916 return dst;
917 }
918
919 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
920 struct dst_entry **dst, struct flowi6 *fl6)
921 {
922 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
923 struct neighbour *n;
924 struct rt6_info *rt;
925 #endif
926 int err;
927 int flags = 0;
928
929 /* The correct way to handle this would be to do
930 * ip6_route_get_saddr, and then ip6_route_output; however,
931 * the route-specific preferred source forces the
932 * ip6_route_output call _before_ ip6_route_get_saddr.
933 *
934 * In source specific routing (no src=any default route),
935 * ip6_route_output will fail given src=any saddr, though, so
936 * that's why we try it again later.
937 */
938 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
939 struct rt6_info *rt;
940 bool had_dst = *dst != NULL;
941
942 if (!had_dst)
943 *dst = ip6_route_output(net, sk, fl6);
944 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
945 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
946 sk ? inet6_sk(sk)->srcprefs : 0,
947 &fl6->saddr);
948 if (err)
949 goto out_err_release;
950
951 /* If we had an erroneous initial result, pretend it
952 * never existed and let the SA-enabled version take
953 * over.
954 */
955 if (!had_dst && (*dst)->error) {
956 dst_release(*dst);
957 *dst = NULL;
958 }
959
960 if (fl6->flowi6_oif)
961 flags |= RT6_LOOKUP_F_IFACE;
962 }
963
964 if (!*dst)
965 *dst = ip6_route_output_flags(net, sk, fl6, flags);
966
967 err = (*dst)->error;
968 if (err)
969 goto out_err_release;
970
971 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
972 /*
973 * Here if the dst entry we've looked up
974 * has a neighbour entry that is in the INCOMPLETE
975 * state and the src address from the flow is
976 * marked as OPTIMISTIC, we release the found
977 * dst entry and replace it instead with the
978 * dst entry of the nexthop router
979 */
980 rt = (struct rt6_info *) *dst;
981 rcu_read_lock_bh();
982 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
983 rt6_nexthop(rt, &fl6->daddr));
984 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
985 rcu_read_unlock_bh();
986
987 if (err) {
988 struct inet6_ifaddr *ifp;
989 struct flowi6 fl_gw6;
990 int redirect;
991
992 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
993 (*dst)->dev, 1);
994
995 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
996 if (ifp)
997 in6_ifa_put(ifp);
998
999 if (redirect) {
1000 /*
1001 * We need to get the dst entry for the
1002 * default router instead
1003 */
1004 dst_release(*dst);
1005 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1006 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1007 *dst = ip6_route_output(net, sk, &fl_gw6);
1008 err = (*dst)->error;
1009 if (err)
1010 goto out_err_release;
1011 }
1012 }
1013 #endif
1014
1015 return 0;
1016
1017 out_err_release:
1018 dst_release(*dst);
1019 *dst = NULL;
1020
1021 if (err == -ENETUNREACH)
1022 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1023 return err;
1024 }
1025
1026 /**
1027 * ip6_dst_lookup - perform route lookup on flow
1028 * @sk: socket which provides route info
1029 * @dst: pointer to dst_entry * for result
1030 * @fl6: flow to lookup
1031 *
1032 * This function performs a route lookup on the given flow.
1033 *
1034 * It returns zero on success, or a standard errno code on error.
1035 */
1036 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1037 struct flowi6 *fl6)
1038 {
1039 *dst = NULL;
1040 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1041 }
1042 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1043
1044 /**
1045 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1046 * @sk: socket which provides route info
1047 * @fl6: flow to lookup
1048 * @final_dst: final destination address for ipsec lookup
1049 *
1050 * This function performs a route lookup on the given flow.
1051 *
1052 * It returns a valid dst pointer on success, or a pointer encoded
1053 * error code.
1054 */
1055 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1056 const struct in6_addr *final_dst)
1057 {
1058 struct dst_entry *dst = NULL;
1059 int err;
1060
1061 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1062 if (err)
1063 return ERR_PTR(err);
1064 if (final_dst)
1065 fl6->daddr = *final_dst;
1066
1067 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068 }
1069 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1070
1071 /**
1072 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1073 * @sk: socket which provides the dst cache and route info
1074 * @fl6: flow to lookup
1075 * @final_dst: final destination address for ipsec lookup
1076 *
1077 * This function performs a route lookup on the given flow with the
1078 * possibility of using the cached route in the socket if it is valid.
1079 * It will take the socket dst lock when operating on the dst cache.
1080 * As a result, this function can only be used in process context.
1081 *
1082 * It returns a valid dst pointer on success, or a pointer encoded
1083 * error code.
1084 */
1085 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1086 const struct in6_addr *final_dst)
1087 {
1088 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1089
1090 dst = ip6_sk_dst_check(sk, dst, fl6);
1091 if (!dst)
1092 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1093
1094 return dst;
1095 }
1096 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1097
1098 static inline int ip6_ufo_append_data(struct sock *sk,
1099 struct sk_buff_head *queue,
1100 int getfrag(void *from, char *to, int offset, int len,
1101 int odd, struct sk_buff *skb),
1102 void *from, int length, int hh_len, int fragheaderlen,
1103 int exthdrlen, int transhdrlen, int mtu,
1104 unsigned int flags, const struct flowi6 *fl6)
1105
1106 {
1107 struct sk_buff *skb;
1108 int err;
1109
1110 /* There is support for UDP large send offload by network
1111 * device, so create one single skb packet containing complete
1112 * udp datagram
1113 */
1114 skb = skb_peek_tail(queue);
1115 if (!skb) {
1116 skb = sock_alloc_send_skb(sk,
1117 hh_len + fragheaderlen + transhdrlen + 20,
1118 (flags & MSG_DONTWAIT), &err);
1119 if (!skb)
1120 return err;
1121
1122 /* reserve space for Hardware header */
1123 skb_reserve(skb, hh_len);
1124
1125 /* create space for UDP/IP header */
1126 skb_put(skb, fragheaderlen + transhdrlen);
1127
1128 /* initialize network header pointer */
1129 skb_set_network_header(skb, exthdrlen);
1130
1131 /* initialize protocol header pointer */
1132 skb->transport_header = skb->network_header + fragheaderlen;
1133
1134 skb->protocol = htons(ETH_P_IPV6);
1135 skb->csum = 0;
1136
1137 __skb_queue_tail(queue, skb);
1138 } else if (skb_is_gso(skb)) {
1139 goto append;
1140 }
1141
1142 skb->ip_summed = CHECKSUM_PARTIAL;
1143 /* Specify the length of each IPv6 datagram fragment.
1144 * It has to be a multiple of 8.
1145 */
1146 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147 sizeof(struct frag_hdr)) & ~7;
1148 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149 skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1150 &fl6->daddr,
1151 &fl6->saddr);
1152
1153 append:
1154 return skb_append_datato_frags(sk, skb, getfrag, from,
1155 (length - transhdrlen));
1156 }
1157
1158 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1159 gfp_t gfp)
1160 {
1161 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1162 }
1163
1164 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1165 gfp_t gfp)
1166 {
1167 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1168 }
1169
1170 static void ip6_append_data_mtu(unsigned int *mtu,
1171 int *maxfraglen,
1172 unsigned int fragheaderlen,
1173 struct sk_buff *skb,
1174 struct rt6_info *rt,
1175 unsigned int orig_mtu)
1176 {
1177 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1178 if (!skb) {
1179 /* first fragment, reserve header_len */
1180 *mtu = orig_mtu - rt->dst.header_len;
1181
1182 } else {
1183 /*
1184 * this fragment is not first, the headers
1185 * space is regarded as data space.
1186 */
1187 *mtu = orig_mtu;
1188 }
1189 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1190 + fragheaderlen - sizeof(struct frag_hdr);
1191 }
1192 }
1193
1194 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1195 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1196 struct rt6_info *rt, struct flowi6 *fl6)
1197 {
1198 struct ipv6_pinfo *np = inet6_sk(sk);
1199 unsigned int mtu;
1200 struct ipv6_txoptions *opt = ipc6->opt;
1201
1202 /*
1203 * setup for corking
1204 */
1205 if (opt) {
1206 if (WARN_ON(v6_cork->opt))
1207 return -EINVAL;
1208
1209 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1210 if (unlikely(!v6_cork->opt))
1211 return -ENOBUFS;
1212
1213 v6_cork->opt->tot_len = opt->tot_len;
1214 v6_cork->opt->opt_flen = opt->opt_flen;
1215 v6_cork->opt->opt_nflen = opt->opt_nflen;
1216
1217 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1218 sk->sk_allocation);
1219 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1220 return -ENOBUFS;
1221
1222 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1223 sk->sk_allocation);
1224 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1225 return -ENOBUFS;
1226
1227 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1228 sk->sk_allocation);
1229 if (opt->hopopt && !v6_cork->opt->hopopt)
1230 return -ENOBUFS;
1231
1232 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1233 sk->sk_allocation);
1234 if (opt->srcrt && !v6_cork->opt->srcrt)
1235 return -ENOBUFS;
1236
1237 /* need source address above miyazawa*/
1238 }
1239 dst_hold(&rt->dst);
1240 cork->base.dst = &rt->dst;
1241 cork->fl.u.ip6 = *fl6;
1242 v6_cork->hop_limit = ipc6->hlimit;
1243 v6_cork->tclass = ipc6->tclass;
1244 if (rt->dst.flags & DST_XFRM_TUNNEL)
1245 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1246 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1247 else
1248 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1249 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1250 if (np->frag_size < mtu) {
1251 if (np->frag_size)
1252 mtu = np->frag_size;
1253 }
1254 cork->base.fragsize = mtu;
1255 if (dst_allfrag(rt->dst.path))
1256 cork->base.flags |= IPCORK_ALLFRAG;
1257 cork->base.length = 0;
1258
1259 return 0;
1260 }
1261
1262 static int __ip6_append_data(struct sock *sk,
1263 struct flowi6 *fl6,
1264 struct sk_buff_head *queue,
1265 struct inet_cork *cork,
1266 struct inet6_cork *v6_cork,
1267 struct page_frag *pfrag,
1268 int getfrag(void *from, char *to, int offset,
1269 int len, int odd, struct sk_buff *skb),
1270 void *from, int length, int transhdrlen,
1271 unsigned int flags, struct ipcm6_cookie *ipc6,
1272 const struct sockcm_cookie *sockc)
1273 {
1274 struct sk_buff *skb, *skb_prev = NULL;
1275 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1276 int exthdrlen = 0;
1277 int dst_exthdrlen = 0;
1278 int hh_len;
1279 int copy;
1280 int err;
1281 int offset = 0;
1282 __u8 tx_flags = 0;
1283 u32 tskey = 0;
1284 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1285 struct ipv6_txoptions *opt = v6_cork->opt;
1286 int csummode = CHECKSUM_NONE;
1287 unsigned int maxnonfragsize, headersize;
1288
1289 skb = skb_peek_tail(queue);
1290 if (!skb) {
1291 exthdrlen = opt ? opt->opt_flen : 0;
1292 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1293 }
1294
1295 mtu = cork->fragsize;
1296 orig_mtu = mtu;
1297
1298 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1299
1300 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1301 (opt ? opt->opt_nflen : 0);
1302 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1303 sizeof(struct frag_hdr);
1304
1305 headersize = sizeof(struct ipv6hdr) +
1306 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1307 (dst_allfrag(&rt->dst) ?
1308 sizeof(struct frag_hdr) : 0) +
1309 rt->rt6i_nfheader_len;
1310
1311 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1312 (sk->sk_protocol == IPPROTO_UDP ||
1313 sk->sk_protocol == IPPROTO_RAW)) {
1314 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1315 sizeof(struct ipv6hdr));
1316 goto emsgsize;
1317 }
1318
1319 if (ip6_sk_ignore_df(sk))
1320 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1321 else
1322 maxnonfragsize = mtu;
1323
1324 if (cork->length + length > maxnonfragsize - headersize) {
1325 emsgsize:
1326 ipv6_local_error(sk, EMSGSIZE, fl6,
1327 mtu - headersize +
1328 sizeof(struct ipv6hdr));
1329 return -EMSGSIZE;
1330 }
1331
1332 /* CHECKSUM_PARTIAL only with no extension headers and when
1333 * we are not going to fragment
1334 */
1335 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1336 headersize == sizeof(struct ipv6hdr) &&
1337 length < mtu - headersize &&
1338 !(flags & MSG_MORE) &&
1339 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1340 csummode = CHECKSUM_PARTIAL;
1341
1342 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1343 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1344 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1345 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1346 tskey = sk->sk_tskey++;
1347 }
1348
1349 /*
1350 * Let's try using as much space as possible.
1351 * Use MTU if total length of the message fits into the MTU.
1352 * Otherwise, we need to reserve fragment header and
1353 * fragment alignment (= 8-15 octects, in total).
1354 *
1355 * Note that we may need to "move" the data from the tail of
1356 * of the buffer to the new fragment when we split
1357 * the message.
1358 *
1359 * FIXME: It may be fragmented into multiple chunks
1360 * at once if non-fragmentable extension headers
1361 * are too large.
1362 * --yoshfuji
1363 */
1364
1365 cork->length += length;
1366 if (((length > mtu) ||
1367 (skb && skb_is_gso(skb))) &&
1368 (sk->sk_protocol == IPPROTO_UDP) &&
1369 (rt->dst.dev->features & NETIF_F_UFO) &&
1370 (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1371 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1372 hh_len, fragheaderlen, exthdrlen,
1373 transhdrlen, mtu, flags, fl6);
1374 if (err)
1375 goto error;
1376 return 0;
1377 }
1378
1379 if (!skb)
1380 goto alloc_new_skb;
1381
1382 while (length > 0) {
1383 /* Check if the remaining data fits into current packet. */
1384 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1385 if (copy < length)
1386 copy = maxfraglen - skb->len;
1387
1388 if (copy <= 0) {
1389 char *data;
1390 unsigned int datalen;
1391 unsigned int fraglen;
1392 unsigned int fraggap;
1393 unsigned int alloclen;
1394 alloc_new_skb:
1395 /* There's no room in the current skb */
1396 if (skb)
1397 fraggap = skb->len - maxfraglen;
1398 else
1399 fraggap = 0;
1400 /* update mtu and maxfraglen if necessary */
1401 if (!skb || !skb_prev)
1402 ip6_append_data_mtu(&mtu, &maxfraglen,
1403 fragheaderlen, skb, rt,
1404 orig_mtu);
1405
1406 skb_prev = skb;
1407
1408 /*
1409 * If remaining data exceeds the mtu,
1410 * we know we need more fragment(s).
1411 */
1412 datalen = length + fraggap;
1413
1414 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1415 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1416 if ((flags & MSG_MORE) &&
1417 !(rt->dst.dev->features&NETIF_F_SG))
1418 alloclen = mtu;
1419 else
1420 alloclen = datalen + fragheaderlen;
1421
1422 alloclen += dst_exthdrlen;
1423
1424 if (datalen != length + fraggap) {
1425 /*
1426 * this is not the last fragment, the trailer
1427 * space is regarded as data space.
1428 */
1429 datalen += rt->dst.trailer_len;
1430 }
1431
1432 alloclen += rt->dst.trailer_len;
1433 fraglen = datalen + fragheaderlen;
1434
1435 /*
1436 * We just reserve space for fragment header.
1437 * Note: this may be overallocation if the message
1438 * (without MSG_MORE) fits into the MTU.
1439 */
1440 alloclen += sizeof(struct frag_hdr);
1441
1442 if (transhdrlen) {
1443 skb = sock_alloc_send_skb(sk,
1444 alloclen + hh_len,
1445 (flags & MSG_DONTWAIT), &err);
1446 } else {
1447 skb = NULL;
1448 if (atomic_read(&sk->sk_wmem_alloc) <=
1449 2 * sk->sk_sndbuf)
1450 skb = sock_wmalloc(sk,
1451 alloclen + hh_len, 1,
1452 sk->sk_allocation);
1453 if (unlikely(!skb))
1454 err = -ENOBUFS;
1455 }
1456 if (!skb)
1457 goto error;
1458 /*
1459 * Fill in the control structures
1460 */
1461 skb->protocol = htons(ETH_P_IPV6);
1462 skb->ip_summed = csummode;
1463 skb->csum = 0;
1464 /* reserve for fragmentation and ipsec header */
1465 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1466 dst_exthdrlen);
1467
1468 /* Only the initial fragment is time stamped */
1469 skb_shinfo(skb)->tx_flags = tx_flags;
1470 tx_flags = 0;
1471 skb_shinfo(skb)->tskey = tskey;
1472 tskey = 0;
1473
1474 /*
1475 * Find where to start putting bytes
1476 */
1477 data = skb_put(skb, fraglen);
1478 skb_set_network_header(skb, exthdrlen);
1479 data += fragheaderlen;
1480 skb->transport_header = (skb->network_header +
1481 fragheaderlen);
1482 if (fraggap) {
1483 skb->csum = skb_copy_and_csum_bits(
1484 skb_prev, maxfraglen,
1485 data + transhdrlen, fraggap, 0);
1486 skb_prev->csum = csum_sub(skb_prev->csum,
1487 skb->csum);
1488 data += fraggap;
1489 pskb_trim_unique(skb_prev, maxfraglen);
1490 }
1491 copy = datalen - transhdrlen - fraggap;
1492
1493 if (copy < 0) {
1494 err = -EINVAL;
1495 kfree_skb(skb);
1496 goto error;
1497 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1498 err = -EFAULT;
1499 kfree_skb(skb);
1500 goto error;
1501 }
1502
1503 offset += copy;
1504 length -= datalen - fraggap;
1505 transhdrlen = 0;
1506 exthdrlen = 0;
1507 dst_exthdrlen = 0;
1508
1509 /*
1510 * Put the packet on the pending queue
1511 */
1512 __skb_queue_tail(queue, skb);
1513 continue;
1514 }
1515
1516 if (copy > length)
1517 copy = length;
1518
1519 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1520 unsigned int off;
1521
1522 off = skb->len;
1523 if (getfrag(from, skb_put(skb, copy),
1524 offset, copy, off, skb) < 0) {
1525 __skb_trim(skb, off);
1526 err = -EFAULT;
1527 goto error;
1528 }
1529 } else {
1530 int i = skb_shinfo(skb)->nr_frags;
1531
1532 err = -ENOMEM;
1533 if (!sk_page_frag_refill(sk, pfrag))
1534 goto error;
1535
1536 if (!skb_can_coalesce(skb, i, pfrag->page,
1537 pfrag->offset)) {
1538 err = -EMSGSIZE;
1539 if (i == MAX_SKB_FRAGS)
1540 goto error;
1541
1542 __skb_fill_page_desc(skb, i, pfrag->page,
1543 pfrag->offset, 0);
1544 skb_shinfo(skb)->nr_frags = ++i;
1545 get_page(pfrag->page);
1546 }
1547 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1548 if (getfrag(from,
1549 page_address(pfrag->page) + pfrag->offset,
1550 offset, copy, skb->len, skb) < 0)
1551 goto error_efault;
1552
1553 pfrag->offset += copy;
1554 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1555 skb->len += copy;
1556 skb->data_len += copy;
1557 skb->truesize += copy;
1558 atomic_add(copy, &sk->sk_wmem_alloc);
1559 }
1560 offset += copy;
1561 length -= copy;
1562 }
1563
1564 return 0;
1565
1566 error_efault:
1567 err = -EFAULT;
1568 error:
1569 cork->length -= length;
1570 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1571 return err;
1572 }
1573
1574 int ip6_append_data(struct sock *sk,
1575 int getfrag(void *from, char *to, int offset, int len,
1576 int odd, struct sk_buff *skb),
1577 void *from, int length, int transhdrlen,
1578 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1579 struct rt6_info *rt, unsigned int flags,
1580 const struct sockcm_cookie *sockc)
1581 {
1582 struct inet_sock *inet = inet_sk(sk);
1583 struct ipv6_pinfo *np = inet6_sk(sk);
1584 int exthdrlen;
1585 int err;
1586
1587 if (flags&MSG_PROBE)
1588 return 0;
1589 if (skb_queue_empty(&sk->sk_write_queue)) {
1590 /*
1591 * setup for corking
1592 */
1593 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1594 ipc6, rt, fl6);
1595 if (err)
1596 return err;
1597
1598 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1599 length += exthdrlen;
1600 transhdrlen += exthdrlen;
1601 } else {
1602 fl6 = &inet->cork.fl.u.ip6;
1603 transhdrlen = 0;
1604 }
1605
1606 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1607 &np->cork, sk_page_frag(sk), getfrag,
1608 from, length, transhdrlen, flags, ipc6, sockc);
1609 }
1610 EXPORT_SYMBOL_GPL(ip6_append_data);
1611
1612 static void ip6_cork_release(struct inet_cork_full *cork,
1613 struct inet6_cork *v6_cork)
1614 {
1615 if (v6_cork->opt) {
1616 kfree(v6_cork->opt->dst0opt);
1617 kfree(v6_cork->opt->dst1opt);
1618 kfree(v6_cork->opt->hopopt);
1619 kfree(v6_cork->opt->srcrt);
1620 kfree(v6_cork->opt);
1621 v6_cork->opt = NULL;
1622 }
1623
1624 if (cork->base.dst) {
1625 dst_release(cork->base.dst);
1626 cork->base.dst = NULL;
1627 cork->base.flags &= ~IPCORK_ALLFRAG;
1628 }
1629 memset(&cork->fl, 0, sizeof(cork->fl));
1630 }
1631
1632 struct sk_buff *__ip6_make_skb(struct sock *sk,
1633 struct sk_buff_head *queue,
1634 struct inet_cork_full *cork,
1635 struct inet6_cork *v6_cork)
1636 {
1637 struct sk_buff *skb, *tmp_skb;
1638 struct sk_buff **tail_skb;
1639 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1640 struct ipv6_pinfo *np = inet6_sk(sk);
1641 struct net *net = sock_net(sk);
1642 struct ipv6hdr *hdr;
1643 struct ipv6_txoptions *opt = v6_cork->opt;
1644 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1645 struct flowi6 *fl6 = &cork->fl.u.ip6;
1646 unsigned char proto = fl6->flowi6_proto;
1647
1648 skb = __skb_dequeue(queue);
1649 if (!skb)
1650 goto out;
1651 tail_skb = &(skb_shinfo(skb)->frag_list);
1652
1653 /* move skb->data to ip header from ext header */
1654 if (skb->data < skb_network_header(skb))
1655 __skb_pull(skb, skb_network_offset(skb));
1656 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1657 __skb_pull(tmp_skb, skb_network_header_len(skb));
1658 *tail_skb = tmp_skb;
1659 tail_skb = &(tmp_skb->next);
1660 skb->len += tmp_skb->len;
1661 skb->data_len += tmp_skb->len;
1662 skb->truesize += tmp_skb->truesize;
1663 tmp_skb->destructor = NULL;
1664 tmp_skb->sk = NULL;
1665 }
1666
1667 /* Allow local fragmentation. */
1668 skb->ignore_df = ip6_sk_ignore_df(sk);
1669
1670 *final_dst = fl6->daddr;
1671 __skb_pull(skb, skb_network_header_len(skb));
1672 if (opt && opt->opt_flen)
1673 ipv6_push_frag_opts(skb, opt, &proto);
1674 if (opt && opt->opt_nflen)
1675 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1676
1677 skb_push(skb, sizeof(struct ipv6hdr));
1678 skb_reset_network_header(skb);
1679 hdr = ipv6_hdr(skb);
1680
1681 ip6_flow_hdr(hdr, v6_cork->tclass,
1682 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1683 np->autoflowlabel, fl6));
1684 hdr->hop_limit = v6_cork->hop_limit;
1685 hdr->nexthdr = proto;
1686 hdr->saddr = fl6->saddr;
1687 hdr->daddr = *final_dst;
1688
1689 skb->priority = sk->sk_priority;
1690 skb->mark = sk->sk_mark;
1691
1692 skb_dst_set(skb, dst_clone(&rt->dst));
1693 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1694 if (proto == IPPROTO_ICMPV6) {
1695 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1696
1697 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1698 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1699 }
1700
1701 ip6_cork_release(cork, v6_cork);
1702 out:
1703 return skb;
1704 }
1705
1706 int ip6_send_skb(struct sk_buff *skb)
1707 {
1708 struct net *net = sock_net(skb->sk);
1709 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1710 int err;
1711
1712 err = ip6_local_out(net, skb->sk, skb);
1713 if (err) {
1714 if (err > 0)
1715 err = net_xmit_errno(err);
1716 if (err)
1717 IP6_INC_STATS(net, rt->rt6i_idev,
1718 IPSTATS_MIB_OUTDISCARDS);
1719 }
1720
1721 return err;
1722 }
1723
1724 int ip6_push_pending_frames(struct sock *sk)
1725 {
1726 struct sk_buff *skb;
1727
1728 skb = ip6_finish_skb(sk);
1729 if (!skb)
1730 return 0;
1731
1732 return ip6_send_skb(skb);
1733 }
1734 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1735
1736 static void __ip6_flush_pending_frames(struct sock *sk,
1737 struct sk_buff_head *queue,
1738 struct inet_cork_full *cork,
1739 struct inet6_cork *v6_cork)
1740 {
1741 struct sk_buff *skb;
1742
1743 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1744 if (skb_dst(skb))
1745 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1746 IPSTATS_MIB_OUTDISCARDS);
1747 kfree_skb(skb);
1748 }
1749
1750 ip6_cork_release(cork, v6_cork);
1751 }
1752
1753 void ip6_flush_pending_frames(struct sock *sk)
1754 {
1755 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1756 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1757 }
1758 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1759
1760 struct sk_buff *ip6_make_skb(struct sock *sk,
1761 int getfrag(void *from, char *to, int offset,
1762 int len, int odd, struct sk_buff *skb),
1763 void *from, int length, int transhdrlen,
1764 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1765 struct rt6_info *rt, unsigned int flags,
1766 const struct sockcm_cookie *sockc)
1767 {
1768 struct inet_cork_full cork;
1769 struct inet6_cork v6_cork;
1770 struct sk_buff_head queue;
1771 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1772 int err;
1773
1774 if (flags & MSG_PROBE)
1775 return NULL;
1776
1777 __skb_queue_head_init(&queue);
1778
1779 cork.base.flags = 0;
1780 cork.base.addr = 0;
1781 cork.base.opt = NULL;
1782 v6_cork.opt = NULL;
1783 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1784 if (err)
1785 return ERR_PTR(err);
1786
1787 if (ipc6->dontfrag < 0)
1788 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1789
1790 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1791 &current->task_frag, getfrag, from,
1792 length + exthdrlen, transhdrlen + exthdrlen,
1793 flags, ipc6, sockc);
1794 if (err) {
1795 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1796 return ERR_PTR(err);
1797 }
1798
1799 return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1800 }
This page took 0.10051 seconds and 5 git commands to generate.