6ce5c3292f9f74021c1fb3b8bce32dea75ade96f
[deliverable/linux.git] / net / ipv4 / ip_output.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Hirokazu Takahashi, <taka@valinux.co.jp>
19 *
20 * See ip_input.c for original log
21 *
22 * Fixes:
23 * Alan Cox : Missing nonblock feature in ip_build_xmit.
24 * Mike Kilburn : htons() missing in ip_build_xmit.
25 * Bradford Johnson: Fix faulty handling of some frames when
26 * no route is found.
27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
28 * (in case if packet not accepted by
29 * output firewall rules)
30 * Mike McLagan : Routing by source
31 * Alexey Kuznetsov: use new route cache
32 * Andi Kleen: Fix broken PMTU recovery and remove
33 * some redundant tests.
34 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
35 * Andi Kleen : Replace ip_reply with ip_send_reply.
36 * Andi Kleen : Split fast and slow ip_build_xmit path
37 * for decreased register pressure on x86
38 * and more readibility.
39 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
40 * silently drop skb instead of failing with -EPERM.
41 * Detlev Wengorz : Copy protocol for fragments.
42 * Hirokazu Takahashi: HW checksumming for outgoing UDP
43 * datagrams.
44 * Hirokazu Takahashi: sendfile() on UDP works now.
45 */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/udp.h>
74 #include <linux/skbuff.h>
75 #include <net/sock.h>
76 #include <net/arp.h>
77 #include <net/icmp.h>
78 #include <net/raw.h>
79 #include <net/checksum.h>
80 #include <net/inetpeer.h>
81 #include <net/checksum.h>
82 #include <linux/igmp.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/netfilter_bridge.h>
85 #include <linux/mroute.h>
86 #include <linux/netlink.h>
87
88 /*
89 * Shall we try to damage output packets if routing dev changes?
90 */
91
92 int sysctl_ip_dynaddr;
93 int sysctl_ip_default_ttl = IPDEFTTL;
94
95 /* Generate a checksum for an outgoing IP datagram. */
96 __inline__ void ip_send_check(struct iphdr *iph)
97 {
98 iph->check = 0;
99 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
100 }
101
102 /* dev_loopback_xmit for use with netfilter. */
103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
104 {
105 newskb->mac.raw = newskb->data;
106 __skb_pull(newskb, newskb->nh.raw - newskb->data);
107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst);
110 nf_reset(newskb);
111 netif_rx(newskb);
112 return 0;
113 }
114
115 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
116 {
117 int ttl = inet->uc_ttl;
118
119 if (ttl < 0)
120 ttl = dst_metric(dst, RTAX_HOPLIMIT);
121 return ttl;
122 }
123
124 /*
125 * Add an ip header to a skbuff and send it out.
126 *
127 */
128 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
129 u32 saddr, u32 daddr, struct ip_options *opt)
130 {
131 struct inet_sock *inet = inet_sk(sk);
132 struct rtable *rt = (struct rtable *)skb->dst;
133 struct iphdr *iph;
134
135 /* Build the IP header. */
136 if (opt)
137 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
138 else
139 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
140
141 iph->version = 4;
142 iph->ihl = 5;
143 iph->tos = inet->tos;
144 if (ip_dont_fragment(sk, &rt->u.dst))
145 iph->frag_off = htons(IP_DF);
146 else
147 iph->frag_off = 0;
148 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
149 iph->daddr = rt->rt_dst;
150 iph->saddr = rt->rt_src;
151 iph->protocol = sk->sk_protocol;
152 iph->tot_len = htons(skb->len);
153 ip_select_ident(iph, &rt->u.dst, sk);
154 skb->nh.iph = iph;
155
156 if (opt && opt->optlen) {
157 iph->ihl += opt->optlen>>2;
158 ip_options_build(skb, opt, daddr, rt, 0);
159 }
160 ip_send_check(iph);
161
162 skb->priority = sk->sk_priority;
163
164 /* Send it out. */
165 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
166 dst_output);
167 }
168
169 static inline int ip_finish_output2(struct sk_buff *skb)
170 {
171 struct dst_entry *dst = skb->dst;
172 struct hh_cache *hh = dst->hh;
173 struct net_device *dev = dst->dev;
174 int hh_len = LL_RESERVED_SPACE(dev);
175
176 /* Be paranoid, rather than too clever. */
177 if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
178 struct sk_buff *skb2;
179
180 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
181 if (skb2 == NULL) {
182 kfree_skb(skb);
183 return -ENOMEM;
184 }
185 if (skb->sk)
186 skb_set_owner_w(skb2, skb->sk);
187 kfree_skb(skb);
188 skb = skb2;
189 }
190
191 #ifdef CONFIG_BRIDGE_NETFILTER
192 /* bridge-netfilter defers calling some IP hooks to the bridge layer
193 * and still needs the conntrack reference.
194 */
195 if (skb->nf_bridge == NULL)
196 #endif
197 nf_reset(skb);
198
199 if (hh) {
200 int hh_alen;
201
202 read_lock_bh(&hh->hh_lock);
203 hh_alen = HH_DATA_ALIGN(hh->hh_len);
204 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
205 read_unlock_bh(&hh->hh_lock);
206 skb_push(skb, hh->hh_len);
207 return hh->hh_output(skb);
208 } else if (dst->neighbour)
209 return dst->neighbour->output(skb);
210
211 if (net_ratelimit())
212 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213 kfree_skb(skb);
214 return -EINVAL;
215 }
216
217 int ip_finish_output(struct sk_buff *skb)
218 {
219 struct net_device *dev = skb->dst->dev;
220
221 skb->dev = dev;
222 skb->protocol = htons(ETH_P_IP);
223
224 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
225 ip_finish_output2);
226 }
227
228 int ip_mc_output(struct sk_buff *skb)
229 {
230 struct sock *sk = skb->sk;
231 struct rtable *rt = (struct rtable*)skb->dst;
232 struct net_device *dev = rt->u.dst.dev;
233
234 /*
235 * If the indicated interface is up and running, send the packet.
236 */
237 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
238
239 skb->dev = dev;
240 skb->protocol = htons(ETH_P_IP);
241
242 /*
243 * Multicasts are looped back for other local users
244 */
245
246 if (rt->rt_flags&RTCF_MULTICAST) {
247 if ((!sk || inet_sk(sk)->mc_loop)
248 #ifdef CONFIG_IP_MROUTE
249 /* Small optimization: do not loopback not local frames,
250 which returned after forwarding; they will be dropped
251 by ip_mr_input in any case.
252 Note, that local frames are looped back to be delivered
253 to local recipients.
254
255 This check is duplicated in ip_mr_input at the moment.
256 */
257 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
258 #endif
259 ) {
260 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
261 if (newskb)
262 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
263 newskb->dev,
264 ip_dev_loopback_xmit);
265 }
266
267 /* Multicasts with ttl 0 must not go beyond the host */
268
269 if (skb->nh.iph->ttl == 0) {
270 kfree_skb(skb);
271 return 0;
272 }
273 }
274
275 if (rt->rt_flags&RTCF_BROADCAST) {
276 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
277 if (newskb)
278 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
279 newskb->dev, ip_dev_loopback_xmit);
280 }
281
282 if (skb->len > dst_mtu(&rt->u.dst))
283 return ip_fragment(skb, ip_finish_output);
284 else
285 return ip_finish_output(skb);
286 }
287
288 int ip_output(struct sk_buff *skb)
289 {
290 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
291
292 if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
293 return ip_fragment(skb, ip_finish_output);
294 else
295 return ip_finish_output(skb);
296 }
297
298 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
299 {
300 struct sock *sk = skb->sk;
301 struct inet_sock *inet = inet_sk(sk);
302 struct ip_options *opt = inet->opt;
303 struct rtable *rt;
304 struct iphdr *iph;
305
306 /* Skip all of this if the packet is already routed,
307 * f.e. by something like SCTP.
308 */
309 rt = (struct rtable *) skb->dst;
310 if (rt != NULL)
311 goto packet_routed;
312
313 /* Make sure we can route this packet. */
314 rt = (struct rtable *)__sk_dst_check(sk, 0);
315 if (rt == NULL) {
316 u32 daddr;
317
318 /* Use correct destination address if we have options. */
319 daddr = inet->daddr;
320 if(opt && opt->srr)
321 daddr = opt->faddr;
322
323 {
324 struct flowi fl = { .oif = sk->sk_bound_dev_if,
325 .nl_u = { .ip4_u =
326 { .daddr = daddr,
327 .saddr = inet->saddr,
328 .tos = RT_CONN_FLAGS(sk) } },
329 .proto = sk->sk_protocol,
330 .uli_u = { .ports =
331 { .sport = inet->sport,
332 .dport = inet->dport } } };
333
334 /* If this fails, retransmit mechanism of transport layer will
335 * keep trying until route appears or the connection times
336 * itself out.
337 */
338 if (ip_route_output_flow(&rt, &fl, sk, 0))
339 goto no_route;
340 }
341 __sk_dst_set(sk, &rt->u.dst);
342 tcp_v4_setup_caps(sk, &rt->u.dst);
343 }
344 skb->dst = dst_clone(&rt->u.dst);
345
346 packet_routed:
347 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
348 goto no_route;
349
350 /* OK, we know where to send it, allocate and build IP header. */
351 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
352 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
353 iph->tot_len = htons(skb->len);
354 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
355 iph->frag_off = htons(IP_DF);
356 else
357 iph->frag_off = 0;
358 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
359 iph->protocol = sk->sk_protocol;
360 iph->saddr = rt->rt_src;
361 iph->daddr = rt->rt_dst;
362 skb->nh.iph = iph;
363 /* Transport layer set skb->h.foo itself. */
364
365 if (opt && opt->optlen) {
366 iph->ihl += opt->optlen >> 2;
367 ip_options_build(skb, opt, inet->daddr, rt, 0);
368 }
369
370 ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
371
372 /* Add an IP checksum. */
373 ip_send_check(iph);
374
375 skb->priority = sk->sk_priority;
376
377 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
378 dst_output);
379
380 no_route:
381 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
382 kfree_skb(skb);
383 return -EHOSTUNREACH;
384 }
385
386
387 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
388 {
389 to->pkt_type = from->pkt_type;
390 to->priority = from->priority;
391 to->protocol = from->protocol;
392 to->security = from->security;
393 dst_release(to->dst);
394 to->dst = dst_clone(from->dst);
395 to->dev = from->dev;
396
397 /* Copy the flags to each fragment. */
398 IPCB(to)->flags = IPCB(from)->flags;
399
400 #ifdef CONFIG_NET_SCHED
401 to->tc_index = from->tc_index;
402 #endif
403 #ifdef CONFIG_NETFILTER
404 to->nfmark = from->nfmark;
405 to->nfcache = from->nfcache;
406 /* Connection association is same as pre-frag packet */
407 nf_conntrack_put(to->nfct);
408 to->nfct = from->nfct;
409 nf_conntrack_get(to->nfct);
410 to->nfctinfo = from->nfctinfo;
411 #ifdef CONFIG_BRIDGE_NETFILTER
412 nf_bridge_put(to->nf_bridge);
413 to->nf_bridge = from->nf_bridge;
414 nf_bridge_get(to->nf_bridge);
415 #endif
416 #endif
417 }
418
419 /*
420 * This IP datagram is too large to be sent in one piece. Break it up into
421 * smaller pieces (each of size equal to IP header plus
422 * a block of the data of the original IP data part) that will yet fit in a
423 * single device frame, and queue such a frame for sending.
424 */
425
426 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
427 {
428 struct iphdr *iph;
429 int raw = 0;
430 int ptr;
431 struct net_device *dev;
432 struct sk_buff *skb2;
433 unsigned int mtu, hlen, left, len, ll_rs;
434 int offset;
435 int not_last_frag;
436 struct rtable *rt = (struct rtable*)skb->dst;
437 int err = 0;
438
439 dev = rt->u.dst.dev;
440
441 /*
442 * Point into the IP datagram header.
443 */
444
445 iph = skb->nh.iph;
446
447 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
448 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
449 htonl(dst_mtu(&rt->u.dst)));
450 kfree_skb(skb);
451 return -EMSGSIZE;
452 }
453
454 /*
455 * Setup starting values.
456 */
457
458 hlen = iph->ihl * 4;
459 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
460
461 /* When frag_list is given, use it. First, check its validity:
462 * some transformers could create wrong frag_list or break existing
463 * one, it is not prohibited. In this case fall back to copying.
464 *
465 * LATER: this step can be merged to real generation of fragments,
466 * we can switch to copy when see the first bad fragment.
467 */
468 if (skb_shinfo(skb)->frag_list) {
469 struct sk_buff *frag;
470 int first_len = skb_pagelen(skb);
471
472 if (first_len - hlen > mtu ||
473 ((first_len - hlen) & 7) ||
474 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
475 skb_cloned(skb))
476 goto slow_path;
477
478 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
479 /* Correct geometry. */
480 if (frag->len > mtu ||
481 ((frag->len & 7) && frag->next) ||
482 skb_headroom(frag) < hlen)
483 goto slow_path;
484
485 /* Partially cloned skb? */
486 if (skb_shared(frag))
487 goto slow_path;
488
489 BUG_ON(frag->sk);
490 if (skb->sk) {
491 sock_hold(skb->sk);
492 frag->sk = skb->sk;
493 frag->destructor = sock_wfree;
494 skb->truesize -= frag->truesize;
495 }
496 }
497
498 /* Everything is OK. Generate! */
499
500 err = 0;
501 offset = 0;
502 frag = skb_shinfo(skb)->frag_list;
503 skb_shinfo(skb)->frag_list = NULL;
504 skb->data_len = first_len - skb_headlen(skb);
505 skb->len = first_len;
506 iph->tot_len = htons(first_len);
507 iph->frag_off = htons(IP_MF);
508 ip_send_check(iph);
509
510 for (;;) {
511 /* Prepare header of the next frame,
512 * before previous one went down. */
513 if (frag) {
514 frag->ip_summed = CHECKSUM_NONE;
515 frag->h.raw = frag->data;
516 frag->nh.raw = __skb_push(frag, hlen);
517 memcpy(frag->nh.raw, iph, hlen);
518 iph = frag->nh.iph;
519 iph->tot_len = htons(frag->len);
520 ip_copy_metadata(frag, skb);
521 if (offset == 0)
522 ip_options_fragment(frag);
523 offset += skb->len - hlen;
524 iph->frag_off = htons(offset>>3);
525 if (frag->next != NULL)
526 iph->frag_off |= htons(IP_MF);
527 /* Ready, complete checksum */
528 ip_send_check(iph);
529 }
530
531 err = output(skb);
532
533 if (err || !frag)
534 break;
535
536 skb = frag;
537 frag = skb->next;
538 skb->next = NULL;
539 }
540
541 if (err == 0) {
542 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
543 return 0;
544 }
545
546 while (frag) {
547 skb = frag->next;
548 kfree_skb(frag);
549 frag = skb;
550 }
551 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
552 return err;
553 }
554
555 slow_path:
556 left = skb->len - hlen; /* Space per frame */
557 ptr = raw + hlen; /* Where to start from */
558
559 #ifdef CONFIG_BRIDGE_NETFILTER
560 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
561 * we need to make room for the encapsulating header */
562 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
563 mtu -= nf_bridge_pad(skb);
564 #else
565 ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
566 #endif
567 /*
568 * Fragment the datagram.
569 */
570
571 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
572 not_last_frag = iph->frag_off & htons(IP_MF);
573
574 /*
575 * Keep copying data until we run out.
576 */
577
578 while(left > 0) {
579 len = left;
580 /* IF: it doesn't fit, use 'mtu' - the data space left */
581 if (len > mtu)
582 len = mtu;
583 /* IF: we are not sending upto and including the packet end
584 then align the next start on an eight byte boundary */
585 if (len < left) {
586 len &= ~7;
587 }
588 /*
589 * Allocate buffer.
590 */
591
592 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
593 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
594 err = -ENOMEM;
595 goto fail;
596 }
597
598 /*
599 * Set up data on packet
600 */
601
602 ip_copy_metadata(skb2, skb);
603 skb_reserve(skb2, ll_rs);
604 skb_put(skb2, len + hlen);
605 skb2->nh.raw = skb2->data;
606 skb2->h.raw = skb2->data + hlen;
607
608 /*
609 * Charge the memory for the fragment to any owner
610 * it might possess
611 */
612
613 if (skb->sk)
614 skb_set_owner_w(skb2, skb->sk);
615
616 /*
617 * Copy the packet header into the new buffer.
618 */
619
620 memcpy(skb2->nh.raw, skb->data, hlen);
621
622 /*
623 * Copy a block of the IP datagram.
624 */
625 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
626 BUG();
627 left -= len;
628
629 /*
630 * Fill in the new header fields.
631 */
632 iph = skb2->nh.iph;
633 iph->frag_off = htons((offset >> 3));
634
635 /* ANK: dirty, but effective trick. Upgrade options only if
636 * the segment to be fragmented was THE FIRST (otherwise,
637 * options are already fixed) and make it ONCE
638 * on the initial skb, so that all the following fragments
639 * will inherit fixed options.
640 */
641 if (offset == 0)
642 ip_options_fragment(skb);
643
644 /*
645 * Added AC : If we are fragmenting a fragment that's not the
646 * last fragment then keep MF on each bit
647 */
648 if (left > 0 || not_last_frag)
649 iph->frag_off |= htons(IP_MF);
650 ptr += len;
651 offset += len;
652
653 /*
654 * Put this fragment into the sending queue.
655 */
656
657 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
658
659 iph->tot_len = htons(len + hlen);
660
661 ip_send_check(iph);
662
663 err = output(skb2);
664 if (err)
665 goto fail;
666 }
667 kfree_skb(skb);
668 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
669 return err;
670
671 fail:
672 kfree_skb(skb);
673 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
674 return err;
675 }
676
677 int
678 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
679 {
680 struct iovec *iov = from;
681
682 if (skb->ip_summed == CHECKSUM_HW) {
683 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
684 return -EFAULT;
685 } else {
686 unsigned int csum = 0;
687 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
688 return -EFAULT;
689 skb->csum = csum_block_add(skb->csum, csum, odd);
690 }
691 return 0;
692 }
693
694 static inline unsigned int
695 csum_page(struct page *page, int offset, int copy)
696 {
697 char *kaddr;
698 unsigned int csum;
699 kaddr = kmap(page);
700 csum = csum_partial(kaddr + offset, copy, 0);
701 kunmap(page);
702 return csum;
703 }
704
705 /*
706 * ip_append_data() and ip_append_page() can make one large IP datagram
707 * from many pieces of data. Each pieces will be holded on the socket
708 * until ip_push_pending_frames() is called. Each piece can be a page
709 * or non-page data.
710 *
711 * Not only UDP, other transport protocols - e.g. raw sockets - can use
712 * this interface potentially.
713 *
714 * LATER: length must be adjusted by pad at tail, when it is required.
715 */
716 int ip_append_data(struct sock *sk,
717 int getfrag(void *from, char *to, int offset, int len,
718 int odd, struct sk_buff *skb),
719 void *from, int length, int transhdrlen,
720 struct ipcm_cookie *ipc, struct rtable *rt,
721 unsigned int flags)
722 {
723 struct inet_sock *inet = inet_sk(sk);
724 struct sk_buff *skb;
725
726 struct ip_options *opt = NULL;
727 int hh_len;
728 int exthdrlen;
729 int mtu;
730 int copy;
731 int err;
732 int offset = 0;
733 unsigned int maxfraglen, fragheaderlen;
734 int csummode = CHECKSUM_NONE;
735
736 if (flags&MSG_PROBE)
737 return 0;
738
739 if (skb_queue_empty(&sk->sk_write_queue)) {
740 /*
741 * setup for corking.
742 */
743 opt = ipc->opt;
744 if (opt) {
745 if (inet->cork.opt == NULL) {
746 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
747 if (unlikely(inet->cork.opt == NULL))
748 return -ENOBUFS;
749 }
750 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
751 inet->cork.flags |= IPCORK_OPT;
752 inet->cork.addr = ipc->addr;
753 }
754 dst_hold(&rt->u.dst);
755 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
756 inet->cork.rt = rt;
757 inet->cork.length = 0;
758 sk->sk_sndmsg_page = NULL;
759 sk->sk_sndmsg_off = 0;
760 if ((exthdrlen = rt->u.dst.header_len) != 0) {
761 length += exthdrlen;
762 transhdrlen += exthdrlen;
763 }
764 } else {
765 rt = inet->cork.rt;
766 if (inet->cork.flags & IPCORK_OPT)
767 opt = inet->cork.opt;
768
769 transhdrlen = 0;
770 exthdrlen = 0;
771 mtu = inet->cork.fragsize;
772 }
773 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
774
775 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
776 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
777
778 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
779 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
780 return -EMSGSIZE;
781 }
782
783 /*
784 * transhdrlen > 0 means that this is the first fragment and we wish
785 * it won't be fragmented in the future.
786 */
787 if (transhdrlen &&
788 length + fragheaderlen <= mtu &&
789 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
790 !exthdrlen)
791 csummode = CHECKSUM_HW;
792
793 inet->cork.length += length;
794
795 /* So, what's going on in the loop below?
796 *
797 * We use calculated fragment length to generate chained skb,
798 * each of segments is IP fragment ready for sending to network after
799 * adding appropriate IP header.
800 */
801
802 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
803 goto alloc_new_skb;
804
805 while (length > 0) {
806 /* Check if the remaining data fits into current packet. */
807 copy = mtu - skb->len;
808 if (copy < length)
809 copy = maxfraglen - skb->len;
810 if (copy <= 0) {
811 char *data;
812 unsigned int datalen;
813 unsigned int fraglen;
814 unsigned int fraggap;
815 unsigned int alloclen;
816 struct sk_buff *skb_prev;
817 alloc_new_skb:
818 skb_prev = skb;
819 if (skb_prev)
820 fraggap = skb_prev->len - maxfraglen;
821 else
822 fraggap = 0;
823
824 /*
825 * If remaining data exceeds the mtu,
826 * we know we need more fragment(s).
827 */
828 datalen = length + fraggap;
829 if (datalen > mtu - fragheaderlen)
830 datalen = maxfraglen - fragheaderlen;
831 fraglen = datalen + fragheaderlen;
832
833 if ((flags & MSG_MORE) &&
834 !(rt->u.dst.dev->features&NETIF_F_SG))
835 alloclen = mtu;
836 else
837 alloclen = datalen + fragheaderlen;
838
839 /* The last fragment gets additional space at tail.
840 * Note, with MSG_MORE we overallocate on fragments,
841 * because we have no idea what fragment will be
842 * the last.
843 */
844 if (datalen == length)
845 alloclen += rt->u.dst.trailer_len;
846
847 if (transhdrlen) {
848 skb = sock_alloc_send_skb(sk,
849 alloclen + hh_len + 15,
850 (flags & MSG_DONTWAIT), &err);
851 } else {
852 skb = NULL;
853 if (atomic_read(&sk->sk_wmem_alloc) <=
854 2 * sk->sk_sndbuf)
855 skb = sock_wmalloc(sk,
856 alloclen + hh_len + 15, 1,
857 sk->sk_allocation);
858 if (unlikely(skb == NULL))
859 err = -ENOBUFS;
860 }
861 if (skb == NULL)
862 goto error;
863
864 /*
865 * Fill in the control structures
866 */
867 skb->ip_summed = csummode;
868 skb->csum = 0;
869 skb_reserve(skb, hh_len);
870
871 /*
872 * Find where to start putting bytes.
873 */
874 data = skb_put(skb, fraglen);
875 skb->nh.raw = data + exthdrlen;
876 data += fragheaderlen;
877 skb->h.raw = data + exthdrlen;
878
879 if (fraggap) {
880 skb->csum = skb_copy_and_csum_bits(
881 skb_prev, maxfraglen,
882 data + transhdrlen, fraggap, 0);
883 skb_prev->csum = csum_sub(skb_prev->csum,
884 skb->csum);
885 data += fraggap;
886 skb_trim(skb_prev, maxfraglen);
887 }
888
889 copy = datalen - transhdrlen - fraggap;
890 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
891 err = -EFAULT;
892 kfree_skb(skb);
893 goto error;
894 }
895
896 offset += copy;
897 length -= datalen - fraggap;
898 transhdrlen = 0;
899 exthdrlen = 0;
900 csummode = CHECKSUM_NONE;
901
902 /*
903 * Put the packet on the pending queue.
904 */
905 __skb_queue_tail(&sk->sk_write_queue, skb);
906 continue;
907 }
908
909 if (copy > length)
910 copy = length;
911
912 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
913 unsigned int off;
914
915 off = skb->len;
916 if (getfrag(from, skb_put(skb, copy),
917 offset, copy, off, skb) < 0) {
918 __skb_trim(skb, off);
919 err = -EFAULT;
920 goto error;
921 }
922 } else {
923 int i = skb_shinfo(skb)->nr_frags;
924 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
925 struct page *page = sk->sk_sndmsg_page;
926 int off = sk->sk_sndmsg_off;
927 unsigned int left;
928
929 if (page && (left = PAGE_SIZE - off) > 0) {
930 if (copy >= left)
931 copy = left;
932 if (page != frag->page) {
933 if (i == MAX_SKB_FRAGS) {
934 err = -EMSGSIZE;
935 goto error;
936 }
937 get_page(page);
938 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
939 frag = &skb_shinfo(skb)->frags[i];
940 }
941 } else if (i < MAX_SKB_FRAGS) {
942 if (copy > PAGE_SIZE)
943 copy = PAGE_SIZE;
944 page = alloc_pages(sk->sk_allocation, 0);
945 if (page == NULL) {
946 err = -ENOMEM;
947 goto error;
948 }
949 sk->sk_sndmsg_page = page;
950 sk->sk_sndmsg_off = 0;
951
952 skb_fill_page_desc(skb, i, page, 0, 0);
953 frag = &skb_shinfo(skb)->frags[i];
954 skb->truesize += PAGE_SIZE;
955 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
956 } else {
957 err = -EMSGSIZE;
958 goto error;
959 }
960 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
961 err = -EFAULT;
962 goto error;
963 }
964 sk->sk_sndmsg_off += copy;
965 frag->size += copy;
966 skb->len += copy;
967 skb->data_len += copy;
968 }
969 offset += copy;
970 length -= copy;
971 }
972
973 return 0;
974
975 error:
976 inet->cork.length -= length;
977 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
978 return err;
979 }
980
981 ssize_t ip_append_page(struct sock *sk, struct page *page,
982 int offset, size_t size, int flags)
983 {
984 struct inet_sock *inet = inet_sk(sk);
985 struct sk_buff *skb;
986 struct rtable *rt;
987 struct ip_options *opt = NULL;
988 int hh_len;
989 int mtu;
990 int len;
991 int err;
992 unsigned int maxfraglen, fragheaderlen, fraggap;
993
994 if (inet->hdrincl)
995 return -EPERM;
996
997 if (flags&MSG_PROBE)
998 return 0;
999
1000 if (skb_queue_empty(&sk->sk_write_queue))
1001 return -EINVAL;
1002
1003 rt = inet->cork.rt;
1004 if (inet->cork.flags & IPCORK_OPT)
1005 opt = inet->cork.opt;
1006
1007 if (!(rt->u.dst.dev->features&NETIF_F_SG))
1008 return -EOPNOTSUPP;
1009
1010 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1011 mtu = inet->cork.fragsize;
1012
1013 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1014 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1015
1016 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1017 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1018 return -EMSGSIZE;
1019 }
1020
1021 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1022 return -EINVAL;
1023
1024 inet->cork.length += size;
1025
1026 while (size > 0) {
1027 int i;
1028
1029 /* Check if the remaining data fits into current packet. */
1030 len = mtu - skb->len;
1031 if (len < size)
1032 len = maxfraglen - skb->len;
1033 if (len <= 0) {
1034 struct sk_buff *skb_prev;
1035 char *data;
1036 struct iphdr *iph;
1037 int alloclen;
1038
1039 skb_prev = skb;
1040 if (skb_prev)
1041 fraggap = skb_prev->len - maxfraglen;
1042 else
1043 fraggap = 0;
1044
1045 alloclen = fragheaderlen + hh_len + fraggap + 15;
1046 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1047 if (unlikely(!skb)) {
1048 err = -ENOBUFS;
1049 goto error;
1050 }
1051
1052 /*
1053 * Fill in the control structures
1054 */
1055 skb->ip_summed = CHECKSUM_NONE;
1056 skb->csum = 0;
1057 skb_reserve(skb, hh_len);
1058
1059 /*
1060 * Find where to start putting bytes.
1061 */
1062 data = skb_put(skb, fragheaderlen + fraggap);
1063 skb->nh.iph = iph = (struct iphdr *)data;
1064 data += fragheaderlen;
1065 skb->h.raw = data;
1066
1067 if (fraggap) {
1068 skb->csum = skb_copy_and_csum_bits(
1069 skb_prev, maxfraglen,
1070 data, fraggap, 0);
1071 skb_prev->csum = csum_sub(skb_prev->csum,
1072 skb->csum);
1073 skb_trim(skb_prev, maxfraglen);
1074 }
1075
1076 /*
1077 * Put the packet on the pending queue.
1078 */
1079 __skb_queue_tail(&sk->sk_write_queue, skb);
1080 continue;
1081 }
1082
1083 i = skb_shinfo(skb)->nr_frags;
1084 if (len > size)
1085 len = size;
1086 if (skb_can_coalesce(skb, i, page, offset)) {
1087 skb_shinfo(skb)->frags[i-1].size += len;
1088 } else if (i < MAX_SKB_FRAGS) {
1089 get_page(page);
1090 skb_fill_page_desc(skb, i, page, offset, len);
1091 } else {
1092 err = -EMSGSIZE;
1093 goto error;
1094 }
1095
1096 if (skb->ip_summed == CHECKSUM_NONE) {
1097 unsigned int csum;
1098 csum = csum_page(page, offset, len);
1099 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1100 }
1101
1102 skb->len += len;
1103 skb->data_len += len;
1104 offset += len;
1105 size -= len;
1106 }
1107 return 0;
1108
1109 error:
1110 inet->cork.length -= size;
1111 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1112 return err;
1113 }
1114
1115 /*
1116 * Combined all pending IP fragments on the socket as one IP datagram
1117 * and push them out.
1118 */
1119 int ip_push_pending_frames(struct sock *sk)
1120 {
1121 struct sk_buff *skb, *tmp_skb;
1122 struct sk_buff **tail_skb;
1123 struct inet_sock *inet = inet_sk(sk);
1124 struct ip_options *opt = NULL;
1125 struct rtable *rt = inet->cork.rt;
1126 struct iphdr *iph;
1127 int df = 0;
1128 __u8 ttl;
1129 int err = 0;
1130
1131 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1132 goto out;
1133 tail_skb = &(skb_shinfo(skb)->frag_list);
1134
1135 /* move skb->data to ip header from ext header */
1136 if (skb->data < skb->nh.raw)
1137 __skb_pull(skb, skb->nh.raw - skb->data);
1138 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1139 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1140 *tail_skb = tmp_skb;
1141 tail_skb = &(tmp_skb->next);
1142 skb->len += tmp_skb->len;
1143 skb->data_len += tmp_skb->len;
1144 skb->truesize += tmp_skb->truesize;
1145 __sock_put(tmp_skb->sk);
1146 tmp_skb->destructor = NULL;
1147 tmp_skb->sk = NULL;
1148 }
1149
1150 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1151 * to fragment the frame generated here. No matter, what transforms
1152 * how transforms change size of the packet, it will come out.
1153 */
1154 if (inet->pmtudisc != IP_PMTUDISC_DO)
1155 skb->local_df = 1;
1156
1157 /* DF bit is set when we want to see DF on outgoing frames.
1158 * If local_df is set too, we still allow to fragment this frame
1159 * locally. */
1160 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1161 (skb->len <= dst_mtu(&rt->u.dst) &&
1162 ip_dont_fragment(sk, &rt->u.dst)))
1163 df = htons(IP_DF);
1164
1165 if (inet->cork.flags & IPCORK_OPT)
1166 opt = inet->cork.opt;
1167
1168 if (rt->rt_type == RTN_MULTICAST)
1169 ttl = inet->mc_ttl;
1170 else
1171 ttl = ip_select_ttl(inet, &rt->u.dst);
1172
1173 iph = (struct iphdr *)skb->data;
1174 iph->version = 4;
1175 iph->ihl = 5;
1176 if (opt) {
1177 iph->ihl += opt->optlen>>2;
1178 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1179 }
1180 iph->tos = inet->tos;
1181 iph->tot_len = htons(skb->len);
1182 iph->frag_off = df;
1183 if (!df) {
1184 __ip_select_ident(iph, &rt->u.dst, 0);
1185 } else {
1186 iph->id = htons(inet->id++);
1187 }
1188 iph->ttl = ttl;
1189 iph->protocol = sk->sk_protocol;
1190 iph->saddr = rt->rt_src;
1191 iph->daddr = rt->rt_dst;
1192 ip_send_check(iph);
1193
1194 skb->priority = sk->sk_priority;
1195 skb->dst = dst_clone(&rt->u.dst);
1196
1197 /* Netfilter gets whole the not fragmented skb. */
1198 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1199 skb->dst->dev, dst_output);
1200 if (err) {
1201 if (err > 0)
1202 err = inet->recverr ? net_xmit_errno(err) : 0;
1203 if (err)
1204 goto error;
1205 }
1206
1207 out:
1208 inet->cork.flags &= ~IPCORK_OPT;
1209 if (inet->cork.opt) {
1210 kfree(inet->cork.opt);
1211 inet->cork.opt = NULL;
1212 }
1213 if (inet->cork.rt) {
1214 ip_rt_put(inet->cork.rt);
1215 inet->cork.rt = NULL;
1216 }
1217 return err;
1218
1219 error:
1220 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1221 goto out;
1222 }
1223
1224 /*
1225 * Throw away all pending data on the socket.
1226 */
1227 void ip_flush_pending_frames(struct sock *sk)
1228 {
1229 struct inet_sock *inet = inet_sk(sk);
1230 struct sk_buff *skb;
1231
1232 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1233 kfree_skb(skb);
1234
1235 inet->cork.flags &= ~IPCORK_OPT;
1236 if (inet->cork.opt) {
1237 kfree(inet->cork.opt);
1238 inet->cork.opt = NULL;
1239 }
1240 if (inet->cork.rt) {
1241 ip_rt_put(inet->cork.rt);
1242 inet->cork.rt = NULL;
1243 }
1244 }
1245
1246
1247 /*
1248 * Fetch data from kernel space and fill in checksum if needed.
1249 */
1250 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1251 int len, int odd, struct sk_buff *skb)
1252 {
1253 unsigned int csum;
1254
1255 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1256 skb->csum = csum_block_add(skb->csum, csum, odd);
1257 return 0;
1258 }
1259
1260 /*
1261 * Generic function to send a packet as reply to another packet.
1262 * Used to send TCP resets so far. ICMP should use this function too.
1263 *
1264 * Should run single threaded per socket because it uses the sock
1265 * structure to pass arguments.
1266 *
1267 * LATER: switch from ip_build_xmit to ip_append_*
1268 */
1269 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1270 unsigned int len)
1271 {
1272 struct inet_sock *inet = inet_sk(sk);
1273 struct {
1274 struct ip_options opt;
1275 char data[40];
1276 } replyopts;
1277 struct ipcm_cookie ipc;
1278 u32 daddr;
1279 struct rtable *rt = (struct rtable*)skb->dst;
1280
1281 if (ip_options_echo(&replyopts.opt, skb))
1282 return;
1283
1284 daddr = ipc.addr = rt->rt_src;
1285 ipc.opt = NULL;
1286
1287 if (replyopts.opt.optlen) {
1288 ipc.opt = &replyopts.opt;
1289
1290 if (ipc.opt->srr)
1291 daddr = replyopts.opt.faddr;
1292 }
1293
1294 {
1295 struct flowi fl = { .nl_u = { .ip4_u =
1296 { .daddr = daddr,
1297 .saddr = rt->rt_spec_dst,
1298 .tos = RT_TOS(skb->nh.iph->tos) } },
1299 /* Not quite clean, but right. */
1300 .uli_u = { .ports =
1301 { .sport = skb->h.th->dest,
1302 .dport = skb->h.th->source } },
1303 .proto = sk->sk_protocol };
1304 if (ip_route_output_key(&rt, &fl))
1305 return;
1306 }
1307
1308 /* And let IP do all the hard work.
1309
1310 This chunk is not reenterable, hence spinlock.
1311 Note that it uses the fact, that this function is called
1312 with locally disabled BH and that sk cannot be already spinlocked.
1313 */
1314 bh_lock_sock(sk);
1315 inet->tos = skb->nh.iph->tos;
1316 sk->sk_priority = skb->priority;
1317 sk->sk_protocol = skb->nh.iph->protocol;
1318 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1319 &ipc, rt, MSG_DONTWAIT);
1320 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1321 if (arg->csumoffset >= 0)
1322 *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1323 skb->ip_summed = CHECKSUM_NONE;
1324 ip_push_pending_frames(sk);
1325 }
1326
1327 bh_unlock_sock(sk);
1328
1329 ip_rt_put(rt);
1330 }
1331
1332 /*
1333 * IP protocol layer initialiser
1334 */
1335
1336 static struct packet_type ip_packet_type = {
1337 .type = __constant_htons(ETH_P_IP),
1338 .func = ip_rcv,
1339 };
1340
1341 /*
1342 * IP registers the packet type and then calls the subprotocol initialisers
1343 */
1344
1345 void __init ip_init(void)
1346 {
1347 dev_add_pack(&ip_packet_type);
1348
1349 ip_rt_init();
1350 inet_initpeers();
1351
1352 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1353 igmp_mc_proc_init();
1354 #endif
1355 }
1356
1357 EXPORT_SYMBOL(ip_finish_output);
1358 EXPORT_SYMBOL(ip_fragment);
1359 EXPORT_SYMBOL(ip_generic_getfrag);
1360 EXPORT_SYMBOL(ip_queue_xmit);
1361 EXPORT_SYMBOL(ip_send_check);
1362
1363 #ifdef CONFIG_SYSCTL
1364 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1365 #endif
This page took 0.075045 seconds and 4 git commands to generate.