net: Convert printks to pr_<level>
[deliverable/linux.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113
114 #define RT_FL_TOS(oldflp4) \
115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116
117 #define IP_MAX_MTU 0xFFF0
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly = 8;
131 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly = 256;
134 static int rt_chain_length_max __read_mostly = 20;
135
136 static struct delayed_work expires_work;
137 static unsigned long expires_ljiffies;
138
139 /*
140 * Interface to generic destination cache.
141 */
142
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
145 static unsigned int ipv4_mtu(const struct dst_entry *dst);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void ipv4_link_failure(struct sk_buff *skb);
149 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
151
152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 int how)
154 {
155 }
156
157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 {
159 struct rtable *rt = (struct rtable *) dst;
160 struct inet_peer *peer;
161 u32 *p = NULL;
162
163 if (!rt->peer)
164 rt_bind_peer(rt, rt->rt_dst, 1);
165
166 peer = rt->peer;
167 if (peer) {
168 u32 *old_p = __DST_METRICS_PTR(old);
169 unsigned long prev, new;
170
171 p = peer->metrics;
172 if (inet_metrics_new(peer))
173 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
174
175 new = (unsigned long) p;
176 prev = cmpxchg(&dst->_metrics, old, new);
177
178 if (prev != old) {
179 p = __DST_METRICS_PTR(prev);
180 if (prev & DST_METRICS_READ_ONLY)
181 p = NULL;
182 } else {
183 if (rt->fi) {
184 fib_info_put(rt->fi);
185 rt->fi = NULL;
186 }
187 }
188 }
189 return p;
190 }
191
192 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
193
194 static struct dst_ops ipv4_dst_ops = {
195 .family = AF_INET,
196 .protocol = cpu_to_be16(ETH_P_IP),
197 .gc = rt_garbage_collect,
198 .check = ipv4_dst_check,
199 .default_advmss = ipv4_default_advmss,
200 .mtu = ipv4_mtu,
201 .cow_metrics = ipv4_cow_metrics,
202 .destroy = ipv4_dst_destroy,
203 .ifdown = ipv4_dst_ifdown,
204 .negative_advice = ipv4_negative_advice,
205 .link_failure = ipv4_link_failure,
206 .update_pmtu = ip_rt_update_pmtu,
207 .local_out = __ip_local_out,
208 .neigh_lookup = ipv4_neigh_lookup,
209 };
210
211 #define ECN_OR_COST(class) TC_PRIO_##class
212
213 const __u8 ip_tos2prio[16] = {
214 TC_PRIO_BESTEFFORT,
215 ECN_OR_COST(BESTEFFORT),
216 TC_PRIO_BESTEFFORT,
217 ECN_OR_COST(BESTEFFORT),
218 TC_PRIO_BULK,
219 ECN_OR_COST(BULK),
220 TC_PRIO_BULK,
221 ECN_OR_COST(BULK),
222 TC_PRIO_INTERACTIVE,
223 ECN_OR_COST(INTERACTIVE),
224 TC_PRIO_INTERACTIVE,
225 ECN_OR_COST(INTERACTIVE),
226 TC_PRIO_INTERACTIVE_BULK,
227 ECN_OR_COST(INTERACTIVE_BULK),
228 TC_PRIO_INTERACTIVE_BULK,
229 ECN_OR_COST(INTERACTIVE_BULK)
230 };
231
232
233 /*
234 * Route cache.
235 */
236
237 /* The locking scheme is rather straight forward:
238 *
239 * 1) Read-Copy Update protects the buckets of the central route hash.
240 * 2) Only writers remove entries, and they hold the lock
241 * as they look at rtable reference counts.
242 * 3) Only readers acquire references to rtable entries,
243 * they do so with atomic increments and with the
244 * lock held.
245 */
246
247 struct rt_hash_bucket {
248 struct rtable __rcu *chain;
249 };
250
251 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
252 defined(CONFIG_PROVE_LOCKING)
253 /*
254 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
255 * The size of this table is a power of two and depends on the number of CPUS.
256 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
257 */
258 #ifdef CONFIG_LOCKDEP
259 # define RT_HASH_LOCK_SZ 256
260 #else
261 # if NR_CPUS >= 32
262 # define RT_HASH_LOCK_SZ 4096
263 # elif NR_CPUS >= 16
264 # define RT_HASH_LOCK_SZ 2048
265 # elif NR_CPUS >= 8
266 # define RT_HASH_LOCK_SZ 1024
267 # elif NR_CPUS >= 4
268 # define RT_HASH_LOCK_SZ 512
269 # else
270 # define RT_HASH_LOCK_SZ 256
271 # endif
272 #endif
273
274 static spinlock_t *rt_hash_locks;
275 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
276
277 static __init void rt_hash_lock_init(void)
278 {
279 int i;
280
281 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
282 GFP_KERNEL);
283 if (!rt_hash_locks)
284 panic("IP: failed to allocate rt_hash_locks\n");
285
286 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
287 spin_lock_init(&rt_hash_locks[i]);
288 }
289 #else
290 # define rt_hash_lock_addr(slot) NULL
291
292 static inline void rt_hash_lock_init(void)
293 {
294 }
295 #endif
296
297 static struct rt_hash_bucket *rt_hash_table __read_mostly;
298 static unsigned rt_hash_mask __read_mostly;
299 static unsigned int rt_hash_log __read_mostly;
300
301 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
302 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
303
304 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
305 int genid)
306 {
307 return jhash_3words((__force u32)daddr, (__force u32)saddr,
308 idx, genid)
309 & rt_hash_mask;
310 }
311
312 static inline int rt_genid(struct net *net)
313 {
314 return atomic_read(&net->ipv4.rt_genid);
315 }
316
317 #ifdef CONFIG_PROC_FS
318 struct rt_cache_iter_state {
319 struct seq_net_private p;
320 int bucket;
321 int genid;
322 };
323
324 static struct rtable *rt_cache_get_first(struct seq_file *seq)
325 {
326 struct rt_cache_iter_state *st = seq->private;
327 struct rtable *r = NULL;
328
329 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
330 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
331 continue;
332 rcu_read_lock_bh();
333 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
334 while (r) {
335 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
336 r->rt_genid == st->genid)
337 return r;
338 r = rcu_dereference_bh(r->dst.rt_next);
339 }
340 rcu_read_unlock_bh();
341 }
342 return r;
343 }
344
345 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
346 struct rtable *r)
347 {
348 struct rt_cache_iter_state *st = seq->private;
349
350 r = rcu_dereference_bh(r->dst.rt_next);
351 while (!r) {
352 rcu_read_unlock_bh();
353 do {
354 if (--st->bucket < 0)
355 return NULL;
356 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
357 rcu_read_lock_bh();
358 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
359 }
360 return r;
361 }
362
363 static struct rtable *rt_cache_get_next(struct seq_file *seq,
364 struct rtable *r)
365 {
366 struct rt_cache_iter_state *st = seq->private;
367 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
368 if (dev_net(r->dst.dev) != seq_file_net(seq))
369 continue;
370 if (r->rt_genid == st->genid)
371 break;
372 }
373 return r;
374 }
375
376 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
377 {
378 struct rtable *r = rt_cache_get_first(seq);
379
380 if (r)
381 while (pos && (r = rt_cache_get_next(seq, r)))
382 --pos;
383 return pos ? NULL : r;
384 }
385
386 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
387 {
388 struct rt_cache_iter_state *st = seq->private;
389 if (*pos)
390 return rt_cache_get_idx(seq, *pos - 1);
391 st->genid = rt_genid(seq_file_net(seq));
392 return SEQ_START_TOKEN;
393 }
394
395 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396 {
397 struct rtable *r;
398
399 if (v == SEQ_START_TOKEN)
400 r = rt_cache_get_first(seq);
401 else
402 r = rt_cache_get_next(seq, v);
403 ++*pos;
404 return r;
405 }
406
407 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
408 {
409 if (v && v != SEQ_START_TOKEN)
410 rcu_read_unlock_bh();
411 }
412
413 static int rt_cache_seq_show(struct seq_file *seq, void *v)
414 {
415 if (v == SEQ_START_TOKEN)
416 seq_printf(seq, "%-127s\n",
417 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
418 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
419 "HHUptod\tSpecDst");
420 else {
421 struct rtable *r = v;
422 struct neighbour *n;
423 int len, HHUptod;
424
425 rcu_read_lock();
426 n = dst_get_neighbour_noref(&r->dst);
427 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
428 rcu_read_unlock();
429
430 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
431 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
432 r->dst.dev ? r->dst.dev->name : "*",
433 (__force u32)r->rt_dst,
434 (__force u32)r->rt_gateway,
435 r->rt_flags, atomic_read(&r->dst.__refcnt),
436 r->dst.__use, 0, (__force u32)r->rt_src,
437 dst_metric_advmss(&r->dst) + 40,
438 dst_metric(&r->dst, RTAX_WINDOW),
439 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
440 dst_metric(&r->dst, RTAX_RTTVAR)),
441 r->rt_key_tos,
442 -1,
443 HHUptod,
444 r->rt_spec_dst, &len);
445
446 seq_printf(seq, "%*s\n", 127 - len, "");
447 }
448 return 0;
449 }
450
451 static const struct seq_operations rt_cache_seq_ops = {
452 .start = rt_cache_seq_start,
453 .next = rt_cache_seq_next,
454 .stop = rt_cache_seq_stop,
455 .show = rt_cache_seq_show,
456 };
457
458 static int rt_cache_seq_open(struct inode *inode, struct file *file)
459 {
460 return seq_open_net(inode, file, &rt_cache_seq_ops,
461 sizeof(struct rt_cache_iter_state));
462 }
463
464 static const struct file_operations rt_cache_seq_fops = {
465 .owner = THIS_MODULE,
466 .open = rt_cache_seq_open,
467 .read = seq_read,
468 .llseek = seq_lseek,
469 .release = seq_release_net,
470 };
471
472
473 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
474 {
475 int cpu;
476
477 if (*pos == 0)
478 return SEQ_START_TOKEN;
479
480 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
481 if (!cpu_possible(cpu))
482 continue;
483 *pos = cpu+1;
484 return &per_cpu(rt_cache_stat, cpu);
485 }
486 return NULL;
487 }
488
489 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490 {
491 int cpu;
492
493 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
494 if (!cpu_possible(cpu))
495 continue;
496 *pos = cpu+1;
497 return &per_cpu(rt_cache_stat, cpu);
498 }
499 return NULL;
500
501 }
502
503 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
504 {
505
506 }
507
508 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
509 {
510 struct rt_cache_stat *st = v;
511
512 if (v == SEQ_START_TOKEN) {
513 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
514 return 0;
515 }
516
517 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
518 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
519 dst_entries_get_slow(&ipv4_dst_ops),
520 st->in_hit,
521 st->in_slow_tot,
522 st->in_slow_mc,
523 st->in_no_route,
524 st->in_brd,
525 st->in_martian_dst,
526 st->in_martian_src,
527
528 st->out_hit,
529 st->out_slow_tot,
530 st->out_slow_mc,
531
532 st->gc_total,
533 st->gc_ignored,
534 st->gc_goal_miss,
535 st->gc_dst_overflow,
536 st->in_hlist_search,
537 st->out_hlist_search
538 );
539 return 0;
540 }
541
542 static const struct seq_operations rt_cpu_seq_ops = {
543 .start = rt_cpu_seq_start,
544 .next = rt_cpu_seq_next,
545 .stop = rt_cpu_seq_stop,
546 .show = rt_cpu_seq_show,
547 };
548
549
550 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
551 {
552 return seq_open(file, &rt_cpu_seq_ops);
553 }
554
555 static const struct file_operations rt_cpu_seq_fops = {
556 .owner = THIS_MODULE,
557 .open = rt_cpu_seq_open,
558 .read = seq_read,
559 .llseek = seq_lseek,
560 .release = seq_release,
561 };
562
563 #ifdef CONFIG_IP_ROUTE_CLASSID
564 static int rt_acct_proc_show(struct seq_file *m, void *v)
565 {
566 struct ip_rt_acct *dst, *src;
567 unsigned int i, j;
568
569 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 if (!dst)
571 return -ENOMEM;
572
573 for_each_possible_cpu(i) {
574 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
575 for (j = 0; j < 256; j++) {
576 dst[j].o_bytes += src[j].o_bytes;
577 dst[j].o_packets += src[j].o_packets;
578 dst[j].i_bytes += src[j].i_bytes;
579 dst[j].i_packets += src[j].i_packets;
580 }
581 }
582
583 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
584 kfree(dst);
585 return 0;
586 }
587
588 static int rt_acct_proc_open(struct inode *inode, struct file *file)
589 {
590 return single_open(file, rt_acct_proc_show, NULL);
591 }
592
593 static const struct file_operations rt_acct_proc_fops = {
594 .owner = THIS_MODULE,
595 .open = rt_acct_proc_open,
596 .read = seq_read,
597 .llseek = seq_lseek,
598 .release = single_release,
599 };
600 #endif
601
602 static int __net_init ip_rt_do_proc_init(struct net *net)
603 {
604 struct proc_dir_entry *pde;
605
606 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
607 &rt_cache_seq_fops);
608 if (!pde)
609 goto err1;
610
611 pde = proc_create("rt_cache", S_IRUGO,
612 net->proc_net_stat, &rt_cpu_seq_fops);
613 if (!pde)
614 goto err2;
615
616 #ifdef CONFIG_IP_ROUTE_CLASSID
617 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
618 if (!pde)
619 goto err3;
620 #endif
621 return 0;
622
623 #ifdef CONFIG_IP_ROUTE_CLASSID
624 err3:
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626 #endif
627 err2:
628 remove_proc_entry("rt_cache", net->proc_net);
629 err1:
630 return -ENOMEM;
631 }
632
633 static void __net_exit ip_rt_do_proc_exit(struct net *net)
634 {
635 remove_proc_entry("rt_cache", net->proc_net_stat);
636 remove_proc_entry("rt_cache", net->proc_net);
637 #ifdef CONFIG_IP_ROUTE_CLASSID
638 remove_proc_entry("rt_acct", net->proc_net);
639 #endif
640 }
641
642 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
643 .init = ip_rt_do_proc_init,
644 .exit = ip_rt_do_proc_exit,
645 };
646
647 static int __init ip_rt_proc_init(void)
648 {
649 return register_pernet_subsys(&ip_rt_proc_ops);
650 }
651
652 #else
653 static inline int ip_rt_proc_init(void)
654 {
655 return 0;
656 }
657 #endif /* CONFIG_PROC_FS */
658
659 static inline void rt_free(struct rtable *rt)
660 {
661 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
662 }
663
664 static inline void rt_drop(struct rtable *rt)
665 {
666 ip_rt_put(rt);
667 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
668 }
669
670 static inline int rt_fast_clean(struct rtable *rth)
671 {
672 /* Kill broadcast/multicast entries very aggresively, if they
673 collide in hash table with more useful entries */
674 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
675 rt_is_input_route(rth) && rth->dst.rt_next;
676 }
677
678 static inline int rt_valuable(struct rtable *rth)
679 {
680 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
681 (rth->peer && rth->peer->pmtu_expires);
682 }
683
684 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
685 {
686 unsigned long age;
687 int ret = 0;
688
689 if (atomic_read(&rth->dst.__refcnt))
690 goto out;
691
692 age = jiffies - rth->dst.lastuse;
693 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
694 (age <= tmo2 && rt_valuable(rth)))
695 goto out;
696 ret = 1;
697 out: return ret;
698 }
699
700 /* Bits of score are:
701 * 31: very valuable
702 * 30: not quite useless
703 * 29..0: usage counter
704 */
705 static inline u32 rt_score(struct rtable *rt)
706 {
707 u32 score = jiffies - rt->dst.lastuse;
708
709 score = ~score & ~(3<<30);
710
711 if (rt_valuable(rt))
712 score |= (1<<31);
713
714 if (rt_is_output_route(rt) ||
715 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
716 score |= (1<<30);
717
718 return score;
719 }
720
721 static inline bool rt_caching(const struct net *net)
722 {
723 return net->ipv4.current_rt_cache_rebuild_count <=
724 net->ipv4.sysctl_rt_cache_rebuild_count;
725 }
726
727 static inline bool compare_hash_inputs(const struct rtable *rt1,
728 const struct rtable *rt2)
729 {
730 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
733 }
734
735 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
736 {
737 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
738 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
739 (rt1->rt_mark ^ rt2->rt_mark) |
740 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
741 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
742 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
743 }
744
745 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
746 {
747 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
748 }
749
750 static inline int rt_is_expired(struct rtable *rth)
751 {
752 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
753 }
754
755 /*
756 * Perform a full scan of hash table and free all entries.
757 * Can be called by a softirq or a process.
758 * In the later case, we want to be reschedule if necessary
759 */
760 static void rt_do_flush(struct net *net, int process_context)
761 {
762 unsigned int i;
763 struct rtable *rth, *next;
764
765 for (i = 0; i <= rt_hash_mask; i++) {
766 struct rtable __rcu **pprev;
767 struct rtable *list;
768
769 if (process_context && need_resched())
770 cond_resched();
771 rth = rcu_access_pointer(rt_hash_table[i].chain);
772 if (!rth)
773 continue;
774
775 spin_lock_bh(rt_hash_lock_addr(i));
776
777 list = NULL;
778 pprev = &rt_hash_table[i].chain;
779 rth = rcu_dereference_protected(*pprev,
780 lockdep_is_held(rt_hash_lock_addr(i)));
781
782 while (rth) {
783 next = rcu_dereference_protected(rth->dst.rt_next,
784 lockdep_is_held(rt_hash_lock_addr(i)));
785
786 if (!net ||
787 net_eq(dev_net(rth->dst.dev), net)) {
788 rcu_assign_pointer(*pprev, next);
789 rcu_assign_pointer(rth->dst.rt_next, list);
790 list = rth;
791 } else {
792 pprev = &rth->dst.rt_next;
793 }
794 rth = next;
795 }
796
797 spin_unlock_bh(rt_hash_lock_addr(i));
798
799 for (; list; list = next) {
800 next = rcu_dereference_protected(list->dst.rt_next, 1);
801 rt_free(list);
802 }
803 }
804 }
805
806 /*
807 * While freeing expired entries, we compute average chain length
808 * and standard deviation, using fixed-point arithmetic.
809 * This to have an estimation of rt_chain_length_max
810 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
811 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812 */
813
814 #define FRACT_BITS 3
815 #define ONE (1UL << FRACT_BITS)
816
817 /*
818 * Given a hash chain and an item in this hash chain,
819 * find if a previous entry has the same hash_inputs
820 * (but differs on tos, mark or oif)
821 * Returns 0 if an alias is found.
822 * Returns ONE if rth has no alias before itself.
823 */
824 static int has_noalias(const struct rtable *head, const struct rtable *rth)
825 {
826 const struct rtable *aux = head;
827
828 while (aux != rth) {
829 if (compare_hash_inputs(aux, rth))
830 return 0;
831 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
832 }
833 return ONE;
834 }
835
836 static void rt_check_expire(void)
837 {
838 static unsigned int rover;
839 unsigned int i = rover, goal;
840 struct rtable *rth;
841 struct rtable __rcu **rthp;
842 unsigned long samples = 0;
843 unsigned long sum = 0, sum2 = 0;
844 unsigned long delta;
845 u64 mult;
846
847 delta = jiffies - expires_ljiffies;
848 expires_ljiffies = jiffies;
849 mult = ((u64)delta) << rt_hash_log;
850 if (ip_rt_gc_timeout > 1)
851 do_div(mult, ip_rt_gc_timeout);
852 goal = (unsigned int)mult;
853 if (goal > rt_hash_mask)
854 goal = rt_hash_mask + 1;
855 for (; goal > 0; goal--) {
856 unsigned long tmo = ip_rt_gc_timeout;
857 unsigned long length;
858
859 i = (i + 1) & rt_hash_mask;
860 rthp = &rt_hash_table[i].chain;
861
862 if (need_resched())
863 cond_resched();
864
865 samples++;
866
867 if (rcu_dereference_raw(*rthp) == NULL)
868 continue;
869 length = 0;
870 spin_lock_bh(rt_hash_lock_addr(i));
871 while ((rth = rcu_dereference_protected(*rthp,
872 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
873 prefetch(rth->dst.rt_next);
874 if (rt_is_expired(rth)) {
875 *rthp = rth->dst.rt_next;
876 rt_free(rth);
877 continue;
878 }
879 if (rth->dst.expires) {
880 /* Entry is expired even if it is in use */
881 if (time_before_eq(jiffies, rth->dst.expires)) {
882 nofree:
883 tmo >>= 1;
884 rthp = &rth->dst.rt_next;
885 /*
886 * We only count entries on
887 * a chain with equal hash inputs once
888 * so that entries for different QOS
889 * levels, and other non-hash input
890 * attributes don't unfairly skew
891 * the length computation
892 */
893 length += has_noalias(rt_hash_table[i].chain, rth);
894 continue;
895 }
896 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
897 goto nofree;
898
899 /* Cleanup aged off entries. */
900 *rthp = rth->dst.rt_next;
901 rt_free(rth);
902 }
903 spin_unlock_bh(rt_hash_lock_addr(i));
904 sum += length;
905 sum2 += length*length;
906 }
907 if (samples) {
908 unsigned long avg = sum / samples;
909 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
910 rt_chain_length_max = max_t(unsigned long,
911 ip_rt_gc_elasticity,
912 (avg + 4*sd) >> FRACT_BITS);
913 }
914 rover = i;
915 }
916
917 /*
918 * rt_worker_func() is run in process context.
919 * we call rt_check_expire() to scan part of the hash table
920 */
921 static void rt_worker_func(struct work_struct *work)
922 {
923 rt_check_expire();
924 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
925 }
926
927 /*
928 * Perturbation of rt_genid by a small quantity [1..256]
929 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
930 * many times (2^24) without giving recent rt_genid.
931 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
932 */
933 static void rt_cache_invalidate(struct net *net)
934 {
935 unsigned char shuffle;
936
937 get_random_bytes(&shuffle, sizeof(shuffle));
938 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
939 inetpeer_invalidate_tree(AF_INET);
940 }
941
942 /*
943 * delay < 0 : invalidate cache (fast : entries will be deleted later)
944 * delay >= 0 : invalidate & flush cache (can be long)
945 */
946 void rt_cache_flush(struct net *net, int delay)
947 {
948 rt_cache_invalidate(net);
949 if (delay >= 0)
950 rt_do_flush(net, !in_softirq());
951 }
952
953 /* Flush previous cache invalidated entries from the cache */
954 void rt_cache_flush_batch(struct net *net)
955 {
956 rt_do_flush(net, !in_softirq());
957 }
958
959 static void rt_emergency_hash_rebuild(struct net *net)
960 {
961 if (net_ratelimit())
962 pr_warn("Route hash chain too long!\n");
963 rt_cache_invalidate(net);
964 }
965
966 /*
967 Short description of GC goals.
968
969 We want to build algorithm, which will keep routing cache
970 at some equilibrium point, when number of aged off entries
971 is kept approximately equal to newly generated ones.
972
973 Current expiration strength is variable "expire".
974 We try to adjust it dynamically, so that if networking
975 is idle expires is large enough to keep enough of warm entries,
976 and when load increases it reduces to limit cache size.
977 */
978
979 static int rt_garbage_collect(struct dst_ops *ops)
980 {
981 static unsigned long expire = RT_GC_TIMEOUT;
982 static unsigned long last_gc;
983 static int rover;
984 static int equilibrium;
985 struct rtable *rth;
986 struct rtable __rcu **rthp;
987 unsigned long now = jiffies;
988 int goal;
989 int entries = dst_entries_get_fast(&ipv4_dst_ops);
990
991 /*
992 * Garbage collection is pretty expensive,
993 * do not make it too frequently.
994 */
995
996 RT_CACHE_STAT_INC(gc_total);
997
998 if (now - last_gc < ip_rt_gc_min_interval &&
999 entries < ip_rt_max_size) {
1000 RT_CACHE_STAT_INC(gc_ignored);
1001 goto out;
1002 }
1003
1004 entries = dst_entries_get_slow(&ipv4_dst_ops);
1005 /* Calculate number of entries, which we want to expire now. */
1006 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1007 if (goal <= 0) {
1008 if (equilibrium < ipv4_dst_ops.gc_thresh)
1009 equilibrium = ipv4_dst_ops.gc_thresh;
1010 goal = entries - equilibrium;
1011 if (goal > 0) {
1012 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1013 goal = entries - equilibrium;
1014 }
1015 } else {
1016 /* We are in dangerous area. Try to reduce cache really
1017 * aggressively.
1018 */
1019 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1020 equilibrium = entries - goal;
1021 }
1022
1023 if (now - last_gc >= ip_rt_gc_min_interval)
1024 last_gc = now;
1025
1026 if (goal <= 0) {
1027 equilibrium += goal;
1028 goto work_done;
1029 }
1030
1031 do {
1032 int i, k;
1033
1034 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035 unsigned long tmo = expire;
1036
1037 k = (k + 1) & rt_hash_mask;
1038 rthp = &rt_hash_table[k].chain;
1039 spin_lock_bh(rt_hash_lock_addr(k));
1040 while ((rth = rcu_dereference_protected(*rthp,
1041 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1042 if (!rt_is_expired(rth) &&
1043 !rt_may_expire(rth, tmo, expire)) {
1044 tmo >>= 1;
1045 rthp = &rth->dst.rt_next;
1046 continue;
1047 }
1048 *rthp = rth->dst.rt_next;
1049 rt_free(rth);
1050 goal--;
1051 }
1052 spin_unlock_bh(rt_hash_lock_addr(k));
1053 if (goal <= 0)
1054 break;
1055 }
1056 rover = k;
1057
1058 if (goal <= 0)
1059 goto work_done;
1060
1061 /* Goal is not achieved. We stop process if:
1062
1063 - if expire reduced to zero. Otherwise, expire is halfed.
1064 - if table is not full.
1065 - if we are called from interrupt.
1066 - jiffies check is just fallback/debug loop breaker.
1067 We will not spin here for long time in any case.
1068 */
1069
1070 RT_CACHE_STAT_INC(gc_goal_miss);
1071
1072 if (expire == 0)
1073 break;
1074
1075 expire >>= 1;
1076
1077 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1078 goto out;
1079 } while (!in_softirq() && time_before_eq(jiffies, now));
1080
1081 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082 goto out;
1083 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1084 goto out;
1085 if (net_ratelimit())
1086 pr_warn("dst cache overflow\n");
1087 RT_CACHE_STAT_INC(gc_dst_overflow);
1088 return 1;
1089
1090 work_done:
1091 expire += ip_rt_gc_min_interval;
1092 if (expire > ip_rt_gc_timeout ||
1093 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095 expire = ip_rt_gc_timeout;
1096 out: return 0;
1097 }
1098
1099 /*
1100 * Returns number of entries in a hash chain that have different hash_inputs
1101 */
1102 static int slow_chain_length(const struct rtable *head)
1103 {
1104 int length = 0;
1105 const struct rtable *rth = head;
1106
1107 while (rth) {
1108 length += has_noalias(head, rth);
1109 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110 }
1111 return length >> FRACT_BITS;
1112 }
1113
1114 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115 {
1116 static const __be32 inaddr_any = 0;
1117 struct net_device *dev = dst->dev;
1118 const __be32 *pkey = daddr;
1119 const struct rtable *rt;
1120 struct neighbour *n;
1121
1122 rt = (const struct rtable *) dst;
1123
1124 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125 pkey = &inaddr_any;
1126 else if (rt->rt_gateway)
1127 pkey = (const __be32 *) &rt->rt_gateway;
1128
1129 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130 if (n)
1131 return n;
1132 return neigh_create(&arp_tbl, pkey, dev);
1133 }
1134
1135 static int rt_bind_neighbour(struct rtable *rt)
1136 {
1137 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138 if (IS_ERR(n))
1139 return PTR_ERR(n);
1140 dst_set_neighbour(&rt->dst, n);
1141
1142 return 0;
1143 }
1144
1145 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1146 struct sk_buff *skb, int ifindex)
1147 {
1148 struct rtable *rth, *cand;
1149 struct rtable __rcu **rthp, **candp;
1150 unsigned long now;
1151 u32 min_score;
1152 int chain_length;
1153 int attempts = !in_softirq();
1154
1155 restart:
1156 chain_length = 0;
1157 min_score = ~(u32)0;
1158 cand = NULL;
1159 candp = NULL;
1160 now = jiffies;
1161
1162 if (!rt_caching(dev_net(rt->dst.dev))) {
1163 /*
1164 * If we're not caching, just tell the caller we
1165 * were successful and don't touch the route. The
1166 * caller hold the sole reference to the cache entry, and
1167 * it will be released when the caller is done with it.
1168 * If we drop it here, the callers have no way to resolve routes
1169 * when we're not caching. Instead, just point *rp at rt, so
1170 * the caller gets a single use out of the route
1171 * Note that we do rt_free on this new route entry, so that
1172 * once its refcount hits zero, we are still able to reap it
1173 * (Thanks Alexey)
1174 * Note: To avoid expensive rcu stuff for this uncached dst,
1175 * we set DST_NOCACHE so that dst_release() can free dst without
1176 * waiting a grace period.
1177 */
1178
1179 rt->dst.flags |= DST_NOCACHE;
1180 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181 int err = rt_bind_neighbour(rt);
1182 if (err) {
1183 if (net_ratelimit())
1184 pr_warn("Neighbour table failure & not caching routes\n");
1185 ip_rt_put(rt);
1186 return ERR_PTR(err);
1187 }
1188 }
1189
1190 goto skip_hashing;
1191 }
1192
1193 rthp = &rt_hash_table[hash].chain;
1194
1195 spin_lock_bh(rt_hash_lock_addr(hash));
1196 while ((rth = rcu_dereference_protected(*rthp,
1197 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1198 if (rt_is_expired(rth)) {
1199 *rthp = rth->dst.rt_next;
1200 rt_free(rth);
1201 continue;
1202 }
1203 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1204 /* Put it first */
1205 *rthp = rth->dst.rt_next;
1206 /*
1207 * Since lookup is lockfree, the deletion
1208 * must be visible to another weakly ordered CPU before
1209 * the insertion at the start of the hash chain.
1210 */
1211 rcu_assign_pointer(rth->dst.rt_next,
1212 rt_hash_table[hash].chain);
1213 /*
1214 * Since lookup is lockfree, the update writes
1215 * must be ordered for consistency on SMP.
1216 */
1217 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1218
1219 dst_use(&rth->dst, now);
1220 spin_unlock_bh(rt_hash_lock_addr(hash));
1221
1222 rt_drop(rt);
1223 if (skb)
1224 skb_dst_set(skb, &rth->dst);
1225 return rth;
1226 }
1227
1228 if (!atomic_read(&rth->dst.__refcnt)) {
1229 u32 score = rt_score(rth);
1230
1231 if (score <= min_score) {
1232 cand = rth;
1233 candp = rthp;
1234 min_score = score;
1235 }
1236 }
1237
1238 chain_length++;
1239
1240 rthp = &rth->dst.rt_next;
1241 }
1242
1243 if (cand) {
1244 /* ip_rt_gc_elasticity used to be average length of chain
1245 * length, when exceeded gc becomes really aggressive.
1246 *
1247 * The second limit is less certain. At the moment it allows
1248 * only 2 entries per bucket. We will see.
1249 */
1250 if (chain_length > ip_rt_gc_elasticity) {
1251 *candp = cand->dst.rt_next;
1252 rt_free(cand);
1253 }
1254 } else {
1255 if (chain_length > rt_chain_length_max &&
1256 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1257 struct net *net = dev_net(rt->dst.dev);
1258 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1259 if (!rt_caching(net)) {
1260 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1261 rt->dst.dev->name, num);
1262 }
1263 rt_emergency_hash_rebuild(net);
1264 spin_unlock_bh(rt_hash_lock_addr(hash));
1265
1266 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1267 ifindex, rt_genid(net));
1268 goto restart;
1269 }
1270 }
1271
1272 /* Try to bind route to arp only if it is output
1273 route or unicast forwarding path.
1274 */
1275 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1276 int err = rt_bind_neighbour(rt);
1277 if (err) {
1278 spin_unlock_bh(rt_hash_lock_addr(hash));
1279
1280 if (err != -ENOBUFS) {
1281 rt_drop(rt);
1282 return ERR_PTR(err);
1283 }
1284
1285 /* Neighbour tables are full and nothing
1286 can be released. Try to shrink route cache,
1287 it is most likely it holds some neighbour records.
1288 */
1289 if (attempts-- > 0) {
1290 int saved_elasticity = ip_rt_gc_elasticity;
1291 int saved_int = ip_rt_gc_min_interval;
1292 ip_rt_gc_elasticity = 1;
1293 ip_rt_gc_min_interval = 0;
1294 rt_garbage_collect(&ipv4_dst_ops);
1295 ip_rt_gc_min_interval = saved_int;
1296 ip_rt_gc_elasticity = saved_elasticity;
1297 goto restart;
1298 }
1299
1300 if (net_ratelimit())
1301 pr_warn("ipv4: Neighbour table overflow\n");
1302 rt_drop(rt);
1303 return ERR_PTR(-ENOBUFS);
1304 }
1305 }
1306
1307 rt->dst.rt_next = rt_hash_table[hash].chain;
1308
1309 /*
1310 * Since lookup is lockfree, we must make sure
1311 * previous writes to rt are committed to memory
1312 * before making rt visible to other CPUS.
1313 */
1314 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1315
1316 spin_unlock_bh(rt_hash_lock_addr(hash));
1317
1318 skip_hashing:
1319 if (skb)
1320 skb_dst_set(skb, &rt->dst);
1321 return rt;
1322 }
1323
1324 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1325
1326 static u32 rt_peer_genid(void)
1327 {
1328 return atomic_read(&__rt_peer_genid);
1329 }
1330
1331 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1332 {
1333 struct inet_peer *peer;
1334
1335 peer = inet_getpeer_v4(daddr, create);
1336
1337 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1338 inet_putpeer(peer);
1339 else
1340 rt->rt_peer_genid = rt_peer_genid();
1341 }
1342
1343 /*
1344 * Peer allocation may fail only in serious out-of-memory conditions. However
1345 * we still can generate some output.
1346 * Random ID selection looks a bit dangerous because we have no chances to
1347 * select ID being unique in a reasonable period of time.
1348 * But broken packet identifier may be better than no packet at all.
1349 */
1350 static void ip_select_fb_ident(struct iphdr *iph)
1351 {
1352 static DEFINE_SPINLOCK(ip_fb_id_lock);
1353 static u32 ip_fallback_id;
1354 u32 salt;
1355
1356 spin_lock_bh(&ip_fb_id_lock);
1357 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1358 iph->id = htons(salt & 0xFFFF);
1359 ip_fallback_id = salt;
1360 spin_unlock_bh(&ip_fb_id_lock);
1361 }
1362
1363 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1364 {
1365 struct rtable *rt = (struct rtable *) dst;
1366
1367 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1368 if (rt->peer == NULL)
1369 rt_bind_peer(rt, rt->rt_dst, 1);
1370
1371 /* If peer is attached to destination, it is never detached,
1372 so that we need not to grab a lock to dereference it.
1373 */
1374 if (rt->peer) {
1375 iph->id = htons(inet_getid(rt->peer, more));
1376 return;
1377 }
1378 } else if (!rt)
1379 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1380 __builtin_return_address(0));
1381
1382 ip_select_fb_ident(iph);
1383 }
1384 EXPORT_SYMBOL(__ip_select_ident);
1385
1386 static void rt_del(unsigned hash, struct rtable *rt)
1387 {
1388 struct rtable __rcu **rthp;
1389 struct rtable *aux;
1390
1391 rthp = &rt_hash_table[hash].chain;
1392 spin_lock_bh(rt_hash_lock_addr(hash));
1393 ip_rt_put(rt);
1394 while ((aux = rcu_dereference_protected(*rthp,
1395 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1396 if (aux == rt || rt_is_expired(aux)) {
1397 *rthp = aux->dst.rt_next;
1398 rt_free(aux);
1399 continue;
1400 }
1401 rthp = &aux->dst.rt_next;
1402 }
1403 spin_unlock_bh(rt_hash_lock_addr(hash));
1404 }
1405
1406 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1407 {
1408 struct rtable *rt = (struct rtable *) dst;
1409 __be32 orig_gw = rt->rt_gateway;
1410 struct neighbour *n, *old_n;
1411
1412 dst_confirm(&rt->dst);
1413
1414 rt->rt_gateway = peer->redirect_learned.a4;
1415
1416 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1417 if (IS_ERR(n)) {
1418 rt->rt_gateway = orig_gw;
1419 return;
1420 }
1421 old_n = xchg(&rt->dst._neighbour, n);
1422 if (old_n)
1423 neigh_release(old_n);
1424 if (!(n->nud_state & NUD_VALID)) {
1425 neigh_event_send(n, NULL);
1426 } else {
1427 rt->rt_flags |= RTCF_REDIRECTED;
1428 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1429 }
1430 }
1431
1432 /* called in rcu_read_lock() section */
1433 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1434 __be32 saddr, struct net_device *dev)
1435 {
1436 int s, i;
1437 struct in_device *in_dev = __in_dev_get_rcu(dev);
1438 __be32 skeys[2] = { saddr, 0 };
1439 int ikeys[2] = { dev->ifindex, 0 };
1440 struct inet_peer *peer;
1441 struct net *net;
1442
1443 if (!in_dev)
1444 return;
1445
1446 net = dev_net(dev);
1447 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1448 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1449 ipv4_is_zeronet(new_gw))
1450 goto reject_redirect;
1451
1452 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1453 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1454 goto reject_redirect;
1455 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1456 goto reject_redirect;
1457 } else {
1458 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1459 goto reject_redirect;
1460 }
1461
1462 for (s = 0; s < 2; s++) {
1463 for (i = 0; i < 2; i++) {
1464 unsigned int hash;
1465 struct rtable __rcu **rthp;
1466 struct rtable *rt;
1467
1468 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1469
1470 rthp = &rt_hash_table[hash].chain;
1471
1472 while ((rt = rcu_dereference(*rthp)) != NULL) {
1473 rthp = &rt->dst.rt_next;
1474
1475 if (rt->rt_key_dst != daddr ||
1476 rt->rt_key_src != skeys[s] ||
1477 rt->rt_oif != ikeys[i] ||
1478 rt_is_input_route(rt) ||
1479 rt_is_expired(rt) ||
1480 !net_eq(dev_net(rt->dst.dev), net) ||
1481 rt->dst.error ||
1482 rt->dst.dev != dev ||
1483 rt->rt_gateway != old_gw)
1484 continue;
1485
1486 if (!rt->peer)
1487 rt_bind_peer(rt, rt->rt_dst, 1);
1488
1489 peer = rt->peer;
1490 if (peer) {
1491 if (peer->redirect_learned.a4 != new_gw) {
1492 peer->redirect_learned.a4 = new_gw;
1493 atomic_inc(&__rt_peer_genid);
1494 }
1495 check_peer_redir(&rt->dst, peer);
1496 }
1497 }
1498 }
1499 }
1500 return;
1501
1502 reject_redirect:
1503 #ifdef CONFIG_IP_ROUTE_VERBOSE
1504 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1505 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
1506 " Advised path = %pI4 -> %pI4\n",
1507 &old_gw, dev->name, &new_gw,
1508 &saddr, &daddr);
1509 #endif
1510 ;
1511 }
1512
1513 static bool peer_pmtu_expired(struct inet_peer *peer)
1514 {
1515 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1516
1517 return orig &&
1518 time_after_eq(jiffies, orig) &&
1519 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1520 }
1521
1522 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1523 {
1524 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1525
1526 return orig &&
1527 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1528 }
1529
1530 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1531 {
1532 struct rtable *rt = (struct rtable *)dst;
1533 struct dst_entry *ret = dst;
1534
1535 if (rt) {
1536 if (dst->obsolete > 0) {
1537 ip_rt_put(rt);
1538 ret = NULL;
1539 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1540 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1541 rt->rt_oif,
1542 rt_genid(dev_net(dst->dev)));
1543 rt_del(hash, rt);
1544 ret = NULL;
1545 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1546 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1547 }
1548 }
1549 return ret;
1550 }
1551
1552 /*
1553 * Algorithm:
1554 * 1. The first ip_rt_redirect_number redirects are sent
1555 * with exponential backoff, then we stop sending them at all,
1556 * assuming that the host ignores our redirects.
1557 * 2. If we did not see packets requiring redirects
1558 * during ip_rt_redirect_silence, we assume that the host
1559 * forgot redirected route and start to send redirects again.
1560 *
1561 * This algorithm is much cheaper and more intelligent than dumb load limiting
1562 * in icmp.c.
1563 *
1564 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1565 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1566 */
1567
1568 void ip_rt_send_redirect(struct sk_buff *skb)
1569 {
1570 struct rtable *rt = skb_rtable(skb);
1571 struct in_device *in_dev;
1572 struct inet_peer *peer;
1573 int log_martians;
1574
1575 rcu_read_lock();
1576 in_dev = __in_dev_get_rcu(rt->dst.dev);
1577 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1578 rcu_read_unlock();
1579 return;
1580 }
1581 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1582 rcu_read_unlock();
1583
1584 if (!rt->peer)
1585 rt_bind_peer(rt, rt->rt_dst, 1);
1586 peer = rt->peer;
1587 if (!peer) {
1588 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1589 return;
1590 }
1591
1592 /* No redirected packets during ip_rt_redirect_silence;
1593 * reset the algorithm.
1594 */
1595 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1596 peer->rate_tokens = 0;
1597
1598 /* Too many ignored redirects; do not send anything
1599 * set dst.rate_last to the last seen redirected packet.
1600 */
1601 if (peer->rate_tokens >= ip_rt_redirect_number) {
1602 peer->rate_last = jiffies;
1603 return;
1604 }
1605
1606 /* Check for load limit; set rate_last to the latest sent
1607 * redirect.
1608 */
1609 if (peer->rate_tokens == 0 ||
1610 time_after(jiffies,
1611 (peer->rate_last +
1612 (ip_rt_redirect_load << peer->rate_tokens)))) {
1613 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1614 peer->rate_last = jiffies;
1615 ++peer->rate_tokens;
1616 #ifdef CONFIG_IP_ROUTE_VERBOSE
1617 if (log_martians &&
1618 peer->rate_tokens == ip_rt_redirect_number &&
1619 net_ratelimit())
1620 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1621 &ip_hdr(skb)->saddr, rt->rt_iif,
1622 &rt->rt_dst, &rt->rt_gateway);
1623 #endif
1624 }
1625 }
1626
1627 static int ip_error(struct sk_buff *skb)
1628 {
1629 struct rtable *rt = skb_rtable(skb);
1630 struct inet_peer *peer;
1631 unsigned long now;
1632 bool send;
1633 int code;
1634
1635 switch (rt->dst.error) {
1636 case EINVAL:
1637 default:
1638 goto out;
1639 case EHOSTUNREACH:
1640 code = ICMP_HOST_UNREACH;
1641 break;
1642 case ENETUNREACH:
1643 code = ICMP_NET_UNREACH;
1644 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1645 IPSTATS_MIB_INNOROUTES);
1646 break;
1647 case EACCES:
1648 code = ICMP_PKT_FILTERED;
1649 break;
1650 }
1651
1652 if (!rt->peer)
1653 rt_bind_peer(rt, rt->rt_dst, 1);
1654 peer = rt->peer;
1655
1656 send = true;
1657 if (peer) {
1658 now = jiffies;
1659 peer->rate_tokens += now - peer->rate_last;
1660 if (peer->rate_tokens > ip_rt_error_burst)
1661 peer->rate_tokens = ip_rt_error_burst;
1662 peer->rate_last = now;
1663 if (peer->rate_tokens >= ip_rt_error_cost)
1664 peer->rate_tokens -= ip_rt_error_cost;
1665 else
1666 send = false;
1667 }
1668 if (send)
1669 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1670
1671 out: kfree_skb(skb);
1672 return 0;
1673 }
1674
1675 /*
1676 * The last two values are not from the RFC but
1677 * are needed for AMPRnet AX.25 paths.
1678 */
1679
1680 static const unsigned short mtu_plateau[] =
1681 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1682
1683 static inline unsigned short guess_mtu(unsigned short old_mtu)
1684 {
1685 int i;
1686
1687 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1688 if (old_mtu > mtu_plateau[i])
1689 return mtu_plateau[i];
1690 return 68;
1691 }
1692
1693 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1694 unsigned short new_mtu,
1695 struct net_device *dev)
1696 {
1697 unsigned short old_mtu = ntohs(iph->tot_len);
1698 unsigned short est_mtu = 0;
1699 struct inet_peer *peer;
1700
1701 peer = inet_getpeer_v4(iph->daddr, 1);
1702 if (peer) {
1703 unsigned short mtu = new_mtu;
1704
1705 if (new_mtu < 68 || new_mtu >= old_mtu) {
1706 /* BSD 4.2 derived systems incorrectly adjust
1707 * tot_len by the IP header length, and report
1708 * a zero MTU in the ICMP message.
1709 */
1710 if (mtu == 0 &&
1711 old_mtu >= 68 + (iph->ihl << 2))
1712 old_mtu -= iph->ihl << 2;
1713 mtu = guess_mtu(old_mtu);
1714 }
1715
1716 if (mtu < ip_rt_min_pmtu)
1717 mtu = ip_rt_min_pmtu;
1718 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1719 unsigned long pmtu_expires;
1720
1721 pmtu_expires = jiffies + ip_rt_mtu_expires;
1722 if (!pmtu_expires)
1723 pmtu_expires = 1UL;
1724
1725 est_mtu = mtu;
1726 peer->pmtu_learned = mtu;
1727 peer->pmtu_expires = pmtu_expires;
1728 atomic_inc(&__rt_peer_genid);
1729 }
1730
1731 inet_putpeer(peer);
1732 }
1733 return est_mtu ? : new_mtu;
1734 }
1735
1736 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1737 {
1738 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1739
1740 if (!expires)
1741 return;
1742 if (time_before(jiffies, expires)) {
1743 u32 orig_dst_mtu = dst_mtu(dst);
1744 if (peer->pmtu_learned < orig_dst_mtu) {
1745 if (!peer->pmtu_orig)
1746 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1747 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1748 }
1749 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1750 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1751 }
1752
1753 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1754 {
1755 struct rtable *rt = (struct rtable *) dst;
1756 struct inet_peer *peer;
1757
1758 dst_confirm(dst);
1759
1760 if (!rt->peer)
1761 rt_bind_peer(rt, rt->rt_dst, 1);
1762 peer = rt->peer;
1763 if (peer) {
1764 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1765
1766 if (mtu < ip_rt_min_pmtu)
1767 mtu = ip_rt_min_pmtu;
1768 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1769
1770 pmtu_expires = jiffies + ip_rt_mtu_expires;
1771 if (!pmtu_expires)
1772 pmtu_expires = 1UL;
1773
1774 peer->pmtu_learned = mtu;
1775 peer->pmtu_expires = pmtu_expires;
1776
1777 atomic_inc(&__rt_peer_genid);
1778 rt->rt_peer_genid = rt_peer_genid();
1779 }
1780 check_peer_pmtu(dst, peer);
1781 }
1782 }
1783
1784
1785 static void ipv4_validate_peer(struct rtable *rt)
1786 {
1787 if (rt->rt_peer_genid != rt_peer_genid()) {
1788 struct inet_peer *peer;
1789
1790 if (!rt->peer)
1791 rt_bind_peer(rt, rt->rt_dst, 0);
1792
1793 peer = rt->peer;
1794 if (peer) {
1795 check_peer_pmtu(&rt->dst, peer);
1796
1797 if (peer->redirect_learned.a4 &&
1798 peer->redirect_learned.a4 != rt->rt_gateway)
1799 check_peer_redir(&rt->dst, peer);
1800 }
1801
1802 rt->rt_peer_genid = rt_peer_genid();
1803 }
1804 }
1805
1806 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1807 {
1808 struct rtable *rt = (struct rtable *) dst;
1809
1810 if (rt_is_expired(rt))
1811 return NULL;
1812 ipv4_validate_peer(rt);
1813 return dst;
1814 }
1815
1816 static void ipv4_dst_destroy(struct dst_entry *dst)
1817 {
1818 struct rtable *rt = (struct rtable *) dst;
1819 struct inet_peer *peer = rt->peer;
1820
1821 if (rt->fi) {
1822 fib_info_put(rt->fi);
1823 rt->fi = NULL;
1824 }
1825 if (peer) {
1826 rt->peer = NULL;
1827 inet_putpeer(peer);
1828 }
1829 }
1830
1831
1832 static void ipv4_link_failure(struct sk_buff *skb)
1833 {
1834 struct rtable *rt;
1835
1836 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1837
1838 rt = skb_rtable(skb);
1839 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1840 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1841 }
1842
1843 static int ip_rt_bug(struct sk_buff *skb)
1844 {
1845 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1846 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1847 skb->dev ? skb->dev->name : "?");
1848 kfree_skb(skb);
1849 WARN_ON(1);
1850 return 0;
1851 }
1852
1853 /*
1854 We do not cache source address of outgoing interface,
1855 because it is used only by IP RR, TS and SRR options,
1856 so that it out of fast path.
1857
1858 BTW remember: "addr" is allowed to be not aligned
1859 in IP options!
1860 */
1861
1862 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1863 {
1864 __be32 src;
1865
1866 if (rt_is_output_route(rt))
1867 src = ip_hdr(skb)->saddr;
1868 else {
1869 struct fib_result res;
1870 struct flowi4 fl4;
1871 struct iphdr *iph;
1872
1873 iph = ip_hdr(skb);
1874
1875 memset(&fl4, 0, sizeof(fl4));
1876 fl4.daddr = iph->daddr;
1877 fl4.saddr = iph->saddr;
1878 fl4.flowi4_tos = RT_TOS(iph->tos);
1879 fl4.flowi4_oif = rt->dst.dev->ifindex;
1880 fl4.flowi4_iif = skb->dev->ifindex;
1881 fl4.flowi4_mark = skb->mark;
1882
1883 rcu_read_lock();
1884 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1885 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1886 else
1887 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1888 RT_SCOPE_UNIVERSE);
1889 rcu_read_unlock();
1890 }
1891 memcpy(addr, &src, 4);
1892 }
1893
1894 #ifdef CONFIG_IP_ROUTE_CLASSID
1895 static void set_class_tag(struct rtable *rt, u32 tag)
1896 {
1897 if (!(rt->dst.tclassid & 0xFFFF))
1898 rt->dst.tclassid |= tag & 0xFFFF;
1899 if (!(rt->dst.tclassid & 0xFFFF0000))
1900 rt->dst.tclassid |= tag & 0xFFFF0000;
1901 }
1902 #endif
1903
1904 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1905 {
1906 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1907
1908 if (advmss == 0) {
1909 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1910 ip_rt_min_advmss);
1911 if (advmss > 65535 - 40)
1912 advmss = 65535 - 40;
1913 }
1914 return advmss;
1915 }
1916
1917 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1918 {
1919 const struct rtable *rt = (const struct rtable *) dst;
1920 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1921
1922 if (mtu && rt_is_output_route(rt))
1923 return mtu;
1924
1925 mtu = dst->dev->mtu;
1926
1927 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1928
1929 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1930 mtu = 576;
1931 }
1932
1933 if (mtu > IP_MAX_MTU)
1934 mtu = IP_MAX_MTU;
1935
1936 return mtu;
1937 }
1938
1939 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1940 struct fib_info *fi)
1941 {
1942 struct inet_peer *peer;
1943 int create = 0;
1944
1945 /* If a peer entry exists for this destination, we must hook
1946 * it up in order to get at cached metrics.
1947 */
1948 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1949 create = 1;
1950
1951 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1952 if (peer) {
1953 rt->rt_peer_genid = rt_peer_genid();
1954 if (inet_metrics_new(peer))
1955 memcpy(peer->metrics, fi->fib_metrics,
1956 sizeof(u32) * RTAX_MAX);
1957 dst_init_metrics(&rt->dst, peer->metrics, false);
1958
1959 check_peer_pmtu(&rt->dst, peer);
1960
1961 if (peer->redirect_learned.a4 &&
1962 peer->redirect_learned.a4 != rt->rt_gateway) {
1963 rt->rt_gateway = peer->redirect_learned.a4;
1964 rt->rt_flags |= RTCF_REDIRECTED;
1965 }
1966 } else {
1967 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1968 rt->fi = fi;
1969 atomic_inc(&fi->fib_clntref);
1970 }
1971 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1972 }
1973 }
1974
1975 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1976 const struct fib_result *res,
1977 struct fib_info *fi, u16 type, u32 itag)
1978 {
1979 struct dst_entry *dst = &rt->dst;
1980
1981 if (fi) {
1982 if (FIB_RES_GW(*res) &&
1983 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1984 rt->rt_gateway = FIB_RES_GW(*res);
1985 rt_init_metrics(rt, fl4, fi);
1986 #ifdef CONFIG_IP_ROUTE_CLASSID
1987 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1988 #endif
1989 }
1990
1991 if (dst_mtu(dst) > IP_MAX_MTU)
1992 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1993 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1994 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1995
1996 #ifdef CONFIG_IP_ROUTE_CLASSID
1997 #ifdef CONFIG_IP_MULTIPLE_TABLES
1998 set_class_tag(rt, fib_rules_tclass(res));
1999 #endif
2000 set_class_tag(rt, itag);
2001 #endif
2002 }
2003
2004 static struct rtable *rt_dst_alloc(struct net_device *dev,
2005 bool nopolicy, bool noxfrm)
2006 {
2007 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2008 DST_HOST |
2009 (nopolicy ? DST_NOPOLICY : 0) |
2010 (noxfrm ? DST_NOXFRM : 0));
2011 }
2012
2013 /* called in rcu_read_lock() section */
2014 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2015 u8 tos, struct net_device *dev, int our)
2016 {
2017 unsigned int hash;
2018 struct rtable *rth;
2019 __be32 spec_dst;
2020 struct in_device *in_dev = __in_dev_get_rcu(dev);
2021 u32 itag = 0;
2022 int err;
2023
2024 /* Primary sanity checks. */
2025
2026 if (in_dev == NULL)
2027 return -EINVAL;
2028
2029 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2030 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2031 goto e_inval;
2032
2033 if (ipv4_is_zeronet(saddr)) {
2034 if (!ipv4_is_local_multicast(daddr))
2035 goto e_inval;
2036 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2037 } else {
2038 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2039 &itag);
2040 if (err < 0)
2041 goto e_err;
2042 }
2043 rth = rt_dst_alloc(init_net.loopback_dev,
2044 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2045 if (!rth)
2046 goto e_nobufs;
2047
2048 #ifdef CONFIG_IP_ROUTE_CLASSID
2049 rth->dst.tclassid = itag;
2050 #endif
2051 rth->dst.output = ip_rt_bug;
2052
2053 rth->rt_key_dst = daddr;
2054 rth->rt_key_src = saddr;
2055 rth->rt_genid = rt_genid(dev_net(dev));
2056 rth->rt_flags = RTCF_MULTICAST;
2057 rth->rt_type = RTN_MULTICAST;
2058 rth->rt_key_tos = tos;
2059 rth->rt_dst = daddr;
2060 rth->rt_src = saddr;
2061 rth->rt_route_iif = dev->ifindex;
2062 rth->rt_iif = dev->ifindex;
2063 rth->rt_oif = 0;
2064 rth->rt_mark = skb->mark;
2065 rth->rt_gateway = daddr;
2066 rth->rt_spec_dst= spec_dst;
2067 rth->rt_peer_genid = 0;
2068 rth->peer = NULL;
2069 rth->fi = NULL;
2070 if (our) {
2071 rth->dst.input= ip_local_deliver;
2072 rth->rt_flags |= RTCF_LOCAL;
2073 }
2074
2075 #ifdef CONFIG_IP_MROUTE
2076 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2077 rth->dst.input = ip_mr_input;
2078 #endif
2079 RT_CACHE_STAT_INC(in_slow_mc);
2080
2081 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2082 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2083 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2084
2085 e_nobufs:
2086 return -ENOBUFS;
2087 e_inval:
2088 return -EINVAL;
2089 e_err:
2090 return err;
2091 }
2092
2093
2094 static void ip_handle_martian_source(struct net_device *dev,
2095 struct in_device *in_dev,
2096 struct sk_buff *skb,
2097 __be32 daddr,
2098 __be32 saddr)
2099 {
2100 RT_CACHE_STAT_INC(in_martian_src);
2101 #ifdef CONFIG_IP_ROUTE_VERBOSE
2102 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2103 /*
2104 * RFC1812 recommendation, if source is martian,
2105 * the only hint is MAC header.
2106 */
2107 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2108 &daddr, &saddr, dev->name);
2109 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2110 print_hex_dump(KERN_WARNING, "ll header: ",
2111 DUMP_PREFIX_OFFSET, 16, 1,
2112 skb_mac_header(skb),
2113 dev->hard_header_len, true);
2114 }
2115 }
2116 #endif
2117 }
2118
2119 /* called in rcu_read_lock() section */
2120 static int __mkroute_input(struct sk_buff *skb,
2121 const struct fib_result *res,
2122 struct in_device *in_dev,
2123 __be32 daddr, __be32 saddr, u32 tos,
2124 struct rtable **result)
2125 {
2126 struct rtable *rth;
2127 int err;
2128 struct in_device *out_dev;
2129 unsigned int flags = 0;
2130 __be32 spec_dst;
2131 u32 itag;
2132
2133 /* get a working reference to the output device */
2134 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2135 if (out_dev == NULL) {
2136 if (net_ratelimit())
2137 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
2138 return -EINVAL;
2139 }
2140
2141
2142 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2143 in_dev->dev, &spec_dst, &itag);
2144 if (err < 0) {
2145 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2146 saddr);
2147
2148 goto cleanup;
2149 }
2150
2151 if (err)
2152 flags |= RTCF_DIRECTSRC;
2153
2154 if (out_dev == in_dev && err &&
2155 (IN_DEV_SHARED_MEDIA(out_dev) ||
2156 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2157 flags |= RTCF_DOREDIRECT;
2158
2159 if (skb->protocol != htons(ETH_P_IP)) {
2160 /* Not IP (i.e. ARP). Do not create route, if it is
2161 * invalid for proxy arp. DNAT routes are always valid.
2162 *
2163 * Proxy arp feature have been extended to allow, ARP
2164 * replies back to the same interface, to support
2165 * Private VLAN switch technologies. See arp.c.
2166 */
2167 if (out_dev == in_dev &&
2168 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2169 err = -EINVAL;
2170 goto cleanup;
2171 }
2172 }
2173
2174 rth = rt_dst_alloc(out_dev->dev,
2175 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2176 IN_DEV_CONF_GET(out_dev, NOXFRM));
2177 if (!rth) {
2178 err = -ENOBUFS;
2179 goto cleanup;
2180 }
2181
2182 rth->rt_key_dst = daddr;
2183 rth->rt_key_src = saddr;
2184 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2185 rth->rt_flags = flags;
2186 rth->rt_type = res->type;
2187 rth->rt_key_tos = tos;
2188 rth->rt_dst = daddr;
2189 rth->rt_src = saddr;
2190 rth->rt_route_iif = in_dev->dev->ifindex;
2191 rth->rt_iif = in_dev->dev->ifindex;
2192 rth->rt_oif = 0;
2193 rth->rt_mark = skb->mark;
2194 rth->rt_gateway = daddr;
2195 rth->rt_spec_dst= spec_dst;
2196 rth->rt_peer_genid = 0;
2197 rth->peer = NULL;
2198 rth->fi = NULL;
2199
2200 rth->dst.input = ip_forward;
2201 rth->dst.output = ip_output;
2202
2203 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2204
2205 *result = rth;
2206 err = 0;
2207 cleanup:
2208 return err;
2209 }
2210
2211 static int ip_mkroute_input(struct sk_buff *skb,
2212 struct fib_result *res,
2213 const struct flowi4 *fl4,
2214 struct in_device *in_dev,
2215 __be32 daddr, __be32 saddr, u32 tos)
2216 {
2217 struct rtable* rth = NULL;
2218 int err;
2219 unsigned hash;
2220
2221 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2222 if (res->fi && res->fi->fib_nhs > 1)
2223 fib_select_multipath(res);
2224 #endif
2225
2226 /* create a routing cache entry */
2227 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2228 if (err)
2229 return err;
2230
2231 /* put it into the cache */
2232 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2233 rt_genid(dev_net(rth->dst.dev)));
2234 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2235 if (IS_ERR(rth))
2236 return PTR_ERR(rth);
2237 return 0;
2238 }
2239
2240 /*
2241 * NOTE. We drop all the packets that has local source
2242 * addresses, because every properly looped back packet
2243 * must have correct destination already attached by output routine.
2244 *
2245 * Such approach solves two big problems:
2246 * 1. Not simplex devices are handled properly.
2247 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2248 * called with rcu_read_lock()
2249 */
2250
2251 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2252 u8 tos, struct net_device *dev)
2253 {
2254 struct fib_result res;
2255 struct in_device *in_dev = __in_dev_get_rcu(dev);
2256 struct flowi4 fl4;
2257 unsigned flags = 0;
2258 u32 itag = 0;
2259 struct rtable * rth;
2260 unsigned hash;
2261 __be32 spec_dst;
2262 int err = -EINVAL;
2263 struct net * net = dev_net(dev);
2264
2265 /* IP on this device is disabled. */
2266
2267 if (!in_dev)
2268 goto out;
2269
2270 /* Check for the most weird martians, which can be not detected
2271 by fib_lookup.
2272 */
2273
2274 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2275 ipv4_is_loopback(saddr))
2276 goto martian_source;
2277
2278 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2279 goto brd_input;
2280
2281 /* Accept zero addresses only to limited broadcast;
2282 * I even do not know to fix it or not. Waiting for complains :-)
2283 */
2284 if (ipv4_is_zeronet(saddr))
2285 goto martian_source;
2286
2287 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2288 goto martian_destination;
2289
2290 /*
2291 * Now we are ready to route packet.
2292 */
2293 fl4.flowi4_oif = 0;
2294 fl4.flowi4_iif = dev->ifindex;
2295 fl4.flowi4_mark = skb->mark;
2296 fl4.flowi4_tos = tos;
2297 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2298 fl4.daddr = daddr;
2299 fl4.saddr = saddr;
2300 err = fib_lookup(net, &fl4, &res);
2301 if (err != 0) {
2302 if (!IN_DEV_FORWARD(in_dev))
2303 goto e_hostunreach;
2304 goto no_route;
2305 }
2306
2307 RT_CACHE_STAT_INC(in_slow_tot);
2308
2309 if (res.type == RTN_BROADCAST)
2310 goto brd_input;
2311
2312 if (res.type == RTN_LOCAL) {
2313 err = fib_validate_source(skb, saddr, daddr, tos,
2314 net->loopback_dev->ifindex,
2315 dev, &spec_dst, &itag);
2316 if (err < 0)
2317 goto martian_source_keep_err;
2318 if (err)
2319 flags |= RTCF_DIRECTSRC;
2320 spec_dst = daddr;
2321 goto local_input;
2322 }
2323
2324 if (!IN_DEV_FORWARD(in_dev))
2325 goto e_hostunreach;
2326 if (res.type != RTN_UNICAST)
2327 goto martian_destination;
2328
2329 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2330 out: return err;
2331
2332 brd_input:
2333 if (skb->protocol != htons(ETH_P_IP))
2334 goto e_inval;
2335
2336 if (ipv4_is_zeronet(saddr))
2337 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2338 else {
2339 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2340 &itag);
2341 if (err < 0)
2342 goto martian_source_keep_err;
2343 if (err)
2344 flags |= RTCF_DIRECTSRC;
2345 }
2346 flags |= RTCF_BROADCAST;
2347 res.type = RTN_BROADCAST;
2348 RT_CACHE_STAT_INC(in_brd);
2349
2350 local_input:
2351 rth = rt_dst_alloc(net->loopback_dev,
2352 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2353 if (!rth)
2354 goto e_nobufs;
2355
2356 rth->dst.input= ip_local_deliver;
2357 rth->dst.output= ip_rt_bug;
2358 #ifdef CONFIG_IP_ROUTE_CLASSID
2359 rth->dst.tclassid = itag;
2360 #endif
2361
2362 rth->rt_key_dst = daddr;
2363 rth->rt_key_src = saddr;
2364 rth->rt_genid = rt_genid(net);
2365 rth->rt_flags = flags|RTCF_LOCAL;
2366 rth->rt_type = res.type;
2367 rth->rt_key_tos = tos;
2368 rth->rt_dst = daddr;
2369 rth->rt_src = saddr;
2370 #ifdef CONFIG_IP_ROUTE_CLASSID
2371 rth->dst.tclassid = itag;
2372 #endif
2373 rth->rt_route_iif = dev->ifindex;
2374 rth->rt_iif = dev->ifindex;
2375 rth->rt_oif = 0;
2376 rth->rt_mark = skb->mark;
2377 rth->rt_gateway = daddr;
2378 rth->rt_spec_dst= spec_dst;
2379 rth->rt_peer_genid = 0;
2380 rth->peer = NULL;
2381 rth->fi = NULL;
2382 if (res.type == RTN_UNREACHABLE) {
2383 rth->dst.input= ip_error;
2384 rth->dst.error= -err;
2385 rth->rt_flags &= ~RTCF_LOCAL;
2386 }
2387 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2388 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2389 err = 0;
2390 if (IS_ERR(rth))
2391 err = PTR_ERR(rth);
2392 goto out;
2393
2394 no_route:
2395 RT_CACHE_STAT_INC(in_no_route);
2396 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2397 res.type = RTN_UNREACHABLE;
2398 if (err == -ESRCH)
2399 err = -ENETUNREACH;
2400 goto local_input;
2401
2402 /*
2403 * Do not cache martian addresses: they should be logged (RFC1812)
2404 */
2405 martian_destination:
2406 RT_CACHE_STAT_INC(in_martian_dst);
2407 #ifdef CONFIG_IP_ROUTE_VERBOSE
2408 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2409 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
2410 &daddr, &saddr, dev->name);
2411 #endif
2412
2413 e_hostunreach:
2414 err = -EHOSTUNREACH;
2415 goto out;
2416
2417 e_inval:
2418 err = -EINVAL;
2419 goto out;
2420
2421 e_nobufs:
2422 err = -ENOBUFS;
2423 goto out;
2424
2425 martian_source:
2426 err = -EINVAL;
2427 martian_source_keep_err:
2428 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2429 goto out;
2430 }
2431
2432 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2433 u8 tos, struct net_device *dev, bool noref)
2434 {
2435 struct rtable * rth;
2436 unsigned hash;
2437 int iif = dev->ifindex;
2438 struct net *net;
2439 int res;
2440
2441 net = dev_net(dev);
2442
2443 rcu_read_lock();
2444
2445 if (!rt_caching(net))
2446 goto skip_cache;
2447
2448 tos &= IPTOS_RT_MASK;
2449 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2450
2451 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2452 rth = rcu_dereference(rth->dst.rt_next)) {
2453 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2454 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2455 (rth->rt_route_iif ^ iif) |
2456 (rth->rt_key_tos ^ tos)) == 0 &&
2457 rth->rt_mark == skb->mark &&
2458 net_eq(dev_net(rth->dst.dev), net) &&
2459 !rt_is_expired(rth)) {
2460 ipv4_validate_peer(rth);
2461 if (noref) {
2462 dst_use_noref(&rth->dst, jiffies);
2463 skb_dst_set_noref(skb, &rth->dst);
2464 } else {
2465 dst_use(&rth->dst, jiffies);
2466 skb_dst_set(skb, &rth->dst);
2467 }
2468 RT_CACHE_STAT_INC(in_hit);
2469 rcu_read_unlock();
2470 return 0;
2471 }
2472 RT_CACHE_STAT_INC(in_hlist_search);
2473 }
2474
2475 skip_cache:
2476 /* Multicast recognition logic is moved from route cache to here.
2477 The problem was that too many Ethernet cards have broken/missing
2478 hardware multicast filters :-( As result the host on multicasting
2479 network acquires a lot of useless route cache entries, sort of
2480 SDR messages from all the world. Now we try to get rid of them.
2481 Really, provided software IP multicast filter is organized
2482 reasonably (at least, hashed), it does not result in a slowdown
2483 comparing with route cache reject entries.
2484 Note, that multicast routers are not affected, because
2485 route cache entry is created eventually.
2486 */
2487 if (ipv4_is_multicast(daddr)) {
2488 struct in_device *in_dev = __in_dev_get_rcu(dev);
2489
2490 if (in_dev) {
2491 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2492 ip_hdr(skb)->protocol);
2493 if (our
2494 #ifdef CONFIG_IP_MROUTE
2495 ||
2496 (!ipv4_is_local_multicast(daddr) &&
2497 IN_DEV_MFORWARD(in_dev))
2498 #endif
2499 ) {
2500 int res = ip_route_input_mc(skb, daddr, saddr,
2501 tos, dev, our);
2502 rcu_read_unlock();
2503 return res;
2504 }
2505 }
2506 rcu_read_unlock();
2507 return -EINVAL;
2508 }
2509 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2510 rcu_read_unlock();
2511 return res;
2512 }
2513 EXPORT_SYMBOL(ip_route_input_common);
2514
2515 /* called with rcu_read_lock() */
2516 static struct rtable *__mkroute_output(const struct fib_result *res,
2517 const struct flowi4 *fl4,
2518 __be32 orig_daddr, __be32 orig_saddr,
2519 int orig_oif, __u8 orig_rtos,
2520 struct net_device *dev_out,
2521 unsigned int flags)
2522 {
2523 struct fib_info *fi = res->fi;
2524 struct in_device *in_dev;
2525 u16 type = res->type;
2526 struct rtable *rth;
2527
2528 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2529 return ERR_PTR(-EINVAL);
2530
2531 if (ipv4_is_lbcast(fl4->daddr))
2532 type = RTN_BROADCAST;
2533 else if (ipv4_is_multicast(fl4->daddr))
2534 type = RTN_MULTICAST;
2535 else if (ipv4_is_zeronet(fl4->daddr))
2536 return ERR_PTR(-EINVAL);
2537
2538 if (dev_out->flags & IFF_LOOPBACK)
2539 flags |= RTCF_LOCAL;
2540
2541 in_dev = __in_dev_get_rcu(dev_out);
2542 if (!in_dev)
2543 return ERR_PTR(-EINVAL);
2544
2545 if (type == RTN_BROADCAST) {
2546 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2547 fi = NULL;
2548 } else if (type == RTN_MULTICAST) {
2549 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2550 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2551 fl4->flowi4_proto))
2552 flags &= ~RTCF_LOCAL;
2553 /* If multicast route do not exist use
2554 * default one, but do not gateway in this case.
2555 * Yes, it is hack.
2556 */
2557 if (fi && res->prefixlen < 4)
2558 fi = NULL;
2559 }
2560
2561 rth = rt_dst_alloc(dev_out,
2562 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2563 IN_DEV_CONF_GET(in_dev, NOXFRM));
2564 if (!rth)
2565 return ERR_PTR(-ENOBUFS);
2566
2567 rth->dst.output = ip_output;
2568
2569 rth->rt_key_dst = orig_daddr;
2570 rth->rt_key_src = orig_saddr;
2571 rth->rt_genid = rt_genid(dev_net(dev_out));
2572 rth->rt_flags = flags;
2573 rth->rt_type = type;
2574 rth->rt_key_tos = orig_rtos;
2575 rth->rt_dst = fl4->daddr;
2576 rth->rt_src = fl4->saddr;
2577 rth->rt_route_iif = 0;
2578 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2579 rth->rt_oif = orig_oif;
2580 rth->rt_mark = fl4->flowi4_mark;
2581 rth->rt_gateway = fl4->daddr;
2582 rth->rt_spec_dst= fl4->saddr;
2583 rth->rt_peer_genid = 0;
2584 rth->peer = NULL;
2585 rth->fi = NULL;
2586
2587 RT_CACHE_STAT_INC(out_slow_tot);
2588
2589 if (flags & RTCF_LOCAL) {
2590 rth->dst.input = ip_local_deliver;
2591 rth->rt_spec_dst = fl4->daddr;
2592 }
2593 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2594 rth->rt_spec_dst = fl4->saddr;
2595 if (flags & RTCF_LOCAL &&
2596 !(dev_out->flags & IFF_LOOPBACK)) {
2597 rth->dst.output = ip_mc_output;
2598 RT_CACHE_STAT_INC(out_slow_mc);
2599 }
2600 #ifdef CONFIG_IP_MROUTE
2601 if (type == RTN_MULTICAST) {
2602 if (IN_DEV_MFORWARD(in_dev) &&
2603 !ipv4_is_local_multicast(fl4->daddr)) {
2604 rth->dst.input = ip_mr_input;
2605 rth->dst.output = ip_mc_output;
2606 }
2607 }
2608 #endif
2609 }
2610
2611 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2612
2613 return rth;
2614 }
2615
2616 /*
2617 * Major route resolver routine.
2618 * called with rcu_read_lock();
2619 */
2620
2621 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2622 {
2623 struct net_device *dev_out = NULL;
2624 __u8 tos = RT_FL_TOS(fl4);
2625 unsigned int flags = 0;
2626 struct fib_result res;
2627 struct rtable *rth;
2628 __be32 orig_daddr;
2629 __be32 orig_saddr;
2630 int orig_oif;
2631
2632 res.fi = NULL;
2633 #ifdef CONFIG_IP_MULTIPLE_TABLES
2634 res.r = NULL;
2635 #endif
2636
2637 orig_daddr = fl4->daddr;
2638 orig_saddr = fl4->saddr;
2639 orig_oif = fl4->flowi4_oif;
2640
2641 fl4->flowi4_iif = net->loopback_dev->ifindex;
2642 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2643 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2644 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2645
2646 rcu_read_lock();
2647 if (fl4->saddr) {
2648 rth = ERR_PTR(-EINVAL);
2649 if (ipv4_is_multicast(fl4->saddr) ||
2650 ipv4_is_lbcast(fl4->saddr) ||
2651 ipv4_is_zeronet(fl4->saddr))
2652 goto out;
2653
2654 /* I removed check for oif == dev_out->oif here.
2655 It was wrong for two reasons:
2656 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2657 is assigned to multiple interfaces.
2658 2. Moreover, we are allowed to send packets with saddr
2659 of another iface. --ANK
2660 */
2661
2662 if (fl4->flowi4_oif == 0 &&
2663 (ipv4_is_multicast(fl4->daddr) ||
2664 ipv4_is_lbcast(fl4->daddr))) {
2665 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2666 dev_out = __ip_dev_find(net, fl4->saddr, false);
2667 if (dev_out == NULL)
2668 goto out;
2669
2670 /* Special hack: user can direct multicasts
2671 and limited broadcast via necessary interface
2672 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2673 This hack is not just for fun, it allows
2674 vic,vat and friends to work.
2675 They bind socket to loopback, set ttl to zero
2676 and expect that it will work.
2677 From the viewpoint of routing cache they are broken,
2678 because we are not allowed to build multicast path
2679 with loopback source addr (look, routing cache
2680 cannot know, that ttl is zero, so that packet
2681 will not leave this host and route is valid).
2682 Luckily, this hack is good workaround.
2683 */
2684
2685 fl4->flowi4_oif = dev_out->ifindex;
2686 goto make_route;
2687 }
2688
2689 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2690 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2691 if (!__ip_dev_find(net, fl4->saddr, false))
2692 goto out;
2693 }
2694 }
2695
2696
2697 if (fl4->flowi4_oif) {
2698 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2699 rth = ERR_PTR(-ENODEV);
2700 if (dev_out == NULL)
2701 goto out;
2702
2703 /* RACE: Check return value of inet_select_addr instead. */
2704 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2705 rth = ERR_PTR(-ENETUNREACH);
2706 goto out;
2707 }
2708 if (ipv4_is_local_multicast(fl4->daddr) ||
2709 ipv4_is_lbcast(fl4->daddr)) {
2710 if (!fl4->saddr)
2711 fl4->saddr = inet_select_addr(dev_out, 0,
2712 RT_SCOPE_LINK);
2713 goto make_route;
2714 }
2715 if (fl4->saddr) {
2716 if (ipv4_is_multicast(fl4->daddr))
2717 fl4->saddr = inet_select_addr(dev_out, 0,
2718 fl4->flowi4_scope);
2719 else if (!fl4->daddr)
2720 fl4->saddr = inet_select_addr(dev_out, 0,
2721 RT_SCOPE_HOST);
2722 }
2723 }
2724
2725 if (!fl4->daddr) {
2726 fl4->daddr = fl4->saddr;
2727 if (!fl4->daddr)
2728 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2729 dev_out = net->loopback_dev;
2730 fl4->flowi4_oif = net->loopback_dev->ifindex;
2731 res.type = RTN_LOCAL;
2732 flags |= RTCF_LOCAL;
2733 goto make_route;
2734 }
2735
2736 if (fib_lookup(net, fl4, &res)) {
2737 res.fi = NULL;
2738 if (fl4->flowi4_oif) {
2739 /* Apparently, routing tables are wrong. Assume,
2740 that the destination is on link.
2741
2742 WHY? DW.
2743 Because we are allowed to send to iface
2744 even if it has NO routes and NO assigned
2745 addresses. When oif is specified, routing
2746 tables are looked up with only one purpose:
2747 to catch if destination is gatewayed, rather than
2748 direct. Moreover, if MSG_DONTROUTE is set,
2749 we send packet, ignoring both routing tables
2750 and ifaddr state. --ANK
2751
2752
2753 We could make it even if oif is unknown,
2754 likely IPv6, but we do not.
2755 */
2756
2757 if (fl4->saddr == 0)
2758 fl4->saddr = inet_select_addr(dev_out, 0,
2759 RT_SCOPE_LINK);
2760 res.type = RTN_UNICAST;
2761 goto make_route;
2762 }
2763 rth = ERR_PTR(-ENETUNREACH);
2764 goto out;
2765 }
2766
2767 if (res.type == RTN_LOCAL) {
2768 if (!fl4->saddr) {
2769 if (res.fi->fib_prefsrc)
2770 fl4->saddr = res.fi->fib_prefsrc;
2771 else
2772 fl4->saddr = fl4->daddr;
2773 }
2774 dev_out = net->loopback_dev;
2775 fl4->flowi4_oif = dev_out->ifindex;
2776 res.fi = NULL;
2777 flags |= RTCF_LOCAL;
2778 goto make_route;
2779 }
2780
2781 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2782 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2783 fib_select_multipath(&res);
2784 else
2785 #endif
2786 if (!res.prefixlen &&
2787 res.table->tb_num_default > 1 &&
2788 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2789 fib_select_default(&res);
2790
2791 if (!fl4->saddr)
2792 fl4->saddr = FIB_RES_PREFSRC(net, res);
2793
2794 dev_out = FIB_RES_DEV(res);
2795 fl4->flowi4_oif = dev_out->ifindex;
2796
2797
2798 make_route:
2799 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2800 tos, dev_out, flags);
2801 if (!IS_ERR(rth)) {
2802 unsigned int hash;
2803
2804 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2805 rt_genid(dev_net(dev_out)));
2806 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2807 }
2808
2809 out:
2810 rcu_read_unlock();
2811 return rth;
2812 }
2813
2814 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2815 {
2816 struct rtable *rth;
2817 unsigned int hash;
2818
2819 if (!rt_caching(net))
2820 goto slow_output;
2821
2822 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2823
2824 rcu_read_lock_bh();
2825 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2826 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2827 if (rth->rt_key_dst == flp4->daddr &&
2828 rth->rt_key_src == flp4->saddr &&
2829 rt_is_output_route(rth) &&
2830 rth->rt_oif == flp4->flowi4_oif &&
2831 rth->rt_mark == flp4->flowi4_mark &&
2832 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2833 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2834 net_eq(dev_net(rth->dst.dev), net) &&
2835 !rt_is_expired(rth)) {
2836 ipv4_validate_peer(rth);
2837 dst_use(&rth->dst, jiffies);
2838 RT_CACHE_STAT_INC(out_hit);
2839 rcu_read_unlock_bh();
2840 if (!flp4->saddr)
2841 flp4->saddr = rth->rt_src;
2842 if (!flp4->daddr)
2843 flp4->daddr = rth->rt_dst;
2844 return rth;
2845 }
2846 RT_CACHE_STAT_INC(out_hlist_search);
2847 }
2848 rcu_read_unlock_bh();
2849
2850 slow_output:
2851 return ip_route_output_slow(net, flp4);
2852 }
2853 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2854
2855 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2856 {
2857 return NULL;
2858 }
2859
2860 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2861 {
2862 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2863
2864 return mtu ? : dst->dev->mtu;
2865 }
2866
2867 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2868 {
2869 }
2870
2871 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2872 unsigned long old)
2873 {
2874 return NULL;
2875 }
2876
2877 static struct dst_ops ipv4_dst_blackhole_ops = {
2878 .family = AF_INET,
2879 .protocol = cpu_to_be16(ETH_P_IP),
2880 .destroy = ipv4_dst_destroy,
2881 .check = ipv4_blackhole_dst_check,
2882 .mtu = ipv4_blackhole_mtu,
2883 .default_advmss = ipv4_default_advmss,
2884 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2885 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2886 .neigh_lookup = ipv4_neigh_lookup,
2887 };
2888
2889 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2890 {
2891 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2892 struct rtable *ort = (struct rtable *) dst_orig;
2893
2894 if (rt) {
2895 struct dst_entry *new = &rt->dst;
2896
2897 new->__use = 1;
2898 new->input = dst_discard;
2899 new->output = dst_discard;
2900 dst_copy_metrics(new, &ort->dst);
2901
2902 new->dev = ort->dst.dev;
2903 if (new->dev)
2904 dev_hold(new->dev);
2905
2906 rt->rt_key_dst = ort->rt_key_dst;
2907 rt->rt_key_src = ort->rt_key_src;
2908 rt->rt_key_tos = ort->rt_key_tos;
2909 rt->rt_route_iif = ort->rt_route_iif;
2910 rt->rt_iif = ort->rt_iif;
2911 rt->rt_oif = ort->rt_oif;
2912 rt->rt_mark = ort->rt_mark;
2913
2914 rt->rt_genid = rt_genid(net);
2915 rt->rt_flags = ort->rt_flags;
2916 rt->rt_type = ort->rt_type;
2917 rt->rt_dst = ort->rt_dst;
2918 rt->rt_src = ort->rt_src;
2919 rt->rt_gateway = ort->rt_gateway;
2920 rt->rt_spec_dst = ort->rt_spec_dst;
2921 rt->peer = ort->peer;
2922 if (rt->peer)
2923 atomic_inc(&rt->peer->refcnt);
2924 rt->fi = ort->fi;
2925 if (rt->fi)
2926 atomic_inc(&rt->fi->fib_clntref);
2927
2928 dst_free(new);
2929 }
2930
2931 dst_release(dst_orig);
2932
2933 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2934 }
2935
2936 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2937 struct sock *sk)
2938 {
2939 struct rtable *rt = __ip_route_output_key(net, flp4);
2940
2941 if (IS_ERR(rt))
2942 return rt;
2943
2944 if (flp4->flowi4_proto)
2945 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2946 flowi4_to_flowi(flp4),
2947 sk, 0);
2948
2949 return rt;
2950 }
2951 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2952
2953 static int rt_fill_info(struct net *net,
2954 struct sk_buff *skb, u32 pid, u32 seq, int event,
2955 int nowait, unsigned int flags)
2956 {
2957 struct rtable *rt = skb_rtable(skb);
2958 struct rtmsg *r;
2959 struct nlmsghdr *nlh;
2960 unsigned long expires = 0;
2961 const struct inet_peer *peer = rt->peer;
2962 u32 id = 0, ts = 0, tsage = 0, error;
2963
2964 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2965 if (nlh == NULL)
2966 return -EMSGSIZE;
2967
2968 r = nlmsg_data(nlh);
2969 r->rtm_family = AF_INET;
2970 r->rtm_dst_len = 32;
2971 r->rtm_src_len = 0;
2972 r->rtm_tos = rt->rt_key_tos;
2973 r->rtm_table = RT_TABLE_MAIN;
2974 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2975 r->rtm_type = rt->rt_type;
2976 r->rtm_scope = RT_SCOPE_UNIVERSE;
2977 r->rtm_protocol = RTPROT_UNSPEC;
2978 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2979 if (rt->rt_flags & RTCF_NOTIFY)
2980 r->rtm_flags |= RTM_F_NOTIFY;
2981
2982 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2983
2984 if (rt->rt_key_src) {
2985 r->rtm_src_len = 32;
2986 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2987 }
2988 if (rt->dst.dev)
2989 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2990 #ifdef CONFIG_IP_ROUTE_CLASSID
2991 if (rt->dst.tclassid)
2992 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2993 #endif
2994 if (rt_is_input_route(rt))
2995 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2996 else if (rt->rt_src != rt->rt_key_src)
2997 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2998
2999 if (rt->rt_dst != rt->rt_gateway)
3000 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3001
3002 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3003 goto nla_put_failure;
3004
3005 if (rt->rt_mark)
3006 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3007
3008 error = rt->dst.error;
3009 if (peer) {
3010 inet_peer_refcheck(rt->peer);
3011 id = atomic_read(&peer->ip_id_count) & 0xffff;
3012 if (peer->tcp_ts_stamp) {
3013 ts = peer->tcp_ts;
3014 tsage = get_seconds() - peer->tcp_ts_stamp;
3015 }
3016 expires = ACCESS_ONCE(peer->pmtu_expires);
3017 if (expires) {
3018 if (time_before(jiffies, expires))
3019 expires -= jiffies;
3020 else
3021 expires = 0;
3022 }
3023 }
3024
3025 if (rt_is_input_route(rt)) {
3026 #ifdef CONFIG_IP_MROUTE
3027 __be32 dst = rt->rt_dst;
3028
3029 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3030 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3031 int err = ipmr_get_route(net, skb,
3032 rt->rt_src, rt->rt_dst,
3033 r, nowait);
3034 if (err <= 0) {
3035 if (!nowait) {
3036 if (err == 0)
3037 return 0;
3038 goto nla_put_failure;
3039 } else {
3040 if (err == -EMSGSIZE)
3041 goto nla_put_failure;
3042 error = err;
3043 }
3044 }
3045 } else
3046 #endif
3047 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3048 }
3049
3050 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3051 expires, error) < 0)
3052 goto nla_put_failure;
3053
3054 return nlmsg_end(skb, nlh);
3055
3056 nla_put_failure:
3057 nlmsg_cancel(skb, nlh);
3058 return -EMSGSIZE;
3059 }
3060
3061 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3062 {
3063 struct net *net = sock_net(in_skb->sk);
3064 struct rtmsg *rtm;
3065 struct nlattr *tb[RTA_MAX+1];
3066 struct rtable *rt = NULL;
3067 __be32 dst = 0;
3068 __be32 src = 0;
3069 u32 iif;
3070 int err;
3071 int mark;
3072 struct sk_buff *skb;
3073
3074 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3075 if (err < 0)
3076 goto errout;
3077
3078 rtm = nlmsg_data(nlh);
3079
3080 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3081 if (skb == NULL) {
3082 err = -ENOBUFS;
3083 goto errout;
3084 }
3085
3086 /* Reserve room for dummy headers, this skb can pass
3087 through good chunk of routing engine.
3088 */
3089 skb_reset_mac_header(skb);
3090 skb_reset_network_header(skb);
3091
3092 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3093 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3094 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3095
3096 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3097 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3098 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3099 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3100
3101 if (iif) {
3102 struct net_device *dev;
3103
3104 dev = __dev_get_by_index(net, iif);
3105 if (dev == NULL) {
3106 err = -ENODEV;
3107 goto errout_free;
3108 }
3109
3110 skb->protocol = htons(ETH_P_IP);
3111 skb->dev = dev;
3112 skb->mark = mark;
3113 local_bh_disable();
3114 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3115 local_bh_enable();
3116
3117 rt = skb_rtable(skb);
3118 if (err == 0 && rt->dst.error)
3119 err = -rt->dst.error;
3120 } else {
3121 struct flowi4 fl4 = {
3122 .daddr = dst,
3123 .saddr = src,
3124 .flowi4_tos = rtm->rtm_tos,
3125 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3126 .flowi4_mark = mark,
3127 };
3128 rt = ip_route_output_key(net, &fl4);
3129
3130 err = 0;
3131 if (IS_ERR(rt))
3132 err = PTR_ERR(rt);
3133 }
3134
3135 if (err)
3136 goto errout_free;
3137
3138 skb_dst_set(skb, &rt->dst);
3139 if (rtm->rtm_flags & RTM_F_NOTIFY)
3140 rt->rt_flags |= RTCF_NOTIFY;
3141
3142 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3143 RTM_NEWROUTE, 0, 0);
3144 if (err <= 0)
3145 goto errout_free;
3146
3147 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3148 errout:
3149 return err;
3150
3151 errout_free:
3152 kfree_skb(skb);
3153 goto errout;
3154 }
3155
3156 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3157 {
3158 struct rtable *rt;
3159 int h, s_h;
3160 int idx, s_idx;
3161 struct net *net;
3162
3163 net = sock_net(skb->sk);
3164
3165 s_h = cb->args[0];
3166 if (s_h < 0)
3167 s_h = 0;
3168 s_idx = idx = cb->args[1];
3169 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3170 if (!rt_hash_table[h].chain)
3171 continue;
3172 rcu_read_lock_bh();
3173 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3174 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3175 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3176 continue;
3177 if (rt_is_expired(rt))
3178 continue;
3179 skb_dst_set_noref(skb, &rt->dst);
3180 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3181 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3182 1, NLM_F_MULTI) <= 0) {
3183 skb_dst_drop(skb);
3184 rcu_read_unlock_bh();
3185 goto done;
3186 }
3187 skb_dst_drop(skb);
3188 }
3189 rcu_read_unlock_bh();
3190 }
3191
3192 done:
3193 cb->args[0] = h;
3194 cb->args[1] = idx;
3195 return skb->len;
3196 }
3197
3198 void ip_rt_multicast_event(struct in_device *in_dev)
3199 {
3200 rt_cache_flush(dev_net(in_dev->dev), 0);
3201 }
3202
3203 #ifdef CONFIG_SYSCTL
3204 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3205 void __user *buffer,
3206 size_t *lenp, loff_t *ppos)
3207 {
3208 if (write) {
3209 int flush_delay;
3210 ctl_table ctl;
3211 struct net *net;
3212
3213 memcpy(&ctl, __ctl, sizeof(ctl));
3214 ctl.data = &flush_delay;
3215 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3216
3217 net = (struct net *)__ctl->extra1;
3218 rt_cache_flush(net, flush_delay);
3219 return 0;
3220 }
3221
3222 return -EINVAL;
3223 }
3224
3225 static ctl_table ipv4_route_table[] = {
3226 {
3227 .procname = "gc_thresh",
3228 .data = &ipv4_dst_ops.gc_thresh,
3229 .maxlen = sizeof(int),
3230 .mode = 0644,
3231 .proc_handler = proc_dointvec,
3232 },
3233 {
3234 .procname = "max_size",
3235 .data = &ip_rt_max_size,
3236 .maxlen = sizeof(int),
3237 .mode = 0644,
3238 .proc_handler = proc_dointvec,
3239 },
3240 {
3241 /* Deprecated. Use gc_min_interval_ms */
3242
3243 .procname = "gc_min_interval",
3244 .data = &ip_rt_gc_min_interval,
3245 .maxlen = sizeof(int),
3246 .mode = 0644,
3247 .proc_handler = proc_dointvec_jiffies,
3248 },
3249 {
3250 .procname = "gc_min_interval_ms",
3251 .data = &ip_rt_gc_min_interval,
3252 .maxlen = sizeof(int),
3253 .mode = 0644,
3254 .proc_handler = proc_dointvec_ms_jiffies,
3255 },
3256 {
3257 .procname = "gc_timeout",
3258 .data = &ip_rt_gc_timeout,
3259 .maxlen = sizeof(int),
3260 .mode = 0644,
3261 .proc_handler = proc_dointvec_jiffies,
3262 },
3263 {
3264 .procname = "gc_interval",
3265 .data = &ip_rt_gc_interval,
3266 .maxlen = sizeof(int),
3267 .mode = 0644,
3268 .proc_handler = proc_dointvec_jiffies,
3269 },
3270 {
3271 .procname = "redirect_load",
3272 .data = &ip_rt_redirect_load,
3273 .maxlen = sizeof(int),
3274 .mode = 0644,
3275 .proc_handler = proc_dointvec,
3276 },
3277 {
3278 .procname = "redirect_number",
3279 .data = &ip_rt_redirect_number,
3280 .maxlen = sizeof(int),
3281 .mode = 0644,
3282 .proc_handler = proc_dointvec,
3283 },
3284 {
3285 .procname = "redirect_silence",
3286 .data = &ip_rt_redirect_silence,
3287 .maxlen = sizeof(int),
3288 .mode = 0644,
3289 .proc_handler = proc_dointvec,
3290 },
3291 {
3292 .procname = "error_cost",
3293 .data = &ip_rt_error_cost,
3294 .maxlen = sizeof(int),
3295 .mode = 0644,
3296 .proc_handler = proc_dointvec,
3297 },
3298 {
3299 .procname = "error_burst",
3300 .data = &ip_rt_error_burst,
3301 .maxlen = sizeof(int),
3302 .mode = 0644,
3303 .proc_handler = proc_dointvec,
3304 },
3305 {
3306 .procname = "gc_elasticity",
3307 .data = &ip_rt_gc_elasticity,
3308 .maxlen = sizeof(int),
3309 .mode = 0644,
3310 .proc_handler = proc_dointvec,
3311 },
3312 {
3313 .procname = "mtu_expires",
3314 .data = &ip_rt_mtu_expires,
3315 .maxlen = sizeof(int),
3316 .mode = 0644,
3317 .proc_handler = proc_dointvec_jiffies,
3318 },
3319 {
3320 .procname = "min_pmtu",
3321 .data = &ip_rt_min_pmtu,
3322 .maxlen = sizeof(int),
3323 .mode = 0644,
3324 .proc_handler = proc_dointvec,
3325 },
3326 {
3327 .procname = "min_adv_mss",
3328 .data = &ip_rt_min_advmss,
3329 .maxlen = sizeof(int),
3330 .mode = 0644,
3331 .proc_handler = proc_dointvec,
3332 },
3333 { }
3334 };
3335
3336 static struct ctl_table empty[1];
3337
3338 static struct ctl_table ipv4_skeleton[] =
3339 {
3340 { .procname = "route",
3341 .mode = 0555, .child = ipv4_route_table},
3342 { .procname = "neigh",
3343 .mode = 0555, .child = empty},
3344 { }
3345 };
3346
3347 static __net_initdata struct ctl_path ipv4_path[] = {
3348 { .procname = "net", },
3349 { .procname = "ipv4", },
3350 { },
3351 };
3352
3353 static struct ctl_table ipv4_route_flush_table[] = {
3354 {
3355 .procname = "flush",
3356 .maxlen = sizeof(int),
3357 .mode = 0200,
3358 .proc_handler = ipv4_sysctl_rtcache_flush,
3359 },
3360 { },
3361 };
3362
3363 static __net_initdata struct ctl_path ipv4_route_path[] = {
3364 { .procname = "net", },
3365 { .procname = "ipv4", },
3366 { .procname = "route", },
3367 { },
3368 };
3369
3370 static __net_init int sysctl_route_net_init(struct net *net)
3371 {
3372 struct ctl_table *tbl;
3373
3374 tbl = ipv4_route_flush_table;
3375 if (!net_eq(net, &init_net)) {
3376 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3377 if (tbl == NULL)
3378 goto err_dup;
3379 }
3380 tbl[0].extra1 = net;
3381
3382 net->ipv4.route_hdr =
3383 register_net_sysctl_table(net, ipv4_route_path, tbl);
3384 if (net->ipv4.route_hdr == NULL)
3385 goto err_reg;
3386 return 0;
3387
3388 err_reg:
3389 if (tbl != ipv4_route_flush_table)
3390 kfree(tbl);
3391 err_dup:
3392 return -ENOMEM;
3393 }
3394
3395 static __net_exit void sysctl_route_net_exit(struct net *net)
3396 {
3397 struct ctl_table *tbl;
3398
3399 tbl = net->ipv4.route_hdr->ctl_table_arg;
3400 unregister_net_sysctl_table(net->ipv4.route_hdr);
3401 BUG_ON(tbl == ipv4_route_flush_table);
3402 kfree(tbl);
3403 }
3404
3405 static __net_initdata struct pernet_operations sysctl_route_ops = {
3406 .init = sysctl_route_net_init,
3407 .exit = sysctl_route_net_exit,
3408 };
3409 #endif
3410
3411 static __net_init int rt_genid_init(struct net *net)
3412 {
3413 get_random_bytes(&net->ipv4.rt_genid,
3414 sizeof(net->ipv4.rt_genid));
3415 get_random_bytes(&net->ipv4.dev_addr_genid,
3416 sizeof(net->ipv4.dev_addr_genid));
3417 return 0;
3418 }
3419
3420 static __net_initdata struct pernet_operations rt_genid_ops = {
3421 .init = rt_genid_init,
3422 };
3423
3424
3425 #ifdef CONFIG_IP_ROUTE_CLASSID
3426 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3427 #endif /* CONFIG_IP_ROUTE_CLASSID */
3428
3429 static __initdata unsigned long rhash_entries;
3430 static int __init set_rhash_entries(char *str)
3431 {
3432 if (!str)
3433 return 0;
3434 rhash_entries = simple_strtoul(str, &str, 0);
3435 return 1;
3436 }
3437 __setup("rhash_entries=", set_rhash_entries);
3438
3439 int __init ip_rt_init(void)
3440 {
3441 int rc = 0;
3442
3443 #ifdef CONFIG_IP_ROUTE_CLASSID
3444 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3445 if (!ip_rt_acct)
3446 panic("IP: failed to allocate ip_rt_acct\n");
3447 #endif
3448
3449 ipv4_dst_ops.kmem_cachep =
3450 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3451 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3452
3453 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3454
3455 if (dst_entries_init(&ipv4_dst_ops) < 0)
3456 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3457
3458 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3459 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3460
3461 rt_hash_table = (struct rt_hash_bucket *)
3462 alloc_large_system_hash("IP route cache",
3463 sizeof(struct rt_hash_bucket),
3464 rhash_entries,
3465 (totalram_pages >= 128 * 1024) ?
3466 15 : 17,
3467 0,
3468 &rt_hash_log,
3469 &rt_hash_mask,
3470 rhash_entries ? 0 : 512 * 1024);
3471 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3472 rt_hash_lock_init();
3473
3474 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3475 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3476
3477 devinet_init();
3478 ip_fib_init();
3479
3480 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3481 expires_ljiffies = jiffies;
3482 schedule_delayed_work(&expires_work,
3483 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3484
3485 if (ip_rt_proc_init())
3486 pr_err("Unable to create route proc files\n");
3487 #ifdef CONFIG_XFRM
3488 xfrm_init();
3489 xfrm4_init(ip_rt_max_size);
3490 #endif
3491 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3492
3493 #ifdef CONFIG_SYSCTL
3494 register_pernet_subsys(&sysctl_route_ops);
3495 #endif
3496 register_pernet_subsys(&rt_genid_ops);
3497 return rc;
3498 }
3499
3500 #ifdef CONFIG_SYSCTL
3501 /*
3502 * We really need to sanitize the damn ipv4 init order, then all
3503 * this nonsense will go away.
3504 */
3505 void __init ip_static_sysctl_init(void)
3506 {
3507 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3508 }
3509 #endif
This page took 0.521855 seconds and 5 git commands to generate.