fib_trie: Optimize fib_find_node
[deliverable/linux.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally described in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
49 */
50
51 #define VERSION "0.409"
52
53 #include <asm/uaccess.h>
54 #include <linux/bitops.h>
55 #include <linux/types.h>
56 #include <linux/kernel.h>
57 #include <linux/mm.h>
58 #include <linux/string.h>
59 #include <linux/socket.h>
60 #include <linux/sockios.h>
61 #include <linux/errno.h>
62 #include <linux/in.h>
63 #include <linux/inet.h>
64 #include <linux/inetdevice.h>
65 #include <linux/netdevice.h>
66 #include <linux/if_arp.h>
67 #include <linux/proc_fs.h>
68 #include <linux/rcupdate.h>
69 #include <linux/skbuff.h>
70 #include <linux/netlink.h>
71 #include <linux/init.h>
72 #include <linux/list.h>
73 #include <linux/slab.h>
74 #include <linux/export.h>
75 #include <net/net_namespace.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83
84 #define MAX_STAT_DEPTH 32
85
86 #define KEYLENGTH (8*sizeof(t_key))
87
88 typedef unsigned int t_key;
89
90 #define IS_TNODE(n) ((n)->bits)
91 #define IS_LEAF(n) (!(n)->bits)
92
93 #define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
94 #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
95
96 struct tnode {
97 t_key key;
98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
100 struct tnode __rcu *parent;
101 struct rcu_head rcu;
102 union {
103 /* The fields in this struct are valid if bits > 0 (TNODE) */
104 struct {
105 unsigned int full_children; /* KEYLENGTH bits needed */
106 unsigned int empty_children; /* KEYLENGTH bits needed */
107 struct tnode __rcu *child[0];
108 };
109 /* This list pointer if valid if bits == 0 (LEAF) */
110 struct hlist_head list;
111 };
112 };
113
114 struct leaf_info {
115 struct hlist_node hlist;
116 int plen;
117 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
118 struct list_head falh;
119 struct rcu_head rcu;
120 };
121
122 #ifdef CONFIG_IP_FIB_TRIE_STATS
123 struct trie_use_stats {
124 unsigned int gets;
125 unsigned int backtrack;
126 unsigned int semantic_match_passed;
127 unsigned int semantic_match_miss;
128 unsigned int null_node_hit;
129 unsigned int resize_node_skipped;
130 };
131 #endif
132
133 struct trie_stat {
134 unsigned int totdepth;
135 unsigned int maxdepth;
136 unsigned int tnodes;
137 unsigned int leaves;
138 unsigned int nullpointers;
139 unsigned int prefixes;
140 unsigned int nodesizes[MAX_STAT_DEPTH];
141 };
142
143 struct trie {
144 struct tnode __rcu *trie;
145 #ifdef CONFIG_IP_FIB_TRIE_STATS
146 struct trie_use_stats __percpu *stats;
147 #endif
148 };
149
150 static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
151 int wasfull);
152 static struct tnode *resize(struct trie *t, struct tnode *tn);
153 static struct tnode *inflate(struct trie *t, struct tnode *tn);
154 static struct tnode *halve(struct trie *t, struct tnode *tn);
155 /* tnodes to free after resize(); protected by RTNL */
156 static struct callback_head *tnode_free_head;
157 static size_t tnode_free_size;
158
159 /*
160 * synchronize_rcu after call_rcu for that many pages; it should be especially
161 * useful before resizing the root node with PREEMPT_NONE configs; the value was
162 * obtained experimentally, aiming to avoid visible slowdown.
163 */
164 static const int sync_pages = 128;
165
166 static struct kmem_cache *fn_alias_kmem __read_mostly;
167 static struct kmem_cache *trie_leaf_kmem __read_mostly;
168
169 /* caller must hold RTNL */
170 #define node_parent(n) rtnl_dereference((n)->parent)
171
172 /* caller must hold RCU read lock or RTNL */
173 #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
174
175 /* wrapper for rcu_assign_pointer */
176 static inline void node_set_parent(struct tnode *n, struct tnode *tp)
177 {
178 if (n)
179 rcu_assign_pointer(n->parent, tp);
180 }
181
182 #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
183
184 /* This provides us with the number of children in this node, in the case of a
185 * leaf this will return 0 meaning none of the children are accessible.
186 */
187 static inline int tnode_child_length(const struct tnode *tn)
188 {
189 return (1ul << tn->bits) & ~(1ul);
190 }
191
192 /*
193 * caller must hold RTNL
194 */
195 static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
196 {
197 BUG_ON(i >= tnode_child_length(tn));
198
199 return rtnl_dereference(tn->child[i]);
200 }
201
202 /*
203 * caller must hold RCU read lock or RTNL
204 */
205 static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
206 {
207 BUG_ON(i >= tnode_child_length(tn));
208
209 return rcu_dereference_rtnl(tn->child[i]);
210 }
211
212 static inline t_key mask_pfx(t_key k, unsigned int l)
213 {
214 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
215 }
216
217 static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
218 {
219 if (offset < KEYLENGTH)
220 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
221 else
222 return 0;
223 }
224
225 static inline int tkey_equals(t_key a, t_key b)
226 {
227 return a == b;
228 }
229
230 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
231 {
232 if (bits == 0 || offset >= KEYLENGTH)
233 return 1;
234 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
235 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
236 }
237
238 static inline int tkey_mismatch(t_key a, int offset, t_key b)
239 {
240 t_key diff = a ^ b;
241 int i = offset;
242
243 if (!diff)
244 return 0;
245 while ((diff << i) >> (KEYLENGTH-1) == 0)
246 i++;
247 return i;
248 }
249
250 /*
251 To understand this stuff, an understanding of keys and all their bits is
252 necessary. Every node in the trie has a key associated with it, but not
253 all of the bits in that key are significant.
254
255 Consider a node 'n' and its parent 'tp'.
256
257 If n is a leaf, every bit in its key is significant. Its presence is
258 necessitated by path compression, since during a tree traversal (when
259 searching for a leaf - unless we are doing an insertion) we will completely
260 ignore all skipped bits we encounter. Thus we need to verify, at the end of
261 a potentially successful search, that we have indeed been walking the
262 correct key path.
263
264 Note that we can never "miss" the correct key in the tree if present by
265 following the wrong path. Path compression ensures that segments of the key
266 that are the same for all keys with a given prefix are skipped, but the
267 skipped part *is* identical for each node in the subtrie below the skipped
268 bit! trie_insert() in this implementation takes care of that - note the
269 call to tkey_sub_equals() in trie_insert().
270
271 if n is an internal node - a 'tnode' here, the various parts of its key
272 have many different meanings.
273
274 Example:
275 _________________________________________________________________
276 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
277 -----------------------------------------------------------------
278 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
279
280 _________________________________________________________________
281 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
282 -----------------------------------------------------------------
283 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
284
285 tp->pos = 7
286 tp->bits = 3
287 n->pos = 15
288 n->bits = 4
289
290 First, let's just ignore the bits that come before the parent tp, that is
291 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
292 not use them for anything.
293
294 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
295 index into the parent's child array. That is, they will be used to find
296 'n' among tp's children.
297
298 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
299 for the node n.
300
301 All the bits we have seen so far are significant to the node n. The rest
302 of the bits are really not needed or indeed known in n->key.
303
304 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
305 n's child array, and will of course be different for each child.
306
307
308 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
309 at this point.
310
311 */
312
313 static const int halve_threshold = 25;
314 static const int inflate_threshold = 50;
315 static const int halve_threshold_root = 15;
316 static const int inflate_threshold_root = 30;
317
318 static void __alias_free_mem(struct rcu_head *head)
319 {
320 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
321 kmem_cache_free(fn_alias_kmem, fa);
322 }
323
324 static inline void alias_free_mem_rcu(struct fib_alias *fa)
325 {
326 call_rcu(&fa->rcu, __alias_free_mem);
327 }
328
329 #define TNODE_KMALLOC_MAX \
330 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
331
332 static void __node_free_rcu(struct rcu_head *head)
333 {
334 struct tnode *n = container_of(head, struct tnode, rcu);
335
336 if (IS_LEAF(n))
337 kmem_cache_free(trie_leaf_kmem, n);
338 else if (n->bits <= TNODE_KMALLOC_MAX)
339 kfree(n);
340 else
341 vfree(n);
342 }
343
344 #define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
345
346 static inline void free_leaf_info(struct leaf_info *leaf)
347 {
348 kfree_rcu(leaf, rcu);
349 }
350
351 static struct tnode *tnode_alloc(size_t size)
352 {
353 if (size <= PAGE_SIZE)
354 return kzalloc(size, GFP_KERNEL);
355 else
356 return vzalloc(size);
357 }
358
359 static void tnode_free_safe(struct tnode *tn)
360 {
361 BUG_ON(IS_LEAF(tn));
362 tn->rcu.next = tnode_free_head;
363 tnode_free_head = &tn->rcu;
364 }
365
366 static void tnode_free_flush(void)
367 {
368 struct callback_head *head;
369
370 while ((head = tnode_free_head)) {
371 struct tnode *tn = container_of(head, struct tnode, rcu);
372
373 tnode_free_head = head->next;
374 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
375
376 node_free(tn);
377 }
378
379 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
380 tnode_free_size = 0;
381 synchronize_rcu();
382 }
383 }
384
385 static struct tnode *leaf_new(t_key key)
386 {
387 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
388 if (l) {
389 l->parent = NULL;
390 /* set key and pos to reflect full key value
391 * any trailing zeros in the key should be ignored
392 * as the nodes are searched
393 */
394 l->key = key;
395 l->pos = KEYLENGTH;
396 /* set bits to 0 indicating we are not a tnode */
397 l->bits = 0;
398
399 INIT_HLIST_HEAD(&l->list);
400 }
401 return l;
402 }
403
404 static struct leaf_info *leaf_info_new(int plen)
405 {
406 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
407 if (li) {
408 li->plen = plen;
409 li->mask_plen = ntohl(inet_make_mask(plen));
410 INIT_LIST_HEAD(&li->falh);
411 }
412 return li;
413 }
414
415 static struct tnode *tnode_new(t_key key, int pos, int bits)
416 {
417 size_t sz = offsetof(struct tnode, child[1 << bits]);
418 struct tnode *tn = tnode_alloc(sz);
419 unsigned int shift = pos + bits;
420
421 /* verify bits and pos their msb bits clear and values are valid */
422 BUG_ON(!bits || (shift > KEYLENGTH));
423
424 if (tn) {
425 tn->parent = NULL;
426 tn->pos = pos;
427 tn->bits = bits;
428 tn->key = mask_pfx(key, pos);
429 tn->full_children = 0;
430 tn->empty_children = 1<<bits;
431 }
432
433 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
434 sizeof(struct tnode *) << bits);
435 return tn;
436 }
437
438 /*
439 * Check whether a tnode 'n' is "full", i.e. it is an internal node
440 * and no bits are skipped. See discussion in dyntree paper p. 6
441 */
442
443 static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
444 {
445 return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
446 }
447
448 static inline void put_child(struct tnode *tn, int i,
449 struct tnode *n)
450 {
451 tnode_put_child_reorg(tn, i, n, -1);
452 }
453
454 /*
455 * Add a child at position i overwriting the old value.
456 * Update the value of full_children and empty_children.
457 */
458
459 static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
460 int wasfull)
461 {
462 struct tnode *chi = rtnl_dereference(tn->child[i]);
463 int isfull;
464
465 BUG_ON(i >= 1<<tn->bits);
466
467 /* update emptyChildren */
468 if (n == NULL && chi != NULL)
469 tn->empty_children++;
470 else if (n != NULL && chi == NULL)
471 tn->empty_children--;
472
473 /* update fullChildren */
474 if (wasfull == -1)
475 wasfull = tnode_full(tn, chi);
476
477 isfull = tnode_full(tn, n);
478 if (wasfull && !isfull)
479 tn->full_children--;
480 else if (!wasfull && isfull)
481 tn->full_children++;
482
483 node_set_parent(n, tn);
484
485 rcu_assign_pointer(tn->child[i], n);
486 }
487
488 #define MAX_WORK 10
489 static struct tnode *resize(struct trie *t, struct tnode *tn)
490 {
491 struct tnode *old_tn, *n = NULL;
492 int inflate_threshold_use;
493 int halve_threshold_use;
494 int max_work;
495
496 if (!tn)
497 return NULL;
498
499 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
500 tn, inflate_threshold, halve_threshold);
501
502 /* No children */
503 if (tn->empty_children > (tnode_child_length(tn) - 1))
504 goto no_children;
505
506 /* One child */
507 if (tn->empty_children == (tnode_child_length(tn) - 1))
508 goto one_child;
509 /*
510 * Double as long as the resulting node has a number of
511 * nonempty nodes that are above the threshold.
512 */
513
514 /*
515 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
516 * the Helsinki University of Technology and Matti Tikkanen of Nokia
517 * Telecommunications, page 6:
518 * "A node is doubled if the ratio of non-empty children to all
519 * children in the *doubled* node is at least 'high'."
520 *
521 * 'high' in this instance is the variable 'inflate_threshold'. It
522 * is expressed as a percentage, so we multiply it with
523 * tnode_child_length() and instead of multiplying by 2 (since the
524 * child array will be doubled by inflate()) and multiplying
525 * the left-hand side by 100 (to handle the percentage thing) we
526 * multiply the left-hand side by 50.
527 *
528 * The left-hand side may look a bit weird: tnode_child_length(tn)
529 * - tn->empty_children is of course the number of non-null children
530 * in the current node. tn->full_children is the number of "full"
531 * children, that is non-null tnodes with a skip value of 0.
532 * All of those will be doubled in the resulting inflated tnode, so
533 * we just count them one extra time here.
534 *
535 * A clearer way to write this would be:
536 *
537 * to_be_doubled = tn->full_children;
538 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
539 * tn->full_children;
540 *
541 * new_child_length = tnode_child_length(tn) * 2;
542 *
543 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
544 * new_child_length;
545 * if (new_fill_factor >= inflate_threshold)
546 *
547 * ...and so on, tho it would mess up the while () loop.
548 *
549 * anyway,
550 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
551 * inflate_threshold
552 *
553 * avoid a division:
554 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
555 * inflate_threshold * new_child_length
556 *
557 * expand not_to_be_doubled and to_be_doubled, and shorten:
558 * 100 * (tnode_child_length(tn) - tn->empty_children +
559 * tn->full_children) >= inflate_threshold * new_child_length
560 *
561 * expand new_child_length:
562 * 100 * (tnode_child_length(tn) - tn->empty_children +
563 * tn->full_children) >=
564 * inflate_threshold * tnode_child_length(tn) * 2
565 *
566 * shorten again:
567 * 50 * (tn->full_children + tnode_child_length(tn) -
568 * tn->empty_children) >= inflate_threshold *
569 * tnode_child_length(tn)
570 *
571 */
572
573 /* Keep root node larger */
574
575 if (!node_parent(tn)) {
576 inflate_threshold_use = inflate_threshold_root;
577 halve_threshold_use = halve_threshold_root;
578 } else {
579 inflate_threshold_use = inflate_threshold;
580 halve_threshold_use = halve_threshold;
581 }
582
583 max_work = MAX_WORK;
584 while ((tn->full_children > 0 && max_work-- &&
585 50 * (tn->full_children + tnode_child_length(tn)
586 - tn->empty_children)
587 >= inflate_threshold_use * tnode_child_length(tn))) {
588
589 old_tn = tn;
590 tn = inflate(t, tn);
591
592 if (IS_ERR(tn)) {
593 tn = old_tn;
594 #ifdef CONFIG_IP_FIB_TRIE_STATS
595 this_cpu_inc(t->stats->resize_node_skipped);
596 #endif
597 break;
598 }
599 }
600
601 /* Return if at least one inflate is run */
602 if (max_work != MAX_WORK)
603 return tn;
604
605 /*
606 * Halve as long as the number of empty children in this
607 * node is above threshold.
608 */
609
610 max_work = MAX_WORK;
611 while (tn->bits > 1 && max_work-- &&
612 100 * (tnode_child_length(tn) - tn->empty_children) <
613 halve_threshold_use * tnode_child_length(tn)) {
614
615 old_tn = tn;
616 tn = halve(t, tn);
617 if (IS_ERR(tn)) {
618 tn = old_tn;
619 #ifdef CONFIG_IP_FIB_TRIE_STATS
620 this_cpu_inc(t->stats->resize_node_skipped);
621 #endif
622 break;
623 }
624 }
625
626
627 /* Only one child remains */
628 if (tn->empty_children == (tnode_child_length(tn) - 1)) {
629 unsigned long i;
630 one_child:
631 for (i = tnode_child_length(tn); !n && i;)
632 n = tnode_get_child(tn, --i);
633 no_children:
634 /* compress one level */
635 node_set_parent(n, NULL);
636 tnode_free_safe(tn);
637 return n;
638 }
639 return tn;
640 }
641
642
643 static void tnode_clean_free(struct tnode *tn)
644 {
645 struct tnode *tofree;
646 int i;
647
648 for (i = 0; i < tnode_child_length(tn); i++) {
649 tofree = rtnl_dereference(tn->child[i]);
650 if (tofree)
651 node_free(tofree);
652 }
653 node_free(tn);
654 }
655
656 static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
657 {
658 int olen = tnode_child_length(oldtnode);
659 struct tnode *tn;
660 int i;
661
662 pr_debug("In inflate\n");
663
664 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
665
666 if (!tn)
667 return ERR_PTR(-ENOMEM);
668
669 /*
670 * Preallocate and store tnodes before the actual work so we
671 * don't get into an inconsistent state if memory allocation
672 * fails. In case of failure we return the oldnode and inflate
673 * of tnode is ignored.
674 */
675
676 for (i = 0; i < olen; i++) {
677 struct tnode *inode;
678
679 inode = tnode_get_child(oldtnode, i);
680 if (tnode_full(oldtnode, inode) && inode->bits > 1) {
681 struct tnode *left, *right;
682 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
683
684 left = tnode_new(inode->key&(~m), inode->pos + 1,
685 inode->bits - 1);
686 if (!left)
687 goto nomem;
688
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
692 if (!right) {
693 node_free(left);
694 goto nomem;
695 }
696
697 put_child(tn, 2*i, left);
698 put_child(tn, 2*i+1, right);
699 }
700 }
701
702 for (i = 0; i < olen; i++) {
703 struct tnode *inode = tnode_get_child(oldtnode, i);
704 struct tnode *left, *right;
705 int size, j;
706
707 /* An empty child */
708 if (inode == NULL)
709 continue;
710
711 /* A leaf or an internal node with skipped bits */
712 if (!tnode_full(oldtnode, inode)) {
713 put_child(tn,
714 tkey_extract_bits(inode->key, tn->pos, tn->bits),
715 inode);
716 continue;
717 }
718
719 /* An internal node with two children */
720 if (inode->bits == 1) {
721 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
722 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
723
724 tnode_free_safe(inode);
725 continue;
726 }
727
728 /* An internal node with more than two children */
729
730 /* We will replace this node 'inode' with two new
731 * ones, 'left' and 'right', each with half of the
732 * original children. The two new nodes will have
733 * a position one bit further down the key and this
734 * means that the "significant" part of their keys
735 * (see the discussion near the top of this file)
736 * will differ by one bit, which will be "0" in
737 * left's key and "1" in right's key. Since we are
738 * moving the key position by one step, the bit that
739 * we are moving away from - the bit at position
740 * (inode->pos) - is the one that will differ between
741 * left and right. So... we synthesize that bit in the
742 * two new keys.
743 * The mask 'm' below will be a single "one" bit at
744 * the position (inode->pos)
745 */
746
747 /* Use the old key, but set the new significant
748 * bit to zero.
749 */
750
751 left = tnode_get_child(tn, 2*i);
752 put_child(tn, 2*i, NULL);
753
754 BUG_ON(!left);
755
756 right = tnode_get_child(tn, 2*i+1);
757 put_child(tn, 2*i+1, NULL);
758
759 BUG_ON(!right);
760
761 size = tnode_child_length(left);
762 for (j = 0; j < size; j++) {
763 put_child(left, j, rtnl_dereference(inode->child[j]));
764 put_child(right, j, rtnl_dereference(inode->child[j + size]));
765 }
766 put_child(tn, 2*i, resize(t, left));
767 put_child(tn, 2*i+1, resize(t, right));
768
769 tnode_free_safe(inode);
770 }
771 tnode_free_safe(oldtnode);
772 return tn;
773 nomem:
774 tnode_clean_free(tn);
775 return ERR_PTR(-ENOMEM);
776 }
777
778 static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
779 {
780 int olen = tnode_child_length(oldtnode);
781 struct tnode *tn, *left, *right;
782 int i;
783
784 pr_debug("In halve\n");
785
786 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
787
788 if (!tn)
789 return ERR_PTR(-ENOMEM);
790
791 /*
792 * Preallocate and store tnodes before the actual work so we
793 * don't get into an inconsistent state if memory allocation
794 * fails. In case of failure we return the oldnode and halve
795 * of tnode is ignored.
796 */
797
798 for (i = 0; i < olen; i += 2) {
799 left = tnode_get_child(oldtnode, i);
800 right = tnode_get_child(oldtnode, i+1);
801
802 /* Two nonempty children */
803 if (left && right) {
804 struct tnode *newn;
805
806 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
807
808 if (!newn)
809 goto nomem;
810
811 put_child(tn, i/2, newn);
812 }
813
814 }
815
816 for (i = 0; i < olen; i += 2) {
817 struct tnode *newBinNode;
818
819 left = tnode_get_child(oldtnode, i);
820 right = tnode_get_child(oldtnode, i+1);
821
822 /* At least one of the children is empty */
823 if (left == NULL) {
824 if (right == NULL) /* Both are empty */
825 continue;
826 put_child(tn, i/2, right);
827 continue;
828 }
829
830 if (right == NULL) {
831 put_child(tn, i/2, left);
832 continue;
833 }
834
835 /* Two nonempty children */
836 newBinNode = tnode_get_child(tn, i/2);
837 put_child(tn, i/2, NULL);
838 put_child(newBinNode, 0, left);
839 put_child(newBinNode, 1, right);
840 put_child(tn, i/2, resize(t, newBinNode));
841 }
842 tnode_free_safe(oldtnode);
843 return tn;
844 nomem:
845 tnode_clean_free(tn);
846 return ERR_PTR(-ENOMEM);
847 }
848
849 /* readside must use rcu_read_lock currently dump routines
850 via get_fa_head and dump */
851
852 static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
853 {
854 struct hlist_head *head = &l->list;
855 struct leaf_info *li;
856
857 hlist_for_each_entry_rcu(li, head, hlist)
858 if (li->plen == plen)
859 return li;
860
861 return NULL;
862 }
863
864 static inline struct list_head *get_fa_head(struct tnode *l, int plen)
865 {
866 struct leaf_info *li = find_leaf_info(l, plen);
867
868 if (!li)
869 return NULL;
870
871 return &li->falh;
872 }
873
874 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
875 {
876 struct leaf_info *li = NULL, *last = NULL;
877
878 if (hlist_empty(head)) {
879 hlist_add_head_rcu(&new->hlist, head);
880 } else {
881 hlist_for_each_entry(li, head, hlist) {
882 if (new->plen > li->plen)
883 break;
884
885 last = li;
886 }
887 if (last)
888 hlist_add_behind_rcu(&new->hlist, &last->hlist);
889 else
890 hlist_add_before_rcu(&new->hlist, &li->hlist);
891 }
892 }
893
894 /* rcu_read_lock needs to be hold by caller from readside */
895 static struct tnode *fib_find_node(struct trie *t, u32 key)
896 {
897 struct tnode *n = rcu_dereference_rtnl(t->trie);
898
899 while (n) {
900 unsigned long index = get_index(key, n);
901
902 /* This bit of code is a bit tricky but it combines multiple
903 * checks into a single check. The prefix consists of the
904 * prefix plus zeros for the bits in the cindex. The index
905 * is the difference between the key and this value. From
906 * this we can actually derive several pieces of data.
907 * if !(index >> bits)
908 * we know the value is cindex
909 * else
910 * we have a mismatch in skip bits and failed
911 */
912 if (index >> n->bits)
913 return NULL;
914
915 /* we have found a leaf. Prefixes have already been compared */
916 if (IS_LEAF(n))
917 break;
918
919 n = rcu_dereference_rtnl(n->child[index]);
920 }
921
922 return n;
923 }
924
925 static void trie_rebalance(struct trie *t, struct tnode *tn)
926 {
927 int wasfull;
928 t_key cindex, key;
929 struct tnode *tp;
930
931 key = tn->key;
932
933 while (tn != NULL && (tp = node_parent(tn)) != NULL) {
934 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
935 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
936 tn = resize(t, tn);
937
938 tnode_put_child_reorg(tp, cindex, tn, wasfull);
939
940 tp = node_parent(tn);
941 if (!tp)
942 rcu_assign_pointer(t->trie, tn);
943
944 tnode_free_flush();
945 if (!tp)
946 break;
947 tn = tp;
948 }
949
950 /* Handle last (top) tnode */
951 if (IS_TNODE(tn))
952 tn = resize(t, tn);
953
954 rcu_assign_pointer(t->trie, tn);
955 tnode_free_flush();
956 }
957
958 /* only used from updater-side */
959
960 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
961 {
962 int pos, newpos;
963 struct tnode *tp = NULL, *tn = NULL;
964 struct tnode *n;
965 struct tnode *l;
966 int missbit;
967 struct list_head *fa_head = NULL;
968 struct leaf_info *li;
969 t_key cindex;
970
971 pos = 0;
972 n = rtnl_dereference(t->trie);
973
974 /* If we point to NULL, stop. Either the tree is empty and we should
975 * just put a new leaf in if, or we have reached an empty child slot,
976 * and we should just put our new leaf in that.
977 * If we point to a T_TNODE, check if it matches our key. Note that
978 * a T_TNODE might be skipping any number of bits - its 'pos' need
979 * not be the parent's 'pos'+'bits'!
980 *
981 * If it does match the current key, get pos/bits from it, extract
982 * the index from our key, push the T_TNODE and walk the tree.
983 *
984 * If it doesn't, we have to replace it with a new T_TNODE.
985 *
986 * If we point to a T_LEAF, it might or might not have the same key
987 * as we do. If it does, just change the value, update the T_LEAF's
988 * value, and return it.
989 * If it doesn't, we need to replace it with a T_TNODE.
990 */
991
992 while (n && IS_TNODE(n)) {
993 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
994 tp = n;
995 pos = n->pos + n->bits;
996 n = tnode_get_child(n,
997 tkey_extract_bits(key,
998 n->pos,
999 n->bits));
1000
1001 BUG_ON(n && node_parent(n) != tp);
1002 } else
1003 break;
1004 }
1005
1006 /*
1007 * n ----> NULL, LEAF or TNODE
1008 *
1009 * tp is n's (parent) ----> NULL or TNODE
1010 */
1011
1012 BUG_ON(tp && IS_LEAF(tp));
1013
1014 /* Case 1: n is a leaf. Compare prefixes */
1015
1016 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1017 li = leaf_info_new(plen);
1018
1019 if (!li)
1020 return NULL;
1021
1022 fa_head = &li->falh;
1023 insert_leaf_info(&n->list, li);
1024 goto done;
1025 }
1026 l = leaf_new(key);
1027
1028 if (!l)
1029 return NULL;
1030
1031 li = leaf_info_new(plen);
1032
1033 if (!li) {
1034 node_free(l);
1035 return NULL;
1036 }
1037
1038 fa_head = &li->falh;
1039 insert_leaf_info(&l->list, li);
1040
1041 if (t->trie && n == NULL) {
1042 /* Case 2: n is NULL, and will just insert a new leaf */
1043
1044 node_set_parent(l, tp);
1045
1046 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1047 put_child(tp, cindex, l);
1048 } else {
1049 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1050 /*
1051 * Add a new tnode here
1052 * first tnode need some special handling
1053 */
1054
1055 if (n) {
1056 pos = tp ? tp->pos+tp->bits : 0;
1057 newpos = tkey_mismatch(key, pos, n->key);
1058 tn = tnode_new(n->key, newpos, 1);
1059 } else {
1060 newpos = 0;
1061 tn = tnode_new(key, newpos, 1); /* First tnode */
1062 }
1063
1064 if (!tn) {
1065 free_leaf_info(li);
1066 node_free(l);
1067 return NULL;
1068 }
1069
1070 node_set_parent(tn, tp);
1071
1072 missbit = tkey_extract_bits(key, newpos, 1);
1073 put_child(tn, missbit, l);
1074 put_child(tn, 1-missbit, n);
1075
1076 if (tp) {
1077 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1078 put_child(tp, cindex, tn);
1079 } else {
1080 rcu_assign_pointer(t->trie, tn);
1081 }
1082
1083 tp = tn;
1084 }
1085
1086 if (tp && tp->pos + tp->bits > 32)
1087 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1088 tp, tp->pos, tp->bits, key, plen);
1089
1090 /* Rebalance the trie */
1091
1092 trie_rebalance(t, tp);
1093 done:
1094 return fa_head;
1095 }
1096
1097 /*
1098 * Caller must hold RTNL.
1099 */
1100 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1101 {
1102 struct trie *t = (struct trie *) tb->tb_data;
1103 struct fib_alias *fa, *new_fa;
1104 struct list_head *fa_head = NULL;
1105 struct fib_info *fi;
1106 int plen = cfg->fc_dst_len;
1107 u8 tos = cfg->fc_tos;
1108 u32 key, mask;
1109 int err;
1110 struct tnode *l;
1111
1112 if (plen > 32)
1113 return -EINVAL;
1114
1115 key = ntohl(cfg->fc_dst);
1116
1117 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1118
1119 mask = ntohl(inet_make_mask(plen));
1120
1121 if (key & ~mask)
1122 return -EINVAL;
1123
1124 key = key & mask;
1125
1126 fi = fib_create_info(cfg);
1127 if (IS_ERR(fi)) {
1128 err = PTR_ERR(fi);
1129 goto err;
1130 }
1131
1132 l = fib_find_node(t, key);
1133 fa = NULL;
1134
1135 if (l) {
1136 fa_head = get_fa_head(l, plen);
1137 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1138 }
1139
1140 /* Now fa, if non-NULL, points to the first fib alias
1141 * with the same keys [prefix,tos,priority], if such key already
1142 * exists or to the node before which we will insert new one.
1143 *
1144 * If fa is NULL, we will need to allocate a new one and
1145 * insert to the head of f.
1146 *
1147 * If f is NULL, no fib node matched the destination key
1148 * and we need to allocate a new one of those as well.
1149 */
1150
1151 if (fa && fa->fa_tos == tos &&
1152 fa->fa_info->fib_priority == fi->fib_priority) {
1153 struct fib_alias *fa_first, *fa_match;
1154
1155 err = -EEXIST;
1156 if (cfg->fc_nlflags & NLM_F_EXCL)
1157 goto out;
1158
1159 /* We have 2 goals:
1160 * 1. Find exact match for type, scope, fib_info to avoid
1161 * duplicate routes
1162 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1163 */
1164 fa_match = NULL;
1165 fa_first = fa;
1166 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1167 list_for_each_entry_continue(fa, fa_head, fa_list) {
1168 if (fa->fa_tos != tos)
1169 break;
1170 if (fa->fa_info->fib_priority != fi->fib_priority)
1171 break;
1172 if (fa->fa_type == cfg->fc_type &&
1173 fa->fa_info == fi) {
1174 fa_match = fa;
1175 break;
1176 }
1177 }
1178
1179 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1180 struct fib_info *fi_drop;
1181 u8 state;
1182
1183 fa = fa_first;
1184 if (fa_match) {
1185 if (fa == fa_match)
1186 err = 0;
1187 goto out;
1188 }
1189 err = -ENOBUFS;
1190 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1191 if (new_fa == NULL)
1192 goto out;
1193
1194 fi_drop = fa->fa_info;
1195 new_fa->fa_tos = fa->fa_tos;
1196 new_fa->fa_info = fi;
1197 new_fa->fa_type = cfg->fc_type;
1198 state = fa->fa_state;
1199 new_fa->fa_state = state & ~FA_S_ACCESSED;
1200
1201 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1202 alias_free_mem_rcu(fa);
1203
1204 fib_release_info(fi_drop);
1205 if (state & FA_S_ACCESSED)
1206 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1207 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1208 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1209
1210 goto succeeded;
1211 }
1212 /* Error if we find a perfect match which
1213 * uses the same scope, type, and nexthop
1214 * information.
1215 */
1216 if (fa_match)
1217 goto out;
1218
1219 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1220 fa = fa_first;
1221 }
1222 err = -ENOENT;
1223 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1224 goto out;
1225
1226 err = -ENOBUFS;
1227 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1228 if (new_fa == NULL)
1229 goto out;
1230
1231 new_fa->fa_info = fi;
1232 new_fa->fa_tos = tos;
1233 new_fa->fa_type = cfg->fc_type;
1234 new_fa->fa_state = 0;
1235 /*
1236 * Insert new entry to the list.
1237 */
1238
1239 if (!fa_head) {
1240 fa_head = fib_insert_node(t, key, plen);
1241 if (unlikely(!fa_head)) {
1242 err = -ENOMEM;
1243 goto out_free_new_fa;
1244 }
1245 }
1246
1247 if (!plen)
1248 tb->tb_num_default++;
1249
1250 list_add_tail_rcu(&new_fa->fa_list,
1251 (fa ? &fa->fa_list : fa_head));
1252
1253 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1254 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1255 &cfg->fc_nlinfo, 0);
1256 succeeded:
1257 return 0;
1258
1259 out_free_new_fa:
1260 kmem_cache_free(fn_alias_kmem, new_fa);
1261 out:
1262 fib_release_info(fi);
1263 err:
1264 return err;
1265 }
1266
1267 /* should be called with rcu_read_lock */
1268 static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
1269 t_key key, const struct flowi4 *flp,
1270 struct fib_result *res, int fib_flags)
1271 {
1272 struct leaf_info *li;
1273 struct hlist_head *hhead = &l->list;
1274
1275 hlist_for_each_entry_rcu(li, hhead, hlist) {
1276 struct fib_alias *fa;
1277
1278 if (l->key != (key & li->mask_plen))
1279 continue;
1280
1281 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1282 struct fib_info *fi = fa->fa_info;
1283 int nhsel, err;
1284
1285 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1286 continue;
1287 if (fi->fib_dead)
1288 continue;
1289 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1290 continue;
1291 fib_alias_accessed(fa);
1292 err = fib_props[fa->fa_type].error;
1293 if (unlikely(err < 0)) {
1294 #ifdef CONFIG_IP_FIB_TRIE_STATS
1295 this_cpu_inc(t->stats->semantic_match_passed);
1296 #endif
1297 return err;
1298 }
1299 if (fi->fib_flags & RTNH_F_DEAD)
1300 continue;
1301 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1302 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1303
1304 if (nh->nh_flags & RTNH_F_DEAD)
1305 continue;
1306 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1307 continue;
1308
1309 #ifdef CONFIG_IP_FIB_TRIE_STATS
1310 this_cpu_inc(t->stats->semantic_match_passed);
1311 #endif
1312 res->prefixlen = li->plen;
1313 res->nh_sel = nhsel;
1314 res->type = fa->fa_type;
1315 res->scope = fi->fib_scope;
1316 res->fi = fi;
1317 res->table = tb;
1318 res->fa_head = &li->falh;
1319 if (!(fib_flags & FIB_LOOKUP_NOREF))
1320 atomic_inc(&fi->fib_clntref);
1321 return 0;
1322 }
1323 }
1324
1325 #ifdef CONFIG_IP_FIB_TRIE_STATS
1326 this_cpu_inc(t->stats->semantic_match_miss);
1327 #endif
1328 }
1329
1330 return 1;
1331 }
1332
1333 static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1334 {
1335 t_key prefix = n->key;
1336
1337 return (key ^ prefix) & (prefix | -prefix);
1338 }
1339
1340 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1341 struct fib_result *res, int fib_flags)
1342 {
1343 struct trie *t = (struct trie *)tb->tb_data;
1344 #ifdef CONFIG_IP_FIB_TRIE_STATS
1345 struct trie_use_stats __percpu *stats = t->stats;
1346 #endif
1347 const t_key key = ntohl(flp->daddr);
1348 struct tnode *n, *pn;
1349 t_key cindex;
1350 int ret = 1;
1351
1352 rcu_read_lock();
1353
1354 n = rcu_dereference(t->trie);
1355 if (!n)
1356 goto failed;
1357
1358 #ifdef CONFIG_IP_FIB_TRIE_STATS
1359 this_cpu_inc(stats->gets);
1360 #endif
1361
1362 pn = n;
1363 cindex = 0;
1364
1365 /* Step 1: Travel to the longest prefix match in the trie */
1366 for (;;) {
1367 unsigned long index = get_index(key, n);
1368
1369 /* This bit of code is a bit tricky but it combines multiple
1370 * checks into a single check. The prefix consists of the
1371 * prefix plus zeros for the "bits" in the prefix. The index
1372 * is the difference between the key and this value. From
1373 * this we can actually derive several pieces of data.
1374 * if !(index >> bits)
1375 * we know the value is child index
1376 * else
1377 * we have a mismatch in skip bits and failed
1378 */
1379 if (index >> n->bits)
1380 break;
1381
1382 /* we have found a leaf. Prefixes have already been compared */
1383 if (IS_LEAF(n))
1384 goto found;
1385
1386 /* only record pn and cindex if we are going to be chopping
1387 * bits later. Otherwise we are just wasting cycles.
1388 */
1389 if (index) {
1390 pn = n;
1391 cindex = index;
1392 }
1393
1394 n = rcu_dereference(n->child[index]);
1395 if (unlikely(!n))
1396 goto backtrace;
1397 }
1398
1399 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1400 for (;;) {
1401 /* record the pointer where our next node pointer is stored */
1402 struct tnode __rcu **cptr = n->child;
1403
1404 /* This test verifies that none of the bits that differ
1405 * between the key and the prefix exist in the region of
1406 * the lsb and higher in the prefix.
1407 */
1408 if (unlikely(prefix_mismatch(key, n)))
1409 goto backtrace;
1410
1411 /* exit out and process leaf */
1412 if (unlikely(IS_LEAF(n)))
1413 break;
1414
1415 /* Don't bother recording parent info. Since we are in
1416 * prefix match mode we will have to come back to wherever
1417 * we started this traversal anyway
1418 */
1419
1420 while ((n = rcu_dereference(*cptr)) == NULL) {
1421 backtrace:
1422 #ifdef CONFIG_IP_FIB_TRIE_STATS
1423 if (!n)
1424 this_cpu_inc(stats->null_node_hit);
1425 #endif
1426 /* If we are at cindex 0 there are no more bits for
1427 * us to strip at this level so we must ascend back
1428 * up one level to see if there are any more bits to
1429 * be stripped there.
1430 */
1431 while (!cindex) {
1432 t_key pkey = pn->key;
1433
1434 pn = node_parent_rcu(pn);
1435 if (unlikely(!pn))
1436 goto failed;
1437 #ifdef CONFIG_IP_FIB_TRIE_STATS
1438 this_cpu_inc(stats->backtrack);
1439 #endif
1440 /* Get Child's index */
1441 cindex = get_index(pkey, pn);
1442 }
1443
1444 /* strip the least significant bit from the cindex */
1445 cindex &= cindex - 1;
1446
1447 /* grab pointer for next child node */
1448 cptr = &pn->child[cindex];
1449 }
1450 }
1451
1452 found:
1453 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1454 ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
1455 if (unlikely(ret > 0))
1456 goto backtrace;
1457 failed:
1458 rcu_read_unlock();
1459 return ret;
1460 }
1461 EXPORT_SYMBOL_GPL(fib_table_lookup);
1462
1463 /*
1464 * Remove the leaf and return parent.
1465 */
1466 static void trie_leaf_remove(struct trie *t, struct tnode *l)
1467 {
1468 struct tnode *tp = node_parent(l);
1469
1470 pr_debug("entering trie_leaf_remove(%p)\n", l);
1471
1472 if (tp) {
1473 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1474 put_child(tp, cindex, NULL);
1475 trie_rebalance(t, tp);
1476 } else
1477 RCU_INIT_POINTER(t->trie, NULL);
1478
1479 node_free(l);
1480 }
1481
1482 /*
1483 * Caller must hold RTNL.
1484 */
1485 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1486 {
1487 struct trie *t = (struct trie *) tb->tb_data;
1488 u32 key, mask;
1489 int plen = cfg->fc_dst_len;
1490 u8 tos = cfg->fc_tos;
1491 struct fib_alias *fa, *fa_to_delete;
1492 struct list_head *fa_head;
1493 struct tnode *l;
1494 struct leaf_info *li;
1495
1496 if (plen > 32)
1497 return -EINVAL;
1498
1499 key = ntohl(cfg->fc_dst);
1500 mask = ntohl(inet_make_mask(plen));
1501
1502 if (key & ~mask)
1503 return -EINVAL;
1504
1505 key = key & mask;
1506 l = fib_find_node(t, key);
1507
1508 if (!l)
1509 return -ESRCH;
1510
1511 li = find_leaf_info(l, plen);
1512
1513 if (!li)
1514 return -ESRCH;
1515
1516 fa_head = &li->falh;
1517 fa = fib_find_alias(fa_head, tos, 0);
1518
1519 if (!fa)
1520 return -ESRCH;
1521
1522 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1523
1524 fa_to_delete = NULL;
1525 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1526 list_for_each_entry_continue(fa, fa_head, fa_list) {
1527 struct fib_info *fi = fa->fa_info;
1528
1529 if (fa->fa_tos != tos)
1530 break;
1531
1532 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1533 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1534 fa->fa_info->fib_scope == cfg->fc_scope) &&
1535 (!cfg->fc_prefsrc ||
1536 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1537 (!cfg->fc_protocol ||
1538 fi->fib_protocol == cfg->fc_protocol) &&
1539 fib_nh_match(cfg, fi) == 0) {
1540 fa_to_delete = fa;
1541 break;
1542 }
1543 }
1544
1545 if (!fa_to_delete)
1546 return -ESRCH;
1547
1548 fa = fa_to_delete;
1549 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1550 &cfg->fc_nlinfo, 0);
1551
1552 list_del_rcu(&fa->fa_list);
1553
1554 if (!plen)
1555 tb->tb_num_default--;
1556
1557 if (list_empty(fa_head)) {
1558 hlist_del_rcu(&li->hlist);
1559 free_leaf_info(li);
1560 }
1561
1562 if (hlist_empty(&l->list))
1563 trie_leaf_remove(t, l);
1564
1565 if (fa->fa_state & FA_S_ACCESSED)
1566 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1567
1568 fib_release_info(fa->fa_info);
1569 alias_free_mem_rcu(fa);
1570 return 0;
1571 }
1572
1573 static int trie_flush_list(struct list_head *head)
1574 {
1575 struct fib_alias *fa, *fa_node;
1576 int found = 0;
1577
1578 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1579 struct fib_info *fi = fa->fa_info;
1580
1581 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1582 list_del_rcu(&fa->fa_list);
1583 fib_release_info(fa->fa_info);
1584 alias_free_mem_rcu(fa);
1585 found++;
1586 }
1587 }
1588 return found;
1589 }
1590
1591 static int trie_flush_leaf(struct tnode *l)
1592 {
1593 int found = 0;
1594 struct hlist_head *lih = &l->list;
1595 struct hlist_node *tmp;
1596 struct leaf_info *li = NULL;
1597
1598 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1599 found += trie_flush_list(&li->falh);
1600
1601 if (list_empty(&li->falh)) {
1602 hlist_del_rcu(&li->hlist);
1603 free_leaf_info(li);
1604 }
1605 }
1606 return found;
1607 }
1608
1609 /*
1610 * Scan for the next right leaf starting at node p->child[idx]
1611 * Since we have back pointer, no recursion necessary.
1612 */
1613 static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1614 {
1615 do {
1616 t_key idx;
1617
1618 if (c)
1619 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1620 else
1621 idx = 0;
1622
1623 while (idx < 1u << p->bits) {
1624 c = tnode_get_child_rcu(p, idx++);
1625 if (!c)
1626 continue;
1627
1628 if (IS_LEAF(c))
1629 return c;
1630
1631 /* Rescan start scanning in new node */
1632 p = c;
1633 idx = 0;
1634 }
1635
1636 /* Node empty, walk back up to parent */
1637 c = p;
1638 } while ((p = node_parent_rcu(c)) != NULL);
1639
1640 return NULL; /* Root of trie */
1641 }
1642
1643 static struct tnode *trie_firstleaf(struct trie *t)
1644 {
1645 struct tnode *n = rcu_dereference_rtnl(t->trie);
1646
1647 if (!n)
1648 return NULL;
1649
1650 if (IS_LEAF(n)) /* trie is just a leaf */
1651 return n;
1652
1653 return leaf_walk_rcu(n, NULL);
1654 }
1655
1656 static struct tnode *trie_nextleaf(struct tnode *l)
1657 {
1658 struct tnode *p = node_parent_rcu(l);
1659
1660 if (!p)
1661 return NULL; /* trie with just one leaf */
1662
1663 return leaf_walk_rcu(p, l);
1664 }
1665
1666 static struct tnode *trie_leafindex(struct trie *t, int index)
1667 {
1668 struct tnode *l = trie_firstleaf(t);
1669
1670 while (l && index-- > 0)
1671 l = trie_nextleaf(l);
1672
1673 return l;
1674 }
1675
1676
1677 /*
1678 * Caller must hold RTNL.
1679 */
1680 int fib_table_flush(struct fib_table *tb)
1681 {
1682 struct trie *t = (struct trie *) tb->tb_data;
1683 struct tnode *l, *ll = NULL;
1684 int found = 0;
1685
1686 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1687 found += trie_flush_leaf(l);
1688
1689 if (ll && hlist_empty(&ll->list))
1690 trie_leaf_remove(t, ll);
1691 ll = l;
1692 }
1693
1694 if (ll && hlist_empty(&ll->list))
1695 trie_leaf_remove(t, ll);
1696
1697 pr_debug("trie_flush found=%d\n", found);
1698 return found;
1699 }
1700
1701 void fib_free_table(struct fib_table *tb)
1702 {
1703 #ifdef CONFIG_IP_FIB_TRIE_STATS
1704 struct trie *t = (struct trie *)tb->tb_data;
1705
1706 free_percpu(t->stats);
1707 #endif /* CONFIG_IP_FIB_TRIE_STATS */
1708 kfree(tb);
1709 }
1710
1711 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1712 struct fib_table *tb,
1713 struct sk_buff *skb, struct netlink_callback *cb)
1714 {
1715 int i, s_i;
1716 struct fib_alias *fa;
1717 __be32 xkey = htonl(key);
1718
1719 s_i = cb->args[5];
1720 i = 0;
1721
1722 /* rcu_read_lock is hold by caller */
1723
1724 list_for_each_entry_rcu(fa, fah, fa_list) {
1725 if (i < s_i) {
1726 i++;
1727 continue;
1728 }
1729
1730 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1731 cb->nlh->nlmsg_seq,
1732 RTM_NEWROUTE,
1733 tb->tb_id,
1734 fa->fa_type,
1735 xkey,
1736 plen,
1737 fa->fa_tos,
1738 fa->fa_info, NLM_F_MULTI) < 0) {
1739 cb->args[5] = i;
1740 return -1;
1741 }
1742 i++;
1743 }
1744 cb->args[5] = i;
1745 return skb->len;
1746 }
1747
1748 static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1749 struct sk_buff *skb, struct netlink_callback *cb)
1750 {
1751 struct leaf_info *li;
1752 int i, s_i;
1753
1754 s_i = cb->args[4];
1755 i = 0;
1756
1757 /* rcu_read_lock is hold by caller */
1758 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1759 if (i < s_i) {
1760 i++;
1761 continue;
1762 }
1763
1764 if (i > s_i)
1765 cb->args[5] = 0;
1766
1767 if (list_empty(&li->falh))
1768 continue;
1769
1770 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1771 cb->args[4] = i;
1772 return -1;
1773 }
1774 i++;
1775 }
1776
1777 cb->args[4] = i;
1778 return skb->len;
1779 }
1780
1781 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1782 struct netlink_callback *cb)
1783 {
1784 struct tnode *l;
1785 struct trie *t = (struct trie *) tb->tb_data;
1786 t_key key = cb->args[2];
1787 int count = cb->args[3];
1788
1789 rcu_read_lock();
1790 /* Dump starting at last key.
1791 * Note: 0.0.0.0/0 (ie default) is first key.
1792 */
1793 if (count == 0)
1794 l = trie_firstleaf(t);
1795 else {
1796 /* Normally, continue from last key, but if that is missing
1797 * fallback to using slow rescan
1798 */
1799 l = fib_find_node(t, key);
1800 if (!l)
1801 l = trie_leafindex(t, count);
1802 }
1803
1804 while (l) {
1805 cb->args[2] = l->key;
1806 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1807 cb->args[3] = count;
1808 rcu_read_unlock();
1809 return -1;
1810 }
1811
1812 ++count;
1813 l = trie_nextleaf(l);
1814 memset(&cb->args[4], 0,
1815 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1816 }
1817 cb->args[3] = count;
1818 rcu_read_unlock();
1819
1820 return skb->len;
1821 }
1822
1823 void __init fib_trie_init(void)
1824 {
1825 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1826 sizeof(struct fib_alias),
1827 0, SLAB_PANIC, NULL);
1828
1829 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1830 max(sizeof(struct tnode),
1831 sizeof(struct leaf_info)),
1832 0, SLAB_PANIC, NULL);
1833 }
1834
1835
1836 struct fib_table *fib_trie_table(u32 id)
1837 {
1838 struct fib_table *tb;
1839 struct trie *t;
1840
1841 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1842 GFP_KERNEL);
1843 if (tb == NULL)
1844 return NULL;
1845
1846 tb->tb_id = id;
1847 tb->tb_default = -1;
1848 tb->tb_num_default = 0;
1849
1850 t = (struct trie *) tb->tb_data;
1851 RCU_INIT_POINTER(t->trie, NULL);
1852 #ifdef CONFIG_IP_FIB_TRIE_STATS
1853 t->stats = alloc_percpu(struct trie_use_stats);
1854 if (!t->stats) {
1855 kfree(tb);
1856 tb = NULL;
1857 }
1858 #endif
1859
1860 return tb;
1861 }
1862
1863 #ifdef CONFIG_PROC_FS
1864 /* Depth first Trie walk iterator */
1865 struct fib_trie_iter {
1866 struct seq_net_private p;
1867 struct fib_table *tb;
1868 struct tnode *tnode;
1869 unsigned int index;
1870 unsigned int depth;
1871 };
1872
1873 static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1874 {
1875 struct tnode *tn = iter->tnode;
1876 unsigned int cindex = iter->index;
1877 struct tnode *p;
1878
1879 /* A single entry routing table */
1880 if (!tn)
1881 return NULL;
1882
1883 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1884 iter->tnode, iter->index, iter->depth);
1885 rescan:
1886 while (cindex < (1<<tn->bits)) {
1887 struct tnode *n = tnode_get_child_rcu(tn, cindex);
1888
1889 if (n) {
1890 if (IS_LEAF(n)) {
1891 iter->tnode = tn;
1892 iter->index = cindex + 1;
1893 } else {
1894 /* push down one level */
1895 iter->tnode = n;
1896 iter->index = 0;
1897 ++iter->depth;
1898 }
1899 return n;
1900 }
1901
1902 ++cindex;
1903 }
1904
1905 /* Current node exhausted, pop back up */
1906 p = node_parent_rcu(tn);
1907 if (p) {
1908 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
1909 tn = p;
1910 --iter->depth;
1911 goto rescan;
1912 }
1913
1914 /* got root? */
1915 return NULL;
1916 }
1917
1918 static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1919 struct trie *t)
1920 {
1921 struct tnode *n;
1922
1923 if (!t)
1924 return NULL;
1925
1926 n = rcu_dereference(t->trie);
1927 if (!n)
1928 return NULL;
1929
1930 if (IS_TNODE(n)) {
1931 iter->tnode = n;
1932 iter->index = 0;
1933 iter->depth = 1;
1934 } else {
1935 iter->tnode = NULL;
1936 iter->index = 0;
1937 iter->depth = 0;
1938 }
1939
1940 return n;
1941 }
1942
1943 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1944 {
1945 struct tnode *n;
1946 struct fib_trie_iter iter;
1947
1948 memset(s, 0, sizeof(*s));
1949
1950 rcu_read_lock();
1951 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1952 if (IS_LEAF(n)) {
1953 struct leaf_info *li;
1954
1955 s->leaves++;
1956 s->totdepth += iter.depth;
1957 if (iter.depth > s->maxdepth)
1958 s->maxdepth = iter.depth;
1959
1960 hlist_for_each_entry_rcu(li, &n->list, hlist)
1961 ++s->prefixes;
1962 } else {
1963 int i;
1964
1965 s->tnodes++;
1966 if (n->bits < MAX_STAT_DEPTH)
1967 s->nodesizes[n->bits]++;
1968
1969 for (i = 0; i < tnode_child_length(n); i++)
1970 if (!rcu_access_pointer(n->child[i]))
1971 s->nullpointers++;
1972 }
1973 }
1974 rcu_read_unlock();
1975 }
1976
1977 /*
1978 * This outputs /proc/net/fib_triestats
1979 */
1980 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1981 {
1982 unsigned int i, max, pointers, bytes, avdepth;
1983
1984 if (stat->leaves)
1985 avdepth = stat->totdepth*100 / stat->leaves;
1986 else
1987 avdepth = 0;
1988
1989 seq_printf(seq, "\tAver depth: %u.%02d\n",
1990 avdepth / 100, avdepth % 100);
1991 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
1992
1993 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
1994 bytes = sizeof(struct tnode) * stat->leaves;
1995
1996 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
1997 bytes += sizeof(struct leaf_info) * stat->prefixes;
1998
1999 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2000 bytes += sizeof(struct tnode) * stat->tnodes;
2001
2002 max = MAX_STAT_DEPTH;
2003 while (max > 0 && stat->nodesizes[max-1] == 0)
2004 max--;
2005
2006 pointers = 0;
2007 for (i = 1; i < max; i++)
2008 if (stat->nodesizes[i] != 0) {
2009 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2010 pointers += (1<<i) * stat->nodesizes[i];
2011 }
2012 seq_putc(seq, '\n');
2013 seq_printf(seq, "\tPointers: %u\n", pointers);
2014
2015 bytes += sizeof(struct tnode *) * pointers;
2016 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2017 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2018 }
2019
2020 #ifdef CONFIG_IP_FIB_TRIE_STATS
2021 static void trie_show_usage(struct seq_file *seq,
2022 const struct trie_use_stats __percpu *stats)
2023 {
2024 struct trie_use_stats s = { 0 };
2025 int cpu;
2026
2027 /* loop through all of the CPUs and gather up the stats */
2028 for_each_possible_cpu(cpu) {
2029 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2030
2031 s.gets += pcpu->gets;
2032 s.backtrack += pcpu->backtrack;
2033 s.semantic_match_passed += pcpu->semantic_match_passed;
2034 s.semantic_match_miss += pcpu->semantic_match_miss;
2035 s.null_node_hit += pcpu->null_node_hit;
2036 s.resize_node_skipped += pcpu->resize_node_skipped;
2037 }
2038
2039 seq_printf(seq, "\nCounters:\n---------\n");
2040 seq_printf(seq, "gets = %u\n", s.gets);
2041 seq_printf(seq, "backtracks = %u\n", s.backtrack);
2042 seq_printf(seq, "semantic match passed = %u\n",
2043 s.semantic_match_passed);
2044 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2045 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2046 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2047 }
2048 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2049
2050 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2051 {
2052 if (tb->tb_id == RT_TABLE_LOCAL)
2053 seq_puts(seq, "Local:\n");
2054 else if (tb->tb_id == RT_TABLE_MAIN)
2055 seq_puts(seq, "Main:\n");
2056 else
2057 seq_printf(seq, "Id %d:\n", tb->tb_id);
2058 }
2059
2060
2061 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2062 {
2063 struct net *net = (struct net *)seq->private;
2064 unsigned int h;
2065
2066 seq_printf(seq,
2067 "Basic info: size of leaf:"
2068 " %Zd bytes, size of tnode: %Zd bytes.\n",
2069 sizeof(struct tnode), sizeof(struct tnode));
2070
2071 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2072 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2073 struct fib_table *tb;
2074
2075 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2076 struct trie *t = (struct trie *) tb->tb_data;
2077 struct trie_stat stat;
2078
2079 if (!t)
2080 continue;
2081
2082 fib_table_print(seq, tb);
2083
2084 trie_collect_stats(t, &stat);
2085 trie_show_stats(seq, &stat);
2086 #ifdef CONFIG_IP_FIB_TRIE_STATS
2087 trie_show_usage(seq, t->stats);
2088 #endif
2089 }
2090 }
2091
2092 return 0;
2093 }
2094
2095 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2096 {
2097 return single_open_net(inode, file, fib_triestat_seq_show);
2098 }
2099
2100 static const struct file_operations fib_triestat_fops = {
2101 .owner = THIS_MODULE,
2102 .open = fib_triestat_seq_open,
2103 .read = seq_read,
2104 .llseek = seq_lseek,
2105 .release = single_release_net,
2106 };
2107
2108 static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2109 {
2110 struct fib_trie_iter *iter = seq->private;
2111 struct net *net = seq_file_net(seq);
2112 loff_t idx = 0;
2113 unsigned int h;
2114
2115 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2116 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2117 struct fib_table *tb;
2118
2119 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2120 struct tnode *n;
2121
2122 for (n = fib_trie_get_first(iter,
2123 (struct trie *) tb->tb_data);
2124 n; n = fib_trie_get_next(iter))
2125 if (pos == idx++) {
2126 iter->tb = tb;
2127 return n;
2128 }
2129 }
2130 }
2131
2132 return NULL;
2133 }
2134
2135 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2136 __acquires(RCU)
2137 {
2138 rcu_read_lock();
2139 return fib_trie_get_idx(seq, *pos);
2140 }
2141
2142 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2143 {
2144 struct fib_trie_iter *iter = seq->private;
2145 struct net *net = seq_file_net(seq);
2146 struct fib_table *tb = iter->tb;
2147 struct hlist_node *tb_node;
2148 unsigned int h;
2149 struct tnode *n;
2150
2151 ++*pos;
2152 /* next node in same table */
2153 n = fib_trie_get_next(iter);
2154 if (n)
2155 return n;
2156
2157 /* walk rest of this hash chain */
2158 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2159 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2160 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2161 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2162 if (n)
2163 goto found;
2164 }
2165
2166 /* new hash chain */
2167 while (++h < FIB_TABLE_HASHSZ) {
2168 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2169 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2170 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2171 if (n)
2172 goto found;
2173 }
2174 }
2175 return NULL;
2176
2177 found:
2178 iter->tb = tb;
2179 return n;
2180 }
2181
2182 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2183 __releases(RCU)
2184 {
2185 rcu_read_unlock();
2186 }
2187
2188 static void seq_indent(struct seq_file *seq, int n)
2189 {
2190 while (n-- > 0)
2191 seq_puts(seq, " ");
2192 }
2193
2194 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2195 {
2196 switch (s) {
2197 case RT_SCOPE_UNIVERSE: return "universe";
2198 case RT_SCOPE_SITE: return "site";
2199 case RT_SCOPE_LINK: return "link";
2200 case RT_SCOPE_HOST: return "host";
2201 case RT_SCOPE_NOWHERE: return "nowhere";
2202 default:
2203 snprintf(buf, len, "scope=%d", s);
2204 return buf;
2205 }
2206 }
2207
2208 static const char *const rtn_type_names[__RTN_MAX] = {
2209 [RTN_UNSPEC] = "UNSPEC",
2210 [RTN_UNICAST] = "UNICAST",
2211 [RTN_LOCAL] = "LOCAL",
2212 [RTN_BROADCAST] = "BROADCAST",
2213 [RTN_ANYCAST] = "ANYCAST",
2214 [RTN_MULTICAST] = "MULTICAST",
2215 [RTN_BLACKHOLE] = "BLACKHOLE",
2216 [RTN_UNREACHABLE] = "UNREACHABLE",
2217 [RTN_PROHIBIT] = "PROHIBIT",
2218 [RTN_THROW] = "THROW",
2219 [RTN_NAT] = "NAT",
2220 [RTN_XRESOLVE] = "XRESOLVE",
2221 };
2222
2223 static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2224 {
2225 if (t < __RTN_MAX && rtn_type_names[t])
2226 return rtn_type_names[t];
2227 snprintf(buf, len, "type %u", t);
2228 return buf;
2229 }
2230
2231 /* Pretty print the trie */
2232 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2233 {
2234 const struct fib_trie_iter *iter = seq->private;
2235 struct tnode *n = v;
2236
2237 if (!node_parent_rcu(n))
2238 fib_table_print(seq, iter->tb);
2239
2240 if (IS_TNODE(n)) {
2241 __be32 prf = htonl(n->key);
2242
2243 seq_indent(seq, iter->depth - 1);
2244 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2245 &prf, n->pos, n->bits, n->full_children,
2246 n->empty_children);
2247 } else {
2248 struct leaf_info *li;
2249 __be32 val = htonl(n->key);
2250
2251 seq_indent(seq, iter->depth);
2252 seq_printf(seq, " |-- %pI4\n", &val);
2253
2254 hlist_for_each_entry_rcu(li, &n->list, hlist) {
2255 struct fib_alias *fa;
2256
2257 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2258 char buf1[32], buf2[32];
2259
2260 seq_indent(seq, iter->depth+1);
2261 seq_printf(seq, " /%d %s %s", li->plen,
2262 rtn_scope(buf1, sizeof(buf1),
2263 fa->fa_info->fib_scope),
2264 rtn_type(buf2, sizeof(buf2),
2265 fa->fa_type));
2266 if (fa->fa_tos)
2267 seq_printf(seq, " tos=%d", fa->fa_tos);
2268 seq_putc(seq, '\n');
2269 }
2270 }
2271 }
2272
2273 return 0;
2274 }
2275
2276 static const struct seq_operations fib_trie_seq_ops = {
2277 .start = fib_trie_seq_start,
2278 .next = fib_trie_seq_next,
2279 .stop = fib_trie_seq_stop,
2280 .show = fib_trie_seq_show,
2281 };
2282
2283 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2284 {
2285 return seq_open_net(inode, file, &fib_trie_seq_ops,
2286 sizeof(struct fib_trie_iter));
2287 }
2288
2289 static const struct file_operations fib_trie_fops = {
2290 .owner = THIS_MODULE,
2291 .open = fib_trie_seq_open,
2292 .read = seq_read,
2293 .llseek = seq_lseek,
2294 .release = seq_release_net,
2295 };
2296
2297 struct fib_route_iter {
2298 struct seq_net_private p;
2299 struct trie *main_trie;
2300 loff_t pos;
2301 t_key key;
2302 };
2303
2304 static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2305 {
2306 struct tnode *l = NULL;
2307 struct trie *t = iter->main_trie;
2308
2309 /* use cache location of last found key */
2310 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2311 pos -= iter->pos;
2312 else {
2313 iter->pos = 0;
2314 l = trie_firstleaf(t);
2315 }
2316
2317 while (l && pos-- > 0) {
2318 iter->pos++;
2319 l = trie_nextleaf(l);
2320 }
2321
2322 if (l)
2323 iter->key = pos; /* remember it */
2324 else
2325 iter->pos = 0; /* forget it */
2326
2327 return l;
2328 }
2329
2330 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2331 __acquires(RCU)
2332 {
2333 struct fib_route_iter *iter = seq->private;
2334 struct fib_table *tb;
2335
2336 rcu_read_lock();
2337 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2338 if (!tb)
2339 return NULL;
2340
2341 iter->main_trie = (struct trie *) tb->tb_data;
2342 if (*pos == 0)
2343 return SEQ_START_TOKEN;
2344 else
2345 return fib_route_get_idx(iter, *pos - 1);
2346 }
2347
2348 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2349 {
2350 struct fib_route_iter *iter = seq->private;
2351 struct tnode *l = v;
2352
2353 ++*pos;
2354 if (v == SEQ_START_TOKEN) {
2355 iter->pos = 0;
2356 l = trie_firstleaf(iter->main_trie);
2357 } else {
2358 iter->pos++;
2359 l = trie_nextleaf(l);
2360 }
2361
2362 if (l)
2363 iter->key = l->key;
2364 else
2365 iter->pos = 0;
2366 return l;
2367 }
2368
2369 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2370 __releases(RCU)
2371 {
2372 rcu_read_unlock();
2373 }
2374
2375 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2376 {
2377 unsigned int flags = 0;
2378
2379 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2380 flags = RTF_REJECT;
2381 if (fi && fi->fib_nh->nh_gw)
2382 flags |= RTF_GATEWAY;
2383 if (mask == htonl(0xFFFFFFFF))
2384 flags |= RTF_HOST;
2385 flags |= RTF_UP;
2386 return flags;
2387 }
2388
2389 /*
2390 * This outputs /proc/net/route.
2391 * The format of the file is not supposed to be changed
2392 * and needs to be same as fib_hash output to avoid breaking
2393 * legacy utilities
2394 */
2395 static int fib_route_seq_show(struct seq_file *seq, void *v)
2396 {
2397 struct tnode *l = v;
2398 struct leaf_info *li;
2399
2400 if (v == SEQ_START_TOKEN) {
2401 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2402 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2403 "\tWindow\tIRTT");
2404 return 0;
2405 }
2406
2407 hlist_for_each_entry_rcu(li, &l->list, hlist) {
2408 struct fib_alias *fa;
2409 __be32 mask, prefix;
2410
2411 mask = inet_make_mask(li->plen);
2412 prefix = htonl(l->key);
2413
2414 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2415 const struct fib_info *fi = fa->fa_info;
2416 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2417
2418 if (fa->fa_type == RTN_BROADCAST
2419 || fa->fa_type == RTN_MULTICAST)
2420 continue;
2421
2422 seq_setwidth(seq, 127);
2423
2424 if (fi)
2425 seq_printf(seq,
2426 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2427 "%d\t%08X\t%d\t%u\t%u",
2428 fi->fib_dev ? fi->fib_dev->name : "*",
2429 prefix,
2430 fi->fib_nh->nh_gw, flags, 0, 0,
2431 fi->fib_priority,
2432 mask,
2433 (fi->fib_advmss ?
2434 fi->fib_advmss + 40 : 0),
2435 fi->fib_window,
2436 fi->fib_rtt >> 3);
2437 else
2438 seq_printf(seq,
2439 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2440 "%d\t%08X\t%d\t%u\t%u",
2441 prefix, 0, flags, 0, 0, 0,
2442 mask, 0, 0, 0);
2443
2444 seq_pad(seq, '\n');
2445 }
2446 }
2447
2448 return 0;
2449 }
2450
2451 static const struct seq_operations fib_route_seq_ops = {
2452 .start = fib_route_seq_start,
2453 .next = fib_route_seq_next,
2454 .stop = fib_route_seq_stop,
2455 .show = fib_route_seq_show,
2456 };
2457
2458 static int fib_route_seq_open(struct inode *inode, struct file *file)
2459 {
2460 return seq_open_net(inode, file, &fib_route_seq_ops,
2461 sizeof(struct fib_route_iter));
2462 }
2463
2464 static const struct file_operations fib_route_fops = {
2465 .owner = THIS_MODULE,
2466 .open = fib_route_seq_open,
2467 .read = seq_read,
2468 .llseek = seq_lseek,
2469 .release = seq_release_net,
2470 };
2471
2472 int __net_init fib_proc_init(struct net *net)
2473 {
2474 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2475 goto out1;
2476
2477 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2478 &fib_triestat_fops))
2479 goto out2;
2480
2481 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2482 goto out3;
2483
2484 return 0;
2485
2486 out3:
2487 remove_proc_entry("fib_triestat", net->proc_net);
2488 out2:
2489 remove_proc_entry("fib_trie", net->proc_net);
2490 out1:
2491 return -ENOMEM;
2492 }
2493
2494 void __net_exit fib_proc_exit(struct net *net)
2495 {
2496 remove_proc_entry("fib_trie", net->proc_net);
2497 remove_proc_entry("fib_triestat", net->proc_net);
2498 remove_proc_entry("route", net->proc_net);
2499 }
2500
2501 #endif /* CONFIG_PROC_FS */
This page took 0.083248 seconds and 5 git commands to generate.