net/ipv4/fib_trie.c

   1 /*
   2  *   This program is free software; you can redistribute it and/or
   3  *   modify it under the terms of the GNU General Public License
   4  *   as published by the Free Software Foundation; either version
   5  *   2 of the License, or (at your option) any later version.
   6  *
   7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8  *     & Swedish University of Agricultural Sciences.
   9  *
  10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11  *     Agricultural Sciences.
  12  *
  13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14  *
  15  * This work is based on the LPC-trie which is originally described in:
  16  *
  17  * An experimental study of compression methods for dynamic tries
  18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19  * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20  *
  21  *
  22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24  *
  25  *
  26  * Code from fib_hash has been reused which includes the following header:
  27  *
  28  *
  29  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30  *              operating system.  INET is implemented using the  BSD Socket
  31  *              interface as the means of communication with the user level.
  32  *
  33  *              IPv4 FIB: lookup engine and maintenance routines.
  34  *
  35  *
  36  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37  *
  38  *              This program is free software; you can redistribute it and/or
  39  *              modify it under the terms of the GNU General Public License
  40  *              as published by the Free Software Foundation; either version
  41  *              2 of the License, or (at your option) any later version.
  42  *
  43  * Substantial contributions to this work comes from:
  44  *
  45  *              David S. Miller, <davem@davemloft.net>
  46  *              Stephen Hemminger <shemminger@osdl.org>
  47  *              Paul E. McKenney <paulmck@us.ibm.com>
  48  *              Patrick McHardy <kaber@trash.net>
  49  */
  50
  51 #define VERSION "0.409"
  52
  53 #include <asm/uaccess.h>
  54 #include <linux/bitops.h>
  55 #include <linux/types.h>
  56 #include <linux/kernel.h>
  57 #include <linux/mm.h>
  58 #include <linux/string.h>
  59 #include <linux/socket.h>
  60 #include <linux/sockios.h>
  61 #include <linux/errno.h>
  62 #include <linux/in.h>
  63 #include <linux/inet.h>
  64 #include <linux/inetdevice.h>
  65 #include <linux/netdevice.h>
  66 #include <linux/if_arp.h>
  67 #include <linux/proc_fs.h>
  68 #include <linux/rcupdate.h>
  69 #include <linux/skbuff.h>
  70 #include <linux/netlink.h>
  71 #include <linux/init.h>
  72 #include <linux/list.h>
  73 #include <linux/slab.h>
  74 #include <linux/export.h>
  75 #include <net/net_namespace.h>
  76 #include <net/ip.h>
  77 #include <net/protocol.h>
  78 #include <net/route.h>
  79 #include <net/tcp.h>
  80 #include <net/sock.h>
  81 #include <net/ip_fib.h>
  82 #include "fib_lookup.h"
  83
  84 #define MAX_STAT_DEPTH 32
  85
  86 #define KEYLENGTH (8*sizeof(t_key))
  87
  88 typedef unsigned int t_key;
  89
  90 #define IS_TNODE(n) ((n)->bits)
  91 #define IS_LEAF(n) (!(n)->bits)
  92
  93 #define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
  94 #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
  95
  96 struct tnode {
  97         t_key key;
  98         unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
  99         unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 100         struct tnode __rcu *parent;
 101         struct rcu_head rcu;
 102         union {
 103                 /* The fields in this struct are valid if bits > 0 (TNODE) */
 104                 struct {
 105                         unsigned int full_children;  /* KEYLENGTH bits needed */
 106                         unsigned int empty_children; /* KEYLENGTH bits needed */
 107                         struct tnode __rcu *child[0];
 108                 };
 109                 /* This list pointer if valid if bits == 0 (LEAF) */
 110                 struct hlist_head list;
 111         };
 112 };
 113
 114 struct leaf_info {
 115         struct hlist_node hlist;
 116         int plen;
 117         u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
 118         struct list_head falh;
 119         struct rcu_head rcu;
 120 };
 121
 122 #ifdef CONFIG_IP_FIB_TRIE_STATS
 123 struct trie_use_stats {
 124         unsigned int gets;
 125         unsigned int backtrack;
 126         unsigned int semantic_match_passed;
 127         unsigned int semantic_match_miss;
 128         unsigned int null_node_hit;
 129         unsigned int resize_node_skipped;
 130 };
 131 #endif
 132
 133 struct trie_stat {
 134         unsigned int totdepth;
 135         unsigned int maxdepth;
 136         unsigned int tnodes;
 137         unsigned int leaves;
 138         unsigned int nullpointers;
 139         unsigned int prefixes;
 140         unsigned int nodesizes[MAX_STAT_DEPTH];
 141 };
 142
 143 struct trie {
 144         struct tnode __rcu *trie;
 145 #ifdef CONFIG_IP_FIB_TRIE_STATS
 146         struct trie_use_stats __percpu *stats;
 147 #endif
 148 };
 149
 150 static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
 151                                   int wasfull);
 152 static struct tnode *resize(struct trie *t, struct tnode *tn);
 153 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 154 static struct tnode *halve(struct trie *t, struct tnode *tn);
 155 /* tnodes to free after resize(); protected by RTNL */
 156 static struct callback_head *tnode_free_head;
 157 static size_t tnode_free_size;
 158
 159 /*
 160  * synchronize_rcu after call_rcu for that many pages; it should be especially
 161  * useful before resizing the root node with PREEMPT_NONE configs; the value was
 162  * obtained experimentally, aiming to avoid visible slowdown.
 163  */
 164 static const int sync_pages = 128;
 165
 166 static struct kmem_cache *fn_alias_kmem __read_mostly;
 167 static struct kmem_cache *trie_leaf_kmem __read_mostly;
 168
 169 /* caller must hold RTNL */
 170 #define node_parent(n) rtnl_dereference((n)->parent)
 171
 172 /* caller must hold RCU read lock or RTNL */
 173 #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
 174
 175 /* wrapper for rcu_assign_pointer */
 176 static inline void node_set_parent(struct tnode *n, struct tnode *tp)
 177 {
 178         if (n)
 179                 rcu_assign_pointer(n->parent, tp);
 180 }
 181
 182 #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
 183
 184 /* This provides us with the number of children in this node, in the case of a
 185  * leaf this will return 0 meaning none of the children are accessible.
 186  */
 187 static inline int tnode_child_length(const struct tnode *tn)
 188 {
 189         return (1ul << tn->bits) & ~(1ul);
 190 }
 191
 192 /*
 193  * caller must hold RTNL
 194  */
 195 static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
 196 {
 197         BUG_ON(i >= tnode_child_length(tn));
 198
 199         return rtnl_dereference(tn->child[i]);
 200 }
 201
 202 /*
 203  * caller must hold RCU read lock or RTNL
 204  */
 205 static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 206 {
 207         BUG_ON(i >= tnode_child_length(tn));
 208
 209         return rcu_dereference_rtnl(tn->child[i]);
 210 }
 211
 212 static inline t_key mask_pfx(t_key k, unsigned int l)
 213 {
 214         return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 215 }
 216
 217 static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 218 {
 219         if (offset < KEYLENGTH)
 220                 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 221         else
 222                 return 0;
 223 }
 224
 225 static inline int tkey_equals(t_key a, t_key b)
 226 {
 227         return a == b;
 228 }
 229
 230 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 231 {
 232         if (bits == 0 || offset >= KEYLENGTH)
 233                 return 1;
 234         bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 235         return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 236 }
 237
 238 static inline int tkey_mismatch(t_key a, int offset, t_key b)
 239 {
 240         t_key diff = a ^ b;
 241         int i = offset;
 242
 243         if (!diff)
 244                 return 0;
 245         while ((diff << i) >> (KEYLENGTH-1) == 0)
 246                 i++;
 247         return i;
 248 }
 249
 250 /*
 251   To understand this stuff, an understanding of keys and all their bits is
 252   necessary. Every node in the trie has a key associated with it, but not
 253   all of the bits in that key are significant.
 254
 255   Consider a node 'n' and its parent 'tp'.
 256
 257   If n is a leaf, every bit in its key is significant. Its presence is
 258   necessitated by path compression, since during a tree traversal (when
 259   searching for a leaf - unless we are doing an insertion) we will completely
 260   ignore all skipped bits we encounter. Thus we need to verify, at the end of
 261   a potentially successful search, that we have indeed been walking the
 262   correct key path.
 263
 264   Note that we can never "miss" the correct key in the tree if present by
 265   following the wrong path. Path compression ensures that segments of the key
 266   that are the same for all keys with a given prefix are skipped, but the
 267   skipped part *is* identical for each node in the subtrie below the skipped
 268   bit! trie_insert() in this implementation takes care of that - note the
 269   call to tkey_sub_equals() in trie_insert().
 270
 271   if n is an internal node - a 'tnode' here, the various parts of its key
 272   have many different meanings.
 273
 274   Example:
 275   _________________________________________________________________
 276   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 277   -----------------------------------------------------------------
 278     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 279
 280   _________________________________________________________________
 281   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 282   -----------------------------------------------------------------
 283    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 284
 285   tp->pos = 7
 286   tp->bits = 3
 287   n->pos = 15
 288   n->bits = 4
 289
 290   First, let's just ignore the bits that come before the parent tp, that is
 291   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 292   not use them for anything.
 293
 294   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 295   index into the parent's child array. That is, they will be used to find
 296   'n' among tp's children.
 297
 298   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 299   for the node n.
 300
 301   All the bits we have seen so far are significant to the node n. The rest
 302   of the bits are really not needed or indeed known in n->key.
 303
 304   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 305   n's child array, and will of course be different for each child.
 306
 307
 308   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 309   at this point.
 310
 311 */
 312
 313 static const int halve_threshold = 25;
 314 static const int inflate_threshold = 50;
 315 static const int halve_threshold_root = 15;
 316 static const int inflate_threshold_root = 30;
 317
 318 static void __alias_free_mem(struct rcu_head *head)
 319 {
 320         struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 321         kmem_cache_free(fn_alias_kmem, fa);
 322 }
 323
 324 static inline void alias_free_mem_rcu(struct fib_alias *fa)
 325 {
 326         call_rcu(&fa->rcu, __alias_free_mem);
 327 }
 328
 329 #define TNODE_KMALLOC_MAX \
 330         ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
 331
 332 static void __node_free_rcu(struct rcu_head *head)
 333 {
 334         struct tnode *n = container_of(head, struct tnode, rcu);
 335
 336         if (IS_LEAF(n))
 337                 kmem_cache_free(trie_leaf_kmem, n);
 338         else if (n->bits <= TNODE_KMALLOC_MAX)
 339                 kfree(n);
 340         else
 341                 vfree(n);
 342 }
 343
 344 #define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
 345
 346 static inline void free_leaf_info(struct leaf_info *leaf)
 347 {
 348         kfree_rcu(leaf, rcu);
 349 }
 350
 351 static struct tnode *tnode_alloc(size_t size)
 352 {
 353         if (size <= PAGE_SIZE)
 354                 return kzalloc(size, GFP_KERNEL);
 355         else
 356                 return vzalloc(size);
 357 }
 358
 359 static void tnode_free_safe(struct tnode *tn)
 360 {
 361         BUG_ON(IS_LEAF(tn));
 362         tn->rcu.next = tnode_free_head;
 363         tnode_free_head = &tn->rcu;
 364 }
 365
 366 static void tnode_free_flush(void)
 367 {
 368         struct callback_head *head;
 369
 370         while ((head = tnode_free_head)) {
 371                 struct tnode *tn = container_of(head, struct tnode, rcu);
 372
 373                 tnode_free_head = head->next;
 374                 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
 375
 376                 node_free(tn);
 377         }
 378
 379         if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 380                 tnode_free_size = 0;
 381                 synchronize_rcu();
 382         }
 383 }
 384
 385 static struct tnode *leaf_new(t_key key)
 386 {
 387         struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 388         if (l) {
 389                 l->parent = NULL;
 390                 /* set key and pos to reflect full key value
 391                  * any trailing zeros in the key should be ignored
 392                  * as the nodes are searched
 393                  */
 394                 l->key = key;
 395                 l->pos = KEYLENGTH;
 396                 /* set bits to 0 indicating we are not a tnode */
 397                 l->bits = 0;
 398
 399                 INIT_HLIST_HEAD(&l->list);
 400         }
 401         return l;
 402 }
 403
 404 static struct leaf_info *leaf_info_new(int plen)
 405 {
 406         struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 407         if (li) {
 408                 li->plen = plen;
 409                 li->mask_plen = ntohl(inet_make_mask(plen));
 410                 INIT_LIST_HEAD(&li->falh);
 411         }
 412         return li;
 413 }
 414
 415 static struct tnode *tnode_new(t_key key, int pos, int bits)
 416 {
 417         size_t sz = offsetof(struct tnode, child[1 << bits]);
 418         struct tnode *tn = tnode_alloc(sz);
 419         unsigned int shift = pos + bits;
 420
 421         /* verify bits and pos their msb bits clear and values are valid */
 422         BUG_ON(!bits || (shift > KEYLENGTH));
 423
 424         if (tn) {
 425                 tn->parent = NULL;
 426                 tn->pos = pos;
 427                 tn->bits = bits;
 428                 tn->key = mask_pfx(key, pos);
 429                 tn->full_children = 0;
 430                 tn->empty_children = 1<<bits;
 431         }
 432
 433         pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 434                  sizeof(struct tnode *) << bits);
 435         return tn;
 436 }
 437
 438 /*
 439  * Check whether a tnode 'n' is "full", i.e. it is an internal node
 440  * and no bits are skipped. See discussion in dyntree paper p. 6
 441  */
 442
 443 static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
 444 {
 445         return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
 446 }
 447
 448 static inline void put_child(struct tnode *tn, int i,
 449                              struct tnode *n)
 450 {
 451         tnode_put_child_reorg(tn, i, n, -1);
 452 }
 453
 454  /*
 455   * Add a child at position i overwriting the old value.
 456   * Update the value of full_children and empty_children.
 457   */
 458
 459 static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
 460                                   int wasfull)
 461 {
 462         struct tnode *chi = rtnl_dereference(tn->child[i]);
 463         int isfull;
 464
 465         BUG_ON(i >= 1<<tn->bits);
 466
 467         /* update emptyChildren */
 468         if (n == NULL && chi != NULL)
 469                 tn->empty_children++;
 470         else if (n != NULL && chi == NULL)
 471                 tn->empty_children--;
 472
 473         /* update fullChildren */
 474         if (wasfull == -1)
 475                 wasfull = tnode_full(tn, chi);
 476
 477         isfull = tnode_full(tn, n);
 478         if (wasfull && !isfull)
 479                 tn->full_children--;
 480         else if (!wasfull && isfull)
 481                 tn->full_children++;
 482
 483         node_set_parent(n, tn);
 484
 485         rcu_assign_pointer(tn->child[i], n);
 486 }
 487
 488 #define MAX_WORK 10
 489 static struct tnode *resize(struct trie *t, struct tnode *tn)
 490 {
 491         struct tnode *old_tn, *n = NULL;
 492         int inflate_threshold_use;
 493         int halve_threshold_use;
 494         int max_work;
 495
 496         if (!tn)
 497                 return NULL;
 498
 499         pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 500                  tn, inflate_threshold, halve_threshold);
 501
 502         /* No children */
 503         if (tn->empty_children > (tnode_child_length(tn) - 1))
 504                 goto no_children;
 505
 506         /* One child */
 507         if (tn->empty_children == (tnode_child_length(tn) - 1))
 508                 goto one_child;
 509         /*
 510          * Double as long as the resulting node has a number of
 511          * nonempty nodes that are above the threshold.
 512          */
 513
 514         /*
 515          * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 516          * the Helsinki University of Technology and Matti Tikkanen of Nokia
 517          * Telecommunications, page 6:
 518          * "A node is doubled if the ratio of non-empty children to all
 519          * children in the *doubled* node is at least 'high'."
 520          *
 521          * 'high' in this instance is the variable 'inflate_threshold'. It
 522          * is expressed as a percentage, so we multiply it with
 523          * tnode_child_length() and instead of multiplying by 2 (since the
 524          * child array will be doubled by inflate()) and multiplying
 525          * the left-hand side by 100 (to handle the percentage thing) we
 526          * multiply the left-hand side by 50.
 527          *
 528          * The left-hand side may look a bit weird: tnode_child_length(tn)
 529          * - tn->empty_children is of course the number of non-null children
 530          * in the current node. tn->full_children is the number of "full"
 531          * children, that is non-null tnodes with a skip value of 0.
 532          * All of those will be doubled in the resulting inflated tnode, so
 533          * we just count them one extra time here.
 534          *
 535          * A clearer way to write this would be:
 536          *
 537          * to_be_doubled = tn->full_children;
 538          * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 539          *     tn->full_children;
 540          *
 541          * new_child_length = tnode_child_length(tn) * 2;
 542          *
 543          * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 544          *      new_child_length;
 545          * if (new_fill_factor >= inflate_threshold)
 546          *
 547          * ...and so on, tho it would mess up the while () loop.
 548          *
 549          * anyway,
 550          * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 551          *      inflate_threshold
 552          *
 553          * avoid a division:
 554          * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 555          *      inflate_threshold * new_child_length
 556          *
 557          * expand not_to_be_doubled and to_be_doubled, and shorten:
 558          * 100 * (tnode_child_length(tn) - tn->empty_children +
 559          *    tn->full_children) >= inflate_threshold * new_child_length
 560          *
 561          * expand new_child_length:
 562          * 100 * (tnode_child_length(tn) - tn->empty_children +
 563          *    tn->full_children) >=
 564          *      inflate_threshold * tnode_child_length(tn) * 2
 565          *
 566          * shorten again:
 567          * 50 * (tn->full_children + tnode_child_length(tn) -
 568          *    tn->empty_children) >= inflate_threshold *
 569          *    tnode_child_length(tn)
 570          *
 571          */
 572
 573         /* Keep root node larger  */
 574
 575         if (!node_parent(tn)) {
 576                 inflate_threshold_use = inflate_threshold_root;
 577                 halve_threshold_use = halve_threshold_root;
 578         } else {
 579                 inflate_threshold_use = inflate_threshold;
 580                 halve_threshold_use = halve_threshold;
 581         }
 582
 583         max_work = MAX_WORK;
 584         while ((tn->full_children > 0 &&  max_work-- &&
 585                 50 * (tn->full_children + tnode_child_length(tn)
 586                       - tn->empty_children)
 587                 >= inflate_threshold_use * tnode_child_length(tn))) {
 588
 589                 old_tn = tn;
 590                 tn = inflate(t, tn);
 591
 592                 if (IS_ERR(tn)) {
 593                         tn = old_tn;
 594 #ifdef CONFIG_IP_FIB_TRIE_STATS
 595                         this_cpu_inc(t->stats->resize_node_skipped);
 596 #endif
 597                         break;
 598                 }
 599         }
 600
 601         /* Return if at least one inflate is run */
 602         if (max_work != MAX_WORK)
 603                 return tn;
 604
 605         /*
 606          * Halve as long as the number of empty children in this
 607          * node is above threshold.
 608          */
 609
 610         max_work = MAX_WORK;
 611         while (tn->bits > 1 &&  max_work-- &&
 612                100 * (tnode_child_length(tn) - tn->empty_children) <
 613                halve_threshold_use * tnode_child_length(tn)) {
 614
 615                 old_tn = tn;
 616                 tn = halve(t, tn);
 617                 if (IS_ERR(tn)) {
 618                         tn = old_tn;
 619 #ifdef CONFIG_IP_FIB_TRIE_STATS
 620                         this_cpu_inc(t->stats->resize_node_skipped);
 621 #endif
 622                         break;
 623                 }
 624         }
 625
 626
 627         /* Only one child remains */
 628         if (tn->empty_children == (tnode_child_length(tn) - 1)) {
 629                 unsigned long i;
 630 one_child:
 631                 for (i = tnode_child_length(tn); !n && i;)
 632                         n = tnode_get_child(tn, --i);
 633 no_children:
 634                 /* compress one level */
 635                 node_set_parent(n, NULL);
 636                 tnode_free_safe(tn);
 637                 return n;
 638         }
 639         return tn;
 640 }
 641
 642
 643 static void tnode_clean_free(struct tnode *tn)
 644 {
 645         struct tnode *tofree;
 646         int i;
 647
 648         for (i = 0; i < tnode_child_length(tn); i++) {
 649                 tofree = rtnl_dereference(tn->child[i]);
 650                 if (tofree)
 651                         node_free(tofree);
 652         }
 653         node_free(tn);
 654 }
 655
 656 static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
 657 {
 658         int olen = tnode_child_length(oldtnode);
 659         struct tnode *tn;
 660         int i;
 661
 662         pr_debug("In inflate\n");
 663
 664         tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 665
 666         if (!tn)
 667                 return ERR_PTR(-ENOMEM);
 668
 669         /*
 670          * Preallocate and store tnodes before the actual work so we
 671          * don't get into an inconsistent state if memory allocation
 672          * fails. In case of failure we return the oldnode and  inflate
 673          * of tnode is ignored.
 674          */
 675
 676         for (i = 0; i < olen; i++) {
 677                 struct tnode *inode;
 678
 679                 inode = tnode_get_child(oldtnode, i);
 680                 if (tnode_full(oldtnode, inode) && inode->bits > 1) {
 681                         struct tnode *left, *right;
 682                         t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 683
 684                         left = tnode_new(inode->key&(~m), inode->pos + 1,
 685                                          inode->bits - 1);
 686                         if (!left)
 687                                 goto nomem;
 688
 689                         right = tnode_new(inode->key|m, inode->pos + 1,
 690                                           inode->bits - 1);
 691
 692                         if (!right) {
 693                                 node_free(left);
 694                                 goto nomem;
 695                         }
 696
 697                         put_child(tn, 2*i, left);
 698                         put_child(tn, 2*i+1, right);
 699                 }
 700         }
 701
 702         for (i = 0; i < olen; i++) {
 703                 struct tnode *inode = tnode_get_child(oldtnode, i);
 704                 struct tnode *left, *right;
 705                 int size, j;
 706
 707                 /* An empty child */
 708                 if (inode == NULL)
 709                         continue;
 710
 711                 /* A leaf or an internal node with skipped bits */
 712                 if (!tnode_full(oldtnode, inode)) {
 713                         put_child(tn,
 714                                 tkey_extract_bits(inode->key, tn->pos, tn->bits),
 715                                 inode);
 716                         continue;
 717                 }
 718
 719                 /* An internal node with two children */
 720                 if (inode->bits == 1) {
 721                         put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
 722                         put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
 723
 724                         tnode_free_safe(inode);
 725                         continue;
 726                 }
 727
 728                 /* An internal node with more than two children */
 729
 730                 /* We will replace this node 'inode' with two new
 731                  * ones, 'left' and 'right', each with half of the
 732                  * original children. The two new nodes will have
 733                  * a position one bit further down the key and this
 734                  * means that the "significant" part of their keys
 735                  * (see the discussion near the top of this file)
 736                  * will differ by one bit, which will be "0" in
 737                  * left's key and "1" in right's key. Since we are
 738                  * moving the key position by one step, the bit that
 739                  * we are moving away from - the bit at position
 740                  * (inode->pos) - is the one that will differ between
 741                  * left and right. So... we synthesize that bit in the
 742                  * two  new keys.
 743                  * The mask 'm' below will be a single "one" bit at
 744                  * the position (inode->pos)
 745                  */
 746
 747                 /* Use the old key, but set the new significant
 748                  *   bit to zero.
 749                  */
 750
 751                 left = tnode_get_child(tn, 2*i);
 752                 put_child(tn, 2*i, NULL);
 753
 754                 BUG_ON(!left);
 755
 756                 right = tnode_get_child(tn, 2*i+1);
 757                 put_child(tn, 2*i+1, NULL);
 758
 759                 BUG_ON(!right);
 760
 761                 size = tnode_child_length(left);
 762                 for (j = 0; j < size; j++) {
 763                         put_child(left, j, rtnl_dereference(inode->child[j]));
 764                         put_child(right, j, rtnl_dereference(inode->child[j + size]));
 765                 }
 766                 put_child(tn, 2*i, resize(t, left));
 767                 put_child(tn, 2*i+1, resize(t, right));
 768
 769                 tnode_free_safe(inode);
 770         }
 771         tnode_free_safe(oldtnode);
 772         return tn;
 773 nomem:
 774         tnode_clean_free(tn);
 775         return ERR_PTR(-ENOMEM);
 776 }
 777
 778 static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
 779 {
 780         int olen = tnode_child_length(oldtnode);
 781         struct tnode *tn, *left, *right;
 782         int i;
 783
 784         pr_debug("In halve\n");
 785
 786         tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 787
 788         if (!tn)
 789                 return ERR_PTR(-ENOMEM);
 790
 791         /*
 792          * Preallocate and store tnodes before the actual work so we
 793          * don't get into an inconsistent state if memory allocation
 794          * fails. In case of failure we return the oldnode and halve
 795          * of tnode is ignored.
 796          */
 797
 798         for (i = 0; i < olen; i += 2) {
 799                 left = tnode_get_child(oldtnode, i);
 800                 right = tnode_get_child(oldtnode, i+1);
 801
 802                 /* Two nonempty children */
 803                 if (left && right) {
 804                         struct tnode *newn;
 805
 806                         newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 807
 808                         if (!newn)
 809                                 goto nomem;
 810
 811                         put_child(tn, i/2, newn);
 812                 }
 813
 814         }
 815
 816         for (i = 0; i < olen; i += 2) {
 817                 struct tnode *newBinNode;
 818
 819                 left = tnode_get_child(oldtnode, i);
 820                 right = tnode_get_child(oldtnode, i+1);
 821
 822                 /* At least one of the children is empty */
 823                 if (left == NULL) {
 824                         if (right == NULL)    /* Both are empty */
 825                                 continue;
 826                         put_child(tn, i/2, right);
 827                         continue;
 828                 }
 829
 830                 if (right == NULL) {
 831                         put_child(tn, i/2, left);
 832                         continue;
 833                 }
 834
 835                 /* Two nonempty children */
 836                 newBinNode = tnode_get_child(tn, i/2);
 837                 put_child(tn, i/2, NULL);
 838                 put_child(newBinNode, 0, left);
 839                 put_child(newBinNode, 1, right);
 840                 put_child(tn, i/2, resize(t, newBinNode));
 841         }
 842         tnode_free_safe(oldtnode);
 843         return tn;
 844 nomem:
 845         tnode_clean_free(tn);
 846         return ERR_PTR(-ENOMEM);
 847 }
 848
 849 /* readside must use rcu_read_lock currently dump routines
 850  via get_fa_head and dump */
 851
 852 static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
 853 {
 854         struct hlist_head *head = &l->list;
 855         struct leaf_info *li;
 856
 857         hlist_for_each_entry_rcu(li, head, hlist)
 858                 if (li->plen == plen)
 859                         return li;
 860
 861         return NULL;
 862 }
 863
 864 static inline struct list_head *get_fa_head(struct tnode *l, int plen)
 865 {
 866         struct leaf_info *li = find_leaf_info(l, plen);
 867
 868         if (!li)
 869                 return NULL;
 870
 871         return &li->falh;
 872 }
 873
 874 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 875 {
 876         struct leaf_info *li = NULL, *last = NULL;
 877
 878         if (hlist_empty(head)) {
 879                 hlist_add_head_rcu(&new->hlist, head);
 880         } else {
 881                 hlist_for_each_entry(li, head, hlist) {
 882                         if (new->plen > li->plen)
 883                                 break;
 884
 885                         last = li;
 886                 }
 887                 if (last)
 888                         hlist_add_behind_rcu(&new->hlist, &last->hlist);
 889                 else
 890                         hlist_add_before_rcu(&new->hlist, &li->hlist);
 891         }
 892 }
 893
 894 /* rcu_read_lock needs to be hold by caller from readside */
 895 static struct tnode *fib_find_node(struct trie *t, u32 key)
 896 {
 897         struct tnode *n = rcu_dereference_rtnl(t->trie);
 898
 899         while (n) {
 900                 unsigned long index = get_index(key, n);
 901
 902                 /* This bit of code is a bit tricky but it combines multiple
 903                  * checks into a single check.  The prefix consists of the
 904                  * prefix plus zeros for the bits in the cindex. The index
 905                  * is the difference between the key and this value.  From
 906                  * this we can actually derive several pieces of data.
 907                  *   if !(index >> bits)
 908                  *     we know the value is cindex
 909                  *   else
 910                  *     we have a mismatch in skip bits and failed
 911                  */
 912                 if (index >> n->bits)
 913                         return NULL;
 914
 915                 /* we have found a leaf. Prefixes have already been compared */
 916                 if (IS_LEAF(n))
 917                         break;
 918
 919                 n = rcu_dereference_rtnl(n->child[index]);
 920         }
 921
 922         return n;
 923 }
 924
 925 static void trie_rebalance(struct trie *t, struct tnode *tn)
 926 {
 927         int wasfull;
 928         t_key cindex, key;
 929         struct tnode *tp;
 930
 931         key = tn->key;
 932
 933         while (tn != NULL && (tp = node_parent(tn)) != NULL) {
 934                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 935                 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 936                 tn = resize(t, tn);
 937
 938                 tnode_put_child_reorg(tp, cindex, tn, wasfull);
 939
 940                 tp = node_parent(tn);
 941                 if (!tp)
 942                         rcu_assign_pointer(t->trie, tn);
 943
 944                 tnode_free_flush();
 945                 if (!tp)
 946                         break;
 947                 tn = tp;
 948         }
 949
 950         /* Handle last (top) tnode */
 951         if (IS_TNODE(tn))
 952                 tn = resize(t, tn);
 953
 954         rcu_assign_pointer(t->trie, tn);
 955         tnode_free_flush();
 956 }
 957
 958 /* only used from updater-side */
 959
 960 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 961 {
 962         int pos, newpos;
 963         struct tnode *tp = NULL, *tn = NULL;
 964         struct tnode *n;
 965         struct tnode *l;
 966         int missbit;
 967         struct list_head *fa_head = NULL;
 968         struct leaf_info *li;
 969         t_key cindex;
 970
 971         pos = 0;
 972         n = rtnl_dereference(t->trie);
 973
 974         /* If we point to NULL, stop. Either the tree is empty and we should
 975          * just put a new leaf in if, or we have reached an empty child slot,
 976          * and we should just put our new leaf in that.
 977          * If we point to a T_TNODE, check if it matches our key. Note that
 978          * a T_TNODE might be skipping any number of bits - its 'pos' need
 979          * not be the parent's 'pos'+'bits'!
 980          *
 981          * If it does match the current key, get pos/bits from it, extract
 982          * the index from our key, push the T_TNODE and walk the tree.
 983          *
 984          * If it doesn't, we have to replace it with a new T_TNODE.
 985          *
 986          * If we point to a T_LEAF, it might or might not have the same key
 987          * as we do. If it does, just change the value, update the T_LEAF's
 988          * value, and return it.
 989          * If it doesn't, we need to replace it with a T_TNODE.
 990          */
 991
 992         while (n && IS_TNODE(n)) {
 993                 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
 994                         tp = n;
 995                         pos = n->pos + n->bits;
 996                         n = tnode_get_child(n,
 997                                             tkey_extract_bits(key,
 998                                                               n->pos,
 999                                                               n->bits));
1000
1001                         BUG_ON(n && node_parent(n) != tp);
1002                 } else
1003                         break;
1004         }
1005
1006         /*
1007          * n  ----> NULL, LEAF or TNODE
1008          *
1009          * tp is n's (parent) ----> NULL or TNODE
1010          */
1011
1012         BUG_ON(tp && IS_LEAF(tp));
1013
1014         /* Case 1: n is a leaf. Compare prefixes */
1015
1016         if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1017                 li = leaf_info_new(plen);
1018
1019                 if (!li)
1020                         return NULL;
1021
1022                 fa_head = &li->falh;
1023                 insert_leaf_info(&n->list, li);
1024                 goto done;
1025         }
1026         l = leaf_new(key);
1027
1028         if (!l)
1029                 return NULL;
1030
1031         li = leaf_info_new(plen);
1032
1033         if (!li) {
1034                 node_free(l);
1035                 return NULL;
1036         }
1037
1038         fa_head = &li->falh;
1039         insert_leaf_info(&l->list, li);
1040
1041         if (t->trie && n == NULL) {
1042                 /* Case 2: n is NULL, and will just insert a new leaf */
1043
1044                 node_set_parent(l, tp);
1045
1046                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1047                 put_child(tp, cindex, l);
1048         } else {
1049                 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1050                 /*
1051                  *  Add a new tnode here
1052                  *  first tnode need some special handling
1053                  */
1054
1055                 if (n) {
1056                         pos = tp ? tp->pos+tp->bits : 0;
1057                         newpos = tkey_mismatch(key, pos, n->key);
1058                         tn = tnode_new(n->key, newpos, 1);
1059                 } else {
1060                         newpos = 0;
1061                         tn = tnode_new(key, newpos, 1); /* First tnode */
1062                 }
1063
1064                 if (!tn) {
1065                         free_leaf_info(li);
1066                         node_free(l);
1067                         return NULL;
1068                 }
1069
1070                 node_set_parent(tn, tp);
1071
1072                 missbit = tkey_extract_bits(key, newpos, 1);
1073                 put_child(tn, missbit, l);
1074                 put_child(tn, 1-missbit, n);
1075
1076                 if (tp) {
1077                         cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1078                         put_child(tp, cindex, tn);
1079                 } else {
1080                         rcu_assign_pointer(t->trie, tn);
1081                 }
1082
1083                 tp = tn;
1084         }
1085
1086         if (tp && tp->pos + tp->bits > 32)
1087                 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1088                         tp, tp->pos, tp->bits, key, plen);
1089
1090         /* Rebalance the trie */
1091
1092         trie_rebalance(t, tp);
1093 done:
1094         return fa_head;
1095 }
1096
1097 /*
1098  * Caller must hold RTNL.
1099  */
1100 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1101 {
1102         struct trie *t = (struct trie *) tb->tb_data;
1103         struct fib_alias *fa, *new_fa;
1104         struct list_head *fa_head = NULL;
1105         struct fib_info *fi;
1106         int plen = cfg->fc_dst_len;
1107         u8 tos = cfg->fc_tos;
1108         u32 key, mask;
1109         int err;
1110         struct tnode *l;
1111
1112         if (plen > 32)
1113                 return -EINVAL;
1114
1115         key = ntohl(cfg->fc_dst);
1116
1117         pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1118
1119         mask = ntohl(inet_make_mask(plen));
1120
1121         if (key & ~mask)
1122                 return -EINVAL;
1123
1124         key = key & mask;
1125
1126         fi = fib_create_info(cfg);
1127         if (IS_ERR(fi)) {
1128                 err = PTR_ERR(fi);
1129                 goto err;
1130         }
1131
1132         l = fib_find_node(t, key);
1133         fa = NULL;
1134
1135         if (l) {
1136                 fa_head = get_fa_head(l, plen);
1137                 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1138         }
1139
1140         /* Now fa, if non-NULL, points to the first fib alias
1141          * with the same keys [prefix,tos,priority], if such key already
1142          * exists or to the node before which we will insert new one.
1143          *
1144          * If fa is NULL, we will need to allocate a new one and
1145          * insert to the head of f.
1146          *
1147          * If f is NULL, no fib node matched the destination key
1148          * and we need to allocate a new one of those as well.
1149          */
1150
1151         if (fa && fa->fa_tos == tos &&
1152             fa->fa_info->fib_priority == fi->fib_priority) {
1153                 struct fib_alias *fa_first, *fa_match;
1154
1155                 err = -EEXIST;
1156                 if (cfg->fc_nlflags & NLM_F_EXCL)
1157                         goto out;
1158
1159                 /* We have 2 goals:
1160                  * 1. Find exact match for type, scope, fib_info to avoid
1161                  * duplicate routes
1162                  * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1163                  */
1164                 fa_match = NULL;
1165                 fa_first = fa;
1166                 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1167                 list_for_each_entry_continue(fa, fa_head, fa_list) {
1168                         if (fa->fa_tos != tos)
1169                                 break;
1170                         if (fa->fa_info->fib_priority != fi->fib_priority)
1171                                 break;
1172                         if (fa->fa_type == cfg->fc_type &&
1173                             fa->fa_info == fi) {
1174                                 fa_match = fa;
1175                                 break;
1176                         }
1177                 }
1178
1179                 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1180                         struct fib_info *fi_drop;
1181                         u8 state;
1182
1183                         fa = fa_first;
1184                         if (fa_match) {
1185                                 if (fa == fa_match)
1186                                         err = 0;
1187                                 goto out;
1188                         }
1189                         err = -ENOBUFS;
1190                         new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1191                         if (new_fa == NULL)
1192                                 goto out;
1193
1194                         fi_drop = fa->fa_info;
1195                         new_fa->fa_tos = fa->fa_tos;
1196                         new_fa->fa_info = fi;
1197                         new_fa->fa_type = cfg->fc_type;
1198                         state = fa->fa_state;
1199                         new_fa->fa_state = state & ~FA_S_ACCESSED;
1200
1201                         list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1202                         alias_free_mem_rcu(fa);
1203
1204                         fib_release_info(fi_drop);
1205                         if (state & FA_S_ACCESSED)
1206                                 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1207                         rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1208                                 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1209
1210                         goto succeeded;
1211                 }
1212                 /* Error if we find a perfect match which
1213                  * uses the same scope, type, and nexthop
1214                  * information.
1215                  */
1216                 if (fa_match)
1217                         goto out;
1218
1219                 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1220                         fa = fa_first;
1221         }
1222         err = -ENOENT;
1223         if (!(cfg->fc_nlflags & NLM_F_CREATE))
1224                 goto out;
1225
1226         err = -ENOBUFS;
1227         new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1228         if (new_fa == NULL)
1229                 goto out;
1230
1231         new_fa->fa_info = fi;
1232         new_fa->fa_tos = tos;
1233         new_fa->fa_type = cfg->fc_type;
1234         new_fa->fa_state = 0;
1235         /*
1236          * Insert new entry to the list.
1237          */
1238
1239         if (!fa_head) {
1240                 fa_head = fib_insert_node(t, key, plen);
1241                 if (unlikely(!fa_head)) {
1242                         err = -ENOMEM;
1243                         goto out_free_new_fa;
1244                 }
1245         }
1246
1247         if (!plen)
1248                 tb->tb_num_default++;
1249
1250         list_add_tail_rcu(&new_fa->fa_list,
1251                           (fa ? &fa->fa_list : fa_head));
1252
1253         rt_cache_flush(cfg->fc_nlinfo.nl_net);
1254         rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1255                   &cfg->fc_nlinfo, 0);
1256 succeeded:
1257         return 0;
1258
1259 out_free_new_fa:
1260         kmem_cache_free(fn_alias_kmem, new_fa);
1261 out:
1262         fib_release_info(fi);
1263 err:
1264         return err;
1265 }
1266
1267 /* should be called with rcu_read_lock */
1268 static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
1269                       t_key key,  const struct flowi4 *flp,
1270                       struct fib_result *res, int fib_flags)
1271 {
1272         struct leaf_info *li;
1273         struct hlist_head *hhead = &l->list;
1274
1275         hlist_for_each_entry_rcu(li, hhead, hlist) {
1276                 struct fib_alias *fa;
1277
1278                 if (l->key != (key & li->mask_plen))
1279                         continue;
1280
1281                 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1282                         struct fib_info *fi = fa->fa_info;
1283                         int nhsel, err;
1284
1285                         if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1286                                 continue;
1287                         if (fi->fib_dead)
1288                                 continue;
1289                         if (fa->fa_info->fib_scope < flp->flowi4_scope)
1290                                 continue;
1291                         fib_alias_accessed(fa);
1292                         err = fib_props[fa->fa_type].error;
1293                         if (unlikely(err < 0)) {
1294 #ifdef CONFIG_IP_FIB_TRIE_STATS
1295                                 this_cpu_inc(t->stats->semantic_match_passed);
1296 #endif
1297                                 return err;
1298                         }
1299                         if (fi->fib_flags & RTNH_F_DEAD)
1300                                 continue;
1301                         for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1302                                 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1303
1304                                 if (nh->nh_flags & RTNH_F_DEAD)
1305                                         continue;
1306                                 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1307                                         continue;
1308
1309 #ifdef CONFIG_IP_FIB_TRIE_STATS
1310                                 this_cpu_inc(t->stats->semantic_match_passed);
1311 #endif
1312                                 res->prefixlen = li->plen;
1313                                 res->nh_sel = nhsel;
1314                                 res->type = fa->fa_type;
1315                                 res->scope = fi->fib_scope;
1316                                 res->fi = fi;
1317                                 res->table = tb;
1318                                 res->fa_head = &li->falh;
1319                                 if (!(fib_flags & FIB_LOOKUP_NOREF))
1320                                         atomic_inc(&fi->fib_clntref);
1321                                 return 0;
1322                         }
1323                 }
1324
1325 #ifdef CONFIG_IP_FIB_TRIE_STATS
1326                 this_cpu_inc(t->stats->semantic_match_miss);
1327 #endif
1328         }
1329
1330         return 1;
1331 }
1332
1333 static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1334 {
1335         t_key prefix = n->key;
1336
1337         return (key ^ prefix) & (prefix | -prefix);
1338 }
1339
1340 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1341                      struct fib_result *res, int fib_flags)
1342 {
1343         struct trie *t = (struct trie *)tb->tb_data;
1344 #ifdef CONFIG_IP_FIB_TRIE_STATS
1345         struct trie_use_stats __percpu *stats = t->stats;
1346 #endif
1347         const t_key key = ntohl(flp->daddr);
1348         struct tnode *n, *pn;
1349         t_key cindex;
1350         int ret = 1;
1351
1352         rcu_read_lock();
1353
1354         n = rcu_dereference(t->trie);
1355         if (!n)
1356                 goto failed;
1357
1358 #ifdef CONFIG_IP_FIB_TRIE_STATS
1359         this_cpu_inc(stats->gets);
1360 #endif
1361
1362         pn = n;
1363         cindex = 0;
1364
1365         /* Step 1: Travel to the longest prefix match in the trie */
1366         for (;;) {
1367                 unsigned long index = get_index(key, n);
1368
1369                 /* This bit of code is a bit tricky but it combines multiple
1370                  * checks into a single check.  The prefix consists of the
1371                  * prefix plus zeros for the "bits" in the prefix. The index
1372                  * is the difference between the key and this value.  From
1373                  * this we can actually derive several pieces of data.
1374                  *   if !(index >> bits)
1375                  *     we know the value is child index
1376                  *   else
1377                  *     we have a mismatch in skip bits and failed
1378                  */
1379                 if (index >> n->bits)
1380                         break;
1381
1382                 /* we have found a leaf. Prefixes have already been compared */
1383                 if (IS_LEAF(n))
1384                         goto found;
1385
1386                 /* only record pn and cindex if we are going to be chopping
1387                  * bits later.  Otherwise we are just wasting cycles.
1388                  */
1389                 if (index) {
1390                         pn = n;
1391                         cindex = index;
1392                 }
1393
1394                 n = rcu_dereference(n->child[index]);
1395                 if (unlikely(!n))
1396                         goto backtrace;
1397         }
1398
1399         /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1400         for (;;) {
1401                 /* record the pointer where our next node pointer is stored */
1402                 struct tnode __rcu **cptr = n->child;
1403
1404                 /* This test verifies that none of the bits that differ
1405                  * between the key and the prefix exist in the region of
1406                  * the lsb and higher in the prefix.
1407                  */
1408                 if (unlikely(prefix_mismatch(key, n)))
1409                         goto backtrace;
1410
1411                 /* exit out and process leaf */
1412                 if (unlikely(IS_LEAF(n)))
1413                         break;
1414
1415                 /* Don't bother recording parent info.  Since we are in
1416                  * prefix match mode we will have to come back to wherever
1417                  * we started this traversal anyway
1418                  */
1419
1420                 while ((n = rcu_dereference(*cptr)) == NULL) {
1421 backtrace:
1422 #ifdef CONFIG_IP_FIB_TRIE_STATS
1423                         if (!n)
1424                                 this_cpu_inc(stats->null_node_hit);
1425 #endif
1426                         /* If we are at cindex 0 there are no more bits for
1427                          * us to strip at this level so we must ascend back
1428                          * up one level to see if there are any more bits to
1429                          * be stripped there.
1430                          */
1431                         while (!cindex) {
1432                                 t_key pkey = pn->key;
1433
1434                                 pn = node_parent_rcu(pn);
1435                                 if (unlikely(!pn))
1436                                         goto failed;
1437 #ifdef CONFIG_IP_FIB_TRIE_STATS
1438                                 this_cpu_inc(stats->backtrack);
1439 #endif
1440                                 /* Get Child's index */
1441                                 cindex = get_index(pkey, pn);
1442                         }
1443
1444                         /* strip the least significant bit from the cindex */
1445                         cindex &= cindex - 1;
1446
1447                         /* grab pointer for next child node */
1448                         cptr = &pn->child[cindex];
1449                 }
1450         }
1451
1452 found:
1453         /* Step 3: Process the leaf, if that fails fall back to backtracing */
1454         ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
1455         if (unlikely(ret > 0))
1456                 goto backtrace;
1457 failed:
1458         rcu_read_unlock();
1459         return ret;
1460 }
1461 EXPORT_SYMBOL_GPL(fib_table_lookup);
1462
1463 /*
1464  * Remove the leaf and return parent.
1465  */
1466 static void trie_leaf_remove(struct trie *t, struct tnode *l)
1467 {
1468         struct tnode *tp = node_parent(l);
1469
1470         pr_debug("entering trie_leaf_remove(%p)\n", l);
1471
1472         if (tp) {
1473                 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1474                 put_child(tp, cindex, NULL);
1475                 trie_rebalance(t, tp);
1476         } else
1477                 RCU_INIT_POINTER(t->trie, NULL);
1478
1479         node_free(l);
1480 }
1481
1482 /*
1483  * Caller must hold RTNL.
1484  */
1485 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1486 {
1487         struct trie *t = (struct trie *) tb->tb_data;
1488         u32 key, mask;
1489         int plen = cfg->fc_dst_len;
1490         u8 tos = cfg->fc_tos;
1491         struct fib_alias *fa, *fa_to_delete;
1492         struct list_head *fa_head;
1493         struct tnode *l;
1494         struct leaf_info *li;
1495
1496         if (plen > 32)
1497                 return -EINVAL;
1498
1499         key = ntohl(cfg->fc_dst);
1500         mask = ntohl(inet_make_mask(plen));
1501
1502         if (key & ~mask)
1503                 return -EINVAL;
1504
1505         key = key & mask;
1506         l = fib_find_node(t, key);
1507
1508         if (!l)
1509                 return -ESRCH;
1510
1511         li = find_leaf_info(l, plen);
1512
1513         if (!li)
1514                 return -ESRCH;
1515
1516         fa_head = &li->falh;
1517         fa = fib_find_alias(fa_head, tos, 0);
1518
1519         if (!fa)
1520                 return -ESRCH;
1521
1522         pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1523
1524         fa_to_delete = NULL;
1525         fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1526         list_for_each_entry_continue(fa, fa_head, fa_list) {
1527                 struct fib_info *fi = fa->fa_info;
1528
1529                 if (fa->fa_tos != tos)
1530                         break;
1531
1532                 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1533                     (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1534                      fa->fa_info->fib_scope == cfg->fc_scope) &&
1535                     (!cfg->fc_prefsrc ||
1536                      fi->fib_prefsrc == cfg->fc_prefsrc) &&
1537                     (!cfg->fc_protocol ||
1538                      fi->fib_protocol == cfg->fc_protocol) &&
1539                     fib_nh_match(cfg, fi) == 0) {
1540                         fa_to_delete = fa;
1541                         break;
1542                 }
1543         }
1544
1545         if (!fa_to_delete)
1546                 return -ESRCH;
1547
1548         fa = fa_to_delete;
1549         rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1550                   &cfg->fc_nlinfo, 0);
1551
1552         list_del_rcu(&fa->fa_list);
1553
1554         if (!plen)
1555                 tb->tb_num_default--;
1556
1557         if (list_empty(fa_head)) {
1558                 hlist_del_rcu(&li->hlist);
1559                 free_leaf_info(li);
1560         }
1561
1562         if (hlist_empty(&l->list))
1563                 trie_leaf_remove(t, l);
1564
1565         if (fa->fa_state & FA_S_ACCESSED)
1566                 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1567
1568         fib_release_info(fa->fa_info);
1569         alias_free_mem_rcu(fa);
1570         return 0;
1571 }
1572
1573 static int trie_flush_list(struct list_head *head)
1574 {
1575         struct fib_alias *fa, *fa_node;
1576         int found = 0;
1577
1578         list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1579                 struct fib_info *fi = fa->fa_info;
1580
1581                 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1582                         list_del_rcu(&fa->fa_list);
1583                         fib_release_info(fa->fa_info);
1584                         alias_free_mem_rcu(fa);
1585                         found++;
1586                 }
1587         }
1588         return found;
1589 }
1590
1591 static int trie_flush_leaf(struct tnode *l)
1592 {
1593         int found = 0;
1594         struct hlist_head *lih = &l->list;
1595         struct hlist_node *tmp;
1596         struct leaf_info *li = NULL;
1597
1598         hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1599                 found += trie_flush_list(&li->falh);
1600
1601                 if (list_empty(&li->falh)) {
1602                         hlist_del_rcu(&li->hlist);
1603                         free_leaf_info(li);
1604                 }
1605         }
1606         return found;
1607 }
1608
1609 /*
1610  * Scan for the next right leaf starting at node p->child[idx]
1611  * Since we have back pointer, no recursion necessary.
1612  */
1613 static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1614 {
1615         do {
1616                 t_key idx;
1617
1618                 if (c)
1619                         idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1620                 else
1621                         idx = 0;
1622
1623                 while (idx < 1u << p->bits) {
1624                         c = tnode_get_child_rcu(p, idx++);
1625                         if (!c)
1626                                 continue;
1627
1628                         if (IS_LEAF(c))
1629                                 return c;
1630
1631                         /* Rescan start scanning in new node */
1632                         p = c;
1633                         idx = 0;
1634                 }
1635
1636                 /* Node empty, walk back up to parent */
1637                 c = p;
1638         } while ((p = node_parent_rcu(c)) != NULL);
1639
1640         return NULL; /* Root of trie */
1641 }
1642
1643 static struct tnode *trie_firstleaf(struct trie *t)
1644 {
1645         struct tnode *n = rcu_dereference_rtnl(t->trie);
1646
1647         if (!n)
1648                 return NULL;
1649
1650         if (IS_LEAF(n))          /* trie is just a leaf */
1651                 return n;
1652
1653         return leaf_walk_rcu(n, NULL);
1654 }
1655
1656 static struct tnode *trie_nextleaf(struct tnode *l)
1657 {
1658         struct tnode *p = node_parent_rcu(l);
1659
1660         if (!p)
1661                 return NULL;    /* trie with just one leaf */
1662
1663         return leaf_walk_rcu(p, l);
1664 }
1665
1666 static struct tnode *trie_leafindex(struct trie *t, int index)
1667 {
1668         struct tnode *l = trie_firstleaf(t);
1669
1670         while (l && index-- > 0)
1671                 l = trie_nextleaf(l);
1672
1673         return l;
1674 }
1675
1676
1677 /*
1678  * Caller must hold RTNL.
1679  */
1680 int fib_table_flush(struct fib_table *tb)
1681 {
1682         struct trie *t = (struct trie *) tb->tb_data;
1683         struct tnode *l, *ll = NULL;
1684         int found = 0;
1685
1686         for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1687                 found += trie_flush_leaf(l);
1688
1689                 if (ll && hlist_empty(&ll->list))
1690                         trie_leaf_remove(t, ll);
1691                 ll = l;
1692         }
1693
1694         if (ll && hlist_empty(&ll->list))
1695                 trie_leaf_remove(t, ll);
1696
1697         pr_debug("trie_flush found=%d\n", found);
1698         return found;
1699 }
1700
1701 void fib_free_table(struct fib_table *tb)
1702 {
1703 #ifdef CONFIG_IP_FIB_TRIE_STATS
1704         struct trie *t = (struct trie *)tb->tb_data;
1705
1706         free_percpu(t->stats);
1707 #endif /* CONFIG_IP_FIB_TRIE_STATS */
1708         kfree(tb);
1709 }
1710
1711 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1712                            struct fib_table *tb,
1713                            struct sk_buff *skb, struct netlink_callback *cb)
1714 {
1715         int i, s_i;
1716         struct fib_alias *fa;
1717         __be32 xkey = htonl(key);
1718
1719         s_i = cb->args[5];
1720         i = 0;
1721
1722         /* rcu_read_lock is hold by caller */
1723
1724         list_for_each_entry_rcu(fa, fah, fa_list) {
1725                 if (i < s_i) {
1726                         i++;
1727                         continue;
1728                 }
1729
1730                 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1731                                   cb->nlh->nlmsg_seq,
1732                                   RTM_NEWROUTE,
1733                                   tb->tb_id,
1734                                   fa->fa_type,
1735                                   xkey,
1736                                   plen,
1737                                   fa->fa_tos,
1738                                   fa->fa_info, NLM_F_MULTI) < 0) {
1739                         cb->args[5] = i;
1740                         return -1;
1741                 }
1742                 i++;
1743         }
1744         cb->args[5] = i;
1745         return skb->len;
1746 }
1747
1748 static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1749                         struct sk_buff *skb, struct netlink_callback *cb)
1750 {
1751         struct leaf_info *li;
1752         int i, s_i;
1753
1754         s_i = cb->args[4];
1755         i = 0;
1756
1757         /* rcu_read_lock is hold by caller */
1758         hlist_for_each_entry_rcu(li, &l->list, hlist) {
1759                 if (i < s_i) {
1760                         i++;
1761                         continue;
1762                 }
1763
1764                 if (i > s_i)
1765                         cb->args[5] = 0;
1766
1767                 if (list_empty(&li->falh))
1768                         continue;
1769
1770                 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1771                         cb->args[4] = i;
1772                         return -1;
1773                 }
1774                 i++;
1775         }
1776
1777         cb->args[4] = i;
1778         return skb->len;
1779 }
1780
1781 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1782                    struct netlink_callback *cb)
1783 {
1784         struct tnode *l;
1785         struct trie *t = (struct trie *) tb->tb_data;
1786         t_key key = cb->args[2];
1787         int count = cb->args[3];
1788
1789         rcu_read_lock();
1790         /* Dump starting at last key.
1791          * Note: 0.0.0.0/0 (ie default) is first key.
1792          */
1793         if (count == 0)
1794                 l = trie_firstleaf(t);
1795         else {
1796                 /* Normally, continue from last key, but if that is missing
1797                  * fallback to using slow rescan
1798                  */
1799                 l = fib_find_node(t, key);
1800                 if (!l)
1801                         l = trie_leafindex(t, count);
1802         }
1803
1804         while (l) {
1805                 cb->args[2] = l->key;
1806                 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1807                         cb->args[3] = count;
1808                         rcu_read_unlock();
1809                         return -1;
1810                 }
1811
1812                 ++count;
1813                 l = trie_nextleaf(l);
1814                 memset(&cb->args[4], 0,
1815                        sizeof(cb->args) - 4*sizeof(cb->args[0]));
1816         }
1817         cb->args[3] = count;
1818         rcu_read_unlock();
1819
1820         return skb->len;
1821 }
1822
1823 void __init fib_trie_init(void)
1824 {
1825         fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1826                                           sizeof(struct fib_alias),
1827                                           0, SLAB_PANIC, NULL);
1828
1829         trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1830                                            max(sizeof(struct tnode),
1831                                                sizeof(struct leaf_info)),
1832                                            0, SLAB_PANIC, NULL);
1833 }
1834
1835
1836 struct fib_table *fib_trie_table(u32 id)
1837 {
1838         struct fib_table *tb;
1839         struct trie *t;
1840
1841         tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1842                      GFP_KERNEL);
1843         if (tb == NULL)
1844                 return NULL;
1845
1846         tb->tb_id = id;
1847         tb->tb_default = -1;
1848         tb->tb_num_default = 0;
1849
1850         t = (struct trie *) tb->tb_data;
1851         RCU_INIT_POINTER(t->trie, NULL);
1852 #ifdef CONFIG_IP_FIB_TRIE_STATS
1853         t->stats = alloc_percpu(struct trie_use_stats);
1854         if (!t->stats) {
1855                 kfree(tb);
1856                 tb = NULL;
1857         }
1858 #endif
1859
1860         return tb;
1861 }
1862
1863 #ifdef CONFIG_PROC_FS
1864 /* Depth first Trie walk iterator */
1865 struct fib_trie_iter {
1866         struct seq_net_private p;
1867         struct fib_table *tb;
1868         struct tnode *tnode;
1869         unsigned int index;
1870         unsigned int depth;
1871 };
1872
1873 static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1874 {
1875         struct tnode *tn = iter->tnode;
1876         unsigned int cindex = iter->index;
1877         struct tnode *p;
1878
1879         /* A single entry routing table */
1880         if (!tn)
1881                 return NULL;
1882
1883         pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1884                  iter->tnode, iter->index, iter->depth);
1885 rescan:
1886         while (cindex < (1<<tn->bits)) {
1887                 struct tnode *n = tnode_get_child_rcu(tn, cindex);
1888
1889                 if (n) {
1890                         if (IS_LEAF(n)) {
1891                                 iter->tnode = tn;
1892                                 iter->index = cindex + 1;
1893                         } else {
1894                                 /* push down one level */
1895                                 iter->tnode = n;
1896                                 iter->index = 0;
1897                                 ++iter->depth;
1898                         }
1899                         return n;
1900                 }
1901
1902                 ++cindex;
1903         }
1904
1905         /* Current node exhausted, pop back up */
1906         p = node_parent_rcu(tn);
1907         if (p) {
1908                 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
1909                 tn = p;
1910                 --iter->depth;
1911                 goto rescan;
1912         }
1913
1914         /* got root? */
1915         return NULL;
1916 }
1917
1918 static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1919                                        struct trie *t)
1920 {
1921         struct tnode *n;
1922
1923         if (!t)
1924                 return NULL;
1925
1926         n = rcu_dereference(t->trie);
1927         if (!n)
1928                 return NULL;
1929
1930         if (IS_TNODE(n)) {
1931                 iter->tnode = n;
1932                 iter->index = 0;
1933                 iter->depth = 1;
1934         } else {
1935                 iter->tnode = NULL;
1936                 iter->index = 0;
1937                 iter->depth = 0;
1938         }
1939
1940         return n;
1941 }
1942
1943 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1944 {
1945         struct tnode *n;
1946         struct fib_trie_iter iter;
1947
1948         memset(s, 0, sizeof(*s));
1949
1950         rcu_read_lock();
1951         for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1952                 if (IS_LEAF(n)) {
1953                         struct leaf_info *li;
1954
1955                         s->leaves++;
1956                         s->totdepth += iter.depth;
1957                         if (iter.depth > s->maxdepth)
1958                                 s->maxdepth = iter.depth;
1959
1960                         hlist_for_each_entry_rcu(li, &n->list, hlist)
1961                                 ++s->prefixes;
1962                 } else {
1963                         int i;
1964
1965                         s->tnodes++;
1966                         if (n->bits < MAX_STAT_DEPTH)
1967                                 s->nodesizes[n->bits]++;
1968
1969                         for (i = 0; i < tnode_child_length(n); i++)
1970                                 if (!rcu_access_pointer(n->child[i]))
1971                                         s->nullpointers++;
1972                 }
1973         }
1974         rcu_read_unlock();
1975 }
1976
1977 /*
1978  *      This outputs /proc/net/fib_triestats
1979  */
1980 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1981 {
1982         unsigned int i, max, pointers, bytes, avdepth;
1983
1984         if (stat->leaves)
1985                 avdepth = stat->totdepth*100 / stat->leaves;
1986         else
1987                 avdepth = 0;
1988
1989         seq_printf(seq, "\tAver depth:     %u.%02d\n",
1990                    avdepth / 100, avdepth % 100);
1991         seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
1992
1993         seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
1994         bytes = sizeof(struct tnode) * stat->leaves;
1995
1996         seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
1997         bytes += sizeof(struct leaf_info) * stat->prefixes;
1998
1999         seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2000         bytes += sizeof(struct tnode) * stat->tnodes;
2001
2002         max = MAX_STAT_DEPTH;
2003         while (max > 0 && stat->nodesizes[max-1] == 0)
2004                 max--;
2005
2006         pointers = 0;
2007         for (i = 1; i < max; i++)
2008                 if (stat->nodesizes[i] != 0) {
2009                         seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2010                         pointers += (1<<i) * stat->nodesizes[i];
2011                 }
2012         seq_putc(seq, '\n');
2013         seq_printf(seq, "\tPointers: %u\n", pointers);
2014
2015         bytes += sizeof(struct tnode *) * pointers;
2016         seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2017         seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2018 }
2019
2020 #ifdef CONFIG_IP_FIB_TRIE_STATS
2021 static void trie_show_usage(struct seq_file *seq,
2022                             const struct trie_use_stats __percpu *stats)
2023 {
2024         struct trie_use_stats s = { 0 };
2025         int cpu;
2026
2027         /* loop through all of the CPUs and gather up the stats */
2028         for_each_possible_cpu(cpu) {
2029                 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2030
2031                 s.gets += pcpu->gets;
2032                 s.backtrack += pcpu->backtrack;
2033                 s.semantic_match_passed += pcpu->semantic_match_passed;
2034                 s.semantic_match_miss += pcpu->semantic_match_miss;
2035                 s.null_node_hit += pcpu->null_node_hit;
2036                 s.resize_node_skipped += pcpu->resize_node_skipped;
2037         }
2038
2039         seq_printf(seq, "\nCounters:\n---------\n");
2040         seq_printf(seq, "gets = %u\n", s.gets);
2041         seq_printf(seq, "backtracks = %u\n", s.backtrack);
2042         seq_printf(seq, "semantic match passed = %u\n",
2043                    s.semantic_match_passed);
2044         seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2045         seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2046         seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2047 }
2048 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2049
2050 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2051 {
2052         if (tb->tb_id == RT_TABLE_LOCAL)
2053                 seq_puts(seq, "Local:\n");
2054         else if (tb->tb_id == RT_TABLE_MAIN)
2055                 seq_puts(seq, "Main:\n");
2056         else
2057                 seq_printf(seq, "Id %d:\n", tb->tb_id);
2058 }
2059
2060
2061 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2062 {
2063         struct net *net = (struct net *)seq->private;
2064         unsigned int h;
2065
2066         seq_printf(seq,
2067                    "Basic info: size of leaf:"
2068                    " %Zd bytes, size of tnode: %Zd bytes.\n",
2069                    sizeof(struct tnode), sizeof(struct tnode));
2070
2071         for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2072                 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2073                 struct fib_table *tb;
2074
2075                 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2076                         struct trie *t = (struct trie *) tb->tb_data;
2077                         struct trie_stat stat;
2078
2079                         if (!t)
2080                                 continue;
2081
2082                         fib_table_print(seq, tb);
2083
2084                         trie_collect_stats(t, &stat);
2085                         trie_show_stats(seq, &stat);
2086 #ifdef CONFIG_IP_FIB_TRIE_STATS
2087                         trie_show_usage(seq, t->stats);
2088 #endif
2089                 }
2090         }
2091
2092         return 0;
2093 }
2094
2095 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2096 {
2097         return single_open_net(inode, file, fib_triestat_seq_show);
2098 }
2099
2100 static const struct file_operations fib_triestat_fops = {
2101         .owner  = THIS_MODULE,
2102         .open   = fib_triestat_seq_open,
2103         .read   = seq_read,
2104         .llseek = seq_lseek,
2105         .release = single_release_net,
2106 };
2107
2108 static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2109 {
2110         struct fib_trie_iter *iter = seq->private;
2111         struct net *net = seq_file_net(seq);
2112         loff_t idx = 0;
2113         unsigned int h;
2114
2115         for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2116                 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2117                 struct fib_table *tb;
2118
2119                 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2120                         struct tnode *n;
2121
2122                         for (n = fib_trie_get_first(iter,
2123                                                     (struct trie *) tb->tb_data);
2124                              n; n = fib_trie_get_next(iter))
2125                                 if (pos == idx++) {
2126                                         iter->tb = tb;
2127                                         return n;
2128                                 }
2129                 }
2130         }
2131
2132         return NULL;
2133 }
2134
2135 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2136         __acquires(RCU)
2137 {
2138         rcu_read_lock();
2139         return fib_trie_get_idx(seq, *pos);
2140 }
2141
2142 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2143 {
2144         struct fib_trie_iter *iter = seq->private;
2145         struct net *net = seq_file_net(seq);
2146         struct fib_table *tb = iter->tb;
2147         struct hlist_node *tb_node;
2148         unsigned int h;
2149         struct tnode *n;
2150
2151         ++*pos;
2152         /* next node in same table */
2153         n = fib_trie_get_next(iter);
2154         if (n)
2155                 return n;
2156
2157         /* walk rest of this hash chain */
2158         h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2159         while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2160                 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2161                 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2162                 if (n)
2163                         goto found;
2164         }
2165
2166         /* new hash chain */
2167         while (++h < FIB_TABLE_HASHSZ) {
2168                 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2169                 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2170                         n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2171                         if (n)
2172                                 goto found;
2173                 }
2174         }
2175         return NULL;
2176
2177 found:
2178         iter->tb = tb;
2179         return n;
2180 }
2181
2182 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2183         __releases(RCU)
2184 {
2185         rcu_read_unlock();
2186 }
2187
2188 static void seq_indent(struct seq_file *seq, int n)
2189 {
2190         while (n-- > 0)
2191                 seq_puts(seq, "   ");
2192 }
2193
2194 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2195 {
2196         switch (s) {
2197         case RT_SCOPE_UNIVERSE: return "universe";
2198         case RT_SCOPE_SITE:     return "site";
2199         case RT_SCOPE_LINK:     return "link";
2200         case RT_SCOPE_HOST:     return "host";
2201         case RT_SCOPE_NOWHERE:  return "nowhere";
2202         default:
2203                 snprintf(buf, len, "scope=%d", s);
2204                 return buf;
2205         }
2206 }
2207
2208 static const char *const rtn_type_names[__RTN_MAX] = {
2209         [RTN_UNSPEC] = "UNSPEC",
2210         [RTN_UNICAST] = "UNICAST",
2211         [RTN_LOCAL] = "LOCAL",
2212         [RTN_BROADCAST] = "BROADCAST",
2213         [RTN_ANYCAST] = "ANYCAST",
2214         [RTN_MULTICAST] = "MULTICAST",
2215         [RTN_BLACKHOLE] = "BLACKHOLE",
2216         [RTN_UNREACHABLE] = "UNREACHABLE",
2217         [RTN_PROHIBIT] = "PROHIBIT",
2218         [RTN_THROW] = "THROW",
2219         [RTN_NAT] = "NAT",
2220         [RTN_XRESOLVE] = "XRESOLVE",
2221 };
2222
2223 static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2224 {
2225         if (t < __RTN_MAX && rtn_type_names[t])
2226                 return rtn_type_names[t];
2227         snprintf(buf, len, "type %u", t);
2228         return buf;
2229 }
2230
2231 /* Pretty print the trie */
2232 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2233 {
2234         const struct fib_trie_iter *iter = seq->private;
2235         struct tnode *n = v;
2236
2237         if (!node_parent_rcu(n))
2238                 fib_table_print(seq, iter->tb);
2239
2240         if (IS_TNODE(n)) {
2241                 __be32 prf = htonl(n->key);
2242
2243                 seq_indent(seq, iter->depth - 1);
2244                 seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2245                            &prf, n->pos, n->bits, n->full_children,
2246                            n->empty_children);
2247         } else {
2248                 struct leaf_info *li;
2249                 __be32 val = htonl(n->key);
2250
2251                 seq_indent(seq, iter->depth);
2252                 seq_printf(seq, "  |-- %pI4\n", &val);
2253
2254                 hlist_for_each_entry_rcu(li, &n->list, hlist) {
2255                         struct fib_alias *fa;
2256
2257                         list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2258                                 char buf1[32], buf2[32];
2259
2260                                 seq_indent(seq, iter->depth+1);
2261                                 seq_printf(seq, "  /%d %s %s", li->plen,
2262                                            rtn_scope(buf1, sizeof(buf1),
2263                                                      fa->fa_info->fib_scope),
2264                                            rtn_type(buf2, sizeof(buf2),
2265                                                     fa->fa_type));
2266                                 if (fa->fa_tos)
2267                                         seq_printf(seq, " tos=%d", fa->fa_tos);
2268                                 seq_putc(seq, '\n');
2269                         }
2270                 }
2271         }
2272
2273         return 0;
2274 }
2275
2276 static const struct seq_operations fib_trie_seq_ops = {
2277         .start  = fib_trie_seq_start,
2278         .next   = fib_trie_seq_next,
2279         .stop   = fib_trie_seq_stop,
2280         .show   = fib_trie_seq_show,
2281 };
2282
2283 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2284 {
2285         return seq_open_net(inode, file, &fib_trie_seq_ops,
2286                             sizeof(struct fib_trie_iter));
2287 }
2288
2289 static const struct file_operations fib_trie_fops = {
2290         .owner  = THIS_MODULE,
2291         .open   = fib_trie_seq_open,
2292         .read   = seq_read,
2293         .llseek = seq_lseek,
2294         .release = seq_release_net,
2295 };
2296
2297 struct fib_route_iter {
2298         struct seq_net_private p;
2299         struct trie *main_trie;
2300         loff_t  pos;
2301         t_key   key;
2302 };
2303
2304 static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2305 {
2306         struct tnode *l = NULL;
2307         struct trie *t = iter->main_trie;
2308
2309         /* use cache location of last found key */
2310         if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2311                 pos -= iter->pos;
2312         else {
2313                 iter->pos = 0;
2314                 l = trie_firstleaf(t);
2315         }
2316
2317         while (l && pos-- > 0) {
2318                 iter->pos++;
2319                 l = trie_nextleaf(l);
2320         }
2321
2322         if (l)
2323                 iter->key = pos;        /* remember it */
2324         else
2325                 iter->pos = 0;          /* forget it */
2326
2327         return l;
2328 }
2329
2330 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2331         __acquires(RCU)
2332 {
2333         struct fib_route_iter *iter = seq->private;
2334         struct fib_table *tb;
2335
2336         rcu_read_lock();
2337         tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2338         if (!tb)
2339                 return NULL;
2340
2341         iter->main_trie = (struct trie *) tb->tb_data;
2342         if (*pos == 0)
2343                 return SEQ_START_TOKEN;
2344         else
2345                 return fib_route_get_idx(iter, *pos - 1);
2346 }
2347
2348 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2349 {
2350         struct fib_route_iter *iter = seq->private;
2351         struct tnode *l = v;
2352
2353         ++*pos;
2354         if (v == SEQ_START_TOKEN) {
2355                 iter->pos = 0;
2356                 l = trie_firstleaf(iter->main_trie);
2357         } else {
2358                 iter->pos++;
2359                 l = trie_nextleaf(l);
2360         }
2361
2362         if (l)
2363                 iter->key = l->key;
2364         else
2365                 iter->pos = 0;
2366         return l;
2367 }
2368
2369 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2370         __releases(RCU)
2371 {
2372         rcu_read_unlock();
2373 }
2374
2375 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2376 {
2377         unsigned int flags = 0;
2378
2379         if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2380                 flags = RTF_REJECT;
2381         if (fi && fi->fib_nh->nh_gw)
2382                 flags |= RTF_GATEWAY;
2383         if (mask == htonl(0xFFFFFFFF))
2384                 flags |= RTF_HOST;
2385         flags |= RTF_UP;
2386         return flags;
2387 }
2388
2389 /*
2390  *      This outputs /proc/net/route.
2391  *      The format of the file is not supposed to be changed
2392  *      and needs to be same as fib_hash output to avoid breaking
2393  *      legacy utilities
2394  */
2395 static int fib_route_seq_show(struct seq_file *seq, void *v)
2396 {
2397         struct tnode *l = v;
2398         struct leaf_info *li;
2399
2400         if (v == SEQ_START_TOKEN) {
2401                 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2402                            "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2403                            "\tWindow\tIRTT");
2404                 return 0;
2405         }
2406
2407         hlist_for_each_entry_rcu(li, &l->list, hlist) {
2408                 struct fib_alias *fa;
2409                 __be32 mask, prefix;
2410
2411                 mask = inet_make_mask(li->plen);
2412                 prefix = htonl(l->key);
2413
2414                 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2415                         const struct fib_info *fi = fa->fa_info;
2416                         unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2417
2418                         if (fa->fa_type == RTN_BROADCAST
2419                             || fa->fa_type == RTN_MULTICAST)
2420                                 continue;
2421
2422                         seq_setwidth(seq, 127);
2423
2424                         if (fi)
2425                                 seq_printf(seq,
2426                                          "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2427                                          "%d\t%08X\t%d\t%u\t%u",
2428                                          fi->fib_dev ? fi->fib_dev->name : "*",
2429                                          prefix,
2430                                          fi->fib_nh->nh_gw, flags, 0, 0,
2431                                          fi->fib_priority,
2432                                          mask,
2433                                          (fi->fib_advmss ?
2434                                           fi->fib_advmss + 40 : 0),
2435                                          fi->fib_window,
2436                                          fi->fib_rtt >> 3);
2437                         else
2438                                 seq_printf(seq,
2439                                          "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2440                                          "%d\t%08X\t%d\t%u\t%u",
2441                                          prefix, 0, flags, 0, 0, 0,
2442                                          mask, 0, 0, 0);
2443
2444                         seq_pad(seq, '\n');
2445                 }
2446         }
2447
2448         return 0;
2449 }
2450
2451 static const struct seq_operations fib_route_seq_ops = {
2452         .start  = fib_route_seq_start,
2453         .next   = fib_route_seq_next,
2454         .stop   = fib_route_seq_stop,
2455         .show   = fib_route_seq_show,
2456 };
2457
2458 static int fib_route_seq_open(struct inode *inode, struct file *file)
2459 {
2460         return seq_open_net(inode, file, &fib_route_seq_ops,
2461                             sizeof(struct fib_route_iter));
2462 }
2463
2464 static const struct file_operations fib_route_fops = {
2465         .owner  = THIS_MODULE,
2466         .open   = fib_route_seq_open,
2467         .read   = seq_read,
2468         .llseek = seq_lseek,
2469         .release = seq_release_net,
2470 };
2471
2472 int __net_init fib_proc_init(struct net *net)
2473 {
2474         if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2475                 goto out1;
2476
2477         if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2478                          &fib_triestat_fops))
2479                 goto out2;
2480
2481         if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2482                 goto out3;
2483
2484         return 0;
2485
2486 out3:
2487         remove_proc_entry("fib_triestat", net->proc_net);
2488 out2:
2489         remove_proc_entry("fib_trie", net->proc_net);
2490 out1:
2491         return -ENOMEM;
2492 }
2493
2494 void __net_exit fib_proc_exit(struct net *net)
2495 {
2496         remove_proc_entry("fib_trie", net->proc_net);
2497         remove_proc_entry("fib_triestat", net->proc_net);
2498         remove_proc_entry("route", net->proc_net);
2499 }
2500
2501 #endif /* CONFIG_PROC_FS */