linux/net/ipv4/fib_trie.c
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally described in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <linux/bitops.h>
  55#include <linux/types.h>
  56#include <linux/kernel.h>
  57#include <linux/mm.h>
  58#include <linux/string.h>
  59#include <linux/socket.h>
  60#include <linux/sockios.h>
  61#include <linux/errno.h>
  62#include <linux/in.h>
  63#include <linux/inet.h>
  64#include <linux/inetdevice.h>
  65#include <linux/netdevice.h>
  66#include <linux/if_arp.h>
  67#include <linux/proc_fs.h>
  68#include <linux/rcupdate.h>
  69#include <linux/skbuff.h>
  70#include <linux/netlink.h>
  71#include <linux/init.h>
  72#include <linux/list.h>
  73#include <linux/slab.h>
  74#include <linux/export.h>
  75#include <net/net_namespace.h>
  76#include <net/ip.h>
  77#include <net/protocol.h>
  78#include <net/route.h>
  79#include <net/tcp.h>
  80#include <net/sock.h>
  81#include <net/ip_fib.h>
  82#include "fib_lookup.h"
  83
  84#define MAX_STAT_DEPTH 32
  85
  86#define KEYLENGTH (8*sizeof(t_key))
  87
  88typedef unsigned int t_key;
  89
  90#define T_TNODE 0
  91#define T_LEAF  1
  92#define NODE_TYPE_MASK  0x1UL
  93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  94
  95#define IS_TNODE(n) (!(n->parent & T_LEAF))
  96#define IS_LEAF(n) (n->parent & T_LEAF)
  97
  98struct rt_trie_node {
  99        unsigned long parent;
 100        t_key key;
 101};
 102
 103struct leaf {
 104        unsigned long parent;
 105        t_key key;
 106        struct hlist_head list;
 107        struct rcu_head rcu;
 108};
 109
 110struct leaf_info {
 111        struct hlist_node hlist;
 112        int plen;
 113        u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
 114        struct list_head falh;
 115        struct rcu_head rcu;
 116};
 117
 118struct tnode {
 119        unsigned long parent;
 120        t_key key;
 121        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 122        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 123        unsigned int full_children;     /* KEYLENGTH bits needed */
 124        unsigned int empty_children;    /* KEYLENGTH bits needed */
 125        union {
 126                struct rcu_head rcu;
 127                struct tnode *tnode_free;
 128        };
 129        struct rt_trie_node __rcu *child[0];
 130};
 131
 132#ifdef CONFIG_IP_FIB_TRIE_STATS
 133struct trie_use_stats {
 134        unsigned int gets;
 135        unsigned int backtrack;
 136        unsigned int semantic_match_passed;
 137        unsigned int semantic_match_miss;
 138        unsigned int null_node_hit;
 139        unsigned int resize_node_skipped;
 140};
 141#endif
 142
 143struct trie_stat {
 144        unsigned int totdepth;
 145        unsigned int maxdepth;
 146        unsigned int tnodes;
 147        unsigned int leaves;
 148        unsigned int nullpointers;
 149        unsigned int prefixes;
 150        unsigned int nodesizes[MAX_STAT_DEPTH];
 151};
 152
 153struct trie {
 154        struct rt_trie_node __rcu *trie;
 155#ifdef CONFIG_IP_FIB_TRIE_STATS
 156        struct trie_use_stats stats;
 157#endif
 158};
 159
 160static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 161                                  int wasfull);
 162static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 163static struct tnode *inflate(struct trie *t, struct tnode *tn);
 164static struct tnode *halve(struct trie *t, struct tnode *tn);
 165/* tnodes to free after resize(); protected by RTNL */
 166static struct tnode *tnode_free_head;
 167static size_t tnode_free_size;
 168
 169/*
 170 * synchronize_rcu after call_rcu for that many pages; it should be especially
 171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 172 * obtained experimentally, aiming to avoid visible slowdown.
 173 */
 174static const int sync_pages = 128;
 175
 176static struct kmem_cache *fn_alias_kmem __read_mostly;
 177static struct kmem_cache *trie_leaf_kmem __read_mostly;
 178
 179/*
 180 * caller must hold RTNL
 181 */
 182static inline struct tnode *node_parent(const struct rt_trie_node *node)
 183{
 184        unsigned long parent;
 185
 186        parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
 187
 188        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 189}
 190
 191/*
 192 * caller must hold RCU read lock or RTNL
 193 */
 194static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
 195{
 196        unsigned long parent;
 197
 198        parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
 199                                                           lockdep_rtnl_is_held());
 200
 201        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 202}
 203
 204/* Same as rcu_assign_pointer
 205 * but that macro() assumes that value is a pointer.
 206 */
 207static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 208{
 209        smp_wmb();
 210        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 211}
 212
 213/*
 214 * caller must hold RTNL
 215 */
 216static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 217{
 218        BUG_ON(i >= 1U << tn->bits);
 219
 220        return rtnl_dereference(tn->child[i]);
 221}
 222
 223/*
 224 * caller must hold RCU read lock or RTNL
 225 */
 226static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 227{
 228        BUG_ON(i >= 1U << tn->bits);
 229
 230        return rcu_dereference_rtnl(tn->child[i]);
 231}
 232
 233static inline int tnode_child_length(const struct tnode *tn)
 234{
 235        return 1 << tn->bits;
 236}
 237
 238static inline t_key mask_pfx(t_key k, unsigned int l)
 239{
 240        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 241}
 242
 243static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 244{
 245        if (offset < KEYLENGTH)
 246                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 247        else
 248                return 0;
 249}
 250
 251static inline int tkey_equals(t_key a, t_key b)
 252{
 253        return a == b;
 254}
 255
 256static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 257{
 258        if (bits == 0 || offset >= KEYLENGTH)
 259                return 1;
 260        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 261        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 262}
 263
 264static inline int tkey_mismatch(t_key a, int offset, t_key b)
 265{
 266        t_key diff = a ^ b;
 267        int i = offset;
 268
 269        if (!diff)
 270                return 0;
 271        while ((diff << i) >> (KEYLENGTH-1) == 0)
 272                i++;
 273        return i;
 274}
 275
 276/*
 277  To understand this stuff, an understanding of keys and all their bits is
 278  necessary. Every node in the trie has a key associated with it, but not
 279  all of the bits in that key are significant.
 280
 281  Consider a node 'n' and its parent 'tp'.
 282
 283  If n is a leaf, every bit in its key is significant. Its presence is
 284  necessitated by path compression, since during a tree traversal (when
 285  searching for a leaf - unless we are doing an insertion) we will completely
 286  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 287  a potentially successful search, that we have indeed been walking the
 288  correct key path.
 289
 290  Note that we can never "miss" the correct key in the tree if present by
 291  following the wrong path. Path compression ensures that segments of the key
 292  that are the same for all keys with a given prefix are skipped, but the
 293  skipped part *is* identical for each node in the subtrie below the skipped
 294  bit! trie_insert() in this implementation takes care of that - note the
 295  call to tkey_sub_equals() in trie_insert().
 296
 297  if n is an internal node - a 'tnode' here, the various parts of its key
 298  have many different meanings.
 299
 300  Example:
 301  _________________________________________________________________
 302  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 303  -----------------------------------------------------------------
 304    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 305
 306  _________________________________________________________________
 307  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 308  -----------------------------------------------------------------
 309   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 310
 311  tp->pos = 7
 312  tp->bits = 3
 313  n->pos = 15
 314  n->bits = 4
 315
 316  First, let's just ignore the bits that come before the parent tp, that is
 317  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 318  not use them for anything.
 319
 320  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 321  index into the parent's child array. That is, they will be used to find
 322  'n' among tp's children.
 323
 324  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 325  for the node n.
 326
 327  All the bits we have seen so far are significant to the node n. The rest
 328  of the bits are really not needed or indeed known in n->key.
 329
 330  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 331  n's child array, and will of course be different for each child.
 332
 333
 334  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 335  at this point.
 336
 337*/
 338
 339static inline void check_tnode(const struct tnode *tn)
 340{
 341        WARN_ON(tn && tn->pos+tn->bits > 32);
 342}
 343
 344static const int halve_threshold = 25;
 345static const int inflate_threshold = 50;
 346static const int halve_threshold_root = 15;
 347static const int inflate_threshold_root = 30;
 348
 349static void __alias_free_mem(struct rcu_head *head)
 350{
 351        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 352        kmem_cache_free(fn_alias_kmem, fa);
 353}
 354
 355static inline void alias_free_mem_rcu(struct fib_alias *fa)
 356{
 357        call_rcu(&fa->rcu, __alias_free_mem);
 358}
 359
 360static void __leaf_free_rcu(struct rcu_head *head)
 361{
 362        struct leaf *l = container_of(head, struct leaf, rcu);
 363        kmem_cache_free(trie_leaf_kmem, l);
 364}
 365
 366static inline void free_leaf(struct leaf *l)
 367{
 368        call_rcu(&l->rcu, __leaf_free_rcu);
 369}
 370
 371static inline void free_leaf_info(struct leaf_info *leaf)
 372{
 373        kfree_rcu(leaf, rcu);
 374}
 375
 376static struct tnode *tnode_alloc(size_t size)
 377{
 378        if (size <= PAGE_SIZE)
 379                return kzalloc(size, GFP_KERNEL);
 380        else
 381                return vzalloc(size);
 382}
 383
 384static void __tnode_free_rcu(struct rcu_head *head)
 385{
 386        struct tnode *tn = container_of(head, struct tnode, rcu);
 387        size_t size = sizeof(struct tnode) +
 388                      (sizeof(struct rt_trie_node *) << tn->bits);
 389
 390        if (size <= PAGE_SIZE)
 391                kfree(tn);
 392        else
 393                vfree(tn);
 394}
 395
 396static inline void tnode_free(struct tnode *tn)
 397{
 398        if (IS_LEAF(tn))
 399                free_leaf((struct leaf *) tn);
 400        else
 401                call_rcu(&tn->rcu, __tnode_free_rcu);
 402}
 403
 404static void tnode_free_safe(struct tnode *tn)
 405{
 406        BUG_ON(IS_LEAF(tn));
 407        tn->tnode_free = tnode_free_head;
 408        tnode_free_head = tn;
 409        tnode_free_size += sizeof(struct tnode) +
 410                           (sizeof(struct rt_trie_node *) << tn->bits);
 411}
 412
 413static void tnode_free_flush(void)
 414{
 415        struct tnode *tn;
 416
 417        while ((tn = tnode_free_head)) {
 418                tnode_free_head = tn->tnode_free;
 419                tn->tnode_free = NULL;
 420                tnode_free(tn);
 421        }
 422
 423        if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 424                tnode_free_size = 0;
 425                synchronize_rcu();
 426        }
 427}
 428
 429static struct leaf *leaf_new(void)
 430{
 431        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 432        if (l) {
 433                l->parent = T_LEAF;
 434                INIT_HLIST_HEAD(&l->list);
 435        }
 436        return l;
 437}
 438
 439static struct leaf_info *leaf_info_new(int plen)
 440{
 441        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 442        if (li) {
 443                li->plen = plen;
 444                li->mask_plen = ntohl(inet_make_mask(plen));
 445                INIT_LIST_HEAD(&li->falh);
 446        }
 447        return li;
 448}
 449
 450static struct tnode *tnode_new(t_key key, int pos, int bits)
 451{
 452        size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
 453        struct tnode *tn = tnode_alloc(sz);
 454
 455        if (tn) {
 456                tn->parent = T_TNODE;
 457                tn->pos = pos;
 458                tn->bits = bits;
 459                tn->key = key;
 460                tn->full_children = 0;
 461                tn->empty_children = 1<<bits;
 462        }
 463
 464        pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 465                 sizeof(struct rt_trie_node *) << bits);
 466        return tn;
 467}
 468
 469/*
 470 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 471 * and no bits are skipped. See discussion in dyntree paper p. 6
 472 */
 473
 474static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 475{
 476        if (n == NULL || IS_LEAF(n))
 477                return 0;
 478
 479        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 480}
 481
 482static inline void put_child(struct tnode *tn, int i,
 483                             struct rt_trie_node *n)
 484{
 485        tnode_put_child_reorg(tn, i, n, -1);
 486}
 487
 488 /*
 489  * Add a child at position i overwriting the old value.
 490  * Update the value of full_children and empty_children.
 491  */
 492
 493static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 494                                  int wasfull)
 495{
 496        struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
 497        int isfull;
 498
 499        BUG_ON(i >= 1<<tn->bits);
 500
 501        /* update emptyChildren */
 502        if (n == NULL && chi != NULL)
 503                tn->empty_children++;
 504        else if (n != NULL && chi == NULL)
 505                tn->empty_children--;
 506
 507        /* update fullChildren */
 508        if (wasfull == -1)
 509                wasfull = tnode_full(tn, chi);
 510
 511        isfull = tnode_full(tn, n);
 512        if (wasfull && !isfull)
 513                tn->full_children--;
 514        else if (!wasfull && isfull)
 515                tn->full_children++;
 516
 517        if (n)
 518                node_set_parent(n, tn);
 519
 520        rcu_assign_pointer(tn->child[i], n);
 521}
 522
 523#define MAX_WORK 10
 524static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 525{
 526        int i;
 527        struct tnode *old_tn;
 528        int inflate_threshold_use;
 529        int halve_threshold_use;
 530        int max_work;
 531
 532        if (!tn)
 533                return NULL;
 534
 535        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 536                 tn, inflate_threshold, halve_threshold);
 537
 538        /* No children */
 539        if (tn->empty_children == tnode_child_length(tn)) {
 540                tnode_free_safe(tn);
 541                return NULL;
 542        }
 543        /* One child */
 544        if (tn->empty_children == tnode_child_length(tn) - 1)
 545                goto one_child;
 546        /*
 547         * Double as long as the resulting node has a number of
 548         * nonempty nodes that are above the threshold.
 549         */
 550
 551        /*
 552         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 553         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 554         * Telecommunications, page 6:
 555         * "A node is doubled if the ratio of non-empty children to all
 556         * children in the *doubled* node is at least 'high'."
 557         *
 558         * 'high' in this instance is the variable 'inflate_threshold'. It
 559         * is expressed as a percentage, so we multiply it with
 560         * tnode_child_length() and instead of multiplying by 2 (since the
 561         * child array will be doubled by inflate()) and multiplying
 562         * the left-hand side by 100 (to handle the percentage thing) we
 563         * multiply the left-hand side by 50.
 564         *
 565         * The left-hand side may look a bit weird: tnode_child_length(tn)
 566         * - tn->empty_children is of course the number of non-null children
 567         * in the current node. tn->full_children is the number of "full"
 568         * children, that is non-null tnodes with a skip value of 0.
 569         * All of those will be doubled in the resulting inflated tnode, so
 570         * we just count them one extra time here.
 571         *
 572         * A clearer way to write this would be:
 573         *
 574         * to_be_doubled = tn->full_children;
 575         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 576         *     tn->full_children;
 577         *
 578         * new_child_length = tnode_child_length(tn) * 2;
 579         *
 580         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 581         *      new_child_length;
 582         * if (new_fill_factor >= inflate_threshold)
 583         *
 584         * ...and so on, tho it would mess up the while () loop.
 585         *
 586         * anyway,
 587         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 588         *      inflate_threshold
 589         *
 590         * avoid a division:
 591         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 592         *      inflate_threshold * new_child_length
 593         *
 594         * expand not_to_be_doubled and to_be_doubled, and shorten:
 595         * 100 * (tnode_child_length(tn) - tn->empty_children +
 596         *    tn->full_children) >= inflate_threshold * new_child_length
 597         *
 598         * expand new_child_length:
 599         * 100 * (tnode_child_length(tn) - tn->empty_children +
 600         *    tn->full_children) >=
 601         *      inflate_threshold * tnode_child_length(tn) * 2
 602         *
 603         * shorten again:
 604         * 50 * (tn->full_children + tnode_child_length(tn) -
 605         *    tn->empty_children) >= inflate_threshold *
 606         *    tnode_child_length(tn)
 607         *
 608         */
 609
 610        check_tnode(tn);
 611
 612        /* Keep root node larger  */
 613
 614        if (!node_parent((struct rt_trie_node *)tn)) {
 615                inflate_threshold_use = inflate_threshold_root;
 616                halve_threshold_use = halve_threshold_root;
 617        } else {
 618                inflate_threshold_use = inflate_threshold;
 619                halve_threshold_use = halve_threshold;
 620        }
 621
 622        max_work = MAX_WORK;
 623        while ((tn->full_children > 0 &&  max_work-- &&
 624                50 * (tn->full_children + tnode_child_length(tn)
 625                      - tn->empty_children)
 626                >= inflate_threshold_use * tnode_child_length(tn))) {
 627
 628                old_tn = tn;
 629                tn = inflate(t, tn);
 630
 631                if (IS_ERR(tn)) {
 632                        tn = old_tn;
 633#ifdef CONFIG_IP_FIB_TRIE_STATS
 634                        t->stats.resize_node_skipped++;
 635#endif
 636                        break;
 637                }
 638        }
 639
 640        check_tnode(tn);
 641
 642        /* Return if at least one inflate is run */
 643        if (max_work != MAX_WORK)
 644                return (struct rt_trie_node *) tn;
 645
 646        /*
 647         * Halve as long as the number of empty children in this
 648         * node is above threshold.
 649         */
 650
 651        max_work = MAX_WORK;
 652        while (tn->bits > 1 &&  max_work-- &&
 653               100 * (tnode_child_length(tn) - tn->empty_children) <
 654               halve_threshold_use * tnode_child_length(tn)) {
 655
 656                old_tn = tn;
 657                tn = halve(t, tn);
 658                if (IS_ERR(tn)) {
 659                        tn = old_tn;
 660#ifdef CONFIG_IP_FIB_TRIE_STATS
 661                        t->stats.resize_node_skipped++;
 662#endif
 663                        break;
 664                }
 665        }
 666
 667
 668        /* Only one child remains */
 669        if (tn->empty_children == tnode_child_length(tn) - 1) {
 670one_child:
 671                for (i = 0; i < tnode_child_length(tn); i++) {
 672                        struct rt_trie_node *n;
 673
 674                        n = rtnl_dereference(tn->child[i]);
 675                        if (!n)
 676                                continue;
 677
 678                        /* compress one level */
 679
 680                        node_set_parent(n, NULL);
 681                        tnode_free_safe(tn);
 682                        return n;
 683                }
 684        }
 685        return (struct rt_trie_node *) tn;
 686}
 687
 688
 689static void tnode_clean_free(struct tnode *tn)
 690{
 691        int i;
 692        struct tnode *tofree;
 693
 694        for (i = 0; i < tnode_child_length(tn); i++) {
 695                tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
 696                if (tofree)
 697                        tnode_free(tofree);
 698        }
 699        tnode_free(tn);
 700}
 701
 702static struct tnode *inflate(struct trie *t, struct tnode *tn)
 703{
 704        struct tnode *oldtnode = tn;
 705        int olen = tnode_child_length(tn);
 706        int i;
 707
 708        pr_debug("In inflate\n");
 709
 710        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 711
 712        if (!tn)
 713                return ERR_PTR(-ENOMEM);
 714
 715        /*
 716         * Preallocate and store tnodes before the actual work so we
 717         * don't get into an inconsistent state if memory allocation
 718         * fails. In case of failure we return the oldnode and  inflate
 719         * of tnode is ignored.
 720         */
 721
 722        for (i = 0; i < olen; i++) {
 723                struct tnode *inode;
 724
 725                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 726                if (inode &&
 727                    IS_TNODE(inode) &&
 728                    inode->pos == oldtnode->pos + oldtnode->bits &&
 729                    inode->bits > 1) {
 730                        struct tnode *left, *right;
 731                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 732
 733                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 734                                         inode->bits - 1);
 735                        if (!left)
 736                                goto nomem;
 737
 738                        right = tnode_new(inode->key|m, inode->pos + 1,
 739                                          inode->bits - 1);
 740
 741                        if (!right) {
 742                                tnode_free(left);
 743                                goto nomem;
 744                        }
 745
 746                        put_child(tn, 2*i, (struct rt_trie_node *) left);
 747                        put_child(tn, 2*i+1, (struct rt_trie_node *) right);
 748                }
 749        }
 750
 751        for (i = 0; i < olen; i++) {
 752                struct tnode *inode;
 753                struct rt_trie_node *node = tnode_get_child(oldtnode, i);
 754                struct tnode *left, *right;
 755                int size, j;
 756
 757                /* An empty child */
 758                if (node == NULL)
 759                        continue;
 760
 761                /* A leaf or an internal node with skipped bits */
 762
 763                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 764                   tn->pos + tn->bits - 1) {
 765                        if (tkey_extract_bits(node->key,
 766                                              oldtnode->pos + oldtnode->bits,
 767                                              1) == 0)
 768                                put_child(tn, 2*i, node);
 769                        else
 770                                put_child(tn, 2*i+1, node);
 771                        continue;
 772                }
 773
 774                /* An internal node with two children */
 775                inode = (struct tnode *) node;
 776
 777                if (inode->bits == 1) {
 778                        put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
 779                        put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
 780
 781                        tnode_free_safe(inode);
 782                        continue;
 783                }
 784
 785                /* An internal node with more than two children */
 786
 787                /* We will replace this node 'inode' with two new
 788                 * ones, 'left' and 'right', each with half of the
 789                 * original children. The two new nodes will have
 790                 * a position one bit further down the key and this
 791                 * means that the "significant" part of their keys
 792                 * (see the discussion near the top of this file)
 793                 * will differ by one bit, which will be "0" in
 794                 * left's key and "1" in right's key. Since we are
 795                 * moving the key position by one step, the bit that
 796                 * we are moving away from - the bit at position
 797                 * (inode->pos) - is the one that will differ between
 798                 * left and right. So... we synthesize that bit in the
 799                 * two  new keys.
 800                 * The mask 'm' below will be a single "one" bit at
 801                 * the position (inode->pos)
 802                 */
 803
 804                /* Use the old key, but set the new significant
 805                 *   bit to zero.
 806                 */
 807
 808                left = (struct tnode *) tnode_get_child(tn, 2*i);
 809                put_child(tn, 2*i, NULL);
 810
 811                BUG_ON(!left);
 812
 813                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 814                put_child(tn, 2*i+1, NULL);
 815
 816                BUG_ON(!right);
 817
 818                size = tnode_child_length(left);
 819                for (j = 0; j < size; j++) {
 820                        put_child(left, j, rtnl_dereference(inode->child[j]));
 821                        put_child(right, j, rtnl_dereference(inode->child[j + size]));
 822                }
 823                put_child(tn, 2*i, resize(t, left));
 824                put_child(tn, 2*i+1, resize(t, right));
 825
 826                tnode_free_safe(inode);
 827        }
 828        tnode_free_safe(oldtnode);
 829        return tn;
 830nomem:
 831        tnode_clean_free(tn);
 832        return ERR_PTR(-ENOMEM);
 833}
 834
 835static struct tnode *halve(struct trie *t, struct tnode *tn)
 836{
 837        struct tnode *oldtnode = tn;
 838        struct rt_trie_node *left, *right;
 839        int i;
 840        int olen = tnode_child_length(tn);
 841
 842        pr_debug("In halve\n");
 843
 844        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 845
 846        if (!tn)
 847                return ERR_PTR(-ENOMEM);
 848
 849        /*
 850         * Preallocate and store tnodes before the actual work so we
 851         * don't get into an inconsistent state if memory allocation
 852         * fails. In case of failure we return the oldnode and halve
 853         * of tnode is ignored.
 854         */
 855
 856        for (i = 0; i < olen; i += 2) {
 857                left = tnode_get_child(oldtnode, i);
 858                right = tnode_get_child(oldtnode, i+1);
 859
 860                /* Two nonempty children */
 861                if (left && right) {
 862                        struct tnode *newn;
 863
 864                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 865
 866                        if (!newn)
 867                                goto nomem;
 868
 869                        put_child(tn, i/2, (struct rt_trie_node *)newn);
 870                }
 871
 872        }
 873
 874        for (i = 0; i < olen; i += 2) {
 875                struct tnode *newBinNode;
 876
 877                left = tnode_get_child(oldtnode, i);
 878                right = tnode_get_child(oldtnode, i+1);
 879
 880                /* At least one of the children is empty */
 881                if (left == NULL) {
 882                        if (right == NULL)    /* Both are empty */
 883                                continue;
 884                        put_child(tn, i/2, right);
 885                        continue;
 886                }
 887
 888                if (right == NULL) {
 889                        put_child(tn, i/2, left);
 890                        continue;
 891                }
 892
 893                /* Two nonempty children */
 894                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 895                put_child(tn, i/2, NULL);
 896                put_child(newBinNode, 0, left);
 897                put_child(newBinNode, 1, right);
 898                put_child(tn, i/2, resize(t, newBinNode));
 899        }
 900        tnode_free_safe(oldtnode);
 901        return tn;
 902nomem:
 903        tnode_clean_free(tn);
 904        return ERR_PTR(-ENOMEM);
 905}
 906
 907/* readside must use rcu_read_lock currently dump routines
 908 via get_fa_head and dump */
 909
 910static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 911{
 912        struct hlist_head *head = &l->list;
 913        struct leaf_info *li;
 914
 915        hlist_for_each_entry_rcu(li, head, hlist)
 916                if (li->plen == plen)
 917                        return li;
 918
 919        return NULL;
 920}
 921
 922static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 923{
 924        struct leaf_info *li = find_leaf_info(l, plen);
 925
 926        if (!li)
 927                return NULL;
 928
 929        return &li->falh;
 930}
 931
 932static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 933{
 934        struct leaf_info *li = NULL, *last = NULL;
 935
 936        if (hlist_empty(head)) {
 937                hlist_add_head_rcu(&new->hlist, head);
 938        } else {
 939                hlist_for_each_entry(li, head, hlist) {
 940                        if (new->plen > li->plen)
 941                                break;
 942
 943                        last = li;
 944                }
 945                if (last)
 946                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 947                else
 948                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 949        }
 950}
 951
 952/* rcu_read_lock needs to be hold by caller from readside */
 953
 954static struct leaf *
 955fib_find_node(struct trie *t, u32 key)
 956{
 957        int pos;
 958        struct tnode *tn;
 959        struct rt_trie_node *n;
 960
 961        pos = 0;
 962        n = rcu_dereference_rtnl(t->trie);
 963
 964        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 965                tn = (struct tnode *) n;
 966
 967                check_tnode(tn);
 968
 969                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 970                        pos = tn->pos + tn->bits;
 971                        n = tnode_get_child_rcu(tn,
 972                                                tkey_extract_bits(key,
 973                                                                  tn->pos,
 974                                                                  tn->bits));
 975                } else
 976                        break;
 977        }
 978        /* Case we have found a leaf. Compare prefixes */
 979
 980        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 981                return (struct leaf *)n;
 982
 983        return NULL;
 984}
 985
 986static void trie_rebalance(struct trie *t, struct tnode *tn)
 987{
 988        int wasfull;
 989        t_key cindex, key;
 990        struct tnode *tp;
 991
 992        key = tn->key;
 993
 994        while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
 995                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 996                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 997                tn = (struct tnode *)resize(t, tn);
 998
 999                tnode_put_child_reorg(tp, cindex,
1000                                      (struct rt_trie_node *)tn, wasfull);
1001
1002                tp = node_parent((struct rt_trie_node *) tn);
1003                if (!tp)
1004                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1005
1006                tnode_free_flush();
1007                if (!tp)
1008                        break;
1009                tn = tp;
1010        }
1011
1012        /* Handle last (top) tnode */
1013        if (IS_TNODE(tn))
1014                tn = (struct tnode *)resize(t, tn);
1015
1016        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1017        tnode_free_flush();
1018}
1019
1020/* only used from updater-side */
1021
1022static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1023{
1024        int pos, newpos;
1025        struct tnode *tp = NULL, *tn = NULL;
1026        struct rt_trie_node *n;
1027        struct leaf *l;
1028        int missbit;
1029        struct list_head *fa_head = NULL;
1030        struct leaf_info *li;
1031        t_key cindex;
1032
1033        pos = 0;
1034        n = rtnl_dereference(t->trie);
1035
1036        /* If we point to NULL, stop. Either the tree is empty and we should
1037         * just put a new leaf in if, or we have reached an empty child slot,
1038         * and we should just put our new leaf in that.
1039         * If we point to a T_TNODE, check if it matches our key. Note that
1040         * a T_TNODE might be skipping any number of bits - its 'pos' need
1041         * not be the parent's 'pos'+'bits'!
1042         *
1043         * If it does match the current key, get pos/bits from it, extract
1044         * the index from our key, push the T_TNODE and walk the tree.
1045         *
1046         * If it doesn't, we have to replace it with a new T_TNODE.
1047         *
1048         * If we point to a T_LEAF, it might or might not have the same key
1049         * as we do. If it does, just change the value, update the T_LEAF's
1050         * value, and return it.
1051         * If it doesn't, we need to replace it with a T_TNODE.
1052         */
1053
1054        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1055                tn = (struct tnode *) n;
1056
1057                check_tnode(tn);
1058
1059                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1060                        tp = tn;
1061                        pos = tn->pos + tn->bits;
1062                        n = tnode_get_child(tn,
1063                                            tkey_extract_bits(key,
1064                                                              tn->pos,
1065                                                              tn->bits));
1066
1067                        BUG_ON(n && node_parent(n) != tn);
1068                } else
1069                        break;
1070        }
1071
1072        /*
1073         * n  ----> NULL, LEAF or TNODE
1074         *
1075         * tp is n's (parent) ----> NULL or TNODE
1076         */
1077
1078        BUG_ON(tp && IS_LEAF(tp));
1079
1080        /* Case 1: n is a leaf. Compare prefixes */
1081
1082        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1083                l = (struct leaf *) n;
1084                li = leaf_info_new(plen);
1085
1086                if (!li)
1087                        return NULL;
1088
1089                fa_head = &li->falh;
1090                insert_leaf_info(&l->list, li);
1091                goto done;
1092        }
1093        l = leaf_new();
1094
1095        if (!l)
1096                return NULL;
1097
1098        l->key = key;
1099        li = leaf_info_new(plen);
1100
1101        if (!li) {
1102                free_leaf(l);
1103                return NULL;
1104        }
1105
1106        fa_head = &li->falh;
1107        insert_leaf_info(&l->list, li);
1108
1109        if (t->trie && n == NULL) {
1110                /* Case 2: n is NULL, and will just insert a new leaf */
1111
1112                node_set_parent((struct rt_trie_node *)l, tp);
1113
1114                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1115                put_child(tp, cindex, (struct rt_trie_node *)l);
1116        } else {
1117                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1118                /*
1119                 *  Add a new tnode here
1120                 *  first tnode need some special handling
1121                 */
1122
1123                if (tp)
1124                        pos = tp->pos+tp->bits;
1125                else
1126                        pos = 0;
1127
1128                if (n) {
1129                        newpos = tkey_mismatch(key, pos, n->key);
1130                        tn = tnode_new(n->key, newpos, 1);
1131                } else {
1132                        newpos = 0;
1133                        tn = tnode_new(key, newpos, 1); /* First tnode */
1134                }
1135
1136                if (!tn) {
1137                        free_leaf_info(li);
1138                        free_leaf(l);
1139                        return NULL;
1140                }
1141
1142                node_set_parent((struct rt_trie_node *)tn, tp);
1143
1144                missbit = tkey_extract_bits(key, newpos, 1);
1145                put_child(tn, missbit, (struct rt_trie_node *)l);
1146                put_child(tn, 1-missbit, n);
1147
1148                if (tp) {
1149                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1150                        put_child(tp, cindex, (struct rt_trie_node *)tn);
1151                } else {
1152                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1153                        tp = tn;
1154                }
1155        }
1156
1157        if (tp && tp->pos + tp->bits > 32)
1158                pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1159                        tp, tp->pos, tp->bits, key, plen);
1160
1161        /* Rebalance the trie */
1162
1163        trie_rebalance(t, tp);
1164done:
1165        return fa_head;
1166}
1167
1168/*
1169 * Caller must hold RTNL.
1170 */
1171int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1172{
1173        struct trie *t = (struct trie *) tb->tb_data;
1174        struct fib_alias *fa, *new_fa;
1175        struct list_head *fa_head = NULL;
1176        struct fib_info *fi;
1177        int plen = cfg->fc_dst_len;
1178        u8 tos = cfg->fc_tos;
1179        u32 key, mask;
1180        int err;
1181        struct leaf *l;
1182
1183        if (plen > 32)
1184                return -EINVAL;
1185
1186        key = ntohl(cfg->fc_dst);
1187
1188        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1189
1190        mask = ntohl(inet_make_mask(plen));
1191
1192        if (key & ~mask)
1193                return -EINVAL;
1194
1195        key = key & mask;
1196
1197        fi = fib_create_info(cfg);
1198        if (IS_ERR(fi)) {
1199                err = PTR_ERR(fi);
1200                goto err;
1201        }
1202
1203        l = fib_find_node(t, key);
1204        fa = NULL;
1205
1206        if (l) {
1207                fa_head = get_fa_head(l, plen);
1208                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1209        }
1210
1211        /* Now fa, if non-NULL, points to the first fib alias
1212         * with the same keys [prefix,tos,priority], if such key already
1213         * exists or to the node before which we will insert new one.
1214         *
1215         * If fa is NULL, we will need to allocate a new one and
1216         * insert to the head of f.
1217         *
1218         * If f is NULL, no fib node matched the destination key
1219         * and we need to allocate a new one of those as well.
1220         */
1221
1222        if (fa && fa->fa_tos == tos &&
1223            fa->fa_info->fib_priority == fi->fib_priority) {
1224                struct fib_alias *fa_first, *fa_match;
1225
1226                err = -EEXIST;
1227                if (cfg->fc_nlflags & NLM_F_EXCL)
1228                        goto out;
1229
1230                /* We have 2 goals:
1231                 * 1. Find exact match for type, scope, fib_info to avoid
1232                 * duplicate routes
1233                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1234                 */
1235                fa_match = NULL;
1236                fa_first = fa;
1237                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1238                list_for_each_entry_continue(fa, fa_head, fa_list) {
1239                        if (fa->fa_tos != tos)
1240                                break;
1241                        if (fa->fa_info->fib_priority != fi->fib_priority)
1242                                break;
1243                        if (fa->fa_type == cfg->fc_type &&
1244                            fa->fa_info == fi) {
1245                                fa_match = fa;
1246                                break;
1247                        }
1248                }
1249
1250                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1251                        struct fib_info *fi_drop;
1252                        u8 state;
1253
1254                        fa = fa_first;
1255                        if (fa_match) {
1256                                if (fa == fa_match)
1257                                        err = 0;
1258                                goto out;
1259                        }
1260                        err = -ENOBUFS;
1261                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1262                        if (new_fa == NULL)
1263                                goto out;
1264
1265                        fi_drop = fa->fa_info;
1266                        new_fa->fa_tos = fa->fa_tos;
1267                        new_fa->fa_info = fi;
1268                        new_fa->fa_type = cfg->fc_type;
1269                        state = fa->fa_state;
1270                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1271
1272                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1273                        alias_free_mem_rcu(fa);
1274
1275                        fib_release_info(fi_drop);
1276                        if (state & FA_S_ACCESSED)
1277                                rt_cache_flush(cfg->fc_nlinfo.nl_net);
1278                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1279                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1280
1281                        goto succeeded;
1282                }
1283                /* Error if we find a perfect match which
1284                 * uses the same scope, type, and nexthop
1285                 * information.
1286                 */
1287                if (fa_match)
1288                        goto out;
1289
1290                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1291                        fa = fa_first;
1292        }
1293        err = -ENOENT;
1294        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1295                goto out;
1296
1297        err = -ENOBUFS;
1298        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1299        if (new_fa == NULL)
1300                goto out;
1301
1302        new_fa->fa_info = fi;
1303        new_fa->fa_tos = tos;
1304        new_fa->fa_type = cfg->fc_type;
1305        new_fa->fa_state = 0;
1306        /*
1307         * Insert new entry to the list.
1308         */
1309
1310        if (!fa_head) {
1311                fa_head = fib_insert_node(t, key, plen);
1312                if (unlikely(!fa_head)) {
1313                        err = -ENOMEM;
1314                        goto out_free_new_fa;
1315                }
1316        }
1317
1318        if (!plen)
1319                tb->tb_num_default++;
1320
1321        list_add_tail_rcu(&new_fa->fa_list,
1322                          (fa ? &fa->fa_list : fa_head));
1323
1324        rt_cache_flush(cfg->fc_nlinfo.nl_net);
1325        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1326                  &cfg->fc_nlinfo, 0);
1327succeeded:
1328        return 0;
1329
1330out_free_new_fa:
1331        kmem_cache_free(fn_alias_kmem, new_fa);
1332out:
1333        fib_release_info(fi);
1334err:
1335        return err;
1336}
1337
1338/* should be called with rcu_read_lock */
1339static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1340                      t_key key,  const struct flowi4 *flp,
1341                      struct fib_result *res, int fib_flags)
1342{
1343        struct leaf_info *li;
1344        struct hlist_head *hhead = &l->list;
1345
1346        hlist_for_each_entry_rcu(li, hhead, hlist) {
1347                struct fib_alias *fa;
1348
1349                if (l->key != (key & li->mask_plen))
1350                        continue;
1351
1352                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1353                        struct fib_info *fi = fa->fa_info;
1354                        int nhsel, err;
1355
1356                        if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1357                                continue;
1358                        if (fi->fib_dead)
1359                                continue;
1360                        if (fa->fa_info->fib_scope < flp->flowi4_scope)
1361                                continue;
1362                        fib_alias_accessed(fa);
1363                        err = fib_props[fa->fa_type].error;
1364                        if (err) {
1365#ifdef CONFIG_IP_FIB_TRIE_STATS
1366                                t->stats.semantic_match_passed++;
1367#endif
1368                                return err;
1369                        }
1370                        if (fi->fib_flags & RTNH_F_DEAD)
1371                                continue;
1372                        for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1373                                const struct fib_nh *nh = &fi->fib_nh[nhsel];
1374
1375                                if (nh->nh_flags & RTNH_F_DEAD)
1376                                        continue;
1377                                if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1378                                        continue;
1379
1380#ifdef CONFIG_IP_FIB_TRIE_STATS
1381                                t->stats.semantic_match_passed++;
1382#endif
1383                                res->prefixlen = li->plen;
1384                                res->nh_sel = nhsel;
1385                                res->type = fa->fa_type;
1386                                res->scope = fa->fa_info->fib_scope;
1387                                res->fi = fi;
1388                                res->table = tb;
1389                                res->fa_head = &li->falh;
1390                                if (!(fib_flags & FIB_LOOKUP_NOREF))
1391                                        atomic_inc(&fi->fib_clntref);
1392                                return 0;
1393                        }
1394                }
1395
1396#ifdef CONFIG_IP_FIB_TRIE_STATS
1397                t->stats.semantic_match_miss++;
1398#endif
1399        }
1400
1401        return 1;
1402}
1403
1404int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1405                     struct fib_result *res, int fib_flags)
1406{
1407        struct trie *t = (struct trie *) tb->tb_data;
1408        int ret;
1409        struct rt_trie_node *n;
1410        struct tnode *pn;
1411        unsigned int pos, bits;
1412        t_key key = ntohl(flp->daddr);
1413        unsigned int chopped_off;
1414        t_key cindex = 0;
1415        unsigned int current_prefix_length = KEYLENGTH;
1416        struct tnode *cn;
1417        t_key pref_mismatch;
1418
1419        rcu_read_lock();
1420
1421        n = rcu_dereference(t->trie);
1422        if (!n)
1423                goto failed;
1424
1425#ifdef CONFIG_IP_FIB_TRIE_STATS
1426        t->stats.gets++;
1427#endif
1428
1429        /* Just a leaf? */
1430        if (IS_LEAF(n)) {
1431                ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1432                goto found;
1433        }
1434
1435        pn = (struct tnode *) n;
1436        chopped_off = 0;
1437
1438        while (pn) {
1439                pos = pn->pos;
1440                bits = pn->bits;
1441
1442                if (!chopped_off)
1443                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1444                                                   pos, bits);
1445
1446                n = tnode_get_child_rcu(pn, cindex);
1447
1448                if (n == NULL) {
1449#ifdef CONFIG_IP_FIB_TRIE_STATS
1450                        t->stats.null_node_hit++;
1451#endif
1452                        goto backtrace;
1453                }
1454
1455                if (IS_LEAF(n)) {
1456                        ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1457                        if (ret > 0)
1458                                goto backtrace;
1459                        goto found;
1460                }
1461
1462                cn = (struct tnode *)n;
1463
1464                /*
1465                 * It's a tnode, and we can do some extra checks here if we
1466                 * like, to avoid descending into a dead-end branch.
1467                 * This tnode is in the parent's child array at index
1468                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1469                 * chopped off, so in reality the index may be just a
1470                 * subprefix, padded with zero at the end.
1471                 * We can also take a look at any skipped bits in this
1472                 * tnode - everything up to p_pos is supposed to be ok,
1473                 * and the non-chopped bits of the index (se previous
1474                 * paragraph) are also guaranteed ok, but the rest is
1475                 * considered unknown.
1476                 *
1477                 * The skipped bits are key[pos+bits..cn->pos].
1478                 */
1479
1480                /* If current_prefix_length < pos+bits, we are already doing
1481                 * actual prefix  matching, which means everything from
1482                 * pos+(bits-chopped_off) onward must be zero along some
1483                 * branch of this subtree - otherwise there is *no* valid
1484                 * prefix present. Here we can only check the skipped
1485                 * bits. Remember, since we have already indexed into the
1486                 * parent's child array, we know that the bits we chopped of
1487                 * *are* zero.
1488                 */
1489
1490                /* NOTA BENE: Checking only skipped bits
1491                   for the new node here */
1492
1493                if (current_prefix_length < pos+bits) {
1494                        if (tkey_extract_bits(cn->key, current_prefix_length,
1495                                                cn->pos - current_prefix_length)
1496                            || !(cn->child[0]))
1497                                goto backtrace;
1498                }
1499
1500                /*
1501                 * If chopped_off=0, the index is fully validated and we
1502                 * only need to look at the skipped bits for this, the new,
1503                 * tnode. What we actually want to do is to find out if
1504                 * these skipped bits match our key perfectly, or if we will
1505                 * have to count on finding a matching prefix further down,
1506                 * because if we do, we would like to have some way of
1507                 * verifying the existence of such a prefix at this point.
1508                 */
1509
1510                /* The only thing we can do at this point is to verify that
1511                 * any such matching prefix can indeed be a prefix to our
1512                 * key, and if the bits in the node we are inspecting that
1513                 * do not match our key are not ZERO, this cannot be true.
1514                 * Thus, find out where there is a mismatch (before cn->pos)
1515                 * and verify that all the mismatching bits are zero in the
1516                 * new tnode's key.
1517                 */
1518
1519                /*
1520                 * Note: We aren't very concerned about the piece of
1521                 * the key that precede pn->pos+pn->bits, since these
1522                 * have already been checked. The bits after cn->pos
1523                 * aren't checked since these are by definition
1524                 * "unknown" at this point. Thus, what we want to see
1525                 * is if we are about to enter the "prefix matching"
1526                 * state, and in that case verify that the skipped
1527                 * bits that will prevail throughout this subtree are
1528                 * zero, as they have to be if we are to find a
1529                 * matching prefix.
1530                 */
1531
1532                pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1533
1534                /*
1535                 * In short: If skipped bits in this node do not match
1536                 * the search key, enter the "prefix matching"
1537                 * state.directly.
1538                 */
1539                if (pref_mismatch) {
1540                        /* fls(x) = __fls(x) + 1 */
1541                        int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1542
1543                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1544                                goto backtrace;
1545
1546                        if (current_prefix_length >= cn->pos)
1547                                current_prefix_length = mp;
1548                }
1549
1550                pn = (struct tnode *)n; /* Descend */
1551                chopped_off = 0;
1552                continue;
1553
1554backtrace:
1555                chopped_off++;
1556
1557                /* As zero don't change the child key (cindex) */
1558                while ((chopped_off <= pn->bits)
1559                       && !(cindex & (1<<(chopped_off-1))))
1560                        chopped_off++;
1561
1562                /* Decrease current_... with bits chopped off */
1563                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1564                        current_prefix_length = pn->pos + pn->bits
1565                                - chopped_off;
1566
1567                /*
1568                 * Either we do the actual chop off according or if we have
1569                 * chopped off all bits in this tnode walk up to our parent.
1570                 */
1571
1572                if (chopped_off <= pn->bits) {
1573                        cindex &= ~(1 << (chopped_off-1));
1574                } else {
1575                        struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1576                        if (!parent)
1577                                goto failed;
1578
1579                        /* Get Child's index */
1580                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1581                        pn = parent;
1582                        chopped_off = 0;
1583
1584#ifdef CONFIG_IP_FIB_TRIE_STATS
1585                        t->stats.backtrack++;
1586#endif
1587                        goto backtrace;
1588                }
1589        }
1590failed:
1591        ret = 1;
1592found:
1593        rcu_read_unlock();
1594        return ret;
1595}
1596EXPORT_SYMBOL_GPL(fib_table_lookup);
1597
1598/*
1599 * Remove the leaf and return parent.
1600 */
1601static void trie_leaf_remove(struct trie *t, struct leaf *l)
1602{
1603        struct tnode *tp = node_parent((struct rt_trie_node *) l);
1604
1605        pr_debug("entering trie_leaf_remove(%p)\n", l);
1606
1607        if (tp) {
1608                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1609                put_child(tp, cindex, NULL);
1610                trie_rebalance(t, tp);
1611        } else
1612                RCU_INIT_POINTER(t->trie, NULL);
1613
1614        free_leaf(l);
1615}
1616
1617/*
1618 * Caller must hold RTNL.
1619 */
1620int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1621{
1622        struct trie *t = (struct trie *) tb->tb_data;
1623        u32 key, mask;
1624        int plen = cfg->fc_dst_len;
1625        u8 tos = cfg->fc_tos;
1626        struct fib_alias *fa, *fa_to_delete;
1627        struct list_head *fa_head;
1628        struct leaf *l;
1629        struct leaf_info *li;
1630
1631        if (plen > 32)
1632                return -EINVAL;
1633
1634        key = ntohl(cfg->fc_dst);
1635        mask = ntohl(inet_make_mask(plen));
1636
1637        if (key & ~mask)
1638                return -EINVAL;
1639
1640        key = key & mask;
1641        l = fib_find_node(t, key);
1642
1643        if (!l)
1644                return -ESRCH;
1645
1646        li = find_leaf_info(l, plen);
1647
1648        if (!li)
1649                return -ESRCH;
1650
1651        fa_head = &li->falh;
1652        fa = fib_find_alias(fa_head, tos, 0);
1653
1654        if (!fa)
1655                return -ESRCH;
1656
1657        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1658
1659        fa_to_delete = NULL;
1660        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1661        list_for_each_entry_continue(fa, fa_head, fa_list) {
1662                struct fib_info *fi = fa->fa_info;
1663
1664                if (fa->fa_tos != tos)
1665                        break;
1666
1667                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1668                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1669                     fa->fa_info->fib_scope == cfg->fc_scope) &&
1670                    (!cfg->fc_prefsrc ||
1671                     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1672                    (!cfg->fc_protocol ||
1673                     fi->fib_protocol == cfg->fc_protocol) &&
1674                    fib_nh_match(cfg, fi) == 0) {
1675                        fa_to_delete = fa;
1676                        break;
1677                }
1678        }
1679
1680        if (!fa_to_delete)
1681                return -ESRCH;
1682
1683        fa = fa_to_delete;
1684        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1685                  &cfg->fc_nlinfo, 0);
1686
1687        list_del_rcu(&fa->fa_list);
1688
1689        if (!plen)
1690                tb->tb_num_default--;
1691
1692        if (list_empty(fa_head)) {
1693                hlist_del_rcu(&li->hlist);
1694                free_leaf_info(li);
1695        }
1696
1697        if (hlist_empty(&l->list))
1698                trie_leaf_remove(t, l);
1699
1700        if (fa->fa_state & FA_S_ACCESSED)
1701                rt_cache_flush(cfg->fc_nlinfo.nl_net);
1702
1703        fib_release_info(fa->fa_info);
1704        alias_free_mem_rcu(fa);
1705        return 0;
1706}
1707
1708static int trie_flush_list(struct list_head *head)
1709{
1710        struct fib_alias *fa, *fa_node;
1711        int found = 0;
1712
1713        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1714                struct fib_info *fi = fa->fa_info;
1715
1716                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1717                        list_del_rcu(&fa->fa_list);
1718                        fib_release_info(fa->fa_info);
1719                        alias_free_mem_rcu(fa);
1720                        found++;
1721                }
1722        }
1723        return found;
1724}
1725
1726static int trie_flush_leaf(struct leaf *l)
1727{
1728        int found = 0;
1729        struct hlist_head *lih = &l->list;
1730        struct hlist_node *tmp;
1731        struct leaf_info *li = NULL;
1732
1733        hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1734                found += trie_flush_list(&li->falh);
1735
1736                if (list_empty(&li->falh)) {
1737                        hlist_del_rcu(&li->hlist);
1738                        free_leaf_info(li);
1739                }
1740        }
1741        return found;
1742}
1743
1744/*
1745 * Scan for the next right leaf starting at node p->child[idx]
1746 * Since we have back pointer, no recursion necessary.
1747 */
1748static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1749{
1750        do {
1751                t_key idx;
1752
1753                if (c)
1754                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1755                else
1756                        idx = 0;
1757
1758                while (idx < 1u << p->bits) {
1759                        c = tnode_get_child_rcu(p, idx++);
1760                        if (!c)
1761                                continue;
1762
1763                        if (IS_LEAF(c))
1764                                return (struct leaf *) c;
1765
1766                        /* Rescan start scanning in new node */
1767                        p = (struct tnode *) c;
1768                        idx = 0;
1769                }
1770
1771                /* Node empty, walk back up to parent */
1772                c = (struct rt_trie_node *) p;
1773        } while ((p = node_parent_rcu(c)) != NULL);
1774
1775        return NULL; /* Root of trie */
1776}
1777
1778static struct leaf *trie_firstleaf(struct trie *t)
1779{
1780        struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1781
1782        if (!n)
1783                return NULL;
1784
1785        if (IS_LEAF(n))          /* trie is just a leaf */
1786                return (struct leaf *) n;
1787
1788        return leaf_walk_rcu(n, NULL);
1789}
1790
1791static struct leaf *trie_nextleaf(struct leaf *l)
1792{
1793        struct rt_trie_node *c = (struct rt_trie_node *) l;
1794        struct tnode *p = node_parent_rcu(c);
1795
1796        if (!p)
1797                return NULL;    /* trie with just one leaf */
1798
1799        return leaf_walk_rcu(p, c);
1800}
1801
1802static struct leaf *trie_leafindex(struct trie *t, int index)
1803{
1804        struct leaf *l = trie_firstleaf(t);
1805
1806        while (l && index-- > 0)
1807                l = trie_nextleaf(l);
1808
1809        return l;
1810}
1811
1812
1813/*
1814 * Caller must hold RTNL.
1815 */
1816int fib_table_flush(struct fib_table *tb)
1817{
1818        struct trie *t = (struct trie *) tb->tb_data;
1819        struct leaf *l, *ll = NULL;
1820        int found = 0;
1821
1822        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1823                found += trie_flush_leaf(l);
1824
1825                if (ll && hlist_empty(&ll->list))
1826                        trie_leaf_remove(t, ll);
1827                ll = l;
1828        }
1829
1830        if (ll && hlist_empty(&ll->list))
1831                trie_leaf_remove(t, ll);
1832
1833        pr_debug("trie_flush found=%d\n", found);
1834        return found;
1835}
1836
1837void fib_free_table(struct fib_table *tb)
1838{
1839        kfree(tb);
1840}
1841
1842static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1843                           struct fib_table *tb,
1844                           struct sk_buff *skb, struct netlink_callback *cb)
1845{
1846        int i, s_i;
1847        struct fib_alias *fa;
1848        __be32 xkey = htonl(key);
1849
1850        s_i = cb->args[5];
1851        i = 0;
1852
1853        /* rcu_read_lock is hold by caller */
1854
1855        list_for_each_entry_rcu(fa, fah, fa_list) {
1856                if (i < s_i) {
1857                        i++;
1858                        continue;
1859                }
1860
1861                if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1862                                  cb->nlh->nlmsg_seq,
1863                                  RTM_NEWROUTE,
1864                                  tb->tb_id,
1865                                  fa->fa_type,
1866                                  xkey,
1867                                  plen,
1868                                  fa->fa_tos,
1869                                  fa->fa_info, NLM_F_MULTI) < 0) {
1870                        cb->args[5] = i;
1871                        return -1;
1872                }
1873                i++;
1874        }
1875        cb->args[5] = i;
1876        return skb->len;
1877}
1878
1879static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1880                        struct sk_buff *skb, struct netlink_callback *cb)
1881{
1882        struct leaf_info *li;
1883        int i, s_i;
1884
1885        s_i = cb->args[4];
1886        i = 0;
1887
1888        /* rcu_read_lock is hold by caller */
1889        hlist_for_each_entry_rcu(li, &l->list, hlist) {
1890                if (i < s_i) {
1891                        i++;
1892                        continue;
1893                }
1894
1895                if (i > s_i)
1896                        cb->args[5] = 0;
1897
1898                if (list_empty(&li->falh))
1899                        continue;
1900
1901                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1902                        cb->args[4] = i;
1903                        return -1;
1904                }
1905                i++;
1906        }
1907
1908        cb->args[4] = i;
1909        return skb->len;
1910}
1911
1912int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1913                   struct netlink_callback *cb)
1914{
1915        struct leaf *l;
1916        struct trie *t = (struct trie *) tb->tb_data;
1917        t_key key = cb->args[2];
1918        int count = cb->args[3];
1919
1920        rcu_read_lock();
1921        /* Dump starting at last key.
1922         * Note: 0.0.0.0/0 (ie default) is first key.
1923         */
1924        if (count == 0)
1925                l = trie_firstleaf(t);
1926        else {
1927                /* Normally, continue from last key, but if that is missing
1928                 * fallback to using slow rescan
1929                 */
1930                l = fib_find_node(t, key);
1931                if (!l)
1932                        l = trie_leafindex(t, count);
1933        }
1934
1935        while (l) {
1936                cb->args[2] = l->key;
1937                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1938                        cb->args[3] = count;
1939                        rcu_read_unlock();
1940                        return -1;
1941                }
1942
1943                ++count;
1944                l = trie_nextleaf(l);
1945                memset(&cb->args[4], 0,
1946                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1947        }
1948        cb->args[3] = count;
1949        rcu_read_unlock();
1950
1951        return skb->len;
1952}
1953
1954void __init fib_trie_init(void)
1955{
1956        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1957                                          sizeof(struct fib_alias),
1958                                          0, SLAB_PANIC, NULL);
1959
1960        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1961                                           max(sizeof(struct leaf),
1962                                               sizeof(struct leaf_info)),
1963                                           0, SLAB_PANIC, NULL);
1964}
1965
1966
1967struct fib_table *fib_trie_table(u32 id)
1968{
1969        struct fib_table *tb;
1970        struct trie *t;
1971
1972        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1973                     GFP_KERNEL);
1974        if (tb == NULL)
1975                return NULL;
1976
1977        tb->tb_id = id;
1978        tb->tb_default = -1;
1979        tb->tb_num_default = 0;
1980
1981        t = (struct trie *) tb->tb_data;
1982        memset(t, 0, sizeof(*t));
1983
1984        return tb;
1985}
1986
1987#ifdef CONFIG_PROC_FS
1988/* Depth first Trie walk iterator */
1989struct fib_trie_iter {
1990        struct seq_net_private p;
1991        struct fib_table *tb;
1992        struct tnode *tnode;
1993        unsigned int index;
1994        unsigned int depth;
1995};
1996
1997static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
1998{
1999        struct tnode *tn = iter->tnode;
2000        unsigned int cindex = iter->index;
2001        struct tnode *p;
2002
2003        /* A single entry routing table */
2004        if (!tn)
2005                return NULL;
2006
2007        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2008                 iter->tnode, iter->index, iter->depth);
2009rescan:
2010        while (cindex < (1<<tn->bits)) {
2011                struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2012
2013                if (n) {
2014                        if (IS_LEAF(n)) {
2015                                iter->tnode = tn;
2016                                iter->index = cindex + 1;
2017                        } else {
2018                                /* push down one level */
2019                                iter->tnode = (struct tnode *) n;
2020                                iter->index = 0;
2021                                ++iter->depth;
2022                        }
2023                        return n;
2024                }
2025
2026                ++cindex;
2027        }
2028
2029        /* Current node exhausted, pop back up */
2030        p = node_parent_rcu((struct rt_trie_node *)tn);
2031        if (p) {
2032                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2033                tn = p;
2034                --iter->depth;
2035                goto rescan;
2036        }
2037
2038        /* got root? */
2039        return NULL;
2040}
2041
2042static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2043                                       struct trie *t)
2044{
2045        struct rt_trie_node *n;
2046
2047        if (!t)
2048                return NULL;
2049
2050        n = rcu_dereference(t->trie);
2051        if (!n)
2052                return NULL;
2053
2054        if (IS_TNODE(n)) {
2055                iter->tnode = (struct tnode *) n;
2056                iter->index = 0;
2057                iter->depth = 1;
2058        } else {
2059                iter->tnode = NULL;
2060                iter->index = 0;
2061                iter->depth = 0;
2062        }
2063
2064        return n;
2065}
2066
2067static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2068{
2069        struct rt_trie_node *n;
2070        struct fib_trie_iter iter;
2071
2072        memset(s, 0, sizeof(*s));
2073
2074        rcu_read_lock();
2075        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2076                if (IS_LEAF(n)) {
2077                        struct leaf *l = (struct leaf *)n;
2078                        struct leaf_info *li;
2079
2080                        s->leaves++;
2081                        s->totdepth += iter.depth;
2082                        if (iter.depth > s->maxdepth)
2083                                s->maxdepth = iter.depth;
2084
2085                        hlist_for_each_entry_rcu(li, &l->list, hlist)
2086                                ++s->prefixes;
2087                } else {
2088                        const struct tnode *tn = (const struct tnode *) n;
2089                        int i;
2090
2091                        s->tnodes++;
2092                        if (tn->bits < MAX_STAT_DEPTH)
2093                                s->nodesizes[tn->bits]++;
2094
2095                        for (i = 0; i < (1<<tn->bits); i++)
2096                                if (!tn->child[i])
2097                                        s->nullpointers++;
2098                }
2099        }
2100        rcu_read_unlock();
2101}
2102
2103/*
2104 *      This outputs /proc/net/fib_triestats
2105 */
2106static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2107{
2108        unsigned int i, max, pointers, bytes, avdepth;
2109
2110        if (stat->leaves)
2111                avdepth = stat->totdepth*100 / stat->leaves;
2112        else
2113                avdepth = 0;
2114
2115        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2116                   avdepth / 100, avdepth % 100);
2117        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2118
2119        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2120        bytes = sizeof(struct leaf) * stat->leaves;
2121
2122        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2123        bytes += sizeof(struct leaf_info) * stat->prefixes;
2124
2125        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2126        bytes += sizeof(struct tnode) * stat->tnodes;
2127
2128        max = MAX_STAT_DEPTH;
2129        while (max > 0 && stat->nodesizes[max-1] == 0)
2130                max--;
2131
2132        pointers = 0;
2133        for (i = 1; i < max; i++)
2134                if (stat->nodesizes[i] != 0) {
2135                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2136                        pointers += (1<<i) * stat->nodesizes[i];
2137                }
2138        seq_putc(seq, '\n');
2139        seq_printf(seq, "\tPointers: %u\n", pointers);
2140
2141        bytes += sizeof(struct rt_trie_node *) * pointers;
2142        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2143        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2144}
2145
2146#ifdef CONFIG_IP_FIB_TRIE_STATS
2147static void trie_show_usage(struct seq_file *seq,
2148                            const struct trie_use_stats *stats)
2149{
2150        seq_printf(seq, "\nCounters:\n---------\n");
2151        seq_printf(seq, "gets = %u\n", stats->gets);
2152        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2153        seq_printf(seq, "semantic match passed = %u\n",
2154                   stats->semantic_match_passed);
2155        seq_printf(seq, "semantic match miss = %u\n",
2156                   stats->semantic_match_miss);
2157        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2158        seq_printf(seq, "skipped node resize = %u\n\n",
2159                   stats->resize_node_skipped);
2160}
2161#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2162
2163static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2164{
2165        if (tb->tb_id == RT_TABLE_LOCAL)
2166                seq_puts(seq, "Local:\n");
2167        else if (tb->tb_id == RT_TABLE_MAIN)
2168                seq_puts(seq, "Main:\n");
2169        else
2170                seq_printf(seq, "Id %d:\n", tb->tb_id);
2171}
2172
2173
2174static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2175{
2176        struct net *net = (struct net *)seq->private;
2177        unsigned int h;
2178
2179        seq_printf(seq,
2180                   "Basic info: size of leaf:"
2181                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2182                   sizeof(struct leaf), sizeof(struct tnode));
2183
2184        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2185                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2186                struct fib_table *tb;
2187
2188                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2189                        struct trie *t = (struct trie *) tb->tb_data;
2190                        struct trie_stat stat;
2191
2192                        if (!t)
2193                                continue;
2194
2195                        fib_table_print(seq, tb);
2196
2197                        trie_collect_stats(t, &stat);
2198                        trie_show_stats(seq, &stat);
2199#ifdef CONFIG_IP_FIB_TRIE_STATS
2200                        trie_show_usage(seq, &t->stats);
2201#endif
2202                }
2203        }
2204
2205        return 0;
2206}
2207
2208static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2209{
2210        return single_open_net(inode, file, fib_triestat_seq_show);
2211}
2212
2213static const struct file_operations fib_triestat_fops = {
2214        .owner  = THIS_MODULE,
2215        .open   = fib_triestat_seq_open,
2216        .read   = seq_read,
2217        .llseek = seq_lseek,
2218        .release = single_release_net,
2219};
2220
2221static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2222{
2223        struct fib_trie_iter *iter = seq->private;
2224        struct net *net = seq_file_net(seq);
2225        loff_t idx = 0;
2226        unsigned int h;
2227
2228        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2229                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2230                struct fib_table *tb;
2231
2232                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2233                        struct rt_trie_node *n;
2234
2235                        for (n = fib_trie_get_first(iter,
2236                                                    (struct trie *) tb->tb_data);
2237                             n; n = fib_trie_get_next(iter))
2238                                if (pos == idx++) {
2239                                        iter->tb = tb;
2240                                        return n;
2241                                }
2242                }
2243        }
2244
2245        return NULL;
2246}
2247
2248static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2249        __acquires(RCU)
2250{
2251        rcu_read_lock();
2252        return fib_trie_get_idx(seq, *pos);
2253}
2254
2255static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2256{
2257        struct fib_trie_iter *iter = seq->private;
2258        struct net *net = seq_file_net(seq);
2259        struct fib_table *tb = iter->tb;
2260        struct hlist_node *tb_node;
2261        unsigned int h;
2262        struct rt_trie_node *n;
2263
2264        ++*pos;
2265        /* next node in same table */
2266        n = fib_trie_get_next(iter);
2267        if (n)
2268                return n;
2269
2270        /* walk rest of this hash chain */
2271        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2272        while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2273                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2274                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2275                if (n)
2276                        goto found;
2277        }
2278
2279        /* new hash chain */
2280        while (++h < FIB_TABLE_HASHSZ) {
2281                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2282                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2283                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2284                        if (n)
2285                                goto found;
2286                }
2287        }
2288        return NULL;
2289
2290found:
2291        iter->tb = tb;
2292        return n;
2293}
2294
2295static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2296        __releases(RCU)
2297{
2298        rcu_read_unlock();
2299}
2300
2301static void seq_indent(struct seq_file *seq, int n)
2302{
2303        while (n-- > 0)
2304                seq_puts(seq, "   ");
2305}
2306
2307static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2308{
2309        switch (s) {
2310        case RT_SCOPE_UNIVERSE: return "universe";
2311        case RT_SCOPE_SITE:     return "site";
2312        case RT_SCOPE_LINK:     return "link";
2313        case RT_SCOPE_HOST:     return "host";
2314        case RT_SCOPE_NOWHERE:  return "nowhere";
2315        default:
2316                snprintf(buf, len, "scope=%d", s);
2317                return buf;
2318        }
2319}
2320
2321static const char *const rtn_type_names[__RTN_MAX] = {
2322        [RTN_UNSPEC] = "UNSPEC",
2323        [RTN_UNICAST] = "UNICAST",
2324        [RTN_LOCAL] = "LOCAL",
2325        [RTN_BROADCAST] = "BROADCAST",
2326        [RTN_ANYCAST] = "ANYCAST",
2327        [RTN_MULTICAST] = "MULTICAST",
2328        [RTN_BLACKHOLE] = "BLACKHOLE",
2329        [RTN_UNREACHABLE] = "UNREACHABLE",
2330        [RTN_PROHIBIT] = "PROHIBIT",
2331        [RTN_THROW] = "THROW",
2332        [RTN_NAT] = "NAT",
2333        [RTN_XRESOLVE] = "XRESOLVE",
2334};
2335
2336static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2337{
2338        if (t < __RTN_MAX && rtn_type_names[t])
2339                return rtn_type_names[t];
2340        snprintf(buf, len, "type %u", t);
2341        return buf;
2342}
2343
2344/* Pretty print the trie */
2345static int fib_trie_seq_show(struct seq_file *seq, void *v)
2346{
2347        const struct fib_trie_iter *iter = seq->private;
2348        struct rt_trie_node *n = v;
2349
2350        if (!node_parent_rcu(n))
2351                fib_table_print(seq, iter->tb);
2352
2353        if (IS_TNODE(n)) {
2354                struct tnode *tn = (struct tnode *) n;
2355                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2356
2357                seq_indent(seq, iter->depth-1);
2358                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2359                           &prf, tn->pos, tn->bits, tn->full_children,
2360                           tn->empty_children);
2361
2362        } else {
2363                struct leaf *l = (struct leaf *) n;
2364                struct leaf_info *li;
2365                __be32 val = htonl(l->key);
2366
2367                seq_indent(seq, iter->depth);
2368                seq_printf(seq, "  |-- %pI4\n", &val);
2369
2370                hlist_for_each_entry_rcu(li, &l->list, hlist) {
2371                        struct fib_alias *fa;
2372
2373                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2374                                char buf1[32], buf2[32];
2375
2376                                seq_indent(seq, iter->depth+1);
2377                                seq_printf(seq, "  /%d %s %s", li->plen,
2378                                           rtn_scope(buf1, sizeof(buf1),
2379                                                     fa->fa_info->fib_scope),
2380                                           rtn_type(buf2, sizeof(buf2),
2381                                                    fa->fa_type));
2382                                if (fa->fa_tos)
2383                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2384                                seq_putc(seq, '\n');
2385                        }
2386                }
2387        }
2388
2389        return 0;
2390}
2391
2392static const struct seq_operations fib_trie_seq_ops = {
2393        .start  = fib_trie_seq_start,
2394        .next   = fib_trie_seq_next,
2395        .stop   = fib_trie_seq_stop,
2396        .show   = fib_trie_seq_show,
2397};
2398
2399static int fib_trie_seq_open(struct inode *inode, struct file *file)
2400{
2401        return seq_open_net(inode, file, &fib_trie_seq_ops,
2402                            sizeof(struct fib_trie_iter));
2403}
2404
2405static const struct file_operations fib_trie_fops = {
2406        .owner  = THIS_MODULE,
2407        .open   = fib_trie_seq_open,
2408        .read   = seq_read,
2409        .llseek = seq_lseek,
2410        .release = seq_release_net,
2411};
2412
2413struct fib_route_iter {
2414        struct seq_net_private p;
2415        struct trie *main_trie;
2416        loff_t  pos;
2417        t_key   key;
2418};
2419
2420static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2421{
2422        struct leaf *l = NULL;
2423        struct trie *t = iter->main_trie;
2424
2425        /* use cache location of last found key */
2426        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2427                pos -= iter->pos;
2428        else {
2429                iter->pos = 0;
2430                l = trie_firstleaf(t);
2431        }
2432
2433        while (l && pos-- > 0) {
2434                iter->pos++;
2435                l = trie_nextleaf(l);
2436        }
2437
2438        if (l)
2439                iter->key = pos;        /* remember it */
2440        else
2441                iter->pos = 0;          /* forget it */
2442
2443        return l;
2444}
2445
2446static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2447        __acquires(RCU)
2448{
2449        struct fib_route_iter *iter = seq->private;
2450        struct fib_table *tb;
2451
2452        rcu_read_lock();
2453        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2454        if (!tb)
2455                return NULL;
2456
2457        iter->main_trie = (struct trie *) tb->tb_data;
2458        if (*pos == 0)
2459                return SEQ_START_TOKEN;
2460        else
2461                return fib_route_get_idx(iter, *pos - 1);
2462}
2463
2464static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2465{
2466        struct fib_route_iter *iter = seq->private;
2467        struct leaf *l = v;
2468
2469        ++*pos;
2470        if (v == SEQ_START_TOKEN) {
2471                iter->pos = 0;
2472                l = trie_firstleaf(iter->main_trie);
2473        } else {
2474                iter->pos++;
2475                l = trie_nextleaf(l);
2476        }
2477
2478        if (l)
2479                iter->key = l->key;
2480        else
2481                iter->pos = 0;
2482        return l;
2483}
2484
2485static void fib_route_seq_stop(struct seq_file *seq, void *v)
2486        __releases(RCU)
2487{
2488        rcu_read_unlock();
2489}
2490
2491static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2492{
2493        unsigned int flags = 0;
2494
2495        if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2496                flags = RTF_REJECT;
2497        if (fi && fi->fib_nh->nh_gw)
2498                flags |= RTF_GATEWAY;
2499        if (mask == htonl(0xFFFFFFFF))
2500                flags |= RTF_HOST;
2501        flags |= RTF_UP;
2502        return flags;
2503}
2504
2505/*
2506 *      This outputs /proc/net/route.
2507 *      The format of the file is not supposed to be changed
2508 *      and needs to be same as fib_hash output to avoid breaking
2509 *      legacy utilities
2510 */
2511static int fib_route_seq_show(struct seq_file *seq, void *v)
2512{
2513        struct leaf *l = v;
2514        struct leaf_info *li;
2515
2516        if (v == SEQ_START_TOKEN) {
2517                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2518                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2519                           "\tWindow\tIRTT");
2520                return 0;
2521        }
2522
2523        hlist_for_each_entry_rcu(li, &l->list, hlist) {
2524                struct fib_alias *fa;
2525                __be32 mask, prefix;
2526
2527                mask = inet_make_mask(li->plen);
2528                prefix = htonl(l->key);
2529
2530                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2531                        const struct fib_info *fi = fa->fa_info;
2532                        unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2533                        int len;
2534
2535                        if (fa->fa_type == RTN_BROADCAST
2536                            || fa->fa_type == RTN_MULTICAST)
2537                                continue;
2538
2539                        if (fi)
2540                                seq_printf(seq,
2541                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2542                                         "%d\t%08X\t%d\t%u\t%u%n",
2543                                         fi->fib_dev ? fi->fib_dev->name : "*",
2544                                         prefix,
2545                                         fi->fib_nh->nh_gw, flags, 0, 0,
2546                                         fi->fib_priority,
2547                                         mask,
2548                                         (fi->fib_advmss ?
2549                                          fi->fib_advmss + 40 : 0),
2550                                         fi->fib_window,
2551                                         fi->fib_rtt >> 3, &len);
2552                        else
2553                                seq_printf(seq,
2554                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2555                                         "%d\t%08X\t%d\t%u\t%u%n",
2556                                         prefix, 0, flags, 0, 0, 0,
2557                                         mask, 0, 0, 0, &len);
2558
2559                        seq_printf(seq, "%*s\n", 127 - len, "");
2560                }
2561        }
2562
2563        return 0;
2564}
2565
2566static const struct seq_operations fib_route_seq_ops = {
2567        .start  = fib_route_seq_start,
2568        .next   = fib_route_seq_next,
2569        .stop   = fib_route_seq_stop,
2570        .show   = fib_route_seq_show,
2571};
2572
2573static int fib_route_seq_open(struct inode *inode, struct file *file)
2574{
2575        return seq_open_net(inode, file, &fib_route_seq_ops,
2576                            sizeof(struct fib_route_iter));
2577}
2578
2579static const struct file_operations fib_route_fops = {
2580        .owner  = THIS_MODULE,
2581        .open   = fib_route_seq_open,
2582        .read   = seq_read,
2583        .llseek = seq_lseek,
2584        .release = seq_release_net,
2585};
2586
2587int __net_init fib_proc_init(struct net *net)
2588{
2589        if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2590                goto out1;
2591
2592        if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2593                         &fib_triestat_fops))
2594                goto out2;
2595
2596        if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2597                goto out3;
2598
2599        return 0;
2600
2601out3:
2602        remove_proc_entry("fib_triestat", net->proc_net);
2603out2:
2604        remove_proc_entry("fib_trie", net->proc_net);
2605out1:
2606        return -ENOMEM;
2607}
2608
2609void __net_exit fib_proc_exit(struct net *net)
2610{
2611        remove_proc_entry("fib_trie", net->proc_net);
2612        remove_proc_entry("fib_triestat", net->proc_net);
2613        remove_proc_entry("route", net->proc_net);
2614}
2615
2616#endif /* CONFIG_PROC_FS */
2617