linux/net/ipv4/fib_trie.c
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally described in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <linux/bitops.h>
  55#include <linux/types.h>
  56#include <linux/kernel.h>
  57#include <linux/mm.h>
  58#include <linux/string.h>
  59#include <linux/socket.h>
  60#include <linux/sockios.h>
  61#include <linux/errno.h>
  62#include <linux/in.h>
  63#include <linux/inet.h>
  64#include <linux/inetdevice.h>
  65#include <linux/netdevice.h>
  66#include <linux/if_arp.h>
  67#include <linux/proc_fs.h>
  68#include <linux/rcupdate.h>
  69#include <linux/skbuff.h>
  70#include <linux/netlink.h>
  71#include <linux/init.h>
  72#include <linux/list.h>
  73#include <linux/slab.h>
  74#include <linux/prefetch.h>
  75#include <linux/export.h>
  76#include <net/net_namespace.h>
  77#include <net/ip.h>
  78#include <net/protocol.h>
  79#include <net/route.h>
  80#include <net/tcp.h>
  81#include <net/sock.h>
  82#include <net/ip_fib.h>
  83#include "fib_lookup.h"
  84
  85#define MAX_STAT_DEPTH 32
  86
  87#define KEYLENGTH (8*sizeof(t_key))
  88
  89typedef unsigned int t_key;
  90
  91#define T_TNODE 0
  92#define T_LEAF  1
  93#define NODE_TYPE_MASK  0x1UL
  94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  95
  96#define IS_TNODE(n) (!(n->parent & T_LEAF))
  97#define IS_LEAF(n) (n->parent & T_LEAF)
  98
  99struct rt_trie_node {
 100        unsigned long parent;
 101        t_key key;
 102};
 103
 104struct leaf {
 105        unsigned long parent;
 106        t_key key;
 107        struct hlist_head list;
 108        struct rcu_head rcu;
 109};
 110
 111struct leaf_info {
 112        struct hlist_node hlist;
 113        int plen;
 114        u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
 115        struct list_head falh;
 116        struct rcu_head rcu;
 117};
 118
 119struct tnode {
 120        unsigned long parent;
 121        t_key key;
 122        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 123        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 124        unsigned int full_children;     /* KEYLENGTH bits needed */
 125        unsigned int empty_children;    /* KEYLENGTH bits needed */
 126        union {
 127                struct rcu_head rcu;
 128                struct work_struct work;
 129                struct tnode *tnode_free;
 130        };
 131        struct rt_trie_node __rcu *child[0];
 132};
 133
 134#ifdef CONFIG_IP_FIB_TRIE_STATS
 135struct trie_use_stats {
 136        unsigned int gets;
 137        unsigned int backtrack;
 138        unsigned int semantic_match_passed;
 139        unsigned int semantic_match_miss;
 140        unsigned int null_node_hit;
 141        unsigned int resize_node_skipped;
 142};
 143#endif
 144
 145struct trie_stat {
 146        unsigned int totdepth;
 147        unsigned int maxdepth;
 148        unsigned int tnodes;
 149        unsigned int leaves;
 150        unsigned int nullpointers;
 151        unsigned int prefixes;
 152        unsigned int nodesizes[MAX_STAT_DEPTH];
 153};
 154
 155struct trie {
 156        struct rt_trie_node __rcu *trie;
 157#ifdef CONFIG_IP_FIB_TRIE_STATS
 158        struct trie_use_stats stats;
 159#endif
 160};
 161
 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 163                                  int wasfull);
 164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 165static struct tnode *inflate(struct trie *t, struct tnode *tn);
 166static struct tnode *halve(struct trie *t, struct tnode *tn);
 167/* tnodes to free after resize(); protected by RTNL */
 168static struct tnode *tnode_free_head;
 169static size_t tnode_free_size;
 170
 171/*
 172 * synchronize_rcu after call_rcu for that many pages; it should be especially
 173 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 174 * obtained experimentally, aiming to avoid visible slowdown.
 175 */
 176static const int sync_pages = 128;
 177
 178static struct kmem_cache *fn_alias_kmem __read_mostly;
 179static struct kmem_cache *trie_leaf_kmem __read_mostly;
 180
 181/*
 182 * caller must hold RTNL
 183 */
 184static inline struct tnode *node_parent(const struct rt_trie_node *node)
 185{
 186        unsigned long parent;
 187
 188        parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
 189
 190        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 191}
 192
 193/*
 194 * caller must hold RCU read lock or RTNL
 195 */
 196static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
 197{
 198        unsigned long parent;
 199
 200        parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
 201                                                           lockdep_rtnl_is_held());
 202
 203        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 204}
 205
 206/* Same as rcu_assign_pointer
 207 * but that macro() assumes that value is a pointer.
 208 */
 209static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 210{
 211        smp_wmb();
 212        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 213}
 214
 215/*
 216 * caller must hold RTNL
 217 */
 218static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 219{
 220        BUG_ON(i >= 1U << tn->bits);
 221
 222        return rtnl_dereference(tn->child[i]);
 223}
 224
 225/*
 226 * caller must hold RCU read lock or RTNL
 227 */
 228static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 229{
 230        BUG_ON(i >= 1U << tn->bits);
 231
 232        return rcu_dereference_rtnl(tn->child[i]);
 233}
 234
 235static inline int tnode_child_length(const struct tnode *tn)
 236{
 237        return 1 << tn->bits;
 238}
 239
 240static inline t_key mask_pfx(t_key k, unsigned int l)
 241{
 242        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 243}
 244
 245static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 246{
 247        if (offset < KEYLENGTH)
 248                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 249        else
 250                return 0;
 251}
 252
 253static inline int tkey_equals(t_key a, t_key b)
 254{
 255        return a == b;
 256}
 257
 258static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 259{
 260        if (bits == 0 || offset >= KEYLENGTH)
 261                return 1;
 262        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 263        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 264}
 265
 266static inline int tkey_mismatch(t_key a, int offset, t_key b)
 267{
 268        t_key diff = a ^ b;
 269        int i = offset;
 270
 271        if (!diff)
 272                return 0;
 273        while ((diff << i) >> (KEYLENGTH-1) == 0)
 274                i++;
 275        return i;
 276}
 277
 278/*
 279  To understand this stuff, an understanding of keys and all their bits is
 280  necessary. Every node in the trie has a key associated with it, but not
 281  all of the bits in that key are significant.
 282
 283  Consider a node 'n' and its parent 'tp'.
 284
 285  If n is a leaf, every bit in its key is significant. Its presence is
 286  necessitated by path compression, since during a tree traversal (when
 287  searching for a leaf - unless we are doing an insertion) we will completely
 288  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 289  a potentially successful search, that we have indeed been walking the
 290  correct key path.
 291
 292  Note that we can never "miss" the correct key in the tree if present by
 293  following the wrong path. Path compression ensures that segments of the key
 294  that are the same for all keys with a given prefix are skipped, but the
 295  skipped part *is* identical for each node in the subtrie below the skipped
 296  bit! trie_insert() in this implementation takes care of that - note the
 297  call to tkey_sub_equals() in trie_insert().
 298
 299  if n is an internal node - a 'tnode' here, the various parts of its key
 300  have many different meanings.
 301
 302  Example:
 303  _________________________________________________________________
 304  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 305  -----------------------------------------------------------------
 306    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 307
 308  _________________________________________________________________
 309  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 310  -----------------------------------------------------------------
 311   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 312
 313  tp->pos = 7
 314  tp->bits = 3
 315  n->pos = 15
 316  n->bits = 4
 317
 318  First, let's just ignore the bits that come before the parent tp, that is
 319  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 320  not use them for anything.
 321
 322  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 323  index into the parent's child array. That is, they will be used to find
 324  'n' among tp's children.
 325
 326  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 327  for the node n.
 328
 329  All the bits we have seen so far are significant to the node n. The rest
 330  of the bits are really not needed or indeed known in n->key.
 331
 332  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 333  n's child array, and will of course be different for each child.
 334
 335
 336  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 337  at this point.
 338
 339*/
 340
 341static inline void check_tnode(const struct tnode *tn)
 342{
 343        WARN_ON(tn && tn->pos+tn->bits > 32);
 344}
 345
 346static const int halve_threshold = 25;
 347static const int inflate_threshold = 50;
 348static const int halve_threshold_root = 15;
 349static const int inflate_threshold_root = 30;
 350
 351static void __alias_free_mem(struct rcu_head *head)
 352{
 353        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 354        kmem_cache_free(fn_alias_kmem, fa);
 355}
 356
 357static inline void alias_free_mem_rcu(struct fib_alias *fa)
 358{
 359        call_rcu(&fa->rcu, __alias_free_mem);
 360}
 361
 362static void __leaf_free_rcu(struct rcu_head *head)
 363{
 364        struct leaf *l = container_of(head, struct leaf, rcu);
 365        kmem_cache_free(trie_leaf_kmem, l);
 366}
 367
 368static inline void free_leaf(struct leaf *l)
 369{
 370        call_rcu(&l->rcu, __leaf_free_rcu);
 371}
 372
 373static inline void free_leaf_info(struct leaf_info *leaf)
 374{
 375        kfree_rcu(leaf, rcu);
 376}
 377
 378static struct tnode *tnode_alloc(size_t size)
 379{
 380        if (size <= PAGE_SIZE)
 381                return kzalloc(size, GFP_KERNEL);
 382        else
 383                return vzalloc(size);
 384}
 385
 386static void __tnode_vfree(struct work_struct *arg)
 387{
 388        struct tnode *tn = container_of(arg, struct tnode, work);
 389        vfree(tn);
 390}
 391
 392static void __tnode_free_rcu(struct rcu_head *head)
 393{
 394        struct tnode *tn = container_of(head, struct tnode, rcu);
 395        size_t size = sizeof(struct tnode) +
 396                      (sizeof(struct rt_trie_node *) << tn->bits);
 397
 398        if (size <= PAGE_SIZE)
 399                kfree(tn);
 400        else {
 401                INIT_WORK(&tn->work, __tnode_vfree);
 402                schedule_work(&tn->work);
 403        }
 404}
 405
 406static inline void tnode_free(struct tnode *tn)
 407{
 408        if (IS_LEAF(tn))
 409                free_leaf((struct leaf *) tn);
 410        else
 411                call_rcu(&tn->rcu, __tnode_free_rcu);
 412}
 413
 414static void tnode_free_safe(struct tnode *tn)
 415{
 416        BUG_ON(IS_LEAF(tn));
 417        tn->tnode_free = tnode_free_head;
 418        tnode_free_head = tn;
 419        tnode_free_size += sizeof(struct tnode) +
 420                           (sizeof(struct rt_trie_node *) << tn->bits);
 421}
 422
 423static void tnode_free_flush(void)
 424{
 425        struct tnode *tn;
 426
 427        while ((tn = tnode_free_head)) {
 428                tnode_free_head = tn->tnode_free;
 429                tn->tnode_free = NULL;
 430                tnode_free(tn);
 431        }
 432
 433        if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 434                tnode_free_size = 0;
 435                synchronize_rcu();
 436        }
 437}
 438
 439static struct leaf *leaf_new(void)
 440{
 441        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 442        if (l) {
 443                l->parent = T_LEAF;
 444                INIT_HLIST_HEAD(&l->list);
 445        }
 446        return l;
 447}
 448
 449static struct leaf_info *leaf_info_new(int plen)
 450{
 451        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 452        if (li) {
 453                li->plen = plen;
 454                li->mask_plen = ntohl(inet_make_mask(plen));
 455                INIT_LIST_HEAD(&li->falh);
 456        }
 457        return li;
 458}
 459
 460static struct tnode *tnode_new(t_key key, int pos, int bits)
 461{
 462        size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
 463        struct tnode *tn = tnode_alloc(sz);
 464
 465        if (tn) {
 466                tn->parent = T_TNODE;
 467                tn->pos = pos;
 468                tn->bits = bits;
 469                tn->key = key;
 470                tn->full_children = 0;
 471                tn->empty_children = 1<<bits;
 472        }
 473
 474        pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 475                 sizeof(struct rt_trie_node *) << bits);
 476        return tn;
 477}
 478
 479/*
 480 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 481 * and no bits are skipped. See discussion in dyntree paper p. 6
 482 */
 483
 484static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 485{
 486        if (n == NULL || IS_LEAF(n))
 487                return 0;
 488
 489        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 490}
 491
 492static inline void put_child(struct tnode *tn, int i,
 493                             struct rt_trie_node *n)
 494{
 495        tnode_put_child_reorg(tn, i, n, -1);
 496}
 497
 498 /*
 499  * Add a child at position i overwriting the old value.
 500  * Update the value of full_children and empty_children.
 501  */
 502
 503static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 504                                  int wasfull)
 505{
 506        struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
 507        int isfull;
 508
 509        BUG_ON(i >= 1<<tn->bits);
 510
 511        /* update emptyChildren */
 512        if (n == NULL && chi != NULL)
 513                tn->empty_children++;
 514        else if (n != NULL && chi == NULL)
 515                tn->empty_children--;
 516
 517        /* update fullChildren */
 518        if (wasfull == -1)
 519                wasfull = tnode_full(tn, chi);
 520
 521        isfull = tnode_full(tn, n);
 522        if (wasfull && !isfull)
 523                tn->full_children--;
 524        else if (!wasfull && isfull)
 525                tn->full_children++;
 526
 527        if (n)
 528                node_set_parent(n, tn);
 529
 530        rcu_assign_pointer(tn->child[i], n);
 531}
 532
 533#define MAX_WORK 10
 534static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 535{
 536        int i;
 537        struct tnode *old_tn;
 538        int inflate_threshold_use;
 539        int halve_threshold_use;
 540        int max_work;
 541
 542        if (!tn)
 543                return NULL;
 544
 545        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 546                 tn, inflate_threshold, halve_threshold);
 547
 548        /* No children */
 549        if (tn->empty_children == tnode_child_length(tn)) {
 550                tnode_free_safe(tn);
 551                return NULL;
 552        }
 553        /* One child */
 554        if (tn->empty_children == tnode_child_length(tn) - 1)
 555                goto one_child;
 556        /*
 557         * Double as long as the resulting node has a number of
 558         * nonempty nodes that are above the threshold.
 559         */
 560
 561        /*
 562         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 563         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 564         * Telecommunications, page 6:
 565         * "A node is doubled if the ratio of non-empty children to all
 566         * children in the *doubled* node is at least 'high'."
 567         *
 568         * 'high' in this instance is the variable 'inflate_threshold'. It
 569         * is expressed as a percentage, so we multiply it with
 570         * tnode_child_length() and instead of multiplying by 2 (since the
 571         * child array will be doubled by inflate()) and multiplying
 572         * the left-hand side by 100 (to handle the percentage thing) we
 573         * multiply the left-hand side by 50.
 574         *
 575         * The left-hand side may look a bit weird: tnode_child_length(tn)
 576         * - tn->empty_children is of course the number of non-null children
 577         * in the current node. tn->full_children is the number of "full"
 578         * children, that is non-null tnodes with a skip value of 0.
 579         * All of those will be doubled in the resulting inflated tnode, so
 580         * we just count them one extra time here.
 581         *
 582         * A clearer way to write this would be:
 583         *
 584         * to_be_doubled = tn->full_children;
 585         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 586         *     tn->full_children;
 587         *
 588         * new_child_length = tnode_child_length(tn) * 2;
 589         *
 590         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 591         *      new_child_length;
 592         * if (new_fill_factor >= inflate_threshold)
 593         *
 594         * ...and so on, tho it would mess up the while () loop.
 595         *
 596         * anyway,
 597         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 598         *      inflate_threshold
 599         *
 600         * avoid a division:
 601         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 602         *      inflate_threshold * new_child_length
 603         *
 604         * expand not_to_be_doubled and to_be_doubled, and shorten:
 605         * 100 * (tnode_child_length(tn) - tn->empty_children +
 606         *    tn->full_children) >= inflate_threshold * new_child_length
 607         *
 608         * expand new_child_length:
 609         * 100 * (tnode_child_length(tn) - tn->empty_children +
 610         *    tn->full_children) >=
 611         *      inflate_threshold * tnode_child_length(tn) * 2
 612         *
 613         * shorten again:
 614         * 50 * (tn->full_children + tnode_child_length(tn) -
 615         *    tn->empty_children) >= inflate_threshold *
 616         *    tnode_child_length(tn)
 617         *
 618         */
 619
 620        check_tnode(tn);
 621
 622        /* Keep root node larger  */
 623
 624        if (!node_parent((struct rt_trie_node *)tn)) {
 625                inflate_threshold_use = inflate_threshold_root;
 626                halve_threshold_use = halve_threshold_root;
 627        } else {
 628                inflate_threshold_use = inflate_threshold;
 629                halve_threshold_use = halve_threshold;
 630        }
 631
 632        max_work = MAX_WORK;
 633        while ((tn->full_children > 0 &&  max_work-- &&
 634                50 * (tn->full_children + tnode_child_length(tn)
 635                      - tn->empty_children)
 636                >= inflate_threshold_use * tnode_child_length(tn))) {
 637
 638                old_tn = tn;
 639                tn = inflate(t, tn);
 640
 641                if (IS_ERR(tn)) {
 642                        tn = old_tn;
 643#ifdef CONFIG_IP_FIB_TRIE_STATS
 644                        t->stats.resize_node_skipped++;
 645#endif
 646                        break;
 647                }
 648        }
 649
 650        check_tnode(tn);
 651
 652        /* Return if at least one inflate is run */
 653        if (max_work != MAX_WORK)
 654                return (struct rt_trie_node *) tn;
 655
 656        /*
 657         * Halve as long as the number of empty children in this
 658         * node is above threshold.
 659         */
 660
 661        max_work = MAX_WORK;
 662        while (tn->bits > 1 &&  max_work-- &&
 663               100 * (tnode_child_length(tn) - tn->empty_children) <
 664               halve_threshold_use * tnode_child_length(tn)) {
 665
 666                old_tn = tn;
 667                tn = halve(t, tn);
 668                if (IS_ERR(tn)) {
 669                        tn = old_tn;
 670#ifdef CONFIG_IP_FIB_TRIE_STATS
 671                        t->stats.resize_node_skipped++;
 672#endif
 673                        break;
 674                }
 675        }
 676
 677
 678        /* Only one child remains */
 679        if (tn->empty_children == tnode_child_length(tn) - 1) {
 680one_child:
 681                for (i = 0; i < tnode_child_length(tn); i++) {
 682                        struct rt_trie_node *n;
 683
 684                        n = rtnl_dereference(tn->child[i]);
 685                        if (!n)
 686                                continue;
 687
 688                        /* compress one level */
 689
 690                        node_set_parent(n, NULL);
 691                        tnode_free_safe(tn);
 692                        return n;
 693                }
 694        }
 695        return (struct rt_trie_node *) tn;
 696}
 697
 698
 699static void tnode_clean_free(struct tnode *tn)
 700{
 701        int i;
 702        struct tnode *tofree;
 703
 704        for (i = 0; i < tnode_child_length(tn); i++) {
 705                tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
 706                if (tofree)
 707                        tnode_free(tofree);
 708        }
 709        tnode_free(tn);
 710}
 711
 712static struct tnode *inflate(struct trie *t, struct tnode *tn)
 713{
 714        struct tnode *oldtnode = tn;
 715        int olen = tnode_child_length(tn);
 716        int i;
 717
 718        pr_debug("In inflate\n");
 719
 720        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 721
 722        if (!tn)
 723                return ERR_PTR(-ENOMEM);
 724
 725        /*
 726         * Preallocate and store tnodes before the actual work so we
 727         * don't get into an inconsistent state if memory allocation
 728         * fails. In case of failure we return the oldnode and  inflate
 729         * of tnode is ignored.
 730         */
 731
 732        for (i = 0; i < olen; i++) {
 733                struct tnode *inode;
 734
 735                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 736                if (inode &&
 737                    IS_TNODE(inode) &&
 738                    inode->pos == oldtnode->pos + oldtnode->bits &&
 739                    inode->bits > 1) {
 740                        struct tnode *left, *right;
 741                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 742
 743                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 744                                         inode->bits - 1);
 745                        if (!left)
 746                                goto nomem;
 747
 748                        right = tnode_new(inode->key|m, inode->pos + 1,
 749                                          inode->bits - 1);
 750
 751                        if (!right) {
 752                                tnode_free(left);
 753                                goto nomem;
 754                        }
 755
 756                        put_child(tn, 2*i, (struct rt_trie_node *) left);
 757                        put_child(tn, 2*i+1, (struct rt_trie_node *) right);
 758                }
 759        }
 760
 761        for (i = 0; i < olen; i++) {
 762                struct tnode *inode;
 763                struct rt_trie_node *node = tnode_get_child(oldtnode, i);
 764                struct tnode *left, *right;
 765                int size, j;
 766
 767                /* An empty child */
 768                if (node == NULL)
 769                        continue;
 770
 771                /* A leaf or an internal node with skipped bits */
 772
 773                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 774                   tn->pos + tn->bits - 1) {
 775                        if (tkey_extract_bits(node->key,
 776                                              oldtnode->pos + oldtnode->bits,
 777                                              1) == 0)
 778                                put_child(tn, 2*i, node);
 779                        else
 780                                put_child(tn, 2*i+1, node);
 781                        continue;
 782                }
 783
 784                /* An internal node with two children */
 785                inode = (struct tnode *) node;
 786
 787                if (inode->bits == 1) {
 788                        put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
 789                        put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
 790
 791                        tnode_free_safe(inode);
 792                        continue;
 793                }
 794
 795                /* An internal node with more than two children */
 796
 797                /* We will replace this node 'inode' with two new
 798                 * ones, 'left' and 'right', each with half of the
 799                 * original children. The two new nodes will have
 800                 * a position one bit further down the key and this
 801                 * means that the "significant" part of their keys
 802                 * (see the discussion near the top of this file)
 803                 * will differ by one bit, which will be "0" in
 804                 * left's key and "1" in right's key. Since we are
 805                 * moving the key position by one step, the bit that
 806                 * we are moving away from - the bit at position
 807                 * (inode->pos) - is the one that will differ between
 808                 * left and right. So... we synthesize that bit in the
 809                 * two  new keys.
 810                 * The mask 'm' below will be a single "one" bit at
 811                 * the position (inode->pos)
 812                 */
 813
 814                /* Use the old key, but set the new significant
 815                 *   bit to zero.
 816                 */
 817
 818                left = (struct tnode *) tnode_get_child(tn, 2*i);
 819                put_child(tn, 2*i, NULL);
 820
 821                BUG_ON(!left);
 822
 823                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 824                put_child(tn, 2*i+1, NULL);
 825
 826                BUG_ON(!right);
 827
 828                size = tnode_child_length(left);
 829                for (j = 0; j < size; j++) {
 830                        put_child(left, j, rtnl_dereference(inode->child[j]));
 831                        put_child(right, j, rtnl_dereference(inode->child[j + size]));
 832                }
 833                put_child(tn, 2*i, resize(t, left));
 834                put_child(tn, 2*i+1, resize(t, right));
 835
 836                tnode_free_safe(inode);
 837        }
 838        tnode_free_safe(oldtnode);
 839        return tn;
 840nomem:
 841        tnode_clean_free(tn);
 842        return ERR_PTR(-ENOMEM);
 843}
 844
 845static struct tnode *halve(struct trie *t, struct tnode *tn)
 846{
 847        struct tnode *oldtnode = tn;
 848        struct rt_trie_node *left, *right;
 849        int i;
 850        int olen = tnode_child_length(tn);
 851
 852        pr_debug("In halve\n");
 853
 854        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 855
 856        if (!tn)
 857                return ERR_PTR(-ENOMEM);
 858
 859        /*
 860         * Preallocate and store tnodes before the actual work so we
 861         * don't get into an inconsistent state if memory allocation
 862         * fails. In case of failure we return the oldnode and halve
 863         * of tnode is ignored.
 864         */
 865
 866        for (i = 0; i < olen; i += 2) {
 867                left = tnode_get_child(oldtnode, i);
 868                right = tnode_get_child(oldtnode, i+1);
 869
 870                /* Two nonempty children */
 871                if (left && right) {
 872                        struct tnode *newn;
 873
 874                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 875
 876                        if (!newn)
 877                                goto nomem;
 878
 879                        put_child(tn, i/2, (struct rt_trie_node *)newn);
 880                }
 881
 882        }
 883
 884        for (i = 0; i < olen; i += 2) {
 885                struct tnode *newBinNode;
 886
 887                left = tnode_get_child(oldtnode, i);
 888                right = tnode_get_child(oldtnode, i+1);
 889
 890                /* At least one of the children is empty */
 891                if (left == NULL) {
 892                        if (right == NULL)    /* Both are empty */
 893                                continue;
 894                        put_child(tn, i/2, right);
 895                        continue;
 896                }
 897
 898                if (right == NULL) {
 899                        put_child(tn, i/2, left);
 900                        continue;
 901                }
 902
 903                /* Two nonempty children */
 904                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 905                put_child(tn, i/2, NULL);
 906                put_child(newBinNode, 0, left);
 907                put_child(newBinNode, 1, right);
 908                put_child(tn, i/2, resize(t, newBinNode));
 909        }
 910        tnode_free_safe(oldtnode);
 911        return tn;
 912nomem:
 913        tnode_clean_free(tn);
 914        return ERR_PTR(-ENOMEM);
 915}
 916
 917/* readside must use rcu_read_lock currently dump routines
 918 via get_fa_head and dump */
 919
 920static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 921{
 922        struct hlist_head *head = &l->list;
 923        struct leaf_info *li;
 924
 925        hlist_for_each_entry_rcu(li, head, hlist)
 926                if (li->plen == plen)
 927                        return li;
 928
 929        return NULL;
 930}
 931
 932static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 933{
 934        struct leaf_info *li = find_leaf_info(l, plen);
 935
 936        if (!li)
 937                return NULL;
 938
 939        return &li->falh;
 940}
 941
 942static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 943{
 944        struct leaf_info *li = NULL, *last = NULL;
 945
 946        if (hlist_empty(head)) {
 947                hlist_add_head_rcu(&new->hlist, head);
 948        } else {
 949                hlist_for_each_entry(li, head, hlist) {
 950                        if (new->plen > li->plen)
 951                                break;
 952
 953                        last = li;
 954                }
 955                if (last)
 956                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 957                else
 958                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 959        }
 960}
 961
 962/* rcu_read_lock needs to be hold by caller from readside */
 963
 964static struct leaf *
 965fib_find_node(struct trie *t, u32 key)
 966{
 967        int pos;
 968        struct tnode *tn;
 969        struct rt_trie_node *n;
 970
 971        pos = 0;
 972        n = rcu_dereference_rtnl(t->trie);
 973
 974        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 975                tn = (struct tnode *) n;
 976
 977                check_tnode(tn);
 978
 979                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 980                        pos = tn->pos + tn->bits;
 981                        n = tnode_get_child_rcu(tn,
 982                                                tkey_extract_bits(key,
 983                                                                  tn->pos,
 984                                                                  tn->bits));
 985                } else
 986                        break;
 987        }
 988        /* Case we have found a leaf. Compare prefixes */
 989
 990        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 991                return (struct leaf *)n;
 992
 993        return NULL;
 994}
 995
 996static void trie_rebalance(struct trie *t, struct tnode *tn)
 997{
 998        int wasfull;
 999        t_key cindex, key;
1000        struct tnode *tp;
1001
1002        key = tn->key;
1003
1004        while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1005                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1006                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1007                tn = (struct tnode *)resize(t, tn);
1008
1009                tnode_put_child_reorg(tp, cindex,
1010                                      (struct rt_trie_node *)tn, wasfull);
1011
1012                tp = node_parent((struct rt_trie_node *) tn);
1013                if (!tp)
1014                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1015
1016                tnode_free_flush();
1017                if (!tp)
1018                        break;
1019                tn = tp;
1020        }
1021
1022        /* Handle last (top) tnode */
1023        if (IS_TNODE(tn))
1024                tn = (struct tnode *)resize(t, tn);
1025
1026        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1027        tnode_free_flush();
1028}
1029
1030/* only used from updater-side */
1031
1032static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1033{
1034        int pos, newpos;
1035        struct tnode *tp = NULL, *tn = NULL;
1036        struct rt_trie_node *n;
1037        struct leaf *l;
1038        int missbit;
1039        struct list_head *fa_head = NULL;
1040        struct leaf_info *li;
1041        t_key cindex;
1042
1043        pos = 0;
1044        n = rtnl_dereference(t->trie);
1045
1046        /* If we point to NULL, stop. Either the tree is empty and we should
1047         * just put a new leaf in if, or we have reached an empty child slot,
1048         * and we should just put our new leaf in that.
1049         * If we point to a T_TNODE, check if it matches our key. Note that
1050         * a T_TNODE might be skipping any number of bits - its 'pos' need
1051         * not be the parent's 'pos'+'bits'!
1052         *
1053         * If it does match the current key, get pos/bits from it, extract
1054         * the index from our key, push the T_TNODE and walk the tree.
1055         *
1056         * If it doesn't, we have to replace it with a new T_TNODE.
1057         *
1058         * If we point to a T_LEAF, it might or might not have the same key
1059         * as we do. If it does, just change the value, update the T_LEAF's
1060         * value, and return it.
1061         * If it doesn't, we need to replace it with a T_TNODE.
1062         */
1063
1064        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1065                tn = (struct tnode *) n;
1066
1067                check_tnode(tn);
1068
1069                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1070                        tp = tn;
1071                        pos = tn->pos + tn->bits;
1072                        n = tnode_get_child(tn,
1073                                            tkey_extract_bits(key,
1074                                                              tn->pos,
1075                                                              tn->bits));
1076
1077                        BUG_ON(n && node_parent(n) != tn);
1078                } else
1079                        break;
1080        }
1081
1082        /*
1083         * n  ----> NULL, LEAF or TNODE
1084         *
1085         * tp is n's (parent) ----> NULL or TNODE
1086         */
1087
1088        BUG_ON(tp && IS_LEAF(tp));
1089
1090        /* Case 1: n is a leaf. Compare prefixes */
1091
1092        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1093                l = (struct leaf *) n;
1094                li = leaf_info_new(plen);
1095
1096                if (!li)
1097                        return NULL;
1098
1099                fa_head = &li->falh;
1100                insert_leaf_info(&l->list, li);
1101                goto done;
1102        }
1103        l = leaf_new();
1104
1105        if (!l)
1106                return NULL;
1107
1108        l->key = key;
1109        li = leaf_info_new(plen);
1110
1111        if (!li) {
1112                free_leaf(l);
1113                return NULL;
1114        }
1115
1116        fa_head = &li->falh;
1117        insert_leaf_info(&l->list, li);
1118
1119        if (t->trie && n == NULL) {
1120                /* Case 2: n is NULL, and will just insert a new leaf */
1121
1122                node_set_parent((struct rt_trie_node *)l, tp);
1123
1124                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125                put_child(tp, cindex, (struct rt_trie_node *)l);
1126        } else {
1127                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1128                /*
1129                 *  Add a new tnode here
1130                 *  first tnode need some special handling
1131                 */
1132
1133                if (tp)
1134                        pos = tp->pos+tp->bits;
1135                else
1136                        pos = 0;
1137
1138                if (n) {
1139                        newpos = tkey_mismatch(key, pos, n->key);
1140                        tn = tnode_new(n->key, newpos, 1);
1141                } else {
1142                        newpos = 0;
1143                        tn = tnode_new(key, newpos, 1); /* First tnode */
1144                }
1145
1146                if (!tn) {
1147                        free_leaf_info(li);
1148                        free_leaf(l);
1149                        return NULL;
1150                }
1151
1152                node_set_parent((struct rt_trie_node *)tn, tp);
1153
1154                missbit = tkey_extract_bits(key, newpos, 1);
1155                put_child(tn, missbit, (struct rt_trie_node *)l);
1156                put_child(tn, 1-missbit, n);
1157
1158                if (tp) {
1159                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1160                        put_child(tp, cindex, (struct rt_trie_node *)tn);
1161                } else {
1162                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1163                        tp = tn;
1164                }
1165        }
1166
1167        if (tp && tp->pos + tp->bits > 32)
1168                pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1169                        tp, tp->pos, tp->bits, key, plen);
1170
1171        /* Rebalance the trie */
1172
1173        trie_rebalance(t, tp);
1174done:
1175        return fa_head;
1176}
1177
1178/*
1179 * Caller must hold RTNL.
1180 */
1181int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1182{
1183        struct trie *t = (struct trie *) tb->tb_data;
1184        struct fib_alias *fa, *new_fa;
1185        struct list_head *fa_head = NULL;
1186        struct fib_info *fi;
1187        int plen = cfg->fc_dst_len;
1188        u8 tos = cfg->fc_tos;
1189        u32 key, mask;
1190        int err;
1191        struct leaf *l;
1192
1193        if (plen > 32)
1194                return -EINVAL;
1195
1196        key = ntohl(cfg->fc_dst);
1197
1198        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1199
1200        mask = ntohl(inet_make_mask(plen));
1201
1202        if (key & ~mask)
1203                return -EINVAL;
1204
1205        key = key & mask;
1206
1207        fi = fib_create_info(cfg);
1208        if (IS_ERR(fi)) {
1209                err = PTR_ERR(fi);
1210                goto err;
1211        }
1212
1213        l = fib_find_node(t, key);
1214        fa = NULL;
1215
1216        if (l) {
1217                fa_head = get_fa_head(l, plen);
1218                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1219        }
1220
1221        /* Now fa, if non-NULL, points to the first fib alias
1222         * with the same keys [prefix,tos,priority], if such key already
1223         * exists or to the node before which we will insert new one.
1224         *
1225         * If fa is NULL, we will need to allocate a new one and
1226         * insert to the head of f.
1227         *
1228         * If f is NULL, no fib node matched the destination key
1229         * and we need to allocate a new one of those as well.
1230         */
1231
1232        if (fa && fa->fa_tos == tos &&
1233            fa->fa_info->fib_priority == fi->fib_priority) {
1234                struct fib_alias *fa_first, *fa_match;
1235
1236                err = -EEXIST;
1237                if (cfg->fc_nlflags & NLM_F_EXCL)
1238                        goto out;
1239
1240                /* We have 2 goals:
1241                 * 1. Find exact match for type, scope, fib_info to avoid
1242                 * duplicate routes
1243                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1244                 */
1245                fa_match = NULL;
1246                fa_first = fa;
1247                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1248                list_for_each_entry_continue(fa, fa_head, fa_list) {
1249                        if (fa->fa_tos != tos)
1250                                break;
1251                        if (fa->fa_info->fib_priority != fi->fib_priority)
1252                                break;
1253                        if (fa->fa_type == cfg->fc_type &&
1254                            fa->fa_info == fi) {
1255                                fa_match = fa;
1256                                break;
1257                        }
1258                }
1259
1260                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1261                        struct fib_info *fi_drop;
1262                        u8 state;
1263
1264                        fa = fa_first;
1265                        if (fa_match) {
1266                                if (fa == fa_match)
1267                                        err = 0;
1268                                goto out;
1269                        }
1270                        err = -ENOBUFS;
1271                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1272                        if (new_fa == NULL)
1273                                goto out;
1274
1275                        fi_drop = fa->fa_info;
1276                        new_fa->fa_tos = fa->fa_tos;
1277                        new_fa->fa_info = fi;
1278                        new_fa->fa_type = cfg->fc_type;
1279                        state = fa->fa_state;
1280                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1281
1282                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1283                        alias_free_mem_rcu(fa);
1284
1285                        fib_release_info(fi_drop);
1286                        if (state & FA_S_ACCESSED)
1287                                rt_cache_flush(cfg->fc_nlinfo.nl_net);
1288                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1289                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1290
1291                        goto succeeded;
1292                }
1293                /* Error if we find a perfect match which
1294                 * uses the same scope, type, and nexthop
1295                 * information.
1296                 */
1297                if (fa_match)
1298                        goto out;
1299
1300                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1301                        fa = fa_first;
1302        }
1303        err = -ENOENT;
1304        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1305                goto out;
1306
1307        err = -ENOBUFS;
1308        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1309        if (new_fa == NULL)
1310                goto out;
1311
1312        new_fa->fa_info = fi;
1313        new_fa->fa_tos = tos;
1314        new_fa->fa_type = cfg->fc_type;
1315        new_fa->fa_state = 0;
1316        /*
1317         * Insert new entry to the list.
1318         */
1319
1320        if (!fa_head) {
1321                fa_head = fib_insert_node(t, key, plen);
1322                if (unlikely(!fa_head)) {
1323                        err = -ENOMEM;
1324                        goto out_free_new_fa;
1325                }
1326        }
1327
1328        if (!plen)
1329                tb->tb_num_default++;
1330
1331        list_add_tail_rcu(&new_fa->fa_list,
1332                          (fa ? &fa->fa_list : fa_head));
1333
1334        rt_cache_flush(cfg->fc_nlinfo.nl_net);
1335        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1336                  &cfg->fc_nlinfo, 0);
1337succeeded:
1338        return 0;
1339
1340out_free_new_fa:
1341        kmem_cache_free(fn_alias_kmem, new_fa);
1342out:
1343        fib_release_info(fi);
1344err:
1345        return err;
1346}
1347
1348/* should be called with rcu_read_lock */
1349static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1350                      t_key key,  const struct flowi4 *flp,
1351                      struct fib_result *res, int fib_flags)
1352{
1353        struct leaf_info *li;
1354        struct hlist_head *hhead = &l->list;
1355
1356        hlist_for_each_entry_rcu(li, hhead, hlist) {
1357                struct fib_alias *fa;
1358
1359                if (l->key != (key & li->mask_plen))
1360                        continue;
1361
1362                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1363                        struct fib_info *fi = fa->fa_info;
1364                        int nhsel, err;
1365
1366                        if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1367                                continue;
1368                        if (fi->fib_dead)
1369                                continue;
1370                        if (fa->fa_info->fib_scope < flp->flowi4_scope)
1371                                continue;
1372                        fib_alias_accessed(fa);
1373                        err = fib_props[fa->fa_type].error;
1374                        if (err) {
1375#ifdef CONFIG_IP_FIB_TRIE_STATS
1376                                t->stats.semantic_match_passed++;
1377#endif
1378                                return err;
1379                        }
1380                        if (fi->fib_flags & RTNH_F_DEAD)
1381                                continue;
1382                        for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1383                                const struct fib_nh *nh = &fi->fib_nh[nhsel];
1384
1385                                if (nh->nh_flags & RTNH_F_DEAD)
1386                                        continue;
1387                                if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1388                                        continue;
1389
1390#ifdef CONFIG_IP_FIB_TRIE_STATS
1391                                t->stats.semantic_match_passed++;
1392#endif
1393                                res->prefixlen = li->plen;
1394                                res->nh_sel = nhsel;
1395                                res->type = fa->fa_type;
1396                                res->scope = fa->fa_info->fib_scope;
1397                                res->fi = fi;
1398                                res->table = tb;
1399                                res->fa_head = &li->falh;
1400                                if (!(fib_flags & FIB_LOOKUP_NOREF))
1401                                        atomic_inc(&fi->fib_clntref);
1402                                return 0;
1403                        }
1404                }
1405
1406#ifdef CONFIG_IP_FIB_TRIE_STATS
1407                t->stats.semantic_match_miss++;
1408#endif
1409        }
1410
1411        return 1;
1412}
1413
1414int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1415                     struct fib_result *res, int fib_flags)
1416{
1417        struct trie *t = (struct trie *) tb->tb_data;
1418        int ret;
1419        struct rt_trie_node *n;
1420        struct tnode *pn;
1421        unsigned int pos, bits;
1422        t_key key = ntohl(flp->daddr);
1423        unsigned int chopped_off;
1424        t_key cindex = 0;
1425        unsigned int current_prefix_length = KEYLENGTH;
1426        struct tnode *cn;
1427        t_key pref_mismatch;
1428
1429        rcu_read_lock();
1430
1431        n = rcu_dereference(t->trie);
1432        if (!n)
1433                goto failed;
1434
1435#ifdef CONFIG_IP_FIB_TRIE_STATS
1436        t->stats.gets++;
1437#endif
1438
1439        /* Just a leaf? */
1440        if (IS_LEAF(n)) {
1441                ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1442                goto found;
1443        }
1444
1445        pn = (struct tnode *) n;
1446        chopped_off = 0;
1447
1448        while (pn) {
1449                pos = pn->pos;
1450                bits = pn->bits;
1451
1452                if (!chopped_off)
1453                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1454                                                   pos, bits);
1455
1456                n = tnode_get_child_rcu(pn, cindex);
1457
1458                if (n == NULL) {
1459#ifdef CONFIG_IP_FIB_TRIE_STATS
1460                        t->stats.null_node_hit++;
1461#endif
1462                        goto backtrace;
1463                }
1464
1465                if (IS_LEAF(n)) {
1466                        ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1467                        if (ret > 0)
1468                                goto backtrace;
1469                        goto found;
1470                }
1471
1472                cn = (struct tnode *)n;
1473
1474                /*
1475                 * It's a tnode, and we can do some extra checks here if we
1476                 * like, to avoid descending into a dead-end branch.
1477                 * This tnode is in the parent's child array at index
1478                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1479                 * chopped off, so in reality the index may be just a
1480                 * subprefix, padded with zero at the end.
1481                 * We can also take a look at any skipped bits in this
1482                 * tnode - everything up to p_pos is supposed to be ok,
1483                 * and the non-chopped bits of the index (se previous
1484                 * paragraph) are also guaranteed ok, but the rest is
1485                 * considered unknown.
1486                 *
1487                 * The skipped bits are key[pos+bits..cn->pos].
1488                 */
1489
1490                /* If current_prefix_length < pos+bits, we are already doing
1491                 * actual prefix  matching, which means everything from
1492                 * pos+(bits-chopped_off) onward must be zero along some
1493                 * branch of this subtree - otherwise there is *no* valid
1494                 * prefix present. Here we can only check the skipped
1495                 * bits. Remember, since we have already indexed into the
1496                 * parent's child array, we know that the bits we chopped of
1497                 * *are* zero.
1498                 */
1499
1500                /* NOTA BENE: Checking only skipped bits
1501                   for the new node here */
1502
1503                if (current_prefix_length < pos+bits) {
1504                        if (tkey_extract_bits(cn->key, current_prefix_length,
1505                                                cn->pos - current_prefix_length)
1506                            || !(cn->child[0]))
1507                                goto backtrace;
1508                }
1509
1510                /*
1511                 * If chopped_off=0, the index is fully validated and we
1512                 * only need to look at the skipped bits for this, the new,
1513                 * tnode. What we actually want to do is to find out if
1514                 * these skipped bits match our key perfectly, or if we will
1515                 * have to count on finding a matching prefix further down,
1516                 * because if we do, we would like to have some way of
1517                 * verifying the existence of such a prefix at this point.
1518                 */
1519
1520                /* The only thing we can do at this point is to verify that
1521                 * any such matching prefix can indeed be a prefix to our
1522                 * key, and if the bits in the node we are inspecting that
1523                 * do not match our key are not ZERO, this cannot be true.
1524                 * Thus, find out where there is a mismatch (before cn->pos)
1525                 * and verify that all the mismatching bits are zero in the
1526                 * new tnode's key.
1527                 */
1528
1529                /*
1530                 * Note: We aren't very concerned about the piece of
1531                 * the key that precede pn->pos+pn->bits, since these
1532                 * have already been checked. The bits after cn->pos
1533                 * aren't checked since these are by definition
1534                 * "unknown" at this point. Thus, what we want to see
1535                 * is if we are about to enter the "prefix matching"
1536                 * state, and in that case verify that the skipped
1537                 * bits that will prevail throughout this subtree are
1538                 * zero, as they have to be if we are to find a
1539                 * matching prefix.
1540                 */
1541
1542                pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1543
1544                /*
1545                 * In short: If skipped bits in this node do not match
1546                 * the search key, enter the "prefix matching"
1547                 * state.directly.
1548                 */
1549                if (pref_mismatch) {
1550                        /* fls(x) = __fls(x) + 1 */
1551                        int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1552
1553                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1554                                goto backtrace;
1555
1556                        if (current_prefix_length >= cn->pos)
1557                                current_prefix_length = mp;
1558                }
1559
1560                pn = (struct tnode *)n; /* Descend */
1561                chopped_off = 0;
1562                continue;
1563
1564backtrace:
1565                chopped_off++;
1566
1567                /* As zero don't change the child key (cindex) */
1568                while ((chopped_off <= pn->bits)
1569                       && !(cindex & (1<<(chopped_off-1))))
1570                        chopped_off++;
1571
1572                /* Decrease current_... with bits chopped off */
1573                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1574                        current_prefix_length = pn->pos + pn->bits
1575                                - chopped_off;
1576
1577                /*
1578                 * Either we do the actual chop off according or if we have
1579                 * chopped off all bits in this tnode walk up to our parent.
1580                 */
1581
1582                if (chopped_off <= pn->bits) {
1583                        cindex &= ~(1 << (chopped_off-1));
1584                } else {
1585                        struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1586                        if (!parent)
1587                                goto failed;
1588
1589                        /* Get Child's index */
1590                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1591                        pn = parent;
1592                        chopped_off = 0;
1593
1594#ifdef CONFIG_IP_FIB_TRIE_STATS
1595                        t->stats.backtrack++;
1596#endif
1597                        goto backtrace;
1598                }
1599        }
1600failed:
1601        ret = 1;
1602found:
1603        rcu_read_unlock();
1604        return ret;
1605}
1606EXPORT_SYMBOL_GPL(fib_table_lookup);
1607
1608/*
1609 * Remove the leaf and return parent.
1610 */
1611static void trie_leaf_remove(struct trie *t, struct leaf *l)
1612{
1613        struct tnode *tp = node_parent((struct rt_trie_node *) l);
1614
1615        pr_debug("entering trie_leaf_remove(%p)\n", l);
1616
1617        if (tp) {
1618                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1619                put_child(tp, cindex, NULL);
1620                trie_rebalance(t, tp);
1621        } else
1622                RCU_INIT_POINTER(t->trie, NULL);
1623
1624        free_leaf(l);
1625}
1626
1627/*
1628 * Caller must hold RTNL.
1629 */
1630int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1631{
1632        struct trie *t = (struct trie *) tb->tb_data;
1633        u32 key, mask;
1634        int plen = cfg->fc_dst_len;
1635        u8 tos = cfg->fc_tos;
1636        struct fib_alias *fa, *fa_to_delete;
1637        struct list_head *fa_head;
1638        struct leaf *l;
1639        struct leaf_info *li;
1640
1641        if (plen > 32)
1642                return -EINVAL;
1643
1644        key = ntohl(cfg->fc_dst);
1645        mask = ntohl(inet_make_mask(plen));
1646
1647        if (key & ~mask)
1648                return -EINVAL;
1649
1650        key = key & mask;
1651        l = fib_find_node(t, key);
1652
1653        if (!l)
1654                return -ESRCH;
1655
1656        li = find_leaf_info(l, plen);
1657
1658        if (!li)
1659                return -ESRCH;
1660
1661        fa_head = &li->falh;
1662        fa = fib_find_alias(fa_head, tos, 0);
1663
1664        if (!fa)
1665                return -ESRCH;
1666
1667        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1668
1669        fa_to_delete = NULL;
1670        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1671        list_for_each_entry_continue(fa, fa_head, fa_list) {
1672                struct fib_info *fi = fa->fa_info;
1673
1674                if (fa->fa_tos != tos)
1675                        break;
1676
1677                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1678                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1679                     fa->fa_info->fib_scope == cfg->fc_scope) &&
1680                    (!cfg->fc_prefsrc ||
1681                     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1682                    (!cfg->fc_protocol ||
1683                     fi->fib_protocol == cfg->fc_protocol) &&
1684                    fib_nh_match(cfg, fi) == 0) {
1685                        fa_to_delete = fa;
1686                        break;
1687                }
1688        }
1689
1690        if (!fa_to_delete)
1691                return -ESRCH;
1692
1693        fa = fa_to_delete;
1694        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1695                  &cfg->fc_nlinfo, 0);
1696
1697        list_del_rcu(&fa->fa_list);
1698
1699        if (!plen)
1700                tb->tb_num_default--;
1701
1702        if (list_empty(fa_head)) {
1703                hlist_del_rcu(&li->hlist);
1704                free_leaf_info(li);
1705        }
1706
1707        if (hlist_empty(&l->list))
1708                trie_leaf_remove(t, l);
1709
1710        if (fa->fa_state & FA_S_ACCESSED)
1711                rt_cache_flush(cfg->fc_nlinfo.nl_net);
1712
1713        fib_release_info(fa->fa_info);
1714        alias_free_mem_rcu(fa);
1715        return 0;
1716}
1717
1718static int trie_flush_list(struct list_head *head)
1719{
1720        struct fib_alias *fa, *fa_node;
1721        int found = 0;
1722
1723        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1724                struct fib_info *fi = fa->fa_info;
1725
1726                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1727                        list_del_rcu(&fa->fa_list);
1728                        fib_release_info(fa->fa_info);
1729                        alias_free_mem_rcu(fa);
1730                        found++;
1731                }
1732        }
1733        return found;
1734}
1735
1736static int trie_flush_leaf(struct leaf *l)
1737{
1738        int found = 0;
1739        struct hlist_head *lih = &l->list;
1740        struct hlist_node *tmp;
1741        struct leaf_info *li = NULL;
1742
1743        hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1744                found += trie_flush_list(&li->falh);
1745
1746                if (list_empty(&li->falh)) {
1747                        hlist_del_rcu(&li->hlist);
1748                        free_leaf_info(li);
1749                }
1750        }
1751        return found;
1752}
1753
1754/*
1755 * Scan for the next right leaf starting at node p->child[idx]
1756 * Since we have back pointer, no recursion necessary.
1757 */
1758static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1759{
1760        do {
1761                t_key idx;
1762
1763                if (c)
1764                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1765                else
1766                        idx = 0;
1767
1768                while (idx < 1u << p->bits) {
1769                        c = tnode_get_child_rcu(p, idx++);
1770                        if (!c)
1771                                continue;
1772
1773                        if (IS_LEAF(c)) {
1774                                prefetch(rcu_dereference_rtnl(p->child[idx]));
1775                                return (struct leaf *) c;
1776                        }
1777
1778                        /* Rescan start scanning in new node */
1779                        p = (struct tnode *) c;
1780                        idx = 0;
1781                }
1782
1783                /* Node empty, walk back up to parent */
1784                c = (struct rt_trie_node *) p;
1785        } while ((p = node_parent_rcu(c)) != NULL);
1786
1787        return NULL; /* Root of trie */
1788}
1789
1790static struct leaf *trie_firstleaf(struct trie *t)
1791{
1792        struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1793
1794        if (!n)
1795                return NULL;
1796
1797        if (IS_LEAF(n))          /* trie is just a leaf */
1798                return (struct leaf *) n;
1799
1800        return leaf_walk_rcu(n, NULL);
1801}
1802
1803static struct leaf *trie_nextleaf(struct leaf *l)
1804{
1805        struct rt_trie_node *c = (struct rt_trie_node *) l;
1806        struct tnode *p = node_parent_rcu(c);
1807
1808        if (!p)
1809                return NULL;    /* trie with just one leaf */
1810
1811        return leaf_walk_rcu(p, c);
1812}
1813
1814static struct leaf *trie_leafindex(struct trie *t, int index)
1815{
1816        struct leaf *l = trie_firstleaf(t);
1817
1818        while (l && index-- > 0)
1819                l = trie_nextleaf(l);
1820
1821        return l;
1822}
1823
1824
1825/*
1826 * Caller must hold RTNL.
1827 */
1828int fib_table_flush(struct fib_table *tb)
1829{
1830        struct trie *t = (struct trie *) tb->tb_data;
1831        struct leaf *l, *ll = NULL;
1832        int found = 0;
1833
1834        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1835                found += trie_flush_leaf(l);
1836
1837                if (ll && hlist_empty(&ll->list))
1838                        trie_leaf_remove(t, ll);
1839                ll = l;
1840        }
1841
1842        if (ll && hlist_empty(&ll->list))
1843                trie_leaf_remove(t, ll);
1844
1845        pr_debug("trie_flush found=%d\n", found);
1846        return found;
1847}
1848
1849void fib_free_table(struct fib_table *tb)
1850{
1851        kfree(tb);
1852}
1853
1854static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1855                           struct fib_table *tb,
1856                           struct sk_buff *skb, struct netlink_callback *cb)
1857{
1858        int i, s_i;
1859        struct fib_alias *fa;
1860        __be32 xkey = htonl(key);
1861
1862        s_i = cb->args[5];
1863        i = 0;
1864
1865        /* rcu_read_lock is hold by caller */
1866
1867        list_for_each_entry_rcu(fa, fah, fa_list) {
1868                if (i < s_i) {
1869                        i++;
1870                        continue;
1871                }
1872
1873                if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1874                                  cb->nlh->nlmsg_seq,
1875                                  RTM_NEWROUTE,
1876                                  tb->tb_id,
1877                                  fa->fa_type,
1878                                  xkey,
1879                                  plen,
1880                                  fa->fa_tos,
1881                                  fa->fa_info, NLM_F_MULTI) < 0) {
1882                        cb->args[5] = i;
1883                        return -1;
1884                }
1885                i++;
1886        }
1887        cb->args[5] = i;
1888        return skb->len;
1889}
1890
1891static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1892                        struct sk_buff *skb, struct netlink_callback *cb)
1893{
1894        struct leaf_info *li;
1895        int i, s_i;
1896
1897        s_i = cb->args[4];
1898        i = 0;
1899
1900        /* rcu_read_lock is hold by caller */
1901        hlist_for_each_entry_rcu(li, &l->list, hlist) {
1902                if (i < s_i) {
1903                        i++;
1904                        continue;
1905                }
1906
1907                if (i > s_i)
1908                        cb->args[5] = 0;
1909
1910                if (list_empty(&li->falh))
1911                        continue;
1912
1913                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1914                        cb->args[4] = i;
1915                        return -1;
1916                }
1917                i++;
1918        }
1919
1920        cb->args[4] = i;
1921        return skb->len;
1922}
1923
1924int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1925                   struct netlink_callback *cb)
1926{
1927        struct leaf *l;
1928        struct trie *t = (struct trie *) tb->tb_data;
1929        t_key key = cb->args[2];
1930        int count = cb->args[3];
1931
1932        rcu_read_lock();
1933        /* Dump starting at last key.
1934         * Note: 0.0.0.0/0 (ie default) is first key.
1935         */
1936        if (count == 0)
1937                l = trie_firstleaf(t);
1938        else {
1939                /* Normally, continue from last key, but if that is missing
1940                 * fallback to using slow rescan
1941                 */
1942                l = fib_find_node(t, key);
1943                if (!l)
1944                        l = trie_leafindex(t, count);
1945        }
1946
1947        while (l) {
1948                cb->args[2] = l->key;
1949                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1950                        cb->args[3] = count;
1951                        rcu_read_unlock();
1952                        return -1;
1953                }
1954
1955                ++count;
1956                l = trie_nextleaf(l);
1957                memset(&cb->args[4], 0,
1958                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1959        }
1960        cb->args[3] = count;
1961        rcu_read_unlock();
1962
1963        return skb->len;
1964}
1965
1966void __init fib_trie_init(void)
1967{
1968        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1969                                          sizeof(struct fib_alias),
1970                                          0, SLAB_PANIC, NULL);
1971
1972        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1973                                           max(sizeof(struct leaf),
1974                                               sizeof(struct leaf_info)),
1975                                           0, SLAB_PANIC, NULL);
1976}
1977
1978
1979struct fib_table *fib_trie_table(u32 id)
1980{
1981        struct fib_table *tb;
1982        struct trie *t;
1983
1984        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1985                     GFP_KERNEL);
1986        if (tb == NULL)
1987                return NULL;
1988
1989        tb->tb_id = id;
1990        tb->tb_default = -1;
1991        tb->tb_num_default = 0;
1992
1993        t = (struct trie *) tb->tb_data;
1994        memset(t, 0, sizeof(*t));
1995
1996        return tb;
1997}
1998
1999#ifdef CONFIG_PROC_FS
2000/* Depth first Trie walk iterator */
2001struct fib_trie_iter {
2002        struct seq_net_private p;
2003        struct fib_table *tb;
2004        struct tnode *tnode;
2005        unsigned int index;
2006        unsigned int depth;
2007};
2008
2009static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2010{
2011        struct tnode *tn = iter->tnode;
2012        unsigned int cindex = iter->index;
2013        struct tnode *p;
2014
2015        /* A single entry routing table */
2016        if (!tn)
2017                return NULL;
2018
2019        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2020                 iter->tnode, iter->index, iter->depth);
2021rescan:
2022        while (cindex < (1<<tn->bits)) {
2023                struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2024
2025                if (n) {
2026                        if (IS_LEAF(n)) {
2027                                iter->tnode = tn;
2028                                iter->index = cindex + 1;
2029                        } else {
2030                                /* push down one level */
2031                                iter->tnode = (struct tnode *) n;
2032                                iter->index = 0;
2033                                ++iter->depth;
2034                        }
2035                        return n;
2036                }
2037
2038                ++cindex;
2039        }
2040
2041        /* Current node exhausted, pop back up */
2042        p = node_parent_rcu((struct rt_trie_node *)tn);
2043        if (p) {
2044                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2045                tn = p;
2046                --iter->depth;
2047                goto rescan;
2048        }
2049
2050        /* got root? */
2051        return NULL;
2052}
2053
2054static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2055                                       struct trie *t)
2056{
2057        struct rt_trie_node *n;
2058
2059        if (!t)
2060                return NULL;
2061
2062        n = rcu_dereference(t->trie);
2063        if (!n)
2064                return NULL;
2065
2066        if (IS_TNODE(n)) {
2067                iter->tnode = (struct tnode *) n;
2068                iter->index = 0;
2069                iter->depth = 1;
2070        } else {
2071                iter->tnode = NULL;
2072                iter->index = 0;
2073                iter->depth = 0;
2074        }
2075
2076        return n;
2077}
2078
2079static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2080{
2081        struct rt_trie_node *n;
2082        struct fib_trie_iter iter;
2083
2084        memset(s, 0, sizeof(*s));
2085
2086        rcu_read_lock();
2087        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2088                if (IS_LEAF(n)) {
2089                        struct leaf *l = (struct leaf *)n;
2090                        struct leaf_info *li;
2091
2092                        s->leaves++;
2093                        s->totdepth += iter.depth;
2094                        if (iter.depth > s->maxdepth)
2095                                s->maxdepth = iter.depth;
2096
2097                        hlist_for_each_entry_rcu(li, &l->list, hlist)
2098                                ++s->prefixes;
2099                } else {
2100                        const struct tnode *tn = (const struct tnode *) n;
2101                        int i;
2102
2103                        s->tnodes++;
2104                        if (tn->bits < MAX_STAT_DEPTH)
2105                                s->nodesizes[tn->bits]++;
2106
2107                        for (i = 0; i < (1<<tn->bits); i++)
2108                                if (!tn->child[i])
2109                                        s->nullpointers++;
2110                }
2111        }
2112        rcu_read_unlock();
2113}
2114
2115/*
2116 *      This outputs /proc/net/fib_triestats
2117 */
2118static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2119{
2120        unsigned int i, max, pointers, bytes, avdepth;
2121
2122        if (stat->leaves)
2123                avdepth = stat->totdepth*100 / stat->leaves;
2124        else
2125                avdepth = 0;
2126
2127        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2128                   avdepth / 100, avdepth % 100);
2129        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2130
2131        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2132        bytes = sizeof(struct leaf) * stat->leaves;
2133
2134        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2135        bytes += sizeof(struct leaf_info) * stat->prefixes;
2136
2137        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2138        bytes += sizeof(struct tnode) * stat->tnodes;
2139
2140        max = MAX_STAT_DEPTH;
2141        while (max > 0 && stat->nodesizes[max-1] == 0)
2142                max--;
2143
2144        pointers = 0;
2145        for (i = 1; i <= max; i++)
2146                if (stat->nodesizes[i] != 0) {
2147                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2148                        pointers += (1<<i) * stat->nodesizes[i];
2149                }
2150        seq_putc(seq, '\n');
2151        seq_printf(seq, "\tPointers: %u\n", pointers);
2152
2153        bytes += sizeof(struct rt_trie_node *) * pointers;
2154        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2155        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2156}
2157
2158#ifdef CONFIG_IP_FIB_TRIE_STATS
2159static void trie_show_usage(struct seq_file *seq,
2160                            const struct trie_use_stats *stats)
2161{
2162        seq_printf(seq, "\nCounters:\n---------\n");
2163        seq_printf(seq, "gets = %u\n", stats->gets);
2164        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2165        seq_printf(seq, "semantic match passed = %u\n",
2166                   stats->semantic_match_passed);
2167        seq_printf(seq, "semantic match miss = %u\n",
2168                   stats->semantic_match_miss);
2169        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2170        seq_printf(seq, "skipped node resize = %u\n\n",
2171                   stats->resize_node_skipped);
2172}
2173#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2174
2175static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2176{
2177        if (tb->tb_id == RT_TABLE_LOCAL)
2178                seq_puts(seq, "Local:\n");
2179        else if (tb->tb_id == RT_TABLE_MAIN)
2180                seq_puts(seq, "Main:\n");
2181        else
2182                seq_printf(seq, "Id %d:\n", tb->tb_id);
2183}
2184
2185
2186static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2187{
2188        struct net *net = (struct net *)seq->private;
2189        unsigned int h;
2190
2191        seq_printf(seq,
2192                   "Basic info: size of leaf:"
2193                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2194                   sizeof(struct leaf), sizeof(struct tnode));
2195
2196        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2197                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2198                struct fib_table *tb;
2199
2200                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2201                        struct trie *t = (struct trie *) tb->tb_data;
2202                        struct trie_stat stat;
2203
2204                        if (!t)
2205                                continue;
2206
2207                        fib_table_print(seq, tb);
2208
2209                        trie_collect_stats(t, &stat);
2210                        trie_show_stats(seq, &stat);
2211#ifdef CONFIG_IP_FIB_TRIE_STATS
2212                        trie_show_usage(seq, &t->stats);
2213#endif
2214                }
2215        }
2216
2217        return 0;
2218}
2219
2220static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2221{
2222        return single_open_net(inode, file, fib_triestat_seq_show);
2223}
2224
2225static const struct file_operations fib_triestat_fops = {
2226        .owner  = THIS_MODULE,
2227        .open   = fib_triestat_seq_open,
2228        .read   = seq_read,
2229        .llseek = seq_lseek,
2230        .release = single_release_net,
2231};
2232
2233static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2234{
2235        struct fib_trie_iter *iter = seq->private;
2236        struct net *net = seq_file_net(seq);
2237        loff_t idx = 0;
2238        unsigned int h;
2239
2240        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2241                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2242                struct fib_table *tb;
2243
2244                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2245                        struct rt_trie_node *n;
2246
2247                        for (n = fib_trie_get_first(iter,
2248                                                    (struct trie *) tb->tb_data);
2249                             n; n = fib_trie_get_next(iter))
2250                                if (pos == idx++) {
2251                                        iter->tb = tb;
2252                                        return n;
2253                                }
2254                }
2255        }
2256
2257        return NULL;
2258}
2259
2260static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2261        __acquires(RCU)
2262{
2263        rcu_read_lock();
2264        return fib_trie_get_idx(seq, *pos);
2265}
2266
2267static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2268{
2269        struct fib_trie_iter *iter = seq->private;
2270        struct net *net = seq_file_net(seq);
2271        struct fib_table *tb = iter->tb;
2272        struct hlist_node *tb_node;
2273        unsigned int h;
2274        struct rt_trie_node *n;
2275
2276        ++*pos;
2277        /* next node in same table */
2278        n = fib_trie_get_next(iter);
2279        if (n)
2280                return n;
2281
2282        /* walk rest of this hash chain */
2283        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2284        while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2285                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2286                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2287                if (n)
2288                        goto found;
2289        }
2290
2291        /* new hash chain */
2292        while (++h < FIB_TABLE_HASHSZ) {
2293                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2294                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2295                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2296                        if (n)
2297                                goto found;
2298                }
2299        }
2300        return NULL;
2301
2302found:
2303        iter->tb = tb;
2304        return n;
2305}
2306
2307static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2308        __releases(RCU)
2309{
2310        rcu_read_unlock();
2311}
2312
2313static void seq_indent(struct seq_file *seq, int n)
2314{
2315        while (n-- > 0)
2316                seq_puts(seq, "   ");
2317}
2318
2319static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2320{
2321        switch (s) {
2322        case RT_SCOPE_UNIVERSE: return "universe";
2323        case RT_SCOPE_SITE:     return "site";
2324        case RT_SCOPE_LINK:     return "link";
2325        case RT_SCOPE_HOST:     return "host";
2326        case RT_SCOPE_NOWHERE:  return "nowhere";
2327        default:
2328                snprintf(buf, len, "scope=%d", s);
2329                return buf;
2330        }
2331}
2332
2333static const char *const rtn_type_names[__RTN_MAX] = {
2334        [RTN_UNSPEC] = "UNSPEC",
2335        [RTN_UNICAST] = "UNICAST",
2336        [RTN_LOCAL] = "LOCAL",
2337        [RTN_BROADCAST] = "BROADCAST",
2338        [RTN_ANYCAST] = "ANYCAST",
2339        [RTN_MULTICAST] = "MULTICAST",
2340        [RTN_BLACKHOLE] = "BLACKHOLE",
2341        [RTN_UNREACHABLE] = "UNREACHABLE",
2342        [RTN_PROHIBIT] = "PROHIBIT",
2343        [RTN_THROW] = "THROW",
2344        [RTN_NAT] = "NAT",
2345        [RTN_XRESOLVE] = "XRESOLVE",
2346};
2347
2348static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2349{
2350        if (t < __RTN_MAX && rtn_type_names[t])
2351                return rtn_type_names[t];
2352        snprintf(buf, len, "type %u", t);
2353        return buf;
2354}
2355
2356/* Pretty print the trie */
2357static int fib_trie_seq_show(struct seq_file *seq, void *v)
2358{
2359        const struct fib_trie_iter *iter = seq->private;
2360        struct rt_trie_node *n = v;
2361
2362        if (!node_parent_rcu(n))
2363                fib_table_print(seq, iter->tb);
2364
2365        if (IS_TNODE(n)) {
2366                struct tnode *tn = (struct tnode *) n;
2367                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2368
2369                seq_indent(seq, iter->depth-1);
2370                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2371                           &prf, tn->pos, tn->bits, tn->full_children,
2372                           tn->empty_children);
2373
2374        } else {
2375                struct leaf *l = (struct leaf *) n;
2376                struct leaf_info *li;
2377                __be32 val = htonl(l->key);
2378
2379                seq_indent(seq, iter->depth);
2380                seq_printf(seq, "  |-- %pI4\n", &val);
2381
2382                hlist_for_each_entry_rcu(li, &l->list, hlist) {
2383                        struct fib_alias *fa;
2384
2385                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2386                                char buf1[32], buf2[32];
2387
2388                                seq_indent(seq, iter->depth+1);
2389                                seq_printf(seq, "  /%d %s %s", li->plen,
2390                                           rtn_scope(buf1, sizeof(buf1),
2391                                                     fa->fa_info->fib_scope),
2392                                           rtn_type(buf2, sizeof(buf2),
2393                                                    fa->fa_type));
2394                                if (fa->fa_tos)
2395                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2396                                seq_putc(seq, '\n');
2397                        }
2398                }
2399        }
2400
2401        return 0;
2402}
2403
2404static const struct seq_operations fib_trie_seq_ops = {
2405        .start  = fib_trie_seq_start,
2406        .next   = fib_trie_seq_next,
2407        .stop   = fib_trie_seq_stop,
2408        .show   = fib_trie_seq_show,
2409};
2410
2411static int fib_trie_seq_open(struct inode *inode, struct file *file)
2412{
2413        return seq_open_net(inode, file, &fib_trie_seq_ops,
2414                            sizeof(struct fib_trie_iter));
2415}
2416
2417static const struct file_operations fib_trie_fops = {
2418        .owner  = THIS_MODULE,
2419        .open   = fib_trie_seq_open,
2420        .read   = seq_read,
2421        .llseek = seq_lseek,
2422        .release = seq_release_net,
2423};
2424
2425struct fib_route_iter {
2426        struct seq_net_private p;
2427        struct trie *main_trie;
2428        loff_t  pos;
2429        t_key   key;
2430};
2431
2432static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2433{
2434        struct leaf *l = NULL;
2435        struct trie *t = iter->main_trie;
2436
2437        /* use cache location of last found key */
2438        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2439                pos -= iter->pos;
2440        else {
2441                iter->pos = 0;
2442                l = trie_firstleaf(t);
2443        }
2444
2445        while (l && pos-- > 0) {
2446                iter->pos++;
2447                l = trie_nextleaf(l);
2448        }
2449
2450        if (l)
2451                iter->key = pos;        /* remember it */
2452        else
2453                iter->pos = 0;          /* forget it */
2454
2455        return l;
2456}
2457
2458static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2459        __acquires(RCU)
2460{
2461        struct fib_route_iter *iter = seq->private;
2462        struct fib_table *tb;
2463
2464        rcu_read_lock();
2465        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2466        if (!tb)
2467                return NULL;
2468
2469        iter->main_trie = (struct trie *) tb->tb_data;
2470        if (*pos == 0)
2471                return SEQ_START_TOKEN;
2472        else
2473                return fib_route_get_idx(iter, *pos - 1);
2474}
2475
2476static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2477{
2478        struct fib_route_iter *iter = seq->private;
2479        struct leaf *l = v;
2480
2481        ++*pos;
2482        if (v == SEQ_START_TOKEN) {
2483                iter->pos = 0;
2484                l = trie_firstleaf(iter->main_trie);
2485        } else {
2486                iter->pos++;
2487                l = trie_nextleaf(l);
2488        }
2489
2490        if (l)
2491                iter->key = l->key;
2492        else
2493                iter->pos = 0;
2494        return l;
2495}
2496
2497static void fib_route_seq_stop(struct seq_file *seq, void *v)
2498        __releases(RCU)
2499{
2500        rcu_read_unlock();
2501}
2502
2503static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2504{
2505        unsigned int flags = 0;
2506
2507        if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2508                flags = RTF_REJECT;
2509        if (fi && fi->fib_nh->nh_gw)
2510                flags |= RTF_GATEWAY;
2511        if (mask == htonl(0xFFFFFFFF))
2512                flags |= RTF_HOST;
2513        flags |= RTF_UP;
2514        return flags;
2515}
2516
2517/*
2518 *      This outputs /proc/net/route.
2519 *      The format of the file is not supposed to be changed
2520 *      and needs to be same as fib_hash output to avoid breaking
2521 *      legacy utilities
2522 */
2523static int fib_route_seq_show(struct seq_file *seq, void *v)
2524{
2525        struct leaf *l = v;
2526        struct leaf_info *li;
2527
2528        if (v == SEQ_START_TOKEN) {
2529                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2530                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2531                           "\tWindow\tIRTT");
2532                return 0;
2533        }
2534
2535        hlist_for_each_entry_rcu(li, &l->list, hlist) {
2536                struct fib_alias *fa;
2537                __be32 mask, prefix;
2538
2539                mask = inet_make_mask(li->plen);
2540                prefix = htonl(l->key);
2541
2542                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2543                        const struct fib_info *fi = fa->fa_info;
2544                        unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2545                        int len;
2546
2547                        if (fa->fa_type == RTN_BROADCAST
2548                            || fa->fa_type == RTN_MULTICAST)
2549                                continue;
2550
2551                        if (fi)
2552                                seq_printf(seq,
2553                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2554                                         "%d\t%08X\t%d\t%u\t%u%n",
2555                                         fi->fib_dev ? fi->fib_dev->name : "*",
2556                                         prefix,
2557                                         fi->fib_nh->nh_gw, flags, 0, 0,
2558                                         fi->fib_priority,
2559                                         mask,
2560                                         (fi->fib_advmss ?
2561                                          fi->fib_advmss + 40 : 0),
2562                                         fi->fib_window,
2563                                         fi->fib_rtt >> 3, &len);
2564                        else
2565                                seq_printf(seq,
2566                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2567                                         "%d\t%08X\t%d\t%u\t%u%n",
2568                                         prefix, 0, flags, 0, 0, 0,
2569                                         mask, 0, 0, 0, &len);
2570
2571                        seq_printf(seq, "%*s\n", 127 - len, "");
2572                }
2573        }
2574
2575        return 0;
2576}
2577
2578static const struct seq_operations fib_route_seq_ops = {
2579        .start  = fib_route_seq_start,
2580        .next   = fib_route_seq_next,
2581        .stop   = fib_route_seq_stop,
2582        .show   = fib_route_seq_show,
2583};
2584
2585static int fib_route_seq_open(struct inode *inode, struct file *file)
2586{
2587        return seq_open_net(inode, file, &fib_route_seq_ops,
2588                            sizeof(struct fib_route_iter));
2589}
2590
2591static const struct file_operations fib_route_fops = {
2592        .owner  = THIS_MODULE,
2593        .open   = fib_route_seq_open,
2594        .read   = seq_read,
2595        .llseek = seq_lseek,
2596        .release = seq_release_net,
2597};
2598
2599int __net_init fib_proc_init(struct net *net)
2600{
2601        if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2602                goto out1;
2603
2604        if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2605                         &fib_triestat_fops))
2606                goto out2;
2607
2608        if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2609                goto out3;
2610
2611        return 0;
2612
2613out3:
2614        remove_proc_entry("fib_triestat", net->proc_net);
2615out2:
2616        remove_proc_entry("fib_trie", net->proc_net);
2617out1:
2618        return -ENOMEM;
2619}
2620
2621void __net_exit fib_proc_exit(struct net *net)
2622{
2623        remove_proc_entry("fib_trie", net->proc_net);
2624        remove_proc_entry("fib_triestat", net->proc_net);
2625        remove_proc_entry("route", net->proc_net);
2626}
2627
2628#endif /* CONFIG_PROC_FS */
2629