linux/net/ipv4/fib_trie.c
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally descibed in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <asm/system.h>
  55#include <linux/bitops.h>
  56#include <linux/types.h>
  57#include <linux/kernel.h>
  58#include <linux/mm.h>
  59#include <linux/string.h>
  60#include <linux/socket.h>
  61#include <linux/sockios.h>
  62#include <linux/errno.h>
  63#include <linux/in.h>
  64#include <linux/inet.h>
  65#include <linux/inetdevice.h>
  66#include <linux/netdevice.h>
  67#include <linux/if_arp.h>
  68#include <linux/proc_fs.h>
  69#include <linux/rcupdate.h>
  70#include <linux/skbuff.h>
  71#include <linux/netlink.h>
  72#include <linux/init.h>
  73#include <linux/list.h>
  74#include <linux/slab.h>
  75#include <net/net_namespace.h>
  76#include <net/ip.h>
  77#include <net/protocol.h>
  78#include <net/route.h>
  79#include <net/tcp.h>
  80#include <net/sock.h>
  81#include <net/ip_fib.h>
  82#include "fib_lookup.h"
  83
  84#define MAX_STAT_DEPTH 32
  85
  86#define KEYLENGTH (8*sizeof(t_key))
  87
  88typedef unsigned int t_key;
  89
  90#define T_TNODE 0
  91#define T_LEAF  1
  92#define NODE_TYPE_MASK  0x1UL
  93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  94
  95#define IS_TNODE(n) (!(n->parent & T_LEAF))
  96#define IS_LEAF(n) (n->parent & T_LEAF)
  97
  98struct node {
  99        unsigned long parent;
 100        t_key key;
 101};
 102
 103struct leaf {
 104        unsigned long parent;
 105        t_key key;
 106        struct hlist_head list;
 107        struct rcu_head rcu;
 108};
 109
 110struct leaf_info {
 111        struct hlist_node hlist;
 112        struct rcu_head rcu;
 113        int plen;
 114        struct list_head falh;
 115};
 116
 117struct tnode {
 118        unsigned long parent;
 119        t_key key;
 120        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 121        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 122        unsigned int full_children;     /* KEYLENGTH bits needed */
 123        unsigned int empty_children;    /* KEYLENGTH bits needed */
 124        union {
 125                struct rcu_head rcu;
 126                struct work_struct work;
 127                struct tnode *tnode_free;
 128        };
 129        struct node *child[0];
 130};
 131
 132#ifdef CONFIG_IP_FIB_TRIE_STATS
 133struct trie_use_stats {
 134        unsigned int gets;
 135        unsigned int backtrack;
 136        unsigned int semantic_match_passed;
 137        unsigned int semantic_match_miss;
 138        unsigned int null_node_hit;
 139        unsigned int resize_node_skipped;
 140};
 141#endif
 142
 143struct trie_stat {
 144        unsigned int totdepth;
 145        unsigned int maxdepth;
 146        unsigned int tnodes;
 147        unsigned int leaves;
 148        unsigned int nullpointers;
 149        unsigned int prefixes;
 150        unsigned int nodesizes[MAX_STAT_DEPTH];
 151};
 152
 153struct trie {
 154        struct node *trie;
 155#ifdef CONFIG_IP_FIB_TRIE_STATS
 156        struct trie_use_stats stats;
 157#endif
 158};
 159
 160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 162                                  int wasfull);
 163static struct node *resize(struct trie *t, struct tnode *tn);
 164static struct tnode *inflate(struct trie *t, struct tnode *tn);
 165static struct tnode *halve(struct trie *t, struct tnode *tn);
 166/* tnodes to free after resize(); protected by RTNL */
 167static struct tnode *tnode_free_head;
 168static size_t tnode_free_size;
 169
 170/*
 171 * synchronize_rcu after call_rcu for that many pages; it should be especially
 172 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 173 * obtained experimentally, aiming to avoid visible slowdown.
 174 */
 175static const int sync_pages = 128;
 176
 177static struct kmem_cache *fn_alias_kmem __read_mostly;
 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
 179
 180static inline struct tnode *node_parent(struct node *node)
 181{
 182        return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
 183}
 184
 185static inline struct tnode *node_parent_rcu(struct node *node)
 186{
 187        struct tnode *ret = node_parent(node);
 188
 189        return rcu_dereference_rtnl(ret);
 190}
 191
 192/* Same as rcu_assign_pointer
 193 * but that macro() assumes that value is a pointer.
 194 */
 195static inline void node_set_parent(struct node *node, struct tnode *ptr)
 196{
 197        smp_wmb();
 198        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 199}
 200
 201static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
 202{
 203        BUG_ON(i >= 1U << tn->bits);
 204
 205        return tn->child[i];
 206}
 207
 208static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
 209{
 210        struct node *ret = tnode_get_child(tn, i);
 211
 212        return rcu_dereference_rtnl(ret);
 213}
 214
 215static inline int tnode_child_length(const struct tnode *tn)
 216{
 217        return 1 << tn->bits;
 218}
 219
 220static inline t_key mask_pfx(t_key k, unsigned short l)
 221{
 222        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 223}
 224
 225static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 226{
 227        if (offset < KEYLENGTH)
 228                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 229        else
 230                return 0;
 231}
 232
 233static inline int tkey_equals(t_key a, t_key b)
 234{
 235        return a == b;
 236}
 237
 238static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 239{
 240        if (bits == 0 || offset >= KEYLENGTH)
 241                return 1;
 242        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 243        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 244}
 245
 246static inline int tkey_mismatch(t_key a, int offset, t_key b)
 247{
 248        t_key diff = a ^ b;
 249        int i = offset;
 250
 251        if (!diff)
 252                return 0;
 253        while ((diff << i) >> (KEYLENGTH-1) == 0)
 254                i++;
 255        return i;
 256}
 257
 258/*
 259  To understand this stuff, an understanding of keys and all their bits is
 260  necessary. Every node in the trie has a key associated with it, but not
 261  all of the bits in that key are significant.
 262
 263  Consider a node 'n' and its parent 'tp'.
 264
 265  If n is a leaf, every bit in its key is significant. Its presence is
 266  necessitated by path compression, since during a tree traversal (when
 267  searching for a leaf - unless we are doing an insertion) we will completely
 268  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 269  a potentially successful search, that we have indeed been walking the
 270  correct key path.
 271
 272  Note that we can never "miss" the correct key in the tree if present by
 273  following the wrong path. Path compression ensures that segments of the key
 274  that are the same for all keys with a given prefix are skipped, but the
 275  skipped part *is* identical for each node in the subtrie below the skipped
 276  bit! trie_insert() in this implementation takes care of that - note the
 277  call to tkey_sub_equals() in trie_insert().
 278
 279  if n is an internal node - a 'tnode' here, the various parts of its key
 280  have many different meanings.
 281
 282  Example:
 283  _________________________________________________________________
 284  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 285  -----------------------------------------------------------------
 286    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 287
 288  _________________________________________________________________
 289  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 290  -----------------------------------------------------------------
 291   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 292
 293  tp->pos = 7
 294  tp->bits = 3
 295  n->pos = 15
 296  n->bits = 4
 297
 298  First, let's just ignore the bits that come before the parent tp, that is
 299  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 300  not use them for anything.
 301
 302  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 303  index into the parent's child array. That is, they will be used to find
 304  'n' among tp's children.
 305
 306  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 307  for the node n.
 308
 309  All the bits we have seen so far are significant to the node n. The rest
 310  of the bits are really not needed or indeed known in n->key.
 311
 312  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 313  n's child array, and will of course be different for each child.
 314
 315
 316  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 317  at this point.
 318
 319*/
 320
 321static inline void check_tnode(const struct tnode *tn)
 322{
 323        WARN_ON(tn && tn->pos+tn->bits > 32);
 324}
 325
 326static const int halve_threshold = 25;
 327static const int inflate_threshold = 50;
 328static const int halve_threshold_root = 15;
 329static const int inflate_threshold_root = 30;
 330
 331static void __alias_free_mem(struct rcu_head *head)
 332{
 333        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 334        kmem_cache_free(fn_alias_kmem, fa);
 335}
 336
 337static inline void alias_free_mem_rcu(struct fib_alias *fa)
 338{
 339        call_rcu(&fa->rcu, __alias_free_mem);
 340}
 341
 342static void __leaf_free_rcu(struct rcu_head *head)
 343{
 344        struct leaf *l = container_of(head, struct leaf, rcu);
 345        kmem_cache_free(trie_leaf_kmem, l);
 346}
 347
 348static inline void free_leaf(struct leaf *l)
 349{
 350        call_rcu_bh(&l->rcu, __leaf_free_rcu);
 351}
 352
 353static void __leaf_info_free_rcu(struct rcu_head *head)
 354{
 355        kfree(container_of(head, struct leaf_info, rcu));
 356}
 357
 358static inline void free_leaf_info(struct leaf_info *leaf)
 359{
 360        call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 361}
 362
 363static struct tnode *tnode_alloc(size_t size)
 364{
 365        if (size <= PAGE_SIZE)
 366                return kzalloc(size, GFP_KERNEL);
 367        else
 368                return vzalloc(size);
 369}
 370
 371static void __tnode_vfree(struct work_struct *arg)
 372{
 373        struct tnode *tn = container_of(arg, struct tnode, work);
 374        vfree(tn);
 375}
 376
 377static void __tnode_free_rcu(struct rcu_head *head)
 378{
 379        struct tnode *tn = container_of(head, struct tnode, rcu);
 380        size_t size = sizeof(struct tnode) +
 381                      (sizeof(struct node *) << tn->bits);
 382
 383        if (size <= PAGE_SIZE)
 384                kfree(tn);
 385        else {
 386                INIT_WORK(&tn->work, __tnode_vfree);
 387                schedule_work(&tn->work);
 388        }
 389}
 390
 391static inline void tnode_free(struct tnode *tn)
 392{
 393        if (IS_LEAF(tn))
 394                free_leaf((struct leaf *) tn);
 395        else
 396                call_rcu(&tn->rcu, __tnode_free_rcu);
 397}
 398
 399static void tnode_free_safe(struct tnode *tn)
 400{
 401        BUG_ON(IS_LEAF(tn));
 402        tn->tnode_free = tnode_free_head;
 403        tnode_free_head = tn;
 404        tnode_free_size += sizeof(struct tnode) +
 405                           (sizeof(struct node *) << tn->bits);
 406}
 407
 408static void tnode_free_flush(void)
 409{
 410        struct tnode *tn;
 411
 412        while ((tn = tnode_free_head)) {
 413                tnode_free_head = tn->tnode_free;
 414                tn->tnode_free = NULL;
 415                tnode_free(tn);
 416        }
 417
 418        if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 419                tnode_free_size = 0;
 420                synchronize_rcu();
 421        }
 422}
 423
 424static struct leaf *leaf_new(void)
 425{
 426        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 427        if (l) {
 428                l->parent = T_LEAF;
 429                INIT_HLIST_HEAD(&l->list);
 430        }
 431        return l;
 432}
 433
 434static struct leaf_info *leaf_info_new(int plen)
 435{
 436        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 437        if (li) {
 438                li->plen = plen;
 439                INIT_LIST_HEAD(&li->falh);
 440        }
 441        return li;
 442}
 443
 444static struct tnode *tnode_new(t_key key, int pos, int bits)
 445{
 446        size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
 447        struct tnode *tn = tnode_alloc(sz);
 448
 449        if (tn) {
 450                tn->parent = T_TNODE;
 451                tn->pos = pos;
 452                tn->bits = bits;
 453                tn->key = key;
 454                tn->full_children = 0;
 455                tn->empty_children = 1<<bits;
 456        }
 457
 458        pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 459                 sizeof(struct node) << bits);
 460        return tn;
 461}
 462
 463/*
 464 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 465 * and no bits are skipped. See discussion in dyntree paper p. 6
 466 */
 467
 468static inline int tnode_full(const struct tnode *tn, const struct node *n)
 469{
 470        if (n == NULL || IS_LEAF(n))
 471                return 0;
 472
 473        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 474}
 475
 476static inline void put_child(struct trie *t, struct tnode *tn, int i,
 477                             struct node *n)
 478{
 479        tnode_put_child_reorg(tn, i, n, -1);
 480}
 481
 482 /*
 483  * Add a child at position i overwriting the old value.
 484  * Update the value of full_children and empty_children.
 485  */
 486
 487static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 488                                  int wasfull)
 489{
 490        struct node *chi = tn->child[i];
 491        int isfull;
 492
 493        BUG_ON(i >= 1<<tn->bits);
 494
 495        /* update emptyChildren */
 496        if (n == NULL && chi != NULL)
 497                tn->empty_children++;
 498        else if (n != NULL && chi == NULL)
 499                tn->empty_children--;
 500
 501        /* update fullChildren */
 502        if (wasfull == -1)
 503                wasfull = tnode_full(tn, chi);
 504
 505        isfull = tnode_full(tn, n);
 506        if (wasfull && !isfull)
 507                tn->full_children--;
 508        else if (!wasfull && isfull)
 509                tn->full_children++;
 510
 511        if (n)
 512                node_set_parent(n, tn);
 513
 514        rcu_assign_pointer(tn->child[i], n);
 515}
 516
 517#define MAX_WORK 10
 518static struct node *resize(struct trie *t, struct tnode *tn)
 519{
 520        int i;
 521        struct tnode *old_tn;
 522        int inflate_threshold_use;
 523        int halve_threshold_use;
 524        int max_work;
 525
 526        if (!tn)
 527                return NULL;
 528
 529        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 530                 tn, inflate_threshold, halve_threshold);
 531
 532        /* No children */
 533        if (tn->empty_children == tnode_child_length(tn)) {
 534                tnode_free_safe(tn);
 535                return NULL;
 536        }
 537        /* One child */
 538        if (tn->empty_children == tnode_child_length(tn) - 1)
 539                goto one_child;
 540        /*
 541         * Double as long as the resulting node has a number of
 542         * nonempty nodes that are above the threshold.
 543         */
 544
 545        /*
 546         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 547         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 548         * Telecommunications, page 6:
 549         * "A node is doubled if the ratio of non-empty children to all
 550         * children in the *doubled* node is at least 'high'."
 551         *
 552         * 'high' in this instance is the variable 'inflate_threshold'. It
 553         * is expressed as a percentage, so we multiply it with
 554         * tnode_child_length() and instead of multiplying by 2 (since the
 555         * child array will be doubled by inflate()) and multiplying
 556         * the left-hand side by 100 (to handle the percentage thing) we
 557         * multiply the left-hand side by 50.
 558         *
 559         * The left-hand side may look a bit weird: tnode_child_length(tn)
 560         * - tn->empty_children is of course the number of non-null children
 561         * in the current node. tn->full_children is the number of "full"
 562         * children, that is non-null tnodes with a skip value of 0.
 563         * All of those will be doubled in the resulting inflated tnode, so
 564         * we just count them one extra time here.
 565         *
 566         * A clearer way to write this would be:
 567         *
 568         * to_be_doubled = tn->full_children;
 569         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 570         *     tn->full_children;
 571         *
 572         * new_child_length = tnode_child_length(tn) * 2;
 573         *
 574         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 575         *      new_child_length;
 576         * if (new_fill_factor >= inflate_threshold)
 577         *
 578         * ...and so on, tho it would mess up the while () loop.
 579         *
 580         * anyway,
 581         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 582         *      inflate_threshold
 583         *
 584         * avoid a division:
 585         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 586         *      inflate_threshold * new_child_length
 587         *
 588         * expand not_to_be_doubled and to_be_doubled, and shorten:
 589         * 100 * (tnode_child_length(tn) - tn->empty_children +
 590         *    tn->full_children) >= inflate_threshold * new_child_length
 591         *
 592         * expand new_child_length:
 593         * 100 * (tnode_child_length(tn) - tn->empty_children +
 594         *    tn->full_children) >=
 595         *      inflate_threshold * tnode_child_length(tn) * 2
 596         *
 597         * shorten again:
 598         * 50 * (tn->full_children + tnode_child_length(tn) -
 599         *    tn->empty_children) >= inflate_threshold *
 600         *    tnode_child_length(tn)
 601         *
 602         */
 603
 604        check_tnode(tn);
 605
 606        /* Keep root node larger  */
 607
 608        if (!node_parent((struct node *)tn)) {
 609                inflate_threshold_use = inflate_threshold_root;
 610                halve_threshold_use = halve_threshold_root;
 611        } else {
 612                inflate_threshold_use = inflate_threshold;
 613                halve_threshold_use = halve_threshold;
 614        }
 615
 616        max_work = MAX_WORK;
 617        while ((tn->full_children > 0 &&  max_work-- &&
 618                50 * (tn->full_children + tnode_child_length(tn)
 619                      - tn->empty_children)
 620                >= inflate_threshold_use * tnode_child_length(tn))) {
 621
 622                old_tn = tn;
 623                tn = inflate(t, tn);
 624
 625                if (IS_ERR(tn)) {
 626                        tn = old_tn;
 627#ifdef CONFIG_IP_FIB_TRIE_STATS
 628                        t->stats.resize_node_skipped++;
 629#endif
 630                        break;
 631                }
 632        }
 633
 634        check_tnode(tn);
 635
 636        /* Return if at least one inflate is run */
 637        if (max_work != MAX_WORK)
 638                return (struct node *) tn;
 639
 640        /*
 641         * Halve as long as the number of empty children in this
 642         * node is above threshold.
 643         */
 644
 645        max_work = MAX_WORK;
 646        while (tn->bits > 1 &&  max_work-- &&
 647               100 * (tnode_child_length(tn) - tn->empty_children) <
 648               halve_threshold_use * tnode_child_length(tn)) {
 649
 650                old_tn = tn;
 651                tn = halve(t, tn);
 652                if (IS_ERR(tn)) {
 653                        tn = old_tn;
 654#ifdef CONFIG_IP_FIB_TRIE_STATS
 655                        t->stats.resize_node_skipped++;
 656#endif
 657                        break;
 658                }
 659        }
 660
 661
 662        /* Only one child remains */
 663        if (tn->empty_children == tnode_child_length(tn) - 1) {
 664one_child:
 665                for (i = 0; i < tnode_child_length(tn); i++) {
 666                        struct node *n;
 667
 668                        n = tn->child[i];
 669                        if (!n)
 670                                continue;
 671
 672                        /* compress one level */
 673
 674                        node_set_parent(n, NULL);
 675                        tnode_free_safe(tn);
 676                        return n;
 677                }
 678        }
 679        return (struct node *) tn;
 680}
 681
 682static struct tnode *inflate(struct trie *t, struct tnode *tn)
 683{
 684        struct tnode *oldtnode = tn;
 685        int olen = tnode_child_length(tn);
 686        int i;
 687
 688        pr_debug("In inflate\n");
 689
 690        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 691
 692        if (!tn)
 693                return ERR_PTR(-ENOMEM);
 694
 695        /*
 696         * Preallocate and store tnodes before the actual work so we
 697         * don't get into an inconsistent state if memory allocation
 698         * fails. In case of failure we return the oldnode and  inflate
 699         * of tnode is ignored.
 700         */
 701
 702        for (i = 0; i < olen; i++) {
 703                struct tnode *inode;
 704
 705                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 706                if (inode &&
 707                    IS_TNODE(inode) &&
 708                    inode->pos == oldtnode->pos + oldtnode->bits &&
 709                    inode->bits > 1) {
 710                        struct tnode *left, *right;
 711                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 712
 713                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 714                                         inode->bits - 1);
 715                        if (!left)
 716                                goto nomem;
 717
 718                        right = tnode_new(inode->key|m, inode->pos + 1,
 719                                          inode->bits - 1);
 720
 721                        if (!right) {
 722                                tnode_free(left);
 723                                goto nomem;
 724                        }
 725
 726                        put_child(t, tn, 2*i, (struct node *) left);
 727                        put_child(t, tn, 2*i+1, (struct node *) right);
 728                }
 729        }
 730
 731        for (i = 0; i < olen; i++) {
 732                struct tnode *inode;
 733                struct node *node = tnode_get_child(oldtnode, i);
 734                struct tnode *left, *right;
 735                int size, j;
 736
 737                /* An empty child */
 738                if (node == NULL)
 739                        continue;
 740
 741                /* A leaf or an internal node with skipped bits */
 742
 743                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 744                   tn->pos + tn->bits - 1) {
 745                        if (tkey_extract_bits(node->key,
 746                                              oldtnode->pos + oldtnode->bits,
 747                                              1) == 0)
 748                                put_child(t, tn, 2*i, node);
 749                        else
 750                                put_child(t, tn, 2*i+1, node);
 751                        continue;
 752                }
 753
 754                /* An internal node with two children */
 755                inode = (struct tnode *) node;
 756
 757                if (inode->bits == 1) {
 758                        put_child(t, tn, 2*i, inode->child[0]);
 759                        put_child(t, tn, 2*i+1, inode->child[1]);
 760
 761                        tnode_free_safe(inode);
 762                        continue;
 763                }
 764
 765                /* An internal node with more than two children */
 766
 767                /* We will replace this node 'inode' with two new
 768                 * ones, 'left' and 'right', each with half of the
 769                 * original children. The two new nodes will have
 770                 * a position one bit further down the key and this
 771                 * means that the "significant" part of their keys
 772                 * (see the discussion near the top of this file)
 773                 * will differ by one bit, which will be "0" in
 774                 * left's key and "1" in right's key. Since we are
 775                 * moving the key position by one step, the bit that
 776                 * we are moving away from - the bit at position
 777                 * (inode->pos) - is the one that will differ between
 778                 * left and right. So... we synthesize that bit in the
 779                 * two  new keys.
 780                 * The mask 'm' below will be a single "one" bit at
 781                 * the position (inode->pos)
 782                 */
 783
 784                /* Use the old key, but set the new significant
 785                 *   bit to zero.
 786                 */
 787
 788                left = (struct tnode *) tnode_get_child(tn, 2*i);
 789                put_child(t, tn, 2*i, NULL);
 790
 791                BUG_ON(!left);
 792
 793                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 794                put_child(t, tn, 2*i+1, NULL);
 795
 796                BUG_ON(!right);
 797
 798                size = tnode_child_length(left);
 799                for (j = 0; j < size; j++) {
 800                        put_child(t, left, j, inode->child[j]);
 801                        put_child(t, right, j, inode->child[j + size]);
 802                }
 803                put_child(t, tn, 2*i, resize(t, left));
 804                put_child(t, tn, 2*i+1, resize(t, right));
 805
 806                tnode_free_safe(inode);
 807        }
 808        tnode_free_safe(oldtnode);
 809        return tn;
 810nomem:
 811        {
 812                int size = tnode_child_length(tn);
 813                int j;
 814
 815                for (j = 0; j < size; j++)
 816                        if (tn->child[j])
 817                                tnode_free((struct tnode *)tn->child[j]);
 818
 819                tnode_free(tn);
 820
 821                return ERR_PTR(-ENOMEM);
 822        }
 823}
 824
 825static struct tnode *halve(struct trie *t, struct tnode *tn)
 826{
 827        struct tnode *oldtnode = tn;
 828        struct node *left, *right;
 829        int i;
 830        int olen = tnode_child_length(tn);
 831
 832        pr_debug("In halve\n");
 833
 834        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 835
 836        if (!tn)
 837                return ERR_PTR(-ENOMEM);
 838
 839        /*
 840         * Preallocate and store tnodes before the actual work so we
 841         * don't get into an inconsistent state if memory allocation
 842         * fails. In case of failure we return the oldnode and halve
 843         * of tnode is ignored.
 844         */
 845
 846        for (i = 0; i < olen; i += 2) {
 847                left = tnode_get_child(oldtnode, i);
 848                right = tnode_get_child(oldtnode, i+1);
 849
 850                /* Two nonempty children */
 851                if (left && right) {
 852                        struct tnode *newn;
 853
 854                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 855
 856                        if (!newn)
 857                                goto nomem;
 858
 859                        put_child(t, tn, i/2, (struct node *)newn);
 860                }
 861
 862        }
 863
 864        for (i = 0; i < olen; i += 2) {
 865                struct tnode *newBinNode;
 866
 867                left = tnode_get_child(oldtnode, i);
 868                right = tnode_get_child(oldtnode, i+1);
 869
 870                /* At least one of the children is empty */
 871                if (left == NULL) {
 872                        if (right == NULL)    /* Both are empty */
 873                                continue;
 874                        put_child(t, tn, i/2, right);
 875                        continue;
 876                }
 877
 878                if (right == NULL) {
 879                        put_child(t, tn, i/2, left);
 880                        continue;
 881                }
 882
 883                /* Two nonempty children */
 884                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 885                put_child(t, tn, i/2, NULL);
 886                put_child(t, newBinNode, 0, left);
 887                put_child(t, newBinNode, 1, right);
 888                put_child(t, tn, i/2, resize(t, newBinNode));
 889        }
 890        tnode_free_safe(oldtnode);
 891        return tn;
 892nomem:
 893        {
 894                int size = tnode_child_length(tn);
 895                int j;
 896
 897                for (j = 0; j < size; j++)
 898                        if (tn->child[j])
 899                                tnode_free((struct tnode *)tn->child[j]);
 900
 901                tnode_free(tn);
 902
 903                return ERR_PTR(-ENOMEM);
 904        }
 905}
 906
 907/* readside must use rcu_read_lock currently dump routines
 908 via get_fa_head and dump */
 909
 910static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 911{
 912        struct hlist_head *head = &l->list;
 913        struct hlist_node *node;
 914        struct leaf_info *li;
 915
 916        hlist_for_each_entry_rcu(li, node, head, hlist)
 917                if (li->plen == plen)
 918                        return li;
 919
 920        return NULL;
 921}
 922
 923static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 924{
 925        struct leaf_info *li = find_leaf_info(l, plen);
 926
 927        if (!li)
 928                return NULL;
 929
 930        return &li->falh;
 931}
 932
 933static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 934{
 935        struct leaf_info *li = NULL, *last = NULL;
 936        struct hlist_node *node;
 937
 938        if (hlist_empty(head)) {
 939                hlist_add_head_rcu(&new->hlist, head);
 940        } else {
 941                hlist_for_each_entry(li, node, head, hlist) {
 942                        if (new->plen > li->plen)
 943                                break;
 944
 945                        last = li;
 946                }
 947                if (last)
 948                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 949                else
 950                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 951        }
 952}
 953
 954/* rcu_read_lock needs to be hold by caller from readside */
 955
 956static struct leaf *
 957fib_find_node(struct trie *t, u32 key)
 958{
 959        int pos;
 960        struct tnode *tn;
 961        struct node *n;
 962
 963        pos = 0;
 964        n = rcu_dereference_rtnl(t->trie);
 965
 966        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 967                tn = (struct tnode *) n;
 968
 969                check_tnode(tn);
 970
 971                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 972                        pos = tn->pos + tn->bits;
 973                        n = tnode_get_child_rcu(tn,
 974                                                tkey_extract_bits(key,
 975                                                                  tn->pos,
 976                                                                  tn->bits));
 977                } else
 978                        break;
 979        }
 980        /* Case we have found a leaf. Compare prefixes */
 981
 982        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 983                return (struct leaf *)n;
 984
 985        return NULL;
 986}
 987
 988static void trie_rebalance(struct trie *t, struct tnode *tn)
 989{
 990        int wasfull;
 991        t_key cindex, key;
 992        struct tnode *tp;
 993
 994        key = tn->key;
 995
 996        while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
 997                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 998                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 999                tn = (struct tnode *) resize(t, (struct tnode *)tn);
1000
1001                tnode_put_child_reorg((struct tnode *)tp, cindex,
1002                                      (struct node *)tn, wasfull);
1003
1004                tp = node_parent((struct node *) tn);
1005                if (!tp)
1006                        rcu_assign_pointer(t->trie, (struct node *)tn);
1007
1008                tnode_free_flush();
1009                if (!tp)
1010                        break;
1011                tn = tp;
1012        }
1013
1014        /* Handle last (top) tnode */
1015        if (IS_TNODE(tn))
1016                tn = (struct tnode *)resize(t, (struct tnode *)tn);
1017
1018        rcu_assign_pointer(t->trie, (struct node *)tn);
1019        tnode_free_flush();
1020}
1021
1022/* only used from updater-side */
1023
1024static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1025{
1026        int pos, newpos;
1027        struct tnode *tp = NULL, *tn = NULL;
1028        struct node *n;
1029        struct leaf *l;
1030        int missbit;
1031        struct list_head *fa_head = NULL;
1032        struct leaf_info *li;
1033        t_key cindex;
1034
1035        pos = 0;
1036        n = t->trie;
1037
1038        /* If we point to NULL, stop. Either the tree is empty and we should
1039         * just put a new leaf in if, or we have reached an empty child slot,
1040         * and we should just put our new leaf in that.
1041         * If we point to a T_TNODE, check if it matches our key. Note that
1042         * a T_TNODE might be skipping any number of bits - its 'pos' need
1043         * not be the parent's 'pos'+'bits'!
1044         *
1045         * If it does match the current key, get pos/bits from it, extract
1046         * the index from our key, push the T_TNODE and walk the tree.
1047         *
1048         * If it doesn't, we have to replace it with a new T_TNODE.
1049         *
1050         * If we point to a T_LEAF, it might or might not have the same key
1051         * as we do. If it does, just change the value, update the T_LEAF's
1052         * value, and return it.
1053         * If it doesn't, we need to replace it with a T_TNODE.
1054         */
1055
1056        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1057                tn = (struct tnode *) n;
1058
1059                check_tnode(tn);
1060
1061                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1062                        tp = tn;
1063                        pos = tn->pos + tn->bits;
1064                        n = tnode_get_child(tn,
1065                                            tkey_extract_bits(key,
1066                                                              tn->pos,
1067                                                              tn->bits));
1068
1069                        BUG_ON(n && node_parent(n) != tn);
1070                } else
1071                        break;
1072        }
1073
1074        /*
1075         * n  ----> NULL, LEAF or TNODE
1076         *
1077         * tp is n's (parent) ----> NULL or TNODE
1078         */
1079
1080        BUG_ON(tp && IS_LEAF(tp));
1081
1082        /* Case 1: n is a leaf. Compare prefixes */
1083
1084        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1085                l = (struct leaf *) n;
1086                li = leaf_info_new(plen);
1087
1088                if (!li)
1089                        return NULL;
1090
1091                fa_head = &li->falh;
1092                insert_leaf_info(&l->list, li);
1093                goto done;
1094        }
1095        l = leaf_new();
1096
1097        if (!l)
1098                return NULL;
1099
1100        l->key = key;
1101        li = leaf_info_new(plen);
1102
1103        if (!li) {
1104                free_leaf(l);
1105                return NULL;
1106        }
1107
1108        fa_head = &li->falh;
1109        insert_leaf_info(&l->list, li);
1110
1111        if (t->trie && n == NULL) {
1112                /* Case 2: n is NULL, and will just insert a new leaf */
1113
1114                node_set_parent((struct node *)l, tp);
1115
1116                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1117                put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1118        } else {
1119                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1120                /*
1121                 *  Add a new tnode here
1122                 *  first tnode need some special handling
1123                 */
1124
1125                if (tp)
1126                        pos = tp->pos+tp->bits;
1127                else
1128                        pos = 0;
1129
1130                if (n) {
1131                        newpos = tkey_mismatch(key, pos, n->key);
1132                        tn = tnode_new(n->key, newpos, 1);
1133                } else {
1134                        newpos = 0;
1135                        tn = tnode_new(key, newpos, 1); /* First tnode */
1136                }
1137
1138                if (!tn) {
1139                        free_leaf_info(li);
1140                        free_leaf(l);
1141                        return NULL;
1142                }
1143
1144                node_set_parent((struct node *)tn, tp);
1145
1146                missbit = tkey_extract_bits(key, newpos, 1);
1147                put_child(t, tn, missbit, (struct node *)l);
1148                put_child(t, tn, 1-missbit, n);
1149
1150                if (tp) {
1151                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1152                        put_child(t, (struct tnode *)tp, cindex,
1153                                  (struct node *)tn);
1154                } else {
1155                        rcu_assign_pointer(t->trie, (struct node *)tn);
1156                        tp = tn;
1157                }
1158        }
1159
1160        if (tp && tp->pos + tp->bits > 32)
1161                pr_warning("fib_trie"
1162                           " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1163                           tp, tp->pos, tp->bits, key, plen);
1164
1165        /* Rebalance the trie */
1166
1167        trie_rebalance(t, tp);
1168done:
1169        return fa_head;
1170}
1171
1172/*
1173 * Caller must hold RTNL.
1174 */
1175int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1176{
1177        struct trie *t = (struct trie *) tb->tb_data;
1178        struct fib_alias *fa, *new_fa;
1179        struct list_head *fa_head = NULL;
1180        struct fib_info *fi;
1181        int plen = cfg->fc_dst_len;
1182        u8 tos = cfg->fc_tos;
1183        u32 key, mask;
1184        int err;
1185        struct leaf *l;
1186
1187        if (plen > 32)
1188                return -EINVAL;
1189
1190        key = ntohl(cfg->fc_dst);
1191
1192        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1193
1194        mask = ntohl(inet_make_mask(plen));
1195
1196        if (key & ~mask)
1197                return -EINVAL;
1198
1199        key = key & mask;
1200
1201        fi = fib_create_info(cfg);
1202        if (IS_ERR(fi)) {
1203                err = PTR_ERR(fi);
1204                goto err;
1205        }
1206
1207        l = fib_find_node(t, key);
1208        fa = NULL;
1209
1210        if (l) {
1211                fa_head = get_fa_head(l, plen);
1212                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1213        }
1214
1215        /* Now fa, if non-NULL, points to the first fib alias
1216         * with the same keys [prefix,tos,priority], if such key already
1217         * exists or to the node before which we will insert new one.
1218         *
1219         * If fa is NULL, we will need to allocate a new one and
1220         * insert to the head of f.
1221         *
1222         * If f is NULL, no fib node matched the destination key
1223         * and we need to allocate a new one of those as well.
1224         */
1225
1226        if (fa && fa->fa_tos == tos &&
1227            fa->fa_info->fib_priority == fi->fib_priority) {
1228                struct fib_alias *fa_first, *fa_match;
1229
1230                err = -EEXIST;
1231                if (cfg->fc_nlflags & NLM_F_EXCL)
1232                        goto out;
1233
1234                /* We have 2 goals:
1235                 * 1. Find exact match for type, scope, fib_info to avoid
1236                 * duplicate routes
1237                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1238                 */
1239                fa_match = NULL;
1240                fa_first = fa;
1241                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1242                list_for_each_entry_continue(fa, fa_head, fa_list) {
1243                        if (fa->fa_tos != tos)
1244                                break;
1245                        if (fa->fa_info->fib_priority != fi->fib_priority)
1246                                break;
1247                        if (fa->fa_type == cfg->fc_type &&
1248                            fa->fa_scope == cfg->fc_scope &&
1249                            fa->fa_info == fi) {
1250                                fa_match = fa;
1251                                break;
1252                        }
1253                }
1254
1255                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1256                        struct fib_info *fi_drop;
1257                        u8 state;
1258
1259                        fa = fa_first;
1260                        if (fa_match) {
1261                                if (fa == fa_match)
1262                                        err = 0;
1263                                goto out;
1264                        }
1265                        err = -ENOBUFS;
1266                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1267                        if (new_fa == NULL)
1268                                goto out;
1269
1270                        fi_drop = fa->fa_info;
1271                        new_fa->fa_tos = fa->fa_tos;
1272                        new_fa->fa_info = fi;
1273                        new_fa->fa_type = cfg->fc_type;
1274                        new_fa->fa_scope = cfg->fc_scope;
1275                        state = fa->fa_state;
1276                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1277
1278                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1279                        alias_free_mem_rcu(fa);
1280
1281                        fib_release_info(fi_drop);
1282                        if (state & FA_S_ACCESSED)
1283                                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1284                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1285                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1286
1287                        goto succeeded;
1288                }
1289                /* Error if we find a perfect match which
1290                 * uses the same scope, type, and nexthop
1291                 * information.
1292                 */
1293                if (fa_match)
1294                        goto out;
1295
1296                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1297                        fa = fa_first;
1298        }
1299        err = -ENOENT;
1300        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1301                goto out;
1302
1303        err = -ENOBUFS;
1304        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1305        if (new_fa == NULL)
1306                goto out;
1307
1308        new_fa->fa_info = fi;
1309        new_fa->fa_tos = tos;
1310        new_fa->fa_type = cfg->fc_type;
1311        new_fa->fa_scope = cfg->fc_scope;
1312        new_fa->fa_state = 0;
1313        /*
1314         * Insert new entry to the list.
1315         */
1316
1317        if (!fa_head) {
1318                fa_head = fib_insert_node(t, key, plen);
1319                if (unlikely(!fa_head)) {
1320                        err = -ENOMEM;
1321                        goto out_free_new_fa;
1322                }
1323        }
1324
1325        list_add_tail_rcu(&new_fa->fa_list,
1326                          (fa ? &fa->fa_list : fa_head));
1327
1328        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1329        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1330                  &cfg->fc_nlinfo, 0);
1331succeeded:
1332        return 0;
1333
1334out_free_new_fa:
1335        kmem_cache_free(fn_alias_kmem, new_fa);
1336out:
1337        fib_release_info(fi);
1338err:
1339        return err;
1340}
1341
1342/* should be called with rcu_read_lock */
1343static int check_leaf(struct trie *t, struct leaf *l,
1344                      t_key key,  const struct flowi *flp,
1345                      struct fib_result *res, int fib_flags)
1346{
1347        struct leaf_info *li;
1348        struct hlist_head *hhead = &l->list;
1349        struct hlist_node *node;
1350
1351        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1352                int err;
1353                int plen = li->plen;
1354                __be32 mask = inet_make_mask(plen);
1355
1356                if (l->key != (key & ntohl(mask)))
1357                        continue;
1358
1359                err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
1360
1361#ifdef CONFIG_IP_FIB_TRIE_STATS
1362                if (err <= 0)
1363                        t->stats.semantic_match_passed++;
1364                else
1365                        t->stats.semantic_match_miss++;
1366#endif
1367                if (err <= 0)
1368                        return err;
1369        }
1370
1371        return 1;
1372}
1373
1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1375                     struct fib_result *res, int fib_flags)
1376{
1377        struct trie *t = (struct trie *) tb->tb_data;
1378        int ret;
1379        struct node *n;
1380        struct tnode *pn;
1381        int pos, bits;
1382        t_key key = ntohl(flp->fl4_dst);
1383        int chopped_off;
1384        t_key cindex = 0;
1385        int current_prefix_length = KEYLENGTH;
1386        struct tnode *cn;
1387        t_key pref_mismatch;
1388
1389        rcu_read_lock();
1390
1391        n = rcu_dereference(t->trie);
1392        if (!n)
1393                goto failed;
1394
1395#ifdef CONFIG_IP_FIB_TRIE_STATS
1396        t->stats.gets++;
1397#endif
1398
1399        /* Just a leaf? */
1400        if (IS_LEAF(n)) {
1401                ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1402                goto found;
1403        }
1404
1405        pn = (struct tnode *) n;
1406        chopped_off = 0;
1407
1408        while (pn) {
1409                pos = pn->pos;
1410                bits = pn->bits;
1411
1412                if (!chopped_off)
1413                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1414                                                   pos, bits);
1415
1416                n = tnode_get_child_rcu(pn, cindex);
1417
1418                if (n == NULL) {
1419#ifdef CONFIG_IP_FIB_TRIE_STATS
1420                        t->stats.null_node_hit++;
1421#endif
1422                        goto backtrace;
1423                }
1424
1425                if (IS_LEAF(n)) {
1426                        ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1427                        if (ret > 0)
1428                                goto backtrace;
1429                        goto found;
1430                }
1431
1432                cn = (struct tnode *)n;
1433
1434                /*
1435                 * It's a tnode, and we can do some extra checks here if we
1436                 * like, to avoid descending into a dead-end branch.
1437                 * This tnode is in the parent's child array at index
1438                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1439                 * chopped off, so in reality the index may be just a
1440                 * subprefix, padded with zero at the end.
1441                 * We can also take a look at any skipped bits in this
1442                 * tnode - everything up to p_pos is supposed to be ok,
1443                 * and the non-chopped bits of the index (se previous
1444                 * paragraph) are also guaranteed ok, but the rest is
1445                 * considered unknown.
1446                 *
1447                 * The skipped bits are key[pos+bits..cn->pos].
1448                 */
1449
1450                /* If current_prefix_length < pos+bits, we are already doing
1451                 * actual prefix  matching, which means everything from
1452                 * pos+(bits-chopped_off) onward must be zero along some
1453                 * branch of this subtree - otherwise there is *no* valid
1454                 * prefix present. Here we can only check the skipped
1455                 * bits. Remember, since we have already indexed into the
1456                 * parent's child array, we know that the bits we chopped of
1457                 * *are* zero.
1458                 */
1459
1460                /* NOTA BENE: Checking only skipped bits
1461                   for the new node here */
1462
1463                if (current_prefix_length < pos+bits) {
1464                        if (tkey_extract_bits(cn->key, current_prefix_length,
1465                                                cn->pos - current_prefix_length)
1466                            || !(cn->child[0]))
1467                                goto backtrace;
1468                }
1469
1470                /*
1471                 * If chopped_off=0, the index is fully validated and we
1472                 * only need to look at the skipped bits for this, the new,
1473                 * tnode. What we actually want to do is to find out if
1474                 * these skipped bits match our key perfectly, or if we will
1475                 * have to count on finding a matching prefix further down,
1476                 * because if we do, we would like to have some way of
1477                 * verifying the existence of such a prefix at this point.
1478                 */
1479
1480                /* The only thing we can do at this point is to verify that
1481                 * any such matching prefix can indeed be a prefix to our
1482                 * key, and if the bits in the node we are inspecting that
1483                 * do not match our key are not ZERO, this cannot be true.
1484                 * Thus, find out where there is a mismatch (before cn->pos)
1485                 * and verify that all the mismatching bits are zero in the
1486                 * new tnode's key.
1487                 */
1488
1489                /*
1490                 * Note: We aren't very concerned about the piece of
1491                 * the key that precede pn->pos+pn->bits, since these
1492                 * have already been checked. The bits after cn->pos
1493                 * aren't checked since these are by definition
1494                 * "unknown" at this point. Thus, what we want to see
1495                 * is if we are about to enter the "prefix matching"
1496                 * state, and in that case verify that the skipped
1497                 * bits that will prevail throughout this subtree are
1498                 * zero, as they have to be if we are to find a
1499                 * matching prefix.
1500                 */
1501
1502                pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1503
1504                /*
1505                 * In short: If skipped bits in this node do not match
1506                 * the search key, enter the "prefix matching"
1507                 * state.directly.
1508                 */
1509                if (pref_mismatch) {
1510                        int mp = KEYLENGTH - fls(pref_mismatch);
1511
1512                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1513                                goto backtrace;
1514
1515                        if (current_prefix_length >= cn->pos)
1516                                current_prefix_length = mp;
1517                }
1518
1519                pn = (struct tnode *)n; /* Descend */
1520                chopped_off = 0;
1521                continue;
1522
1523backtrace:
1524                chopped_off++;
1525
1526                /* As zero don't change the child key (cindex) */
1527                while ((chopped_off <= pn->bits)
1528                       && !(cindex & (1<<(chopped_off-1))))
1529                        chopped_off++;
1530
1531                /* Decrease current_... with bits chopped off */
1532                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1533                        current_prefix_length = pn->pos + pn->bits
1534                                - chopped_off;
1535
1536                /*
1537                 * Either we do the actual chop off according or if we have
1538                 * chopped off all bits in this tnode walk up to our parent.
1539                 */
1540
1541                if (chopped_off <= pn->bits) {
1542                        cindex &= ~(1 << (chopped_off-1));
1543                } else {
1544                        struct tnode *parent = node_parent_rcu((struct node *) pn);
1545                        if (!parent)
1546                                goto failed;
1547
1548                        /* Get Child's index */
1549                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1550                        pn = parent;
1551                        chopped_off = 0;
1552
1553#ifdef CONFIG_IP_FIB_TRIE_STATS
1554                        t->stats.backtrack++;
1555#endif
1556                        goto backtrace;
1557                }
1558        }
1559failed:
1560        ret = 1;
1561found:
1562        rcu_read_unlock();
1563        return ret;
1564}
1565
1566/*
1567 * Remove the leaf and return parent.
1568 */
1569static void trie_leaf_remove(struct trie *t, struct leaf *l)
1570{
1571        struct tnode *tp = node_parent((struct node *) l);
1572
1573        pr_debug("entering trie_leaf_remove(%p)\n", l);
1574
1575        if (tp) {
1576                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1577                put_child(t, (struct tnode *)tp, cindex, NULL);
1578                trie_rebalance(t, tp);
1579        } else
1580                rcu_assign_pointer(t->trie, NULL);
1581
1582        free_leaf(l);
1583}
1584
1585/*
1586 * Caller must hold RTNL.
1587 */
1588int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1589{
1590        struct trie *t = (struct trie *) tb->tb_data;
1591        u32 key, mask;
1592        int plen = cfg->fc_dst_len;
1593        u8 tos = cfg->fc_tos;
1594        struct fib_alias *fa, *fa_to_delete;
1595        struct list_head *fa_head;
1596        struct leaf *l;
1597        struct leaf_info *li;
1598
1599        if (plen > 32)
1600                return -EINVAL;
1601
1602        key = ntohl(cfg->fc_dst);
1603        mask = ntohl(inet_make_mask(plen));
1604
1605        if (key & ~mask)
1606                return -EINVAL;
1607
1608        key = key & mask;
1609        l = fib_find_node(t, key);
1610
1611        if (!l)
1612                return -ESRCH;
1613
1614        fa_head = get_fa_head(l, plen);
1615        fa = fib_find_alias(fa_head, tos, 0);
1616
1617        if (!fa)
1618                return -ESRCH;
1619
1620        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1621
1622        fa_to_delete = NULL;
1623        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1624        list_for_each_entry_continue(fa, fa_head, fa_list) {
1625                struct fib_info *fi = fa->fa_info;
1626
1627                if (fa->fa_tos != tos)
1628                        break;
1629
1630                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1631                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1632                     fa->fa_scope == cfg->fc_scope) &&
1633                    (!cfg->fc_protocol ||
1634                     fi->fib_protocol == cfg->fc_protocol) &&
1635                    fib_nh_match(cfg, fi) == 0) {
1636                        fa_to_delete = fa;
1637                        break;
1638                }
1639        }
1640
1641        if (!fa_to_delete)
1642                return -ESRCH;
1643
1644        fa = fa_to_delete;
1645        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1646                  &cfg->fc_nlinfo, 0);
1647
1648        l = fib_find_node(t, key);
1649        li = find_leaf_info(l, plen);
1650
1651        list_del_rcu(&fa->fa_list);
1652
1653        if (list_empty(fa_head)) {
1654                hlist_del_rcu(&li->hlist);
1655                free_leaf_info(li);
1656        }
1657
1658        if (hlist_empty(&l->list))
1659                trie_leaf_remove(t, l);
1660
1661        if (fa->fa_state & FA_S_ACCESSED)
1662                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1663
1664        fib_release_info(fa->fa_info);
1665        alias_free_mem_rcu(fa);
1666        return 0;
1667}
1668
1669static int trie_flush_list(struct list_head *head)
1670{
1671        struct fib_alias *fa, *fa_node;
1672        int found = 0;
1673
1674        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1675                struct fib_info *fi = fa->fa_info;
1676
1677                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1678                        list_del_rcu(&fa->fa_list);
1679                        fib_release_info(fa->fa_info);
1680                        alias_free_mem_rcu(fa);
1681                        found++;
1682                }
1683        }
1684        return found;
1685}
1686
1687static int trie_flush_leaf(struct leaf *l)
1688{
1689        int found = 0;
1690        struct hlist_head *lih = &l->list;
1691        struct hlist_node *node, *tmp;
1692        struct leaf_info *li = NULL;
1693
1694        hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1695                found += trie_flush_list(&li->falh);
1696
1697                if (list_empty(&li->falh)) {
1698                        hlist_del_rcu(&li->hlist);
1699                        free_leaf_info(li);
1700                }
1701        }
1702        return found;
1703}
1704
1705/*
1706 * Scan for the next right leaf starting at node p->child[idx]
1707 * Since we have back pointer, no recursion necessary.
1708 */
1709static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1710{
1711        do {
1712                t_key idx;
1713
1714                if (c)
1715                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1716                else
1717                        idx = 0;
1718
1719                while (idx < 1u << p->bits) {
1720                        c = tnode_get_child_rcu(p, idx++);
1721                        if (!c)
1722                                continue;
1723
1724                        if (IS_LEAF(c)) {
1725                                prefetch(p->child[idx]);
1726                                return (struct leaf *) c;
1727                        }
1728
1729                        /* Rescan start scanning in new node */
1730                        p = (struct tnode *) c;
1731                        idx = 0;
1732                }
1733
1734                /* Node empty, walk back up to parent */
1735                c = (struct node *) p;
1736        } while ((p = node_parent_rcu(c)) != NULL);
1737
1738        return NULL; /* Root of trie */
1739}
1740
1741static struct leaf *trie_firstleaf(struct trie *t)
1742{
1743        struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1744
1745        if (!n)
1746                return NULL;
1747
1748        if (IS_LEAF(n))          /* trie is just a leaf */
1749                return (struct leaf *) n;
1750
1751        return leaf_walk_rcu(n, NULL);
1752}
1753
1754static struct leaf *trie_nextleaf(struct leaf *l)
1755{
1756        struct node *c = (struct node *) l;
1757        struct tnode *p = node_parent_rcu(c);
1758
1759        if (!p)
1760                return NULL;    /* trie with just one leaf */
1761
1762        return leaf_walk_rcu(p, c);
1763}
1764
1765static struct leaf *trie_leafindex(struct trie *t, int index)
1766{
1767        struct leaf *l = trie_firstleaf(t);
1768
1769        while (l && index-- > 0)
1770                l = trie_nextleaf(l);
1771
1772        return l;
1773}
1774
1775
1776/*
1777 * Caller must hold RTNL.
1778 */
1779int fib_table_flush(struct fib_table *tb)
1780{
1781        struct trie *t = (struct trie *) tb->tb_data;
1782        struct leaf *l, *ll = NULL;
1783        int found = 0;
1784
1785        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1786                found += trie_flush_leaf(l);
1787
1788                if (ll && hlist_empty(&ll->list))
1789                        trie_leaf_remove(t, ll);
1790                ll = l;
1791        }
1792
1793        if (ll && hlist_empty(&ll->list))
1794                trie_leaf_remove(t, ll);
1795
1796        pr_debug("trie_flush found=%d\n", found);
1797        return found;
1798}
1799
1800void fib_free_table(struct fib_table *tb)
1801{
1802        kfree(tb);
1803}
1804
1805void fib_table_select_default(struct fib_table *tb,
1806                              const struct flowi *flp,
1807                              struct fib_result *res)
1808{
1809        struct trie *t = (struct trie *) tb->tb_data;
1810        int order, last_idx;
1811        struct fib_info *fi = NULL;
1812        struct fib_info *last_resort;
1813        struct fib_alias *fa = NULL;
1814        struct list_head *fa_head;
1815        struct leaf *l;
1816
1817        last_idx = -1;
1818        last_resort = NULL;
1819        order = -1;
1820
1821        rcu_read_lock();
1822
1823        l = fib_find_node(t, 0);
1824        if (!l)
1825                goto out;
1826
1827        fa_head = get_fa_head(l, 0);
1828        if (!fa_head)
1829                goto out;
1830
1831        if (list_empty(fa_head))
1832                goto out;
1833
1834        list_for_each_entry_rcu(fa, fa_head, fa_list) {
1835                struct fib_info *next_fi = fa->fa_info;
1836
1837                if (fa->fa_scope != res->scope ||
1838                    fa->fa_type != RTN_UNICAST)
1839                        continue;
1840
1841                if (next_fi->fib_priority > res->fi->fib_priority)
1842                        break;
1843                if (!next_fi->fib_nh[0].nh_gw ||
1844                    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1845                        continue;
1846
1847                fib_alias_accessed(fa);
1848
1849                if (fi == NULL) {
1850                        if (next_fi != res->fi)
1851                                break;
1852                } else if (!fib_detect_death(fi, order, &last_resort,
1853                                             &last_idx, tb->tb_default)) {
1854                        fib_result_assign(res, fi);
1855                        tb->tb_default = order;
1856                        goto out;
1857                }
1858                fi = next_fi;
1859                order++;
1860        }
1861        if (order <= 0 || fi == NULL) {
1862                tb->tb_default = -1;
1863                goto out;
1864        }
1865
1866        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1867                                tb->tb_default)) {
1868                fib_result_assign(res, fi);
1869                tb->tb_default = order;
1870                goto out;
1871        }
1872        if (last_idx >= 0)
1873                fib_result_assign(res, last_resort);
1874        tb->tb_default = last_idx;
1875out:
1876        rcu_read_unlock();
1877}
1878
1879static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1880                           struct fib_table *tb,
1881                           struct sk_buff *skb, struct netlink_callback *cb)
1882{
1883        int i, s_i;
1884        struct fib_alias *fa;
1885        __be32 xkey = htonl(key);
1886
1887        s_i = cb->args[5];
1888        i = 0;
1889
1890        /* rcu_read_lock is hold by caller */
1891
1892        list_for_each_entry_rcu(fa, fah, fa_list) {
1893                if (i < s_i) {
1894                        i++;
1895                        continue;
1896                }
1897
1898                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1899                                  cb->nlh->nlmsg_seq,
1900                                  RTM_NEWROUTE,
1901                                  tb->tb_id,
1902                                  fa->fa_type,
1903                                  fa->fa_scope,
1904                                  xkey,
1905                                  plen,
1906                                  fa->fa_tos,
1907                                  fa->fa_info, NLM_F_MULTI) < 0) {
1908                        cb->args[5] = i;
1909                        return -1;
1910                }
1911                i++;
1912        }
1913        cb->args[5] = i;
1914        return skb->len;
1915}
1916
1917static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1918                        struct sk_buff *skb, struct netlink_callback *cb)
1919{
1920        struct leaf_info *li;
1921        struct hlist_node *node;
1922        int i, s_i;
1923
1924        s_i = cb->args[4];
1925        i = 0;
1926
1927        /* rcu_read_lock is hold by caller */
1928        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1929                if (i < s_i) {
1930                        i++;
1931                        continue;
1932                }
1933
1934                if (i > s_i)
1935                        cb->args[5] = 0;
1936
1937                if (list_empty(&li->falh))
1938                        continue;
1939
1940                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1941                        cb->args[4] = i;
1942                        return -1;
1943                }
1944                i++;
1945        }
1946
1947        cb->args[4] = i;
1948        return skb->len;
1949}
1950
1951int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1952                   struct netlink_callback *cb)
1953{
1954        struct leaf *l;
1955        struct trie *t = (struct trie *) tb->tb_data;
1956        t_key key = cb->args[2];
1957        int count = cb->args[3];
1958
1959        rcu_read_lock();
1960        /* Dump starting at last key.
1961         * Note: 0.0.0.0/0 (ie default) is first key.
1962         */
1963        if (count == 0)
1964                l = trie_firstleaf(t);
1965        else {
1966                /* Normally, continue from last key, but if that is missing
1967                 * fallback to using slow rescan
1968                 */
1969                l = fib_find_node(t, key);
1970                if (!l)
1971                        l = trie_leafindex(t, count);
1972        }
1973
1974        while (l) {
1975                cb->args[2] = l->key;
1976                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1977                        cb->args[3] = count;
1978                        rcu_read_unlock();
1979                        return -1;
1980                }
1981
1982                ++count;
1983                l = trie_nextleaf(l);
1984                memset(&cb->args[4], 0,
1985                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1986        }
1987        cb->args[3] = count;
1988        rcu_read_unlock();
1989
1990        return skb->len;
1991}
1992
1993void __init fib_hash_init(void)
1994{
1995        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1996                                          sizeof(struct fib_alias),
1997                                          0, SLAB_PANIC, NULL);
1998
1999        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2000                                           max(sizeof(struct leaf),
2001                                               sizeof(struct leaf_info)),
2002                                           0, SLAB_PANIC, NULL);
2003}
2004
2005
2006/* Fix more generic FIB names for init later */
2007struct fib_table *fib_hash_table(u32 id)
2008{
2009        struct fib_table *tb;
2010        struct trie *t;
2011
2012        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2013                     GFP_KERNEL);
2014        if (tb == NULL)
2015                return NULL;
2016
2017        tb->tb_id = id;
2018        tb->tb_default = -1;
2019
2020        t = (struct trie *) tb->tb_data;
2021        memset(t, 0, sizeof(*t));
2022
2023        if (id == RT_TABLE_LOCAL)
2024                pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2025
2026        return tb;
2027}
2028
2029#ifdef CONFIG_PROC_FS
2030/* Depth first Trie walk iterator */
2031struct fib_trie_iter {
2032        struct seq_net_private p;
2033        struct fib_table *tb;
2034        struct tnode *tnode;
2035        unsigned int index;
2036        unsigned int depth;
2037};
2038
2039static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2040{
2041        struct tnode *tn = iter->tnode;
2042        unsigned int cindex = iter->index;
2043        struct tnode *p;
2044
2045        /* A single entry routing table */
2046        if (!tn)
2047                return NULL;
2048
2049        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2050                 iter->tnode, iter->index, iter->depth);
2051rescan:
2052        while (cindex < (1<<tn->bits)) {
2053                struct node *n = tnode_get_child_rcu(tn, cindex);
2054
2055                if (n) {
2056                        if (IS_LEAF(n)) {
2057                                iter->tnode = tn;
2058                                iter->index = cindex + 1;
2059                        } else {
2060                                /* push down one level */
2061                                iter->tnode = (struct tnode *) n;
2062                                iter->index = 0;
2063                                ++iter->depth;
2064                        }
2065                        return n;
2066                }
2067
2068                ++cindex;
2069        }
2070
2071        /* Current node exhausted, pop back up */
2072        p = node_parent_rcu((struct node *)tn);
2073        if (p) {
2074                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2075                tn = p;
2076                --iter->depth;
2077                goto rescan;
2078        }
2079
2080        /* got root? */
2081        return NULL;
2082}
2083
2084static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2085                                       struct trie *t)
2086{
2087        struct node *n;
2088
2089        if (!t)
2090                return NULL;
2091
2092        n = rcu_dereference(t->trie);
2093        if (!n)
2094                return NULL;
2095
2096        if (IS_TNODE(n)) {
2097                iter->tnode = (struct tnode *) n;
2098                iter->index = 0;
2099                iter->depth = 1;
2100        } else {
2101                iter->tnode = NULL;
2102                iter->index = 0;
2103                iter->depth = 0;
2104        }
2105
2106        return n;
2107}
2108
2109static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2110{
2111        struct node *n;
2112        struct fib_trie_iter iter;
2113
2114        memset(s, 0, sizeof(*s));
2115
2116        rcu_read_lock();
2117        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2118                if (IS_LEAF(n)) {
2119                        struct leaf *l = (struct leaf *)n;
2120                        struct leaf_info *li;
2121                        struct hlist_node *tmp;
2122
2123                        s->leaves++;
2124                        s->totdepth += iter.depth;
2125                        if (iter.depth > s->maxdepth)
2126                                s->maxdepth = iter.depth;
2127
2128                        hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2129                                ++s->prefixes;
2130                } else {
2131                        const struct tnode *tn = (const struct tnode *) n;
2132                        int i;
2133
2134                        s->tnodes++;
2135                        if (tn->bits < MAX_STAT_DEPTH)
2136                                s->nodesizes[tn->bits]++;
2137
2138                        for (i = 0; i < (1<<tn->bits); i++)
2139                                if (!tn->child[i])
2140                                        s->nullpointers++;
2141                }
2142        }
2143        rcu_read_unlock();
2144}
2145
2146/*
2147 *      This outputs /proc/net/fib_triestats
2148 */
2149static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2150{
2151        unsigned int i, max, pointers, bytes, avdepth;
2152
2153        if (stat->leaves)
2154                avdepth = stat->totdepth*100 / stat->leaves;
2155        else
2156                avdepth = 0;
2157
2158        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2159                   avdepth / 100, avdepth % 100);
2160        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2161
2162        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2163        bytes = sizeof(struct leaf) * stat->leaves;
2164
2165        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2166        bytes += sizeof(struct leaf_info) * stat->prefixes;
2167
2168        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2169        bytes += sizeof(struct tnode) * stat->tnodes;
2170
2171        max = MAX_STAT_DEPTH;
2172        while (max > 0 && stat->nodesizes[max-1] == 0)
2173                max--;
2174
2175        pointers = 0;
2176        for (i = 1; i <= max; i++)
2177                if (stat->nodesizes[i] != 0) {
2178                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2179                        pointers += (1<<i) * stat->nodesizes[i];
2180                }
2181        seq_putc(seq, '\n');
2182        seq_printf(seq, "\tPointers: %u\n", pointers);
2183
2184        bytes += sizeof(struct node *) * pointers;
2185        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2186        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2187}
2188
2189#ifdef CONFIG_IP_FIB_TRIE_STATS
2190static void trie_show_usage(struct seq_file *seq,
2191                            const struct trie_use_stats *stats)
2192{
2193        seq_printf(seq, "\nCounters:\n---------\n");
2194        seq_printf(seq, "gets = %u\n", stats->gets);
2195        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2196        seq_printf(seq, "semantic match passed = %u\n",
2197                   stats->semantic_match_passed);
2198        seq_printf(seq, "semantic match miss = %u\n",
2199                   stats->semantic_match_miss);
2200        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2201        seq_printf(seq, "skipped node resize = %u\n\n",
2202                   stats->resize_node_skipped);
2203}
2204#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2205
2206static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2207{
2208        if (tb->tb_id == RT_TABLE_LOCAL)
2209                seq_puts(seq, "Local:\n");
2210        else if (tb->tb_id == RT_TABLE_MAIN)
2211                seq_puts(seq, "Main:\n");
2212        else
2213                seq_printf(seq, "Id %d:\n", tb->tb_id);
2214}
2215
2216
2217static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2218{
2219        struct net *net = (struct net *)seq->private;
2220        unsigned int h;
2221
2222        seq_printf(seq,
2223                   "Basic info: size of leaf:"
2224                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2225                   sizeof(struct leaf), sizeof(struct tnode));
2226
2227        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2228                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2229                struct hlist_node *node;
2230                struct fib_table *tb;
2231
2232                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2233                        struct trie *t = (struct trie *) tb->tb_data;
2234                        struct trie_stat stat;
2235
2236                        if (!t)
2237                                continue;
2238
2239                        fib_table_print(seq, tb);
2240
2241                        trie_collect_stats(t, &stat);
2242                        trie_show_stats(seq, &stat);
2243#ifdef CONFIG_IP_FIB_TRIE_STATS
2244                        trie_show_usage(seq, &t->stats);
2245#endif
2246                }
2247        }
2248
2249        return 0;
2250}
2251
2252static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2253{
2254        return single_open_net(inode, file, fib_triestat_seq_show);
2255}
2256
2257static const struct file_operations fib_triestat_fops = {
2258        .owner  = THIS_MODULE,
2259        .open   = fib_triestat_seq_open,
2260        .read   = seq_read,
2261        .llseek = seq_lseek,
2262        .release = single_release_net,
2263};
2264
2265static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2266{
2267        struct fib_trie_iter *iter = seq->private;
2268        struct net *net = seq_file_net(seq);
2269        loff_t idx = 0;
2270        unsigned int h;
2271
2272        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2273                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2274                struct hlist_node *node;
2275                struct fib_table *tb;
2276
2277                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2278                        struct node *n;
2279
2280                        for (n = fib_trie_get_first(iter,
2281                                                    (struct trie *) tb->tb_data);
2282                             n; n = fib_trie_get_next(iter))
2283                                if (pos == idx++) {
2284                                        iter->tb = tb;
2285                                        return n;
2286                                }
2287                }
2288        }
2289
2290        return NULL;
2291}
2292
2293static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2294        __acquires(RCU)
2295{
2296        rcu_read_lock();
2297        return fib_trie_get_idx(seq, *pos);
2298}
2299
2300static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2301{
2302        struct fib_trie_iter *iter = seq->private;
2303        struct net *net = seq_file_net(seq);
2304        struct fib_table *tb = iter->tb;
2305        struct hlist_node *tb_node;
2306        unsigned int h;
2307        struct node *n;
2308
2309        ++*pos;
2310        /* next node in same table */
2311        n = fib_trie_get_next(iter);
2312        if (n)
2313                return n;
2314
2315        /* walk rest of this hash chain */
2316        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2317        while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2318                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2319                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2320                if (n)
2321                        goto found;
2322        }
2323
2324        /* new hash chain */
2325        while (++h < FIB_TABLE_HASHSZ) {
2326                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2327                hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2328                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2329                        if (n)
2330                                goto found;
2331                }
2332        }
2333        return NULL;
2334
2335found:
2336        iter->tb = tb;
2337        return n;
2338}
2339
2340static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2341        __releases(RCU)
2342{
2343        rcu_read_unlock();
2344}
2345
2346static void seq_indent(struct seq_file *seq, int n)
2347{
2348        while (n-- > 0)
2349                seq_puts(seq, "   ");
2350}
2351
2352static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2353{
2354        switch (s) {
2355        case RT_SCOPE_UNIVERSE: return "universe";
2356        case RT_SCOPE_SITE:     return "site";
2357        case RT_SCOPE_LINK:     return "link";
2358        case RT_SCOPE_HOST:     return "host";
2359        case RT_SCOPE_NOWHERE:  return "nowhere";
2360        default:
2361                snprintf(buf, len, "scope=%d", s);
2362                return buf;
2363        }
2364}
2365
2366static const char *const rtn_type_names[__RTN_MAX] = {
2367        [RTN_UNSPEC] = "UNSPEC",
2368        [RTN_UNICAST] = "UNICAST",
2369        [RTN_LOCAL] = "LOCAL",
2370        [RTN_BROADCAST] = "BROADCAST",
2371        [RTN_ANYCAST] = "ANYCAST",
2372        [RTN_MULTICAST] = "MULTICAST",
2373        [RTN_BLACKHOLE] = "BLACKHOLE",
2374        [RTN_UNREACHABLE] = "UNREACHABLE",
2375        [RTN_PROHIBIT] = "PROHIBIT",
2376        [RTN_THROW] = "THROW",
2377        [RTN_NAT] = "NAT",
2378        [RTN_XRESOLVE] = "XRESOLVE",
2379};
2380
2381static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2382{
2383        if (t < __RTN_MAX && rtn_type_names[t])
2384                return rtn_type_names[t];
2385        snprintf(buf, len, "type %u", t);
2386        return buf;
2387}
2388
2389/* Pretty print the trie */
2390static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{
2392        const struct fib_trie_iter *iter = seq->private;
2393        struct node *n = v;
2394
2395        if (!node_parent_rcu(n))
2396                fib_table_print(seq, iter->tb);
2397
2398        if (IS_TNODE(n)) {
2399                struct tnode *tn = (struct tnode *) n;
2400                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2401
2402                seq_indent(seq, iter->depth-1);
2403                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2404                           &prf, tn->pos, tn->bits, tn->full_children,
2405                           tn->empty_children);
2406
2407        } else {
2408                struct leaf *l = (struct leaf *) n;
2409                struct leaf_info *li;
2410                struct hlist_node *node;
2411                __be32 val = htonl(l->key);
2412
2413                seq_indent(seq, iter->depth);
2414                seq_printf(seq, "  |-- %pI4\n", &val);
2415
2416                hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2417                        struct fib_alias *fa;
2418
2419                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2420                                char buf1[32], buf2[32];
2421
2422                                seq_indent(seq, iter->depth+1);
2423                                seq_printf(seq, "  /%d %s %s", li->plen,
2424                                           rtn_scope(buf1, sizeof(buf1),
2425                                                     fa->fa_scope),
2426                                           rtn_type(buf2, sizeof(buf2),
2427                                                    fa->fa_type));
2428                                if (fa->fa_tos)
2429                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2430                                seq_putc(seq, '\n');
2431                        }
2432                }
2433        }
2434
2435        return 0;
2436}
2437
2438static const struct seq_operations fib_trie_seq_ops = {
2439        .start  = fib_trie_seq_start,
2440        .next   = fib_trie_seq_next,
2441        .stop   = fib_trie_seq_stop,
2442        .show   = fib_trie_seq_show,
2443};
2444
2445static int fib_trie_seq_open(struct inode *inode, struct file *file)
2446{
2447        return seq_open_net(inode, file, &fib_trie_seq_ops,
2448                            sizeof(struct fib_trie_iter));
2449}
2450
2451static const struct file_operations fib_trie_fops = {
2452        .owner  = THIS_MODULE,
2453        .open   = fib_trie_seq_open,
2454        .read   = seq_read,
2455        .llseek = seq_lseek,
2456        .release = seq_release_net,
2457};
2458
2459struct fib_route_iter {
2460        struct seq_net_private p;
2461        struct trie *main_trie;
2462        loff_t  pos;
2463        t_key   key;
2464};
2465
2466static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2467{
2468        struct leaf *l = NULL;
2469        struct trie *t = iter->main_trie;
2470
2471        /* use cache location of last found key */
2472        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2473                pos -= iter->pos;
2474        else {
2475                iter->pos = 0;
2476                l = trie_firstleaf(t);
2477        }
2478
2479        while (l && pos-- > 0) {
2480                iter->pos++;
2481                l = trie_nextleaf(l);
2482        }
2483
2484        if (l)
2485                iter->key = pos;        /* remember it */
2486        else
2487                iter->pos = 0;          /* forget it */
2488
2489        return l;
2490}
2491
2492static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2493        __acquires(RCU)
2494{
2495        struct fib_route_iter *iter = seq->private;
2496        struct fib_table *tb;
2497
2498        rcu_read_lock();
2499        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2500        if (!tb)
2501                return NULL;
2502
2503        iter->main_trie = (struct trie *) tb->tb_data;
2504        if (*pos == 0)
2505                return SEQ_START_TOKEN;
2506        else
2507                return fib_route_get_idx(iter, *pos - 1);
2508}
2509
2510static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2511{
2512        struct fib_route_iter *iter = seq->private;
2513        struct leaf *l = v;
2514
2515        ++*pos;
2516        if (v == SEQ_START_TOKEN) {
2517                iter->pos = 0;
2518                l = trie_firstleaf(iter->main_trie);
2519        } else {
2520                iter->pos++;
2521                l = trie_nextleaf(l);
2522        }
2523
2524        if (l)
2525                iter->key = l->key;
2526        else
2527                iter->pos = 0;
2528        return l;
2529}
2530
2531static void fib_route_seq_stop(struct seq_file *seq, void *v)
2532        __releases(RCU)
2533{
2534        rcu_read_unlock();
2535}
2536
2537static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2538{
2539        unsigned int flags = 0;
2540
2541        if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2542                flags = RTF_REJECT;
2543        if (fi && fi->fib_nh->nh_gw)
2544                flags |= RTF_GATEWAY;
2545        if (mask == htonl(0xFFFFFFFF))
2546                flags |= RTF_HOST;
2547        flags |= RTF_UP;
2548        return flags;
2549}
2550
2551/*
2552 *      This outputs /proc/net/route.
2553 *      The format of the file is not supposed to be changed
2554 *      and needs to be same as fib_hash output to avoid breaking
2555 *      legacy utilities
2556 */
2557static int fib_route_seq_show(struct seq_file *seq, void *v)
2558{
2559        struct leaf *l = v;
2560        struct leaf_info *li;
2561        struct hlist_node *node;
2562
2563        if (v == SEQ_START_TOKEN) {
2564                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2565                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2566                           "\tWindow\tIRTT");
2567                return 0;
2568        }
2569
2570        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2571                struct fib_alias *fa;
2572                __be32 mask, prefix;
2573
2574                mask = inet_make_mask(li->plen);
2575                prefix = htonl(l->key);
2576
2577                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2578                        const struct fib_info *fi = fa->fa_info;
2579                        unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2580                        int len;
2581
2582                        if (fa->fa_type == RTN_BROADCAST
2583                            || fa->fa_type == RTN_MULTICAST)
2584                                continue;
2585
2586                        if (fi)
2587                                seq_printf(seq,
2588                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2589                                         "%d\t%08X\t%d\t%u\t%u%n",
2590                                         fi->fib_dev ? fi->fib_dev->name : "*",
2591                                         prefix,
2592                                         fi->fib_nh->nh_gw, flags, 0, 0,
2593                                         fi->fib_priority,
2594                                         mask,
2595                                         (fi->fib_advmss ?
2596                                          fi->fib_advmss + 40 : 0),
2597                                         fi->fib_window,
2598                                         fi->fib_rtt >> 3, &len);
2599                        else
2600                                seq_printf(seq,
2601                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2602                                         "%d\t%08X\t%d\t%u\t%u%n",
2603                                         prefix, 0, flags, 0, 0, 0,
2604                                         mask, 0, 0, 0, &len);
2605
2606                        seq_printf(seq, "%*s\n", 127 - len, "");
2607                }
2608        }
2609
2610        return 0;
2611}
2612
2613static const struct seq_operations fib_route_seq_ops = {
2614        .start  = fib_route_seq_start,
2615        .next   = fib_route_seq_next,
2616        .stop   = fib_route_seq_stop,
2617        .show   = fib_route_seq_show,
2618};
2619
2620static int fib_route_seq_open(struct inode *inode, struct file *file)
2621{
2622        return seq_open_net(inode, file, &fib_route_seq_ops,
2623                            sizeof(struct fib_route_iter));
2624}
2625
2626static const struct file_operations fib_route_fops = {
2627        .owner  = THIS_MODULE,
2628        .open   = fib_route_seq_open,
2629        .read   = seq_read,
2630        .llseek = seq_lseek,
2631        .release = seq_release_net,
2632};
2633
2634int __net_init fib_proc_init(struct net *net)
2635{
2636        if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2637                goto out1;
2638
2639        if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2640                                  &fib_triestat_fops))
2641                goto out2;
2642
2643        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2644                goto out3;
2645
2646        return 0;
2647
2648out3:
2649        proc_net_remove(net, "fib_triestat");
2650out2:
2651        proc_net_remove(net, "fib_trie");
2652out1:
2653        return -ENOMEM;
2654}
2655
2656void __net_exit fib_proc_exit(struct net *net)
2657{
2658        proc_net_remove(net, "fib_trie");
2659        proc_net_remove(net, "fib_triestat");
2660        proc_net_remove(net, "route");
2661}
2662
2663#endif /* CONFIG_PROC_FS */
2664