linux/net/ipv6/ip6_fib.c
<<
>>
Prefs
   1/*
   2 *      Linux INET6 implementation
   3 *      Forwarding Information Database
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 *
  13 *      Changes:
  14 *      Yuji SEKIYA @USAGI:     Support default route on router node;
  15 *                              remove ip6_null_entry from the top of
  16 *                              routing table.
  17 *      Ville Nuorvala:         Fixed routing subtrees.
  18 */
  19
  20#define pr_fmt(fmt) "IPv6: " fmt
  21
  22#include <linux/errno.h>
  23#include <linux/types.h>
  24#include <linux/net.h>
  25#include <linux/route.h>
  26#include <linux/netdevice.h>
  27#include <linux/in6.h>
  28#include <linux/init.h>
  29#include <linux/list.h>
  30#include <linux/slab.h>
  31
  32#include <net/ipv6.h>
  33#include <net/ndisc.h>
  34#include <net/addrconf.h>
  35
  36#include <net/ip6_fib.h>
  37#include <net/ip6_route.h>
  38
  39#define RT6_DEBUG 2
  40
  41#if RT6_DEBUG >= 3
  42#define RT6_TRACE(x...) pr_debug(x)
  43#else
  44#define RT6_TRACE(x...) do { ; } while (0)
  45#endif
  46
  47static struct kmem_cache *fib6_node_kmem __read_mostly;
  48
  49struct fib6_cleaner {
  50        struct fib6_walker w;
  51        struct net *net;
  52        int (*func)(struct rt6_info *, void *arg);
  53        int sernum;
  54        void *arg;
  55};
  56
  57static DEFINE_RWLOCK(fib6_walker_lock);
  58
  59#ifdef CONFIG_IPV6_SUBTREES
  60#define FWS_INIT FWS_S
  61#else
  62#define FWS_INIT FWS_L
  63#endif
  64
  65static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
  66static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
  67static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
  68static int fib6_walk(struct fib6_walker *w);
  69static int fib6_walk_continue(struct fib6_walker *w);
  70
  71/*
  72 *      A routing update causes an increase of the serial number on the
  73 *      affected subtree. This allows for cached routes to be asynchronously
  74 *      tested when modifications are made to the destination cache as a
  75 *      result of redirects, path MTU changes, etc.
  76 */
  77
  78static void fib6_gc_timer_cb(unsigned long arg);
  79
  80static LIST_HEAD(fib6_walkers);
  81#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
  82
  83static void fib6_walker_link(struct fib6_walker *w)
  84{
  85        write_lock_bh(&fib6_walker_lock);
  86        list_add(&w->lh, &fib6_walkers);
  87        write_unlock_bh(&fib6_walker_lock);
  88}
  89
  90static void fib6_walker_unlink(struct fib6_walker *w)
  91{
  92        write_lock_bh(&fib6_walker_lock);
  93        list_del(&w->lh);
  94        write_unlock_bh(&fib6_walker_lock);
  95}
  96
  97static int fib6_new_sernum(struct net *net)
  98{
  99        int new, old;
 100
 101        do {
 102                old = atomic_read(&net->ipv6.fib6_sernum);
 103                new = old < INT_MAX ? old + 1 : 1;
 104        } while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
 105                                old, new) != old);
 106        return new;
 107}
 108
 109enum {
 110        FIB6_NO_SERNUM_CHANGE = 0,
 111};
 112
 113/*
 114 *      Auxiliary address test functions for the radix tree.
 115 *
 116 *      These assume a 32bit processor (although it will work on
 117 *      64bit processors)
 118 */
 119
 120/*
 121 *      test bit
 122 */
 123#if defined(__LITTLE_ENDIAN)
 124# define BITOP_BE32_SWIZZLE     (0x1F & ~7)
 125#else
 126# define BITOP_BE32_SWIZZLE     0
 127#endif
 128
 129static __be32 addr_bit_set(const void *token, int fn_bit)
 130{
 131        const __be32 *addr = token;
 132        /*
 133         * Here,
 134         *      1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
 135         * is optimized version of
 136         *      htonl(1 << ((~fn_bit)&0x1F))
 137         * See include/asm-generic/bitops/le.h.
 138         */
 139        return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
 140               addr[fn_bit >> 5];
 141}
 142
 143static struct fib6_node *node_alloc(void)
 144{
 145        struct fib6_node *fn;
 146
 147        fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
 148
 149        return fn;
 150}
 151
 152static void node_free(struct fib6_node *fn)
 153{
 154        kmem_cache_free(fib6_node_kmem, fn);
 155}
 156
 157static void rt6_release(struct rt6_info *rt)
 158{
 159        if (atomic_dec_and_test(&rt->rt6i_ref))
 160                dst_free(&rt->dst);
 161}
 162
 163static void fib6_link_table(struct net *net, struct fib6_table *tb)
 164{
 165        unsigned int h;
 166
 167        /*
 168         * Initialize table lock at a single place to give lockdep a key,
 169         * tables aren't visible prior to being linked to the list.
 170         */
 171        rwlock_init(&tb->tb6_lock);
 172
 173        h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
 174
 175        /*
 176         * No protection necessary, this is the only list mutatation
 177         * operation, tables never disappear once they exist.
 178         */
 179        hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
 180}
 181
 182#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 183
 184static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
 185{
 186        struct fib6_table *table;
 187
 188        table = kzalloc(sizeof(*table), GFP_ATOMIC);
 189        if (table) {
 190                table->tb6_id = id;
 191                table->tb6_root.leaf = net->ipv6.ip6_null_entry;
 192                table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 193                inet_peer_base_init(&table->tb6_peers);
 194        }
 195
 196        return table;
 197}
 198
 199struct fib6_table *fib6_new_table(struct net *net, u32 id)
 200{
 201        struct fib6_table *tb;
 202
 203        if (id == 0)
 204                id = RT6_TABLE_MAIN;
 205        tb = fib6_get_table(net, id);
 206        if (tb)
 207                return tb;
 208
 209        tb = fib6_alloc_table(net, id);
 210        if (tb)
 211                fib6_link_table(net, tb);
 212
 213        return tb;
 214}
 215
 216struct fib6_table *fib6_get_table(struct net *net, u32 id)
 217{
 218        struct fib6_table *tb;
 219        struct hlist_head *head;
 220        unsigned int h;
 221
 222        if (id == 0)
 223                id = RT6_TABLE_MAIN;
 224        h = id & (FIB6_TABLE_HASHSZ - 1);
 225        rcu_read_lock();
 226        head = &net->ipv6.fib_table_hash[h];
 227        hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 228                if (tb->tb6_id == id) {
 229                        rcu_read_unlock();
 230                        return tb;
 231                }
 232        }
 233        rcu_read_unlock();
 234
 235        return NULL;
 236}
 237
 238static void __net_init fib6_tables_init(struct net *net)
 239{
 240        fib6_link_table(net, net->ipv6.fib6_main_tbl);
 241        fib6_link_table(net, net->ipv6.fib6_local_tbl);
 242}
 243#else
 244
 245struct fib6_table *fib6_new_table(struct net *net, u32 id)
 246{
 247        return fib6_get_table(net, id);
 248}
 249
 250struct fib6_table *fib6_get_table(struct net *net, u32 id)
 251{
 252          return net->ipv6.fib6_main_tbl;
 253}
 254
 255struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 256                                   int flags, pol_lookup_t lookup)
 257{
 258        return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
 259}
 260
 261static void __net_init fib6_tables_init(struct net *net)
 262{
 263        fib6_link_table(net, net->ipv6.fib6_main_tbl);
 264}
 265
 266#endif
 267
 268static int fib6_dump_node(struct fib6_walker *w)
 269{
 270        int res;
 271        struct rt6_info *rt;
 272
 273        for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
 274                res = rt6_dump_route(rt, w->args);
 275                if (res < 0) {
 276                        /* Frame is full, suspend walking */
 277                        w->leaf = rt;
 278                        return 1;
 279                }
 280                WARN_ON(res == 0);
 281        }
 282        w->leaf = NULL;
 283        return 0;
 284}
 285
 286static void fib6_dump_end(struct netlink_callback *cb)
 287{
 288        struct fib6_walker *w = (void *)cb->args[2];
 289
 290        if (w) {
 291                if (cb->args[4]) {
 292                        cb->args[4] = 0;
 293                        fib6_walker_unlink(w);
 294                }
 295                cb->args[2] = 0;
 296                kfree(w);
 297        }
 298        cb->done = (void *)cb->args[3];
 299        cb->args[1] = 3;
 300}
 301
 302static int fib6_dump_done(struct netlink_callback *cb)
 303{
 304        fib6_dump_end(cb);
 305        return cb->done ? cb->done(cb) : 0;
 306}
 307
 308static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 309                           struct netlink_callback *cb)
 310{
 311        struct fib6_walker *w;
 312        int res;
 313
 314        w = (void *)cb->args[2];
 315        w->root = &table->tb6_root;
 316
 317        if (cb->args[4] == 0) {
 318                w->count = 0;
 319                w->skip = 0;
 320
 321                read_lock_bh(&table->tb6_lock);
 322                res = fib6_walk(w);
 323                read_unlock_bh(&table->tb6_lock);
 324                if (res > 0) {
 325                        cb->args[4] = 1;
 326                        cb->args[5] = w->root->fn_sernum;
 327                }
 328        } else {
 329                if (cb->args[5] != w->root->fn_sernum) {
 330                        /* Begin at the root if the tree changed */
 331                        cb->args[5] = w->root->fn_sernum;
 332                        w->state = FWS_INIT;
 333                        w->node = w->root;
 334                        w->skip = w->count;
 335                } else
 336                        w->skip = 0;
 337
 338                read_lock_bh(&table->tb6_lock);
 339                res = fib6_walk_continue(w);
 340                read_unlock_bh(&table->tb6_lock);
 341                if (res <= 0) {
 342                        fib6_walker_unlink(w);
 343                        cb->args[4] = 0;
 344                }
 345        }
 346
 347        return res;
 348}
 349
 350static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 351{
 352        struct net *net = sock_net(skb->sk);
 353        unsigned int h, s_h;
 354        unsigned int e = 0, s_e;
 355        struct rt6_rtnl_dump_arg arg;
 356        struct fib6_walker *w;
 357        struct fib6_table *tb;
 358        struct hlist_head *head;
 359        int res = 0;
 360
 361        s_h = cb->args[0];
 362        s_e = cb->args[1];
 363
 364        w = (void *)cb->args[2];
 365        if (!w) {
 366                /* New dump:
 367                 *
 368                 * 1. hook callback destructor.
 369                 */
 370                cb->args[3] = (long)cb->done;
 371                cb->done = fib6_dump_done;
 372
 373                /*
 374                 * 2. allocate and initialize walker.
 375                 */
 376                w = kzalloc(sizeof(*w), GFP_ATOMIC);
 377                if (!w)
 378                        return -ENOMEM;
 379                w->func = fib6_dump_node;
 380                cb->args[2] = (long)w;
 381        }
 382
 383        arg.skb = skb;
 384        arg.cb = cb;
 385        arg.net = net;
 386        w->args = &arg;
 387
 388        rcu_read_lock();
 389        for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
 390                e = 0;
 391                head = &net->ipv6.fib_table_hash[h];
 392                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 393                        if (e < s_e)
 394                                goto next;
 395                        res = fib6_dump_table(tb, skb, cb);
 396                        if (res != 0)
 397                                goto out;
 398next:
 399                        e++;
 400                }
 401        }
 402out:
 403        rcu_read_unlock();
 404        cb->args[1] = e;
 405        cb->args[0] = h;
 406
 407        res = res < 0 ? res : skb->len;
 408        if (res <= 0)
 409                fib6_dump_end(cb);
 410        return res;
 411}
 412
 413/*
 414 *      Routing Table
 415 *
 416 *      return the appropriate node for a routing tree "add" operation
 417 *      by either creating and inserting or by returning an existing
 418 *      node.
 419 */
 420
 421static struct fib6_node *fib6_add_1(struct fib6_node *root,
 422                                     struct in6_addr *addr, int plen,
 423                                     int offset, int allow_create,
 424                                     int replace_required, int sernum)
 425{
 426        struct fib6_node *fn, *in, *ln;
 427        struct fib6_node *pn = NULL;
 428        struct rt6key *key;
 429        int     bit;
 430        __be32  dir = 0;
 431
 432        RT6_TRACE("fib6_add_1\n");
 433
 434        /* insert node in tree */
 435
 436        fn = root;
 437
 438        do {
 439                key = (struct rt6key *)((u8 *)fn->leaf + offset);
 440
 441                /*
 442                 *      Prefix match
 443                 */
 444                if (plen < fn->fn_bit ||
 445                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
 446                        if (!allow_create) {
 447                                if (replace_required) {
 448                                        pr_warn("Can't replace route, no match found\n");
 449                                        return ERR_PTR(-ENOENT);
 450                                }
 451                                pr_warn("NLM_F_CREATE should be set when creating new route\n");
 452                        }
 453                        goto insert_above;
 454                }
 455
 456                /*
 457                 *      Exact match ?
 458                 */
 459
 460                if (plen == fn->fn_bit) {
 461                        /* clean up an intermediate node */
 462                        if (!(fn->fn_flags & RTN_RTINFO)) {
 463                                rt6_release(fn->leaf);
 464                                fn->leaf = NULL;
 465                        }
 466
 467                        fn->fn_sernum = sernum;
 468
 469                        return fn;
 470                }
 471
 472                /*
 473                 *      We have more bits to go
 474                 */
 475
 476                /* Try to walk down on tree. */
 477                fn->fn_sernum = sernum;
 478                dir = addr_bit_set(addr, fn->fn_bit);
 479                pn = fn;
 480                fn = dir ? fn->right : fn->left;
 481        } while (fn);
 482
 483        if (!allow_create) {
 484                /* We should not create new node because
 485                 * NLM_F_REPLACE was specified without NLM_F_CREATE
 486                 * I assume it is safe to require NLM_F_CREATE when
 487                 * REPLACE flag is used! Later we may want to remove the
 488                 * check for replace_required, because according
 489                 * to netlink specification, NLM_F_CREATE
 490                 * MUST be specified if new route is created.
 491                 * That would keep IPv6 consistent with IPv4
 492                 */
 493                if (replace_required) {
 494                        pr_warn("Can't replace route, no match found\n");
 495                        return ERR_PTR(-ENOENT);
 496                }
 497                pr_warn("NLM_F_CREATE should be set when creating new route\n");
 498        }
 499        /*
 500         *      We walked to the bottom of tree.
 501         *      Create new leaf node without children.
 502         */
 503
 504        ln = node_alloc();
 505
 506        if (!ln)
 507                return ERR_PTR(-ENOMEM);
 508        ln->fn_bit = plen;
 509
 510        ln->parent = pn;
 511        ln->fn_sernum = sernum;
 512
 513        if (dir)
 514                pn->right = ln;
 515        else
 516                pn->left  = ln;
 517
 518        return ln;
 519
 520
 521insert_above:
 522        /*
 523         * split since we don't have a common prefix anymore or
 524         * we have a less significant route.
 525         * we've to insert an intermediate node on the list
 526         * this new node will point to the one we need to create
 527         * and the current
 528         */
 529
 530        pn = fn->parent;
 531
 532        /* find 1st bit in difference between the 2 addrs.
 533
 534           See comment in __ipv6_addr_diff: bit may be an invalid value,
 535           but if it is >= plen, the value is ignored in any case.
 536         */
 537
 538        bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
 539
 540        /*
 541         *              (intermediate)[in]
 542         *                /        \
 543         *      (new leaf node)[ln] (old node)[fn]
 544         */
 545        if (plen > bit) {
 546                in = node_alloc();
 547                ln = node_alloc();
 548
 549                if (!in || !ln) {
 550                        if (in)
 551                                node_free(in);
 552                        if (ln)
 553                                node_free(ln);
 554                        return ERR_PTR(-ENOMEM);
 555                }
 556
 557                /*
 558                 * new intermediate node.
 559                 * RTN_RTINFO will
 560                 * be off since that an address that chooses one of
 561                 * the branches would not match less specific routes
 562                 * in the other branch
 563                 */
 564
 565                in->fn_bit = bit;
 566
 567                in->parent = pn;
 568                in->leaf = fn->leaf;
 569                atomic_inc(&in->leaf->rt6i_ref);
 570
 571                in->fn_sernum = sernum;
 572
 573                /* update parent pointer */
 574                if (dir)
 575                        pn->right = in;
 576                else
 577                        pn->left  = in;
 578
 579                ln->fn_bit = plen;
 580
 581                ln->parent = in;
 582                fn->parent = in;
 583
 584                ln->fn_sernum = sernum;
 585
 586                if (addr_bit_set(addr, bit)) {
 587                        in->right = ln;
 588                        in->left  = fn;
 589                } else {
 590                        in->left  = ln;
 591                        in->right = fn;
 592                }
 593        } else { /* plen <= bit */
 594
 595                /*
 596                 *              (new leaf node)[ln]
 597                 *                /        \
 598                 *           (old node)[fn] NULL
 599                 */
 600
 601                ln = node_alloc();
 602
 603                if (!ln)
 604                        return ERR_PTR(-ENOMEM);
 605
 606                ln->fn_bit = plen;
 607
 608                ln->parent = pn;
 609
 610                ln->fn_sernum = sernum;
 611
 612                if (dir)
 613                        pn->right = ln;
 614                else
 615                        pn->left  = ln;
 616
 617                if (addr_bit_set(&key->addr, plen))
 618                        ln->right = fn;
 619                else
 620                        ln->left  = fn;
 621
 622                fn->parent = ln;
 623        }
 624        return ln;
 625}
 626
 627static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
 628{
 629        return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
 630               RTF_GATEWAY;
 631}
 632
 633static int fib6_commit_metrics(struct dst_entry *dst,
 634                               struct nlattr *mx, int mx_len)
 635{
 636        struct nlattr *nla;
 637        int remaining;
 638        u32 *mp;
 639
 640        if (dst->flags & DST_HOST) {
 641                mp = dst_metrics_write_ptr(dst);
 642        } else {
 643                mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
 644                if (!mp)
 645                        return -ENOMEM;
 646                dst_init_metrics(dst, mp, 0);
 647        }
 648
 649        nla_for_each_attr(nla, mx, mx_len, remaining) {
 650                int type = nla_type(nla);
 651
 652                if (type) {
 653                        if (type > RTAX_MAX)
 654                                return -EINVAL;
 655
 656                        mp[type - 1] = nla_get_u32(nla);
 657                }
 658        }
 659        return 0;
 660}
 661
 662static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 663                          struct net *net)
 664{
 665        if (atomic_read(&rt->rt6i_ref) != 1) {
 666                /* This route is used as dummy address holder in some split
 667                 * nodes. It is not leaked, but it still holds other resources,
 668                 * which must be released in time. So, scan ascendant nodes
 669                 * and replace dummy references to this route with references
 670                 * to still alive ones.
 671                 */
 672                while (fn) {
 673                        if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
 674                                fn->leaf = fib6_find_prefix(net, fn);
 675                                atomic_inc(&fn->leaf->rt6i_ref);
 676                                rt6_release(rt);
 677                        }
 678                        fn = fn->parent;
 679                }
 680                /* No more references are possible at this point. */
 681                BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
 682        }
 683}
 684
 685/*
 686 *      Insert routing information in a node.
 687 */
 688
 689static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 690                            struct nl_info *info, struct nlattr *mx, int mx_len)
 691{
 692        struct rt6_info *iter = NULL;
 693        struct rt6_info **ins;
 694        int replace = (info->nlh &&
 695                       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
 696        int add = (!info->nlh ||
 697                   (info->nlh->nlmsg_flags & NLM_F_CREATE));
 698        int found = 0;
 699        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
 700        int err;
 701
 702        ins = &fn->leaf;
 703
 704        for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
 705                /*
 706                 *      Search for duplicates
 707                 */
 708
 709                if (iter->rt6i_metric == rt->rt6i_metric) {
 710                        /*
 711                         *      Same priority level
 712                         */
 713                        if (info->nlh &&
 714                            (info->nlh->nlmsg_flags & NLM_F_EXCL))
 715                                return -EEXIST;
 716                        if (replace) {
 717                                found++;
 718                                break;
 719                        }
 720
 721                        if (iter->dst.dev == rt->dst.dev &&
 722                            iter->rt6i_idev == rt->rt6i_idev &&
 723                            ipv6_addr_equal(&iter->rt6i_gateway,
 724                                            &rt->rt6i_gateway)) {
 725                                if (rt->rt6i_nsiblings)
 726                                        rt->rt6i_nsiblings = 0;
 727                                if (!(iter->rt6i_flags & RTF_EXPIRES))
 728                                        return -EEXIST;
 729                                if (!(rt->rt6i_flags & RTF_EXPIRES))
 730                                        rt6_clean_expires(iter);
 731                                else
 732                                        rt6_set_expires(iter, rt->dst.expires);
 733                                return -EEXIST;
 734                        }
 735                        /* If we have the same destination and the same metric,
 736                         * but not the same gateway, then the route we try to
 737                         * add is sibling to this route, increment our counter
 738                         * of siblings, and later we will add our route to the
 739                         * list.
 740                         * Only static routes (which don't have flag
 741                         * RTF_EXPIRES) are used for ECMPv6.
 742                         *
 743                         * To avoid long list, we only had siblings if the
 744                         * route have a gateway.
 745                         */
 746                        if (rt_can_ecmp &&
 747                            rt6_qualify_for_ecmp(iter))
 748                                rt->rt6i_nsiblings++;
 749                }
 750
 751                if (iter->rt6i_metric > rt->rt6i_metric)
 752                        break;
 753
 754                ins = &iter->dst.rt6_next;
 755        }
 756
 757        /* Reset round-robin state, if necessary */
 758        if (ins == &fn->leaf)
 759                fn->rr_ptr = NULL;
 760
 761        /* Link this route to others same route. */
 762        if (rt->rt6i_nsiblings) {
 763                unsigned int rt6i_nsiblings;
 764                struct rt6_info *sibling, *temp_sibling;
 765
 766                /* Find the first route that have the same metric */
 767                sibling = fn->leaf;
 768                while (sibling) {
 769                        if (sibling->rt6i_metric == rt->rt6i_metric &&
 770                            rt6_qualify_for_ecmp(sibling)) {
 771                                list_add_tail(&rt->rt6i_siblings,
 772                                              &sibling->rt6i_siblings);
 773                                break;
 774                        }
 775                        sibling = sibling->dst.rt6_next;
 776                }
 777                /* For each sibling in the list, increment the counter of
 778                 * siblings. BUG() if counters does not match, list of siblings
 779                 * is broken!
 780                 */
 781                rt6i_nsiblings = 0;
 782                list_for_each_entry_safe(sibling, temp_sibling,
 783                                         &rt->rt6i_siblings, rt6i_siblings) {
 784                        sibling->rt6i_nsiblings++;
 785                        BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
 786                        rt6i_nsiblings++;
 787                }
 788                BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
 789        }
 790
 791        /*
 792         *      insert node
 793         */
 794        if (!replace) {
 795                if (!add)
 796                        pr_warn("NLM_F_CREATE should be set when creating new route\n");
 797
 798add:
 799                if (mx) {
 800                        err = fib6_commit_metrics(&rt->dst, mx, mx_len);
 801                        if (err)
 802                                return err;
 803                }
 804                rt->dst.rt6_next = iter;
 805                *ins = rt;
 806                rt->rt6i_node = fn;
 807                atomic_inc(&rt->rt6i_ref);
 808                inet6_rt_notify(RTM_NEWROUTE, rt, info);
 809                info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
 810
 811                if (!(fn->fn_flags & RTN_RTINFO)) {
 812                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 813                        fn->fn_flags |= RTN_RTINFO;
 814                }
 815
 816        } else {
 817                if (!found) {
 818                        if (add)
 819                                goto add;
 820                        pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
 821                        return -ENOENT;
 822                }
 823                if (mx) {
 824                        err = fib6_commit_metrics(&rt->dst, mx, mx_len);
 825                        if (err)
 826                                return err;
 827                }
 828                *ins = rt;
 829                rt->rt6i_node = fn;
 830                rt->dst.rt6_next = iter->dst.rt6_next;
 831                atomic_inc(&rt->rt6i_ref);
 832                inet6_rt_notify(RTM_NEWROUTE, rt, info);
 833                if (!(fn->fn_flags & RTN_RTINFO)) {
 834                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 835                        fn->fn_flags |= RTN_RTINFO;
 836                }
 837                fib6_purge_rt(iter, fn, info->nl_net);
 838                rt6_release(iter);
 839        }
 840
 841        return 0;
 842}
 843
 844static void fib6_start_gc(struct net *net, struct rt6_info *rt)
 845{
 846        if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
 847            (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
 848                mod_timer(&net->ipv6.ip6_fib_timer,
 849                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 850}
 851
 852void fib6_force_start_gc(struct net *net)
 853{
 854        if (!timer_pending(&net->ipv6.ip6_fib_timer))
 855                mod_timer(&net->ipv6.ip6_fib_timer,
 856                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 857}
 858
 859/*
 860 *      Add routing information to the routing tree.
 861 *      <destination addr>/<source addr>
 862 *      with source addr info in sub-trees
 863 */
 864
 865int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
 866             struct nlattr *mx, int mx_len)
 867{
 868        struct fib6_node *fn, *pn = NULL;
 869        int err = -ENOMEM;
 870        int allow_create = 1;
 871        int replace_required = 0;
 872        int sernum = fib6_new_sernum(info->nl_net);
 873
 874        if (info->nlh) {
 875                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
 876                        allow_create = 0;
 877                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
 878                        replace_required = 1;
 879        }
 880        if (!allow_create && !replace_required)
 881                pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
 882
 883        fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
 884                        offsetof(struct rt6_info, rt6i_dst), allow_create,
 885                        replace_required, sernum);
 886        if (IS_ERR(fn)) {
 887                err = PTR_ERR(fn);
 888                fn = NULL;
 889                goto out;
 890        }
 891
 892        pn = fn;
 893
 894#ifdef CONFIG_IPV6_SUBTREES
 895        if (rt->rt6i_src.plen) {
 896                struct fib6_node *sn;
 897
 898                if (!fn->subtree) {
 899                        struct fib6_node *sfn;
 900
 901                        /*
 902                         * Create subtree.
 903                         *
 904                         *              fn[main tree]
 905                         *              |
 906                         *              sfn[subtree root]
 907                         *                 \
 908                         *                  sn[new leaf node]
 909                         */
 910
 911                        /* Create subtree root node */
 912                        sfn = node_alloc();
 913                        if (!sfn)
 914                                goto st_failure;
 915
 916                        sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
 917                        atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
 918                        sfn->fn_flags = RTN_ROOT;
 919                        sfn->fn_sernum = sernum;
 920
 921                        /* Now add the first leaf node to new subtree */
 922
 923                        sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
 924                                        rt->rt6i_src.plen,
 925                                        offsetof(struct rt6_info, rt6i_src),
 926                                        allow_create, replace_required, sernum);
 927
 928                        if (IS_ERR(sn)) {
 929                                /* If it is failed, discard just allocated
 930                                   root, and then (in st_failure) stale node
 931                                   in main tree.
 932                                 */
 933                                node_free(sfn);
 934                                err = PTR_ERR(sn);
 935                                goto st_failure;
 936                        }
 937
 938                        /* Now link new subtree to main tree */
 939                        sfn->parent = fn;
 940                        fn->subtree = sfn;
 941                } else {
 942                        sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
 943                                        rt->rt6i_src.plen,
 944                                        offsetof(struct rt6_info, rt6i_src),
 945                                        allow_create, replace_required, sernum);
 946
 947                        if (IS_ERR(sn)) {
 948                                err = PTR_ERR(sn);
 949                                goto st_failure;
 950                        }
 951                }
 952
 953                if (!fn->leaf) {
 954                        fn->leaf = rt;
 955                        atomic_inc(&rt->rt6i_ref);
 956                }
 957                fn = sn;
 958        }
 959#endif
 960
 961        err = fib6_add_rt2node(fn, rt, info, mx, mx_len);
 962        if (!err) {
 963                fib6_start_gc(info->nl_net, rt);
 964                if (!(rt->rt6i_flags & RTF_CACHE))
 965                        fib6_prune_clones(info->nl_net, pn);
 966        }
 967
 968out:
 969        if (err) {
 970#ifdef CONFIG_IPV6_SUBTREES
 971                /*
 972                 * If fib6_add_1 has cleared the old leaf pointer in the
 973                 * super-tree leaf node we have to find a new one for it.
 974                 */
 975                if (pn != fn && pn->leaf == rt) {
 976                        pn->leaf = NULL;
 977                        atomic_dec(&rt->rt6i_ref);
 978                }
 979                if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
 980                        pn->leaf = fib6_find_prefix(info->nl_net, pn);
 981#if RT6_DEBUG >= 2
 982                        if (!pn->leaf) {
 983                                WARN_ON(pn->leaf == NULL);
 984                                pn->leaf = info->nl_net->ipv6.ip6_null_entry;
 985                        }
 986#endif
 987                        atomic_inc(&pn->leaf->rt6i_ref);
 988                }
 989#endif
 990                dst_free(&rt->dst);
 991        }
 992        return err;
 993
 994#ifdef CONFIG_IPV6_SUBTREES
 995        /* Subtree creation failed, probably main tree node
 996           is orphan. If it is, shoot it.
 997         */
 998st_failure:
 999        if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
1000                fib6_repair_tree(info->nl_net, fn);
1001        dst_free(&rt->dst);
1002        return err;
1003#endif
1004}
1005
1006/*
1007 *      Routing tree lookup
1008 *
1009 */
1010
1011struct lookup_args {
1012        int                     offset;         /* key offset on rt6_info       */
1013        const struct in6_addr   *addr;          /* search key                   */
1014};
1015
1016static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
1017                                       struct lookup_args *args)
1018{
1019        struct fib6_node *fn;
1020        __be32 dir;
1021
1022        if (unlikely(args->offset == 0))
1023                return NULL;
1024
1025        /*
1026         *      Descend on a tree
1027         */
1028
1029        fn = root;
1030
1031        for (;;) {
1032                struct fib6_node *next;
1033
1034                dir = addr_bit_set(args->addr, fn->fn_bit);
1035
1036                next = dir ? fn->right : fn->left;
1037
1038                if (next) {
1039                        fn = next;
1040                        continue;
1041                }
1042                break;
1043        }
1044
1045        while (fn) {
1046                if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
1047                        struct rt6key *key;
1048
1049                        key = (struct rt6key *) ((u8 *) fn->leaf +
1050                                                 args->offset);
1051
1052                        if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1053#ifdef CONFIG_IPV6_SUBTREES
1054                                if (fn->subtree) {
1055                                        struct fib6_node *sfn;
1056                                        sfn = fib6_lookup_1(fn->subtree,
1057                                                            args + 1);
1058                                        if (!sfn)
1059                                                goto backtrack;
1060                                        fn = sfn;
1061                                }
1062#endif
1063                                if (fn->fn_flags & RTN_RTINFO)
1064                                        return fn;
1065                        }
1066                }
1067#ifdef CONFIG_IPV6_SUBTREES
1068backtrack:
1069#endif
1070                if (fn->fn_flags & RTN_ROOT)
1071                        break;
1072
1073                fn = fn->parent;
1074        }
1075
1076        return NULL;
1077}
1078
1079struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
1080                              const struct in6_addr *saddr)
1081{
1082        struct fib6_node *fn;
1083        struct lookup_args args[] = {
1084                {
1085                        .offset = offsetof(struct rt6_info, rt6i_dst),
1086                        .addr = daddr,
1087                },
1088#ifdef CONFIG_IPV6_SUBTREES
1089                {
1090                        .offset = offsetof(struct rt6_info, rt6i_src),
1091                        .addr = saddr,
1092                },
1093#endif
1094                {
1095                        .offset = 0,    /* sentinel */
1096                }
1097        };
1098
1099        fn = fib6_lookup_1(root, daddr ? args : args + 1);
1100        if (!fn || fn->fn_flags & RTN_TL_ROOT)
1101                fn = root;
1102
1103        return fn;
1104}
1105
1106/*
1107 *      Get node with specified destination prefix (and source prefix,
1108 *      if subtrees are used)
1109 */
1110
1111
1112static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1113                                       const struct in6_addr *addr,
1114                                       int plen, int offset)
1115{
1116        struct fib6_node *fn;
1117
1118        for (fn = root; fn ; ) {
1119                struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
1120
1121                /*
1122                 *      Prefix match
1123                 */
1124                if (plen < fn->fn_bit ||
1125                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1126                        return NULL;
1127
1128                if (plen == fn->fn_bit)
1129                        return fn;
1130
1131                /*
1132                 *      We have more bits to go
1133                 */
1134                if (addr_bit_set(addr, fn->fn_bit))
1135                        fn = fn->right;
1136                else
1137                        fn = fn->left;
1138        }
1139        return NULL;
1140}
1141
1142struct fib6_node *fib6_locate(struct fib6_node *root,
1143                              const struct in6_addr *daddr, int dst_len,
1144                              const struct in6_addr *saddr, int src_len)
1145{
1146        struct fib6_node *fn;
1147
1148        fn = fib6_locate_1(root, daddr, dst_len,
1149                           offsetof(struct rt6_info, rt6i_dst));
1150
1151#ifdef CONFIG_IPV6_SUBTREES
1152        if (src_len) {
1153                WARN_ON(saddr == NULL);
1154                if (fn && fn->subtree)
1155                        fn = fib6_locate_1(fn->subtree, saddr, src_len,
1156                                           offsetof(struct rt6_info, rt6i_src));
1157        }
1158#endif
1159
1160        if (fn && fn->fn_flags & RTN_RTINFO)
1161                return fn;
1162
1163        return NULL;
1164}
1165
1166
1167/*
1168 *      Deletion
1169 *
1170 */
1171
1172static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
1173{
1174        if (fn->fn_flags & RTN_ROOT)
1175                return net->ipv6.ip6_null_entry;
1176
1177        while (fn) {
1178                if (fn->left)
1179                        return fn->left->leaf;
1180                if (fn->right)
1181                        return fn->right->leaf;
1182
1183                fn = FIB6_SUBTREE(fn);
1184        }
1185        return NULL;
1186}
1187
1188/*
1189 *      Called to trim the tree of intermediate nodes when possible. "fn"
1190 *      is the node we want to try and remove.
1191 */
1192
1193static struct fib6_node *fib6_repair_tree(struct net *net,
1194                                           struct fib6_node *fn)
1195{
1196        int children;
1197        int nstate;
1198        struct fib6_node *child, *pn;
1199        struct fib6_walker *w;
1200        int iter = 0;
1201
1202        for (;;) {
1203                RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1204                iter++;
1205
1206                WARN_ON(fn->fn_flags & RTN_RTINFO);
1207                WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1208                WARN_ON(fn->leaf != NULL);
1209
1210                children = 0;
1211                child = NULL;
1212                if (fn->right)
1213                        child = fn->right, children |= 1;
1214                if (fn->left)
1215                        child = fn->left, children |= 2;
1216
1217                if (children == 3 || FIB6_SUBTREE(fn)
1218#ifdef CONFIG_IPV6_SUBTREES
1219                    /* Subtree root (i.e. fn) may have one child */
1220                    || (children && fn->fn_flags & RTN_ROOT)
1221#endif
1222                    ) {
1223                        fn->leaf = fib6_find_prefix(net, fn);
1224#if RT6_DEBUG >= 2
1225                        if (!fn->leaf) {
1226                                WARN_ON(!fn->leaf);
1227                                fn->leaf = net->ipv6.ip6_null_entry;
1228                        }
1229#endif
1230                        atomic_inc(&fn->leaf->rt6i_ref);
1231                        return fn->parent;
1232                }
1233
1234                pn = fn->parent;
1235#ifdef CONFIG_IPV6_SUBTREES
1236                if (FIB6_SUBTREE(pn) == fn) {
1237                        WARN_ON(!(fn->fn_flags & RTN_ROOT));
1238                        FIB6_SUBTREE(pn) = NULL;
1239                        nstate = FWS_L;
1240                } else {
1241                        WARN_ON(fn->fn_flags & RTN_ROOT);
1242#endif
1243                        if (pn->right == fn)
1244                                pn->right = child;
1245                        else if (pn->left == fn)
1246                                pn->left = child;
1247#if RT6_DEBUG >= 2
1248                        else
1249                                WARN_ON(1);
1250#endif
1251                        if (child)
1252                                child->parent = pn;
1253                        nstate = FWS_R;
1254#ifdef CONFIG_IPV6_SUBTREES
1255                }
1256#endif
1257
1258                read_lock(&fib6_walker_lock);
1259                FOR_WALKERS(w) {
1260                        if (!child) {
1261                                if (w->root == fn) {
1262                                        w->root = w->node = NULL;
1263                                        RT6_TRACE("W %p adjusted by delroot 1\n", w);
1264                                } else if (w->node == fn) {
1265                                        RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
1266                                        w->node = pn;
1267                                        w->state = nstate;
1268                                }
1269                        } else {
1270                                if (w->root == fn) {
1271                                        w->root = child;
1272                                        RT6_TRACE("W %p adjusted by delroot 2\n", w);
1273                                }
1274                                if (w->node == fn) {
1275                                        w->node = child;
1276                                        if (children&2) {
1277                                                RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1278                                                w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1279                                        } else {
1280                                                RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1281                                                w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1282                                        }
1283                                }
1284                        }
1285                }
1286                read_unlock(&fib6_walker_lock);
1287
1288                node_free(fn);
1289                if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1290                        return pn;
1291
1292                rt6_release(pn->leaf);
1293                pn->leaf = NULL;
1294                fn = pn;
1295        }
1296}
1297
1298static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
1299                           struct nl_info *info)
1300{
1301        struct fib6_walker *w;
1302        struct rt6_info *rt = *rtp;
1303        struct net *net = info->nl_net;
1304
1305        RT6_TRACE("fib6_del_route\n");
1306
1307        /* Unlink it */
1308        *rtp = rt->dst.rt6_next;
1309        rt->rt6i_node = NULL;
1310        net->ipv6.rt6_stats->fib_rt_entries--;
1311        net->ipv6.rt6_stats->fib_discarded_routes++;
1312
1313        /* Reset round-robin state, if necessary */
1314        if (fn->rr_ptr == rt)
1315                fn->rr_ptr = NULL;
1316
1317        /* Remove this entry from other siblings */
1318        if (rt->rt6i_nsiblings) {
1319                struct rt6_info *sibling, *next_sibling;
1320
1321                list_for_each_entry_safe(sibling, next_sibling,
1322                                         &rt->rt6i_siblings, rt6i_siblings)
1323                        sibling->rt6i_nsiblings--;
1324                rt->rt6i_nsiblings = 0;
1325                list_del_init(&rt->rt6i_siblings);
1326        }
1327
1328        /* Adjust walkers */
1329        read_lock(&fib6_walker_lock);
1330        FOR_WALKERS(w) {
1331                if (w->state == FWS_C && w->leaf == rt) {
1332                        RT6_TRACE("walker %p adjusted by delroute\n", w);
1333                        w->leaf = rt->dst.rt6_next;
1334                        if (!w->leaf)
1335                                w->state = FWS_U;
1336                }
1337        }
1338        read_unlock(&fib6_walker_lock);
1339
1340        rt->dst.rt6_next = NULL;
1341
1342        /* If it was last route, expunge its radix tree node */
1343        if (!fn->leaf) {
1344                fn->fn_flags &= ~RTN_RTINFO;
1345                net->ipv6.rt6_stats->fib_route_nodes--;
1346                fn = fib6_repair_tree(net, fn);
1347        }
1348
1349        fib6_purge_rt(rt, fn, net);
1350
1351        inet6_rt_notify(RTM_DELROUTE, rt, info);
1352        rt6_release(rt);
1353}
1354
1355int fib6_del(struct rt6_info *rt, struct nl_info *info)
1356{
1357        struct net *net = info->nl_net;
1358        struct fib6_node *fn = rt->rt6i_node;
1359        struct rt6_info **rtp;
1360
1361#if RT6_DEBUG >= 2
1362        if (rt->dst.obsolete > 0) {
1363                WARN_ON(fn != NULL);
1364                return -ENOENT;
1365        }
1366#endif
1367        if (!fn || rt == net->ipv6.ip6_null_entry)
1368                return -ENOENT;
1369
1370        WARN_ON(!(fn->fn_flags & RTN_RTINFO));
1371
1372        if (!(rt->rt6i_flags & RTF_CACHE)) {
1373                struct fib6_node *pn = fn;
1374#ifdef CONFIG_IPV6_SUBTREES
1375                /* clones of this route might be in another subtree */
1376                if (rt->rt6i_src.plen) {
1377                        while (!(pn->fn_flags & RTN_ROOT))
1378                                pn = pn->parent;
1379                        pn = pn->parent;
1380                }
1381#endif
1382                fib6_prune_clones(info->nl_net, pn);
1383        }
1384
1385        /*
1386         *      Walk the leaf entries looking for ourself
1387         */
1388
1389        for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
1390                if (*rtp == rt) {
1391                        fib6_del_route(fn, rtp, info);
1392                        return 0;
1393                }
1394        }
1395        return -ENOENT;
1396}
1397
1398/*
1399 *      Tree traversal function.
1400 *
1401 *      Certainly, it is not interrupt safe.
1402 *      However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1403 *      It means, that we can modify tree during walking
1404 *      and use this function for garbage collection, clone pruning,
1405 *      cleaning tree when a device goes down etc. etc.
1406 *
1407 *      It guarantees that every node will be traversed,
1408 *      and that it will be traversed only once.
1409 *
1410 *      Callback function w->func may return:
1411 *      0 -> continue walking.
1412 *      positive value -> walking is suspended (used by tree dumps,
1413 *      and probably by gc, if it will be split to several slices)
1414 *      negative value -> terminate walking.
1415 *
1416 *      The function itself returns:
1417 *      0   -> walk is complete.
1418 *      >0  -> walk is incomplete (i.e. suspended)
1419 *      <0  -> walk is terminated by an error.
1420 */
1421
1422static int fib6_walk_continue(struct fib6_walker *w)
1423{
1424        struct fib6_node *fn, *pn;
1425
1426        for (;;) {
1427                fn = w->node;
1428                if (!fn)
1429                        return 0;
1430
1431                if (w->prune && fn != w->root &&
1432                    fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
1433                        w->state = FWS_C;
1434                        w->leaf = fn->leaf;
1435                }
1436                switch (w->state) {
1437#ifdef CONFIG_IPV6_SUBTREES
1438                case FWS_S:
1439                        if (FIB6_SUBTREE(fn)) {
1440                                w->node = FIB6_SUBTREE(fn);
1441                                continue;
1442                        }
1443                        w->state = FWS_L;
1444#endif
1445                case FWS_L:
1446                        if (fn->left) {
1447                                w->node = fn->left;
1448                                w->state = FWS_INIT;
1449                                continue;
1450                        }
1451                        w->state = FWS_R;
1452                case FWS_R:
1453                        if (fn->right) {
1454                                w->node = fn->right;
1455                                w->state = FWS_INIT;
1456                                continue;
1457                        }
1458                        w->state = FWS_C;
1459                        w->leaf = fn->leaf;
1460                case FWS_C:
1461                        if (w->leaf && fn->fn_flags & RTN_RTINFO) {
1462                                int err;
1463
1464                                if (w->skip) {
1465                                        w->skip--;
1466                                        goto skip;
1467                                }
1468
1469                                err = w->func(w);
1470                                if (err)
1471                                        return err;
1472
1473                                w->count++;
1474                                continue;
1475                        }
1476skip:
1477                        w->state = FWS_U;
1478                case FWS_U:
1479                        if (fn == w->root)
1480                                return 0;
1481                        pn = fn->parent;
1482                        w->node = pn;
1483#ifdef CONFIG_IPV6_SUBTREES
1484                        if (FIB6_SUBTREE(pn) == fn) {
1485                                WARN_ON(!(fn->fn_flags & RTN_ROOT));
1486                                w->state = FWS_L;
1487                                continue;
1488                        }
1489#endif
1490                        if (pn->left == fn) {
1491                                w->state = FWS_R;
1492                                continue;
1493                        }
1494                        if (pn->right == fn) {
1495                                w->state = FWS_C;
1496                                w->leaf = w->node->leaf;
1497                                continue;
1498                        }
1499#if RT6_DEBUG >= 2
1500                        WARN_ON(1);
1501#endif
1502                }
1503        }
1504}
1505
1506static int fib6_walk(struct fib6_walker *w)
1507{
1508        int res;
1509
1510        w->state = FWS_INIT;
1511        w->node = w->root;
1512
1513        fib6_walker_link(w);
1514        res = fib6_walk_continue(w);
1515        if (res <= 0)
1516                fib6_walker_unlink(w);
1517        return res;
1518}
1519
1520static int fib6_clean_node(struct fib6_walker *w)
1521{
1522        int res;
1523        struct rt6_info *rt;
1524        struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
1525        struct nl_info info = {
1526                .nl_net = c->net,
1527        };
1528
1529        if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
1530            w->node->fn_sernum != c->sernum)
1531                w->node->fn_sernum = c->sernum;
1532
1533        if (!c->func) {
1534                WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
1535                w->leaf = NULL;
1536                return 0;
1537        }
1538
1539        for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
1540                res = c->func(rt, c->arg);
1541                if (res < 0) {
1542                        w->leaf = rt;
1543                        res = fib6_del(rt, &info);
1544                        if (res) {
1545#if RT6_DEBUG >= 2
1546                                pr_debug("%s: del failed: rt=%p@%p err=%d\n",
1547                                         __func__, rt, rt->rt6i_node, res);
1548#endif
1549                                continue;
1550                        }
1551                        return 0;
1552                }
1553                WARN_ON(res != 0);
1554        }
1555        w->leaf = rt;
1556        return 0;
1557}
1558
1559/*
1560 *      Convenient frontend to tree walker.
1561 *
1562 *      func is called on each route.
1563 *              It may return -1 -> delete this route.
1564 *                            0  -> continue walking
1565 *
1566 *      prune==1 -> only immediate children of node (certainly,
1567 *      ignoring pure split nodes) will be scanned.
1568 */
1569
1570static void fib6_clean_tree(struct net *net, struct fib6_node *root,
1571                            int (*func)(struct rt6_info *, void *arg),
1572                            bool prune, int sernum, void *arg)
1573{
1574        struct fib6_cleaner c;
1575
1576        c.w.root = root;
1577        c.w.func = fib6_clean_node;
1578        c.w.prune = prune;
1579        c.w.count = 0;
1580        c.w.skip = 0;
1581        c.func = func;
1582        c.sernum = sernum;
1583        c.arg = arg;
1584        c.net = net;
1585
1586        fib6_walk(&c.w);
1587}
1588
1589static void __fib6_clean_all(struct net *net,
1590                             int (*func)(struct rt6_info *, void *),
1591                             int sernum, void *arg)
1592{
1593        struct fib6_table *table;
1594        struct hlist_head *head;
1595        unsigned int h;
1596
1597        rcu_read_lock();
1598        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
1599                head = &net->ipv6.fib_table_hash[h];
1600                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
1601                        write_lock_bh(&table->tb6_lock);
1602                        fib6_clean_tree(net, &table->tb6_root,
1603                                        func, false, sernum, arg);
1604                        write_unlock_bh(&table->tb6_lock);
1605                }
1606        }
1607        rcu_read_unlock();
1608}
1609
1610void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
1611                    void *arg)
1612{
1613        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
1614}
1615
1616static int fib6_prune_clone(struct rt6_info *rt, void *arg)
1617{
1618        if (rt->rt6i_flags & RTF_CACHE) {
1619                RT6_TRACE("pruning clone %p\n", rt);
1620                return -1;
1621        }
1622
1623        return 0;
1624}
1625
1626static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
1627{
1628        fib6_clean_tree(net, fn, fib6_prune_clone, true,
1629                        FIB6_NO_SERNUM_CHANGE, NULL);
1630}
1631
1632static void fib6_flush_trees(struct net *net)
1633{
1634        int new_sernum = fib6_new_sernum(net);
1635
1636        __fib6_clean_all(net, NULL, new_sernum, NULL);
1637}
1638
1639/*
1640 *      Garbage collection
1641 */
1642
1643static struct fib6_gc_args
1644{
1645        int                     timeout;
1646        int                     more;
1647} gc_args;
1648
1649static int fib6_age(struct rt6_info *rt, void *arg)
1650{
1651        unsigned long now = jiffies;
1652
1653        /*
1654         *      check addrconf expiration here.
1655         *      Routes are expired even if they are in use.
1656         *
1657         *      Also age clones. Note, that clones are aged out
1658         *      only if they are not in use now.
1659         */
1660
1661        if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
1662                if (time_after(now, rt->dst.expires)) {
1663                        RT6_TRACE("expiring %p\n", rt);
1664                        return -1;
1665                }
1666                gc_args.more++;
1667        } else if (rt->rt6i_flags & RTF_CACHE) {
1668                if (atomic_read(&rt->dst.__refcnt) == 0 &&
1669                    time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
1670                        RT6_TRACE("aging clone %p\n", rt);
1671                        return -1;
1672                } else if (rt->rt6i_flags & RTF_GATEWAY) {
1673                        struct neighbour *neigh;
1674                        __u8 neigh_flags = 0;
1675
1676                        neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1677                        if (neigh) {
1678                                neigh_flags = neigh->flags;
1679                                neigh_release(neigh);
1680                        }
1681                        if (!(neigh_flags & NTF_ROUTER)) {
1682                                RT6_TRACE("purging route %p via non-router but gateway\n",
1683                                          rt);
1684                                return -1;
1685                        }
1686                }
1687                gc_args.more++;
1688        }
1689
1690        return 0;
1691}
1692
1693static DEFINE_SPINLOCK(fib6_gc_lock);
1694
1695void fib6_run_gc(unsigned long expires, struct net *net, bool force)
1696{
1697        unsigned long now;
1698
1699        if (force) {
1700                spin_lock_bh(&fib6_gc_lock);
1701        } else if (!spin_trylock_bh(&fib6_gc_lock)) {
1702                mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
1703                return;
1704        }
1705        gc_args.timeout = expires ? (int)expires :
1706                          net->ipv6.sysctl.ip6_rt_gc_interval;
1707
1708        gc_args.more = icmp6_dst_gc();
1709
1710        fib6_clean_all(net, fib6_age, NULL);
1711        now = jiffies;
1712        net->ipv6.ip6_rt_last_gc = now;
1713
1714        if (gc_args.more)
1715                mod_timer(&net->ipv6.ip6_fib_timer,
1716                          round_jiffies(now
1717                                        + net->ipv6.sysctl.ip6_rt_gc_interval));
1718        else
1719                del_timer(&net->ipv6.ip6_fib_timer);
1720        spin_unlock_bh(&fib6_gc_lock);
1721}
1722
1723static void fib6_gc_timer_cb(unsigned long arg)
1724{
1725        fib6_run_gc(0, (struct net *)arg, true);
1726}
1727
1728static int __net_init fib6_net_init(struct net *net)
1729{
1730        size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
1731
1732        setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
1733
1734        net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
1735        if (!net->ipv6.rt6_stats)
1736                goto out_timer;
1737
1738        /* Avoid false sharing : Use at least a full cache line */
1739        size = max_t(size_t, size, L1_CACHE_BYTES);
1740
1741        net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
1742        if (!net->ipv6.fib_table_hash)
1743                goto out_rt6_stats;
1744
1745        net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
1746                                          GFP_KERNEL);
1747        if (!net->ipv6.fib6_main_tbl)
1748                goto out_fib_table_hash;
1749
1750        net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
1751        net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
1752        net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
1753                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
1754        inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
1755
1756#ifdef CONFIG_IPV6_MULTIPLE_TABLES
1757        net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
1758                                           GFP_KERNEL);
1759        if (!net->ipv6.fib6_local_tbl)
1760                goto out_fib6_main_tbl;
1761        net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
1762        net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
1763        net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
1764                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
1765        inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
1766#endif
1767        fib6_tables_init(net);
1768
1769        return 0;
1770
1771#ifdef CONFIG_IPV6_MULTIPLE_TABLES
1772out_fib6_main_tbl:
1773        kfree(net->ipv6.fib6_main_tbl);
1774#endif
1775out_fib_table_hash:
1776        kfree(net->ipv6.fib_table_hash);
1777out_rt6_stats:
1778        kfree(net->ipv6.rt6_stats);
1779out_timer:
1780        return -ENOMEM;
1781}
1782
1783static void fib6_net_exit(struct net *net)
1784{
1785        rt6_ifdown(net, NULL);
1786        del_timer_sync(&net->ipv6.ip6_fib_timer);
1787
1788#ifdef CONFIG_IPV6_MULTIPLE_TABLES
1789        inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
1790        kfree(net->ipv6.fib6_local_tbl);
1791#endif
1792        inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
1793        kfree(net->ipv6.fib6_main_tbl);
1794        kfree(net->ipv6.fib_table_hash);
1795        kfree(net->ipv6.rt6_stats);
1796}
1797
1798static struct pernet_operations fib6_net_ops = {
1799        .init = fib6_net_init,
1800        .exit = fib6_net_exit,
1801};
1802
1803int __init fib6_init(void)
1804{
1805        int ret = -ENOMEM;
1806
1807        fib6_node_kmem = kmem_cache_create("fib6_nodes",
1808                                           sizeof(struct fib6_node),
1809                                           0, SLAB_HWCACHE_ALIGN,
1810                                           NULL);
1811        if (!fib6_node_kmem)
1812                goto out;
1813
1814        ret = register_pernet_subsys(&fib6_net_ops);
1815        if (ret)
1816                goto out_kmem_cache_create;
1817
1818        ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
1819                              NULL);
1820        if (ret)
1821                goto out_unregister_subsys;
1822
1823        __fib6_flush_trees = fib6_flush_trees;
1824out:
1825        return ret;
1826
1827out_unregister_subsys:
1828        unregister_pernet_subsys(&fib6_net_ops);
1829out_kmem_cache_create:
1830        kmem_cache_destroy(fib6_node_kmem);
1831        goto out;
1832}
1833
1834void fib6_gc_cleanup(void)
1835{
1836        unregister_pernet_subsys(&fib6_net_ops);
1837        kmem_cache_destroy(fib6_node_kmem);
1838}
1839
1840#ifdef CONFIG_PROC_FS
1841
1842struct ipv6_route_iter {
1843        struct seq_net_private p;
1844        struct fib6_walker w;
1845        loff_t skip;
1846        struct fib6_table *tbl;
1847        int sernum;
1848};
1849
1850static int ipv6_route_seq_show(struct seq_file *seq, void *v)
1851{
1852        struct rt6_info *rt = v;
1853        struct ipv6_route_iter *iter = seq->private;
1854
1855        seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
1856
1857#ifdef CONFIG_IPV6_SUBTREES
1858        seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
1859#else
1860        seq_puts(seq, "00000000000000000000000000000000 00 ");
1861#endif
1862        if (rt->rt6i_flags & RTF_GATEWAY)
1863                seq_printf(seq, "%pi6", &rt->rt6i_gateway);
1864        else
1865                seq_puts(seq, "00000000000000000000000000000000");
1866
1867        seq_printf(seq, " %08x %08x %08x %08x %8s\n",
1868                   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
1869                   rt->dst.__use, rt->rt6i_flags,
1870                   rt->dst.dev ? rt->dst.dev->name : "");
1871        iter->w.leaf = NULL;
1872        return 0;
1873}
1874
1875static int ipv6_route_yield(struct fib6_walker *w)
1876{
1877        struct ipv6_route_iter *iter = w->args;
1878
1879        if (!iter->skip)
1880                return 1;
1881
1882        do {
1883                iter->w.leaf = iter->w.leaf->dst.rt6_next;
1884                iter->skip--;
1885                if (!iter->skip && iter->w.leaf)
1886                        return 1;
1887        } while (iter->w.leaf);
1888
1889        return 0;
1890}
1891
1892static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
1893{
1894        memset(&iter->w, 0, sizeof(iter->w));
1895        iter->w.func = ipv6_route_yield;
1896        iter->w.root = &iter->tbl->tb6_root;
1897        iter->w.state = FWS_INIT;
1898        iter->w.node = iter->w.root;
1899        iter->w.args = iter;
1900        iter->sernum = iter->w.root->fn_sernum;
1901        INIT_LIST_HEAD(&iter->w.lh);
1902        fib6_walker_link(&iter->w);
1903}
1904
1905static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
1906                                                    struct net *net)
1907{
1908        unsigned int h;
1909        struct hlist_node *node;
1910
1911        if (tbl) {
1912                h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
1913                node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
1914        } else {
1915                h = 0;
1916                node = NULL;
1917        }
1918
1919        while (!node && h < FIB6_TABLE_HASHSZ) {
1920                node = rcu_dereference_bh(
1921                        hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
1922        }
1923        return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
1924}
1925
1926static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
1927{
1928        if (iter->sernum != iter->w.root->fn_sernum) {
1929                iter->sernum = iter->w.root->fn_sernum;
1930                iter->w.state = FWS_INIT;
1931                iter->w.node = iter->w.root;
1932                WARN_ON(iter->w.skip);
1933                iter->w.skip = iter->w.count;
1934        }
1935}
1936
1937static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1938{
1939        int r;
1940        struct rt6_info *n;
1941        struct net *net = seq_file_net(seq);
1942        struct ipv6_route_iter *iter = seq->private;
1943
1944        if (!v)
1945                goto iter_table;
1946
1947        n = ((struct rt6_info *)v)->dst.rt6_next;
1948        if (n) {
1949                ++*pos;
1950                return n;
1951        }
1952
1953iter_table:
1954        ipv6_route_check_sernum(iter);
1955        read_lock(&iter->tbl->tb6_lock);
1956        r = fib6_walk_continue(&iter->w);
1957        read_unlock(&iter->tbl->tb6_lock);
1958        if (r > 0) {
1959                if (v)
1960                        ++*pos;
1961                return iter->w.leaf;
1962        } else if (r < 0) {
1963                fib6_walker_unlink(&iter->w);
1964                return NULL;
1965        }
1966        fib6_walker_unlink(&iter->w);
1967
1968        iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
1969        if (!iter->tbl)
1970                return NULL;
1971
1972        ipv6_route_seq_setup_walk(iter);
1973        goto iter_table;
1974}
1975
1976static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
1977        __acquires(RCU_BH)
1978{
1979        struct net *net = seq_file_net(seq);
1980        struct ipv6_route_iter *iter = seq->private;
1981
1982        rcu_read_lock_bh();
1983        iter->tbl = ipv6_route_seq_next_table(NULL, net);
1984        iter->skip = *pos;
1985
1986        if (iter->tbl) {
1987                ipv6_route_seq_setup_walk(iter);
1988                return ipv6_route_seq_next(seq, NULL, pos);
1989        } else {
1990                return NULL;
1991        }
1992}
1993
1994static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
1995{
1996        struct fib6_walker *w = &iter->w;
1997        return w->node && !(w->state == FWS_U && w->node == w->root);
1998}
1999
2000static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2001        __releases(RCU_BH)
2002{
2003        struct ipv6_route_iter *iter = seq->private;
2004
2005        if (ipv6_route_iter_active(iter))
2006                fib6_walker_unlink(&iter->w);
2007
2008        rcu_read_unlock_bh();
2009}
2010
2011static const struct seq_operations ipv6_route_seq_ops = {
2012        .start  = ipv6_route_seq_start,
2013        .next   = ipv6_route_seq_next,
2014        .stop   = ipv6_route_seq_stop,
2015        .show   = ipv6_route_seq_show
2016};
2017
2018int ipv6_route_open(struct inode *inode, struct file *file)
2019{
2020        return seq_open_net(inode, file, &ipv6_route_seq_ops,
2021                            sizeof(struct ipv6_route_iter));
2022}
2023
2024#endif /* CONFIG_PROC_FS */
2025