linux/net/ipv6/ip6_fib.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      Linux INET6 implementation
   4 *      Forwarding Information Database
   5 *
   6 *      Authors:
   7 *      Pedro Roque             <roque@di.fc.ul.pt>
   8 *
   9 *      Changes:
  10 *      Yuji SEKIYA @USAGI:     Support default route on router node;
  11 *                              remove ip6_null_entry from the top of
  12 *                              routing table.
  13 *      Ville Nuorvala:         Fixed routing subtrees.
  14 */
  15
  16#define pr_fmt(fmt) "IPv6: " fmt
  17
  18#include <linux/errno.h>
  19#include <linux/types.h>
  20#include <linux/net.h>
  21#include <linux/route.h>
  22#include <linux/netdevice.h>
  23#include <linux/in6.h>
  24#include <linux/init.h>
  25#include <linux/list.h>
  26#include <linux/slab.h>
  27
  28#include <net/ip.h>
  29#include <net/ipv6.h>
  30#include <net/ndisc.h>
  31#include <net/addrconf.h>
  32#include <net/lwtunnel.h>
  33#include <net/fib_notifier.h>
  34
  35#include <net/ip6_fib.h>
  36#include <net/ip6_route.h>
  37
  38static struct kmem_cache *fib6_node_kmem __read_mostly;
  39
  40struct fib6_cleaner {
  41        struct fib6_walker w;
  42        struct net *net;
  43        int (*func)(struct fib6_info *, void *arg);
  44        int sernum;
  45        void *arg;
  46        bool skip_notify;
  47};
  48
  49#ifdef CONFIG_IPV6_SUBTREES
  50#define FWS_INIT FWS_S
  51#else
  52#define FWS_INIT FWS_L
  53#endif
  54
  55static struct fib6_info *fib6_find_prefix(struct net *net,
  56                                         struct fib6_table *table,
  57                                         struct fib6_node *fn);
  58static struct fib6_node *fib6_repair_tree(struct net *net,
  59                                          struct fib6_table *table,
  60                                          struct fib6_node *fn);
  61static int fib6_walk(struct net *net, struct fib6_walker *w);
  62static int fib6_walk_continue(struct fib6_walker *w);
  63
  64/*
  65 *      A routing update causes an increase of the serial number on the
  66 *      affected subtree. This allows for cached routes to be asynchronously
  67 *      tested when modifications are made to the destination cache as a
  68 *      result of redirects, path MTU changes, etc.
  69 */
  70
  71static void fib6_gc_timer_cb(struct timer_list *t);
  72
  73#define FOR_WALKERS(net, w) \
  74        list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
  75
  76static void fib6_walker_link(struct net *net, struct fib6_walker *w)
  77{
  78        write_lock_bh(&net->ipv6.fib6_walker_lock);
  79        list_add(&w->lh, &net->ipv6.fib6_walkers);
  80        write_unlock_bh(&net->ipv6.fib6_walker_lock);
  81}
  82
  83static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
  84{
  85        write_lock_bh(&net->ipv6.fib6_walker_lock);
  86        list_del(&w->lh);
  87        write_unlock_bh(&net->ipv6.fib6_walker_lock);
  88}
  89
  90static int fib6_new_sernum(struct net *net)
  91{
  92        int new, old;
  93
  94        do {
  95                old = atomic_read(&net->ipv6.fib6_sernum);
  96                new = old < INT_MAX ? old + 1 : 1;
  97        } while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
  98                                old, new) != old);
  99        return new;
 100}
 101
 102enum {
 103        FIB6_NO_SERNUM_CHANGE = 0,
 104};
 105
 106void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
 107{
 108        struct fib6_node *fn;
 109
 110        fn = rcu_dereference_protected(f6i->fib6_node,
 111                        lockdep_is_held(&f6i->fib6_table->tb6_lock));
 112        if (fn)
 113                fn->fn_sernum = fib6_new_sernum(net);
 114}
 115
 116/*
 117 *      Auxiliary address test functions for the radix tree.
 118 *
 119 *      These assume a 32bit processor (although it will work on
 120 *      64bit processors)
 121 */
 122
 123/*
 124 *      test bit
 125 */
 126#if defined(__LITTLE_ENDIAN)
 127# define BITOP_BE32_SWIZZLE     (0x1F & ~7)
 128#else
 129# define BITOP_BE32_SWIZZLE     0
 130#endif
 131
 132static __be32 addr_bit_set(const void *token, int fn_bit)
 133{
 134        const __be32 *addr = token;
 135        /*
 136         * Here,
 137         *      1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
 138         * is optimized version of
 139         *      htonl(1 << ((~fn_bit)&0x1F))
 140         * See include/asm-generic/bitops/le.h.
 141         */
 142        return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
 143               addr[fn_bit >> 5];
 144}
 145
 146struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
 147{
 148        struct fib6_info *f6i;
 149        size_t sz = sizeof(*f6i);
 150
 151        if (with_fib6_nh)
 152                sz += sizeof(struct fib6_nh);
 153
 154        f6i = kzalloc(sz, gfp_flags);
 155        if (!f6i)
 156                return NULL;
 157
 158        /* fib6_siblings is a union with nh_list, so this initializes both */
 159        INIT_LIST_HEAD(&f6i->fib6_siblings);
 160        refcount_set(&f6i->fib6_ref, 1);
 161
 162        return f6i;
 163}
 164
 165void fib6_info_destroy_rcu(struct rcu_head *head)
 166{
 167        struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
 168
 169        WARN_ON(f6i->fib6_node);
 170
 171        if (f6i->nh)
 172                nexthop_put(f6i->nh);
 173        else
 174                fib6_nh_release(f6i->fib6_nh);
 175
 176        ip_fib_metrics_put(f6i->fib6_metrics);
 177        kfree(f6i);
 178}
 179EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
 180
 181static struct fib6_node *node_alloc(struct net *net)
 182{
 183        struct fib6_node *fn;
 184
 185        fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
 186        if (fn)
 187                net->ipv6.rt6_stats->fib_nodes++;
 188
 189        return fn;
 190}
 191
 192static void node_free_immediate(struct net *net, struct fib6_node *fn)
 193{
 194        kmem_cache_free(fib6_node_kmem, fn);
 195        net->ipv6.rt6_stats->fib_nodes--;
 196}
 197
 198static void node_free_rcu(struct rcu_head *head)
 199{
 200        struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
 201
 202        kmem_cache_free(fib6_node_kmem, fn);
 203}
 204
 205static void node_free(struct net *net, struct fib6_node *fn)
 206{
 207        call_rcu(&fn->rcu, node_free_rcu);
 208        net->ipv6.rt6_stats->fib_nodes--;
 209}
 210
 211static void fib6_free_table(struct fib6_table *table)
 212{
 213        inetpeer_invalidate_tree(&table->tb6_peers);
 214        kfree(table);
 215}
 216
 217static void fib6_link_table(struct net *net, struct fib6_table *tb)
 218{
 219        unsigned int h;
 220
 221        /*
 222         * Initialize table lock at a single place to give lockdep a key,
 223         * tables aren't visible prior to being linked to the list.
 224         */
 225        spin_lock_init(&tb->tb6_lock);
 226        h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
 227
 228        /*
 229         * No protection necessary, this is the only list mutatation
 230         * operation, tables never disappear once they exist.
 231         */
 232        hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
 233}
 234
 235#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 236
 237static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
 238{
 239        struct fib6_table *table;
 240
 241        table = kzalloc(sizeof(*table), GFP_ATOMIC);
 242        if (table) {
 243                table->tb6_id = id;
 244                rcu_assign_pointer(table->tb6_root.leaf,
 245                                   net->ipv6.fib6_null_entry);
 246                table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 247                inet_peer_base_init(&table->tb6_peers);
 248        }
 249
 250        return table;
 251}
 252
 253struct fib6_table *fib6_new_table(struct net *net, u32 id)
 254{
 255        struct fib6_table *tb;
 256
 257        if (id == 0)
 258                id = RT6_TABLE_MAIN;
 259        tb = fib6_get_table(net, id);
 260        if (tb)
 261                return tb;
 262
 263        tb = fib6_alloc_table(net, id);
 264        if (tb)
 265                fib6_link_table(net, tb);
 266
 267        return tb;
 268}
 269EXPORT_SYMBOL_GPL(fib6_new_table);
 270
 271struct fib6_table *fib6_get_table(struct net *net, u32 id)
 272{
 273        struct fib6_table *tb;
 274        struct hlist_head *head;
 275        unsigned int h;
 276
 277        if (id == 0)
 278                id = RT6_TABLE_MAIN;
 279        h = id & (FIB6_TABLE_HASHSZ - 1);
 280        rcu_read_lock();
 281        head = &net->ipv6.fib_table_hash[h];
 282        hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 283                if (tb->tb6_id == id) {
 284                        rcu_read_unlock();
 285                        return tb;
 286                }
 287        }
 288        rcu_read_unlock();
 289
 290        return NULL;
 291}
 292EXPORT_SYMBOL_GPL(fib6_get_table);
 293
 294static void __net_init fib6_tables_init(struct net *net)
 295{
 296        fib6_link_table(net, net->ipv6.fib6_main_tbl);
 297        fib6_link_table(net, net->ipv6.fib6_local_tbl);
 298}
 299#else
 300
 301struct fib6_table *fib6_new_table(struct net *net, u32 id)
 302{
 303        return fib6_get_table(net, id);
 304}
 305
 306struct fib6_table *fib6_get_table(struct net *net, u32 id)
 307{
 308          return net->ipv6.fib6_main_tbl;
 309}
 310
 311struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 312                                   const struct sk_buff *skb,
 313                                   int flags, pol_lookup_t lookup)
 314{
 315        struct rt6_info *rt;
 316
 317        rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
 318        if (rt->dst.error == -EAGAIN) {
 319                ip6_rt_put_flags(rt, flags);
 320                rt = net->ipv6.ip6_null_entry;
 321                if (!(flags & RT6_LOOKUP_F_DST_NOREF))
 322                        dst_hold(&rt->dst);
 323        }
 324
 325        return &rt->dst;
 326}
 327
 328/* called with rcu lock held; no reference taken on fib6_info */
 329int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
 330                struct fib6_result *res, int flags)
 331{
 332        return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
 333                                 res, flags);
 334}
 335
 336static void __net_init fib6_tables_init(struct net *net)
 337{
 338        fib6_link_table(net, net->ipv6.fib6_main_tbl);
 339}
 340
 341#endif
 342
 343unsigned int fib6_tables_seq_read(struct net *net)
 344{
 345        unsigned int h, fib_seq = 0;
 346
 347        rcu_read_lock();
 348        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 349                struct hlist_head *head = &net->ipv6.fib_table_hash[h];
 350                struct fib6_table *tb;
 351
 352                hlist_for_each_entry_rcu(tb, head, tb6_hlist)
 353                        fib_seq += tb->fib_seq;
 354        }
 355        rcu_read_unlock();
 356
 357        return fib_seq;
 358}
 359
 360static int call_fib6_entry_notifier(struct notifier_block *nb,
 361                                    enum fib_event_type event_type,
 362                                    struct fib6_info *rt,
 363                                    struct netlink_ext_ack *extack)
 364{
 365        struct fib6_entry_notifier_info info = {
 366                .info.extack = extack,
 367                .rt = rt,
 368        };
 369
 370        return call_fib6_notifier(nb, event_type, &info.info);
 371}
 372
 373static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
 374                                              enum fib_event_type event_type,
 375                                              struct fib6_info *rt,
 376                                              unsigned int nsiblings,
 377                                              struct netlink_ext_ack *extack)
 378{
 379        struct fib6_entry_notifier_info info = {
 380                .info.extack = extack,
 381                .rt = rt,
 382                .nsiblings = nsiblings,
 383        };
 384
 385        return call_fib6_notifier(nb, event_type, &info.info);
 386}
 387
 388int call_fib6_entry_notifiers(struct net *net,
 389                              enum fib_event_type event_type,
 390                              struct fib6_info *rt,
 391                              struct netlink_ext_ack *extack)
 392{
 393        struct fib6_entry_notifier_info info = {
 394                .info.extack = extack,
 395                .rt = rt,
 396        };
 397
 398        rt->fib6_table->fib_seq++;
 399        return call_fib6_notifiers(net, event_type, &info.info);
 400}
 401
 402int call_fib6_multipath_entry_notifiers(struct net *net,
 403                                        enum fib_event_type event_type,
 404                                        struct fib6_info *rt,
 405                                        unsigned int nsiblings,
 406                                        struct netlink_ext_ack *extack)
 407{
 408        struct fib6_entry_notifier_info info = {
 409                .info.extack = extack,
 410                .rt = rt,
 411                .nsiblings = nsiblings,
 412        };
 413
 414        rt->fib6_table->fib_seq++;
 415        return call_fib6_notifiers(net, event_type, &info.info);
 416}
 417
 418int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
 419{
 420        struct fib6_entry_notifier_info info = {
 421                .rt = rt,
 422                .nsiblings = rt->fib6_nsiblings,
 423        };
 424
 425        rt->fib6_table->fib_seq++;
 426        return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
 427}
 428
 429struct fib6_dump_arg {
 430        struct net *net;
 431        struct notifier_block *nb;
 432        struct netlink_ext_ack *extack;
 433};
 434
 435static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
 436{
 437        enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
 438        int err;
 439
 440        if (!rt || rt == arg->net->ipv6.fib6_null_entry)
 441                return 0;
 442
 443        if (rt->fib6_nsiblings)
 444                err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
 445                                                         rt,
 446                                                         rt->fib6_nsiblings,
 447                                                         arg->extack);
 448        else
 449                err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
 450                                               arg->extack);
 451
 452        return err;
 453}
 454
 455static int fib6_node_dump(struct fib6_walker *w)
 456{
 457        int err;
 458
 459        err = fib6_rt_dump(w->leaf, w->args);
 460        w->leaf = NULL;
 461        return err;
 462}
 463
 464static int fib6_table_dump(struct net *net, struct fib6_table *tb,
 465                           struct fib6_walker *w)
 466{
 467        int err;
 468
 469        w->root = &tb->tb6_root;
 470        spin_lock_bh(&tb->tb6_lock);
 471        err = fib6_walk(net, w);
 472        spin_unlock_bh(&tb->tb6_lock);
 473        return err;
 474}
 475
 476/* Called with rcu_read_lock() */
 477int fib6_tables_dump(struct net *net, struct notifier_block *nb,
 478                     struct netlink_ext_ack *extack)
 479{
 480        struct fib6_dump_arg arg;
 481        struct fib6_walker *w;
 482        unsigned int h;
 483        int err = 0;
 484
 485        w = kzalloc(sizeof(*w), GFP_ATOMIC);
 486        if (!w)
 487                return -ENOMEM;
 488
 489        w->func = fib6_node_dump;
 490        arg.net = net;
 491        arg.nb = nb;
 492        arg.extack = extack;
 493        w->args = &arg;
 494
 495        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 496                struct hlist_head *head = &net->ipv6.fib_table_hash[h];
 497                struct fib6_table *tb;
 498
 499                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 500                        err = fib6_table_dump(net, tb, w);
 501                        if (err < 0)
 502                                goto out;
 503                }
 504        }
 505
 506out:
 507        kfree(w);
 508
 509        return err;
 510}
 511
 512static int fib6_dump_node(struct fib6_walker *w)
 513{
 514        int res;
 515        struct fib6_info *rt;
 516
 517        for_each_fib6_walker_rt(w) {
 518                res = rt6_dump_route(rt, w->args, w->skip_in_node);
 519                if (res >= 0) {
 520                        /* Frame is full, suspend walking */
 521                        w->leaf = rt;
 522
 523                        /* We'll restart from this node, so if some routes were
 524                         * already dumped, skip them next time.
 525                         */
 526                        w->skip_in_node += res;
 527
 528                        return 1;
 529                }
 530                w->skip_in_node = 0;
 531
 532                /* Multipath routes are dumped in one route with the
 533                 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
 534                 * last sibling of this route (no need to dump the
 535                 * sibling routes again)
 536                 */
 537                if (rt->fib6_nsiblings)
 538                        rt = list_last_entry(&rt->fib6_siblings,
 539                                             struct fib6_info,
 540                                             fib6_siblings);
 541        }
 542        w->leaf = NULL;
 543        return 0;
 544}
 545
 546static void fib6_dump_end(struct netlink_callback *cb)
 547{
 548        struct net *net = sock_net(cb->skb->sk);
 549        struct fib6_walker *w = (void *)cb->args[2];
 550
 551        if (w) {
 552                if (cb->args[4]) {
 553                        cb->args[4] = 0;
 554                        fib6_walker_unlink(net, w);
 555                }
 556                cb->args[2] = 0;
 557                kfree(w);
 558        }
 559        cb->done = (void *)cb->args[3];
 560        cb->args[1] = 3;
 561}
 562
 563static int fib6_dump_done(struct netlink_callback *cb)
 564{
 565        fib6_dump_end(cb);
 566        return cb->done ? cb->done(cb) : 0;
 567}
 568
 569static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 570                           struct netlink_callback *cb)
 571{
 572        struct net *net = sock_net(skb->sk);
 573        struct fib6_walker *w;
 574        int res;
 575
 576        w = (void *)cb->args[2];
 577        w->root = &table->tb6_root;
 578
 579        if (cb->args[4] == 0) {
 580                w->count = 0;
 581                w->skip = 0;
 582                w->skip_in_node = 0;
 583
 584                spin_lock_bh(&table->tb6_lock);
 585                res = fib6_walk(net, w);
 586                spin_unlock_bh(&table->tb6_lock);
 587                if (res > 0) {
 588                        cb->args[4] = 1;
 589                        cb->args[5] = w->root->fn_sernum;
 590                }
 591        } else {
 592                if (cb->args[5] != w->root->fn_sernum) {
 593                        /* Begin at the root if the tree changed */
 594                        cb->args[5] = w->root->fn_sernum;
 595                        w->state = FWS_INIT;
 596                        w->node = w->root;
 597                        w->skip = w->count;
 598                        w->skip_in_node = 0;
 599                } else
 600                        w->skip = 0;
 601
 602                spin_lock_bh(&table->tb6_lock);
 603                res = fib6_walk_continue(w);
 604                spin_unlock_bh(&table->tb6_lock);
 605                if (res <= 0) {
 606                        fib6_walker_unlink(net, w);
 607                        cb->args[4] = 0;
 608                }
 609        }
 610
 611        return res;
 612}
 613
 614static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 615{
 616        struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
 617                                         .filter.dump_routes = true };
 618        const struct nlmsghdr *nlh = cb->nlh;
 619        struct net *net = sock_net(skb->sk);
 620        unsigned int h, s_h;
 621        unsigned int e = 0, s_e;
 622        struct fib6_walker *w;
 623        struct fib6_table *tb;
 624        struct hlist_head *head;
 625        int res = 0;
 626
 627        if (cb->strict_check) {
 628                int err;
 629
 630                err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
 631                if (err < 0)
 632                        return err;
 633        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 634                struct rtmsg *rtm = nlmsg_data(nlh);
 635
 636                if (rtm->rtm_flags & RTM_F_PREFIX)
 637                        arg.filter.flags = RTM_F_PREFIX;
 638        }
 639
 640        w = (void *)cb->args[2];
 641        if (!w) {
 642                /* New dump:
 643                 *
 644                 * 1. hook callback destructor.
 645                 */
 646                cb->args[3] = (long)cb->done;
 647                cb->done = fib6_dump_done;
 648
 649                /*
 650                 * 2. allocate and initialize walker.
 651                 */
 652                w = kzalloc(sizeof(*w), GFP_ATOMIC);
 653                if (!w)
 654                        return -ENOMEM;
 655                w->func = fib6_dump_node;
 656                cb->args[2] = (long)w;
 657        }
 658
 659        arg.skb = skb;
 660        arg.cb = cb;
 661        arg.net = net;
 662        w->args = &arg;
 663
 664        if (arg.filter.table_id) {
 665                tb = fib6_get_table(net, arg.filter.table_id);
 666                if (!tb) {
 667                        if (arg.filter.dump_all_families)
 668                                goto out;
 669
 670                        NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
 671                        return -ENOENT;
 672                }
 673
 674                if (!cb->args[0]) {
 675                        res = fib6_dump_table(tb, skb, cb);
 676                        if (!res)
 677                                cb->args[0] = 1;
 678                }
 679                goto out;
 680        }
 681
 682        s_h = cb->args[0];
 683        s_e = cb->args[1];
 684
 685        rcu_read_lock();
 686        for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
 687                e = 0;
 688                head = &net->ipv6.fib_table_hash[h];
 689                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 690                        if (e < s_e)
 691                                goto next;
 692                        res = fib6_dump_table(tb, skb, cb);
 693                        if (res != 0)
 694                                goto out_unlock;
 695next:
 696                        e++;
 697                }
 698        }
 699out_unlock:
 700        rcu_read_unlock();
 701        cb->args[1] = e;
 702        cb->args[0] = h;
 703out:
 704        res = res < 0 ? res : skb->len;
 705        if (res <= 0)
 706                fib6_dump_end(cb);
 707        return res;
 708}
 709
 710void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
 711{
 712        if (!f6i)
 713                return;
 714
 715        if (f6i->fib6_metrics == &dst_default_metrics) {
 716                struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
 717
 718                if (!p)
 719                        return;
 720
 721                refcount_set(&p->refcnt, 1);
 722                f6i->fib6_metrics = p;
 723        }
 724
 725        f6i->fib6_metrics->metrics[metric - 1] = val;
 726}
 727
 728/*
 729 *      Routing Table
 730 *
 731 *      return the appropriate node for a routing tree "add" operation
 732 *      by either creating and inserting or by returning an existing
 733 *      node.
 734 */
 735
 736static struct fib6_node *fib6_add_1(struct net *net,
 737                                    struct fib6_table *table,
 738                                    struct fib6_node *root,
 739                                    struct in6_addr *addr, int plen,
 740                                    int offset, int allow_create,
 741                                    int replace_required,
 742                                    struct netlink_ext_ack *extack)
 743{
 744        struct fib6_node *fn, *in, *ln;
 745        struct fib6_node *pn = NULL;
 746        struct rt6key *key;
 747        int     bit;
 748        __be32  dir = 0;
 749
 750        RT6_TRACE("fib6_add_1\n");
 751
 752        /* insert node in tree */
 753
 754        fn = root;
 755
 756        do {
 757                struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 758                                            lockdep_is_held(&table->tb6_lock));
 759                key = (struct rt6key *)((u8 *)leaf + offset);
 760
 761                /*
 762                 *      Prefix match
 763                 */
 764                if (plen < fn->fn_bit ||
 765                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
 766                        if (!allow_create) {
 767                                if (replace_required) {
 768                                        NL_SET_ERR_MSG(extack,
 769                                                       "Can not replace route - no match found");
 770                                        pr_warn("Can't replace route, no match found\n");
 771                                        return ERR_PTR(-ENOENT);
 772                                }
 773                                pr_warn("NLM_F_CREATE should be set when creating new route\n");
 774                        }
 775                        goto insert_above;
 776                }
 777
 778                /*
 779                 *      Exact match ?
 780                 */
 781
 782                if (plen == fn->fn_bit) {
 783                        /* clean up an intermediate node */
 784                        if (!(fn->fn_flags & RTN_RTINFO)) {
 785                                RCU_INIT_POINTER(fn->leaf, NULL);
 786                                fib6_info_release(leaf);
 787                        /* remove null_entry in the root node */
 788                        } else if (fn->fn_flags & RTN_TL_ROOT &&
 789                                   rcu_access_pointer(fn->leaf) ==
 790                                   net->ipv6.fib6_null_entry) {
 791                                RCU_INIT_POINTER(fn->leaf, NULL);
 792                        }
 793
 794                        return fn;
 795                }
 796
 797                /*
 798                 *      We have more bits to go
 799                 */
 800
 801                /* Try to walk down on tree. */
 802                dir = addr_bit_set(addr, fn->fn_bit);
 803                pn = fn;
 804                fn = dir ?
 805                     rcu_dereference_protected(fn->right,
 806                                        lockdep_is_held(&table->tb6_lock)) :
 807                     rcu_dereference_protected(fn->left,
 808                                        lockdep_is_held(&table->tb6_lock));
 809        } while (fn);
 810
 811        if (!allow_create) {
 812                /* We should not create new node because
 813                 * NLM_F_REPLACE was specified without NLM_F_CREATE
 814                 * I assume it is safe to require NLM_F_CREATE when
 815                 * REPLACE flag is used! Later we may want to remove the
 816                 * check for replace_required, because according
 817                 * to netlink specification, NLM_F_CREATE
 818                 * MUST be specified if new route is created.
 819                 * That would keep IPv6 consistent with IPv4
 820                 */
 821                if (replace_required) {
 822                        NL_SET_ERR_MSG(extack,
 823                                       "Can not replace route - no match found");
 824                        pr_warn("Can't replace route, no match found\n");
 825                        return ERR_PTR(-ENOENT);
 826                }
 827                pr_warn("NLM_F_CREATE should be set when creating new route\n");
 828        }
 829        /*
 830         *      We walked to the bottom of tree.
 831         *      Create new leaf node without children.
 832         */
 833
 834        ln = node_alloc(net);
 835
 836        if (!ln)
 837                return ERR_PTR(-ENOMEM);
 838        ln->fn_bit = plen;
 839        RCU_INIT_POINTER(ln->parent, pn);
 840
 841        if (dir)
 842                rcu_assign_pointer(pn->right, ln);
 843        else
 844                rcu_assign_pointer(pn->left, ln);
 845
 846        return ln;
 847
 848
 849insert_above:
 850        /*
 851         * split since we don't have a common prefix anymore or
 852         * we have a less significant route.
 853         * we've to insert an intermediate node on the list
 854         * this new node will point to the one we need to create
 855         * and the current
 856         */
 857
 858        pn = rcu_dereference_protected(fn->parent,
 859                                       lockdep_is_held(&table->tb6_lock));
 860
 861        /* find 1st bit in difference between the 2 addrs.
 862
 863           See comment in __ipv6_addr_diff: bit may be an invalid value,
 864           but if it is >= plen, the value is ignored in any case.
 865         */
 866
 867        bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
 868
 869        /*
 870         *              (intermediate)[in]
 871         *                /        \
 872         *      (new leaf node)[ln] (old node)[fn]
 873         */
 874        if (plen > bit) {
 875                in = node_alloc(net);
 876                ln = node_alloc(net);
 877
 878                if (!in || !ln) {
 879                        if (in)
 880                                node_free_immediate(net, in);
 881                        if (ln)
 882                                node_free_immediate(net, ln);
 883                        return ERR_PTR(-ENOMEM);
 884                }
 885
 886                /*
 887                 * new intermediate node.
 888                 * RTN_RTINFO will
 889                 * be off since that an address that chooses one of
 890                 * the branches would not match less specific routes
 891                 * in the other branch
 892                 */
 893
 894                in->fn_bit = bit;
 895
 896                RCU_INIT_POINTER(in->parent, pn);
 897                in->leaf = fn->leaf;
 898                fib6_info_hold(rcu_dereference_protected(in->leaf,
 899                                lockdep_is_held(&table->tb6_lock)));
 900
 901                /* update parent pointer */
 902                if (dir)
 903                        rcu_assign_pointer(pn->right, in);
 904                else
 905                        rcu_assign_pointer(pn->left, in);
 906
 907                ln->fn_bit = plen;
 908
 909                RCU_INIT_POINTER(ln->parent, in);
 910                rcu_assign_pointer(fn->parent, in);
 911
 912                if (addr_bit_set(addr, bit)) {
 913                        rcu_assign_pointer(in->right, ln);
 914                        rcu_assign_pointer(in->left, fn);
 915                } else {
 916                        rcu_assign_pointer(in->left, ln);
 917                        rcu_assign_pointer(in->right, fn);
 918                }
 919        } else { /* plen <= bit */
 920
 921                /*
 922                 *              (new leaf node)[ln]
 923                 *                /        \
 924                 *           (old node)[fn] NULL
 925                 */
 926
 927                ln = node_alloc(net);
 928
 929                if (!ln)
 930                        return ERR_PTR(-ENOMEM);
 931
 932                ln->fn_bit = plen;
 933
 934                RCU_INIT_POINTER(ln->parent, pn);
 935
 936                if (addr_bit_set(&key->addr, plen))
 937                        RCU_INIT_POINTER(ln->right, fn);
 938                else
 939                        RCU_INIT_POINTER(ln->left, fn);
 940
 941                rcu_assign_pointer(fn->parent, ln);
 942
 943                if (dir)
 944                        rcu_assign_pointer(pn->right, ln);
 945                else
 946                        rcu_assign_pointer(pn->left, ln);
 947        }
 948        return ln;
 949}
 950
 951static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
 952                                  const struct fib6_info *match,
 953                                  const struct fib6_table *table)
 954{
 955        int cpu;
 956
 957        if (!fib6_nh->rt6i_pcpu)
 958                return;
 959
 960        /* release the reference to this fib entry from
 961         * all of its cached pcpu routes
 962         */
 963        for_each_possible_cpu(cpu) {
 964                struct rt6_info **ppcpu_rt;
 965                struct rt6_info *pcpu_rt;
 966
 967                ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
 968                pcpu_rt = *ppcpu_rt;
 969
 970                /* only dropping the 'from' reference if the cached route
 971                 * is using 'match'. The cached pcpu_rt->from only changes
 972                 * from a fib6_info to NULL (ip6_dst_destroy); it can never
 973                 * change from one fib6_info reference to another
 974                 */
 975                if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
 976                        struct fib6_info *from;
 977
 978                        from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
 979                        fib6_info_release(from);
 980                }
 981        }
 982}
 983
 984struct fib6_nh_pcpu_arg {
 985        struct fib6_info        *from;
 986        const struct fib6_table *table;
 987};
 988
 989static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
 990{
 991        struct fib6_nh_pcpu_arg *arg = _arg;
 992
 993        __fib6_drop_pcpu_from(nh, arg->from, arg->table);
 994        return 0;
 995}
 996
 997static void fib6_drop_pcpu_from(struct fib6_info *f6i,
 998                                const struct fib6_table *table)
 999{
1000        /* Make sure rt6_make_pcpu_route() wont add other percpu routes
1001         * while we are cleaning them here.
1002         */
1003        f6i->fib6_destroying = 1;
1004        mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
1005
1006        if (f6i->nh) {
1007                struct fib6_nh_pcpu_arg arg = {
1008                        .from = f6i,
1009                        .table = table
1010                };
1011
1012                nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
1013                                         &arg);
1014        } else {
1015                struct fib6_nh *fib6_nh;
1016
1017                fib6_nh = f6i->fib6_nh;
1018                __fib6_drop_pcpu_from(fib6_nh, f6i, table);
1019        }
1020}
1021
1022static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
1023                          struct net *net)
1024{
1025        struct fib6_table *table = rt->fib6_table;
1026
1027        fib6_drop_pcpu_from(rt, table);
1028
1029        if (rt->nh && !list_empty(&rt->nh_list))
1030                list_del_init(&rt->nh_list);
1031
1032        if (refcount_read(&rt->fib6_ref) != 1) {
1033                /* This route is used as dummy address holder in some split
1034                 * nodes. It is not leaked, but it still holds other resources,
1035                 * which must be released in time. So, scan ascendant nodes
1036                 * and replace dummy references to this route with references
1037                 * to still alive ones.
1038                 */
1039                while (fn) {
1040                        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1041                                            lockdep_is_held(&table->tb6_lock));
1042                        struct fib6_info *new_leaf;
1043                        if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
1044                                new_leaf = fib6_find_prefix(net, table, fn);
1045                                fib6_info_hold(new_leaf);
1046
1047                                rcu_assign_pointer(fn->leaf, new_leaf);
1048                                fib6_info_release(rt);
1049                        }
1050                        fn = rcu_dereference_protected(fn->parent,
1051                                    lockdep_is_held(&table->tb6_lock));
1052                }
1053        }
1054}
1055
1056/*
1057 *      Insert routing information in a node.
1058 */
1059
1060static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
1061                            struct nl_info *info,
1062                            struct netlink_ext_ack *extack)
1063{
1064        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1065                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
1066        struct fib6_info *iter = NULL;
1067        struct fib6_info __rcu **ins;
1068        struct fib6_info __rcu **fallback_ins = NULL;
1069        int replace = (info->nlh &&
1070                       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
1071        int add = (!info->nlh ||
1072                   (info->nlh->nlmsg_flags & NLM_F_CREATE));
1073        int found = 0;
1074        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
1075        bool notify_sibling_rt = false;
1076        u16 nlflags = NLM_F_EXCL;
1077        int err;
1078
1079        if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
1080                nlflags |= NLM_F_APPEND;
1081
1082        ins = &fn->leaf;
1083
1084        for (iter = leaf; iter;
1085             iter = rcu_dereference_protected(iter->fib6_next,
1086                                lockdep_is_held(&rt->fib6_table->tb6_lock))) {
1087                /*
1088                 *      Search for duplicates
1089                 */
1090
1091                if (iter->fib6_metric == rt->fib6_metric) {
1092                        /*
1093                         *      Same priority level
1094                         */
1095                        if (info->nlh &&
1096                            (info->nlh->nlmsg_flags & NLM_F_EXCL))
1097                                return -EEXIST;
1098
1099                        nlflags &= ~NLM_F_EXCL;
1100                        if (replace) {
1101                                if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
1102                                        found++;
1103                                        break;
1104                                }
1105                                fallback_ins = fallback_ins ?: ins;
1106                                goto next_iter;
1107                        }
1108
1109                        if (rt6_duplicate_nexthop(iter, rt)) {
1110                                if (rt->fib6_nsiblings)
1111                                        rt->fib6_nsiblings = 0;
1112                                if (!(iter->fib6_flags & RTF_EXPIRES))
1113                                        return -EEXIST;
1114                                if (!(rt->fib6_flags & RTF_EXPIRES))
1115                                        fib6_clean_expires(iter);
1116                                else
1117                                        fib6_set_expires(iter, rt->expires);
1118
1119                                if (rt->fib6_pmtu)
1120                                        fib6_metric_set(iter, RTAX_MTU,
1121                                                        rt->fib6_pmtu);
1122                                return -EEXIST;
1123                        }
1124                        /* If we have the same destination and the same metric,
1125                         * but not the same gateway, then the route we try to
1126                         * add is sibling to this route, increment our counter
1127                         * of siblings, and later we will add our route to the
1128                         * list.
1129                         * Only static routes (which don't have flag
1130                         * RTF_EXPIRES) are used for ECMPv6.
1131                         *
1132                         * To avoid long list, we only had siblings if the
1133                         * route have a gateway.
1134                         */
1135                        if (rt_can_ecmp &&
1136                            rt6_qualify_for_ecmp(iter))
1137                                rt->fib6_nsiblings++;
1138                }
1139
1140                if (iter->fib6_metric > rt->fib6_metric)
1141                        break;
1142
1143next_iter:
1144                ins = &iter->fib6_next;
1145        }
1146
1147        if (fallback_ins && !found) {
1148                /* No matching route with same ecmp-able-ness found, replace
1149                 * first matching route
1150                 */
1151                ins = fallback_ins;
1152                iter = rcu_dereference_protected(*ins,
1153                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
1154                found++;
1155        }
1156
1157        /* Reset round-robin state, if necessary */
1158        if (ins == &fn->leaf)
1159                fn->rr_ptr = NULL;
1160
1161        /* Link this route to others same route. */
1162        if (rt->fib6_nsiblings) {
1163                unsigned int fib6_nsiblings;
1164                struct fib6_info *sibling, *temp_sibling;
1165
1166                /* Find the first route that have the same metric */
1167                sibling = leaf;
1168                notify_sibling_rt = true;
1169                while (sibling) {
1170                        if (sibling->fib6_metric == rt->fib6_metric &&
1171                            rt6_qualify_for_ecmp(sibling)) {
1172                                list_add_tail(&rt->fib6_siblings,
1173                                              &sibling->fib6_siblings);
1174                                break;
1175                        }
1176                        sibling = rcu_dereference_protected(sibling->fib6_next,
1177                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
1178                        notify_sibling_rt = false;
1179                }
1180                /* For each sibling in the list, increment the counter of
1181                 * siblings. BUG() if counters does not match, list of siblings
1182                 * is broken!
1183                 */
1184                fib6_nsiblings = 0;
1185                list_for_each_entry_safe(sibling, temp_sibling,
1186                                         &rt->fib6_siblings, fib6_siblings) {
1187                        sibling->fib6_nsiblings++;
1188                        BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1189                        fib6_nsiblings++;
1190                }
1191                BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1192                rt6_multipath_rebalance(temp_sibling);
1193        }
1194
1195        /*
1196         *      insert node
1197         */
1198        if (!replace) {
1199                if (!add)
1200                        pr_warn("NLM_F_CREATE should be set when creating new route\n");
1201
1202add:
1203                nlflags |= NLM_F_CREATE;
1204
1205                /* The route should only be notified if it is the first
1206                 * route in the node or if it is added as a sibling
1207                 * route to the first route in the node.
1208                 */
1209                if (!info->skip_notify_kernel &&
1210                    (notify_sibling_rt || ins == &fn->leaf)) {
1211                        enum fib_event_type fib_event;
1212
1213                        if (notify_sibling_rt)
1214                                fib_event = FIB_EVENT_ENTRY_APPEND;
1215                        else
1216                                fib_event = FIB_EVENT_ENTRY_REPLACE;
1217                        err = call_fib6_entry_notifiers(info->nl_net,
1218                                                        fib_event, rt,
1219                                                        extack);
1220                        if (err) {
1221                                struct fib6_info *sibling, *next_sibling;
1222
1223                                /* If the route has siblings, then it first
1224                                 * needs to be unlinked from them.
1225                                 */
1226                                if (!rt->fib6_nsiblings)
1227                                        return err;
1228
1229                                list_for_each_entry_safe(sibling, next_sibling,
1230                                                         &rt->fib6_siblings,
1231                                                         fib6_siblings)
1232                                        sibling->fib6_nsiblings--;
1233                                rt->fib6_nsiblings = 0;
1234                                list_del_init(&rt->fib6_siblings);
1235                                rt6_multipath_rebalance(next_sibling);
1236                                return err;
1237                        }
1238                }
1239
1240                rcu_assign_pointer(rt->fib6_next, iter);
1241                fib6_info_hold(rt);
1242                rcu_assign_pointer(rt->fib6_node, fn);
1243                rcu_assign_pointer(*ins, rt);
1244                if (!info->skip_notify)
1245                        inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
1246                info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
1247
1248                if (!(fn->fn_flags & RTN_RTINFO)) {
1249                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1250                        fn->fn_flags |= RTN_RTINFO;
1251                }
1252
1253        } else {
1254                int nsiblings;
1255
1256                if (!found) {
1257                        if (add)
1258                                goto add;
1259                        pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1260                        return -ENOENT;
1261                }
1262
1263                if (!info->skip_notify_kernel && ins == &fn->leaf) {
1264                        err = call_fib6_entry_notifiers(info->nl_net,
1265                                                        FIB_EVENT_ENTRY_REPLACE,
1266                                                        rt, extack);
1267                        if (err)
1268                                return err;
1269                }
1270
1271                fib6_info_hold(rt);
1272                rcu_assign_pointer(rt->fib6_node, fn);
1273                rt->fib6_next = iter->fib6_next;
1274                rcu_assign_pointer(*ins, rt);
1275                if (!info->skip_notify)
1276                        inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1277                if (!(fn->fn_flags & RTN_RTINFO)) {
1278                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1279                        fn->fn_flags |= RTN_RTINFO;
1280                }
1281                nsiblings = iter->fib6_nsiblings;
1282                iter->fib6_node = NULL;
1283                fib6_purge_rt(iter, fn, info->nl_net);
1284                if (rcu_access_pointer(fn->rr_ptr) == iter)
1285                        fn->rr_ptr = NULL;
1286                fib6_info_release(iter);
1287
1288                if (nsiblings) {
1289                        /* Replacing an ECMP route, remove all siblings */
1290                        ins = &rt->fib6_next;
1291                        iter = rcu_dereference_protected(*ins,
1292                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
1293                        while (iter) {
1294                                if (iter->fib6_metric > rt->fib6_metric)
1295                                        break;
1296                                if (rt6_qualify_for_ecmp(iter)) {
1297                                        *ins = iter->fib6_next;
1298                                        iter->fib6_node = NULL;
1299                                        fib6_purge_rt(iter, fn, info->nl_net);
1300                                        if (rcu_access_pointer(fn->rr_ptr) == iter)
1301                                                fn->rr_ptr = NULL;
1302                                        fib6_info_release(iter);
1303                                        nsiblings--;
1304                                        info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1305                                } else {
1306                                        ins = &iter->fib6_next;
1307                                }
1308                                iter = rcu_dereference_protected(*ins,
1309                                        lockdep_is_held(&rt->fib6_table->tb6_lock));
1310                        }
1311                        WARN_ON(nsiblings != 0);
1312                }
1313        }
1314
1315        return 0;
1316}
1317
1318static void fib6_start_gc(struct net *net, struct fib6_info *rt)
1319{
1320        if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
1321            (rt->fib6_flags & RTF_EXPIRES))
1322                mod_timer(&net->ipv6.ip6_fib_timer,
1323                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1324}
1325
1326void fib6_force_start_gc(struct net *net)
1327{
1328        if (!timer_pending(&net->ipv6.ip6_fib_timer))
1329                mod_timer(&net->ipv6.ip6_fib_timer,
1330                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1331}
1332
1333static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
1334                                           int sernum)
1335{
1336        struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1337                                lockdep_is_held(&rt->fib6_table->tb6_lock));
1338
1339        /* paired with smp_rmb() in rt6_get_cookie_safe() */
1340        smp_wmb();
1341        while (fn) {
1342                fn->fn_sernum = sernum;
1343                fn = rcu_dereference_protected(fn->parent,
1344                                lockdep_is_held(&rt->fib6_table->tb6_lock));
1345        }
1346}
1347
1348void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
1349{
1350        __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
1351}
1352
1353/* allow ipv4 to update sernum via ipv6_stub */
1354void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
1355{
1356        spin_lock_bh(&f6i->fib6_table->tb6_lock);
1357        fib6_update_sernum_upto_root(net, f6i);
1358        spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1359}
1360
1361/*
1362 *      Add routing information to the routing tree.
1363 *      <destination addr>/<source addr>
1364 *      with source addr info in sub-trees
1365 *      Need to own table->tb6_lock
1366 */
1367
1368int fib6_add(struct fib6_node *root, struct fib6_info *rt,
1369             struct nl_info *info, struct netlink_ext_ack *extack)
1370{
1371        struct fib6_table *table = rt->fib6_table;
1372        struct fib6_node *fn, *pn = NULL;
1373        int err = -ENOMEM;
1374        int allow_create = 1;
1375        int replace_required = 0;
1376        int sernum = fib6_new_sernum(info->nl_net);
1377
1378        if (info->nlh) {
1379                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
1380                        allow_create = 0;
1381                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
1382                        replace_required = 1;
1383        }
1384        if (!allow_create && !replace_required)
1385                pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1386
1387        fn = fib6_add_1(info->nl_net, table, root,
1388                        &rt->fib6_dst.addr, rt->fib6_dst.plen,
1389                        offsetof(struct fib6_info, fib6_dst), allow_create,
1390                        replace_required, extack);
1391        if (IS_ERR(fn)) {
1392                err = PTR_ERR(fn);
1393                fn = NULL;
1394                goto out;
1395        }
1396
1397        pn = fn;
1398
1399#ifdef CONFIG_IPV6_SUBTREES
1400        if (rt->fib6_src.plen) {
1401                struct fib6_node *sn;
1402
1403                if (!rcu_access_pointer(fn->subtree)) {
1404                        struct fib6_node *sfn;
1405
1406                        /*
1407                         * Create subtree.
1408                         *
1409                         *              fn[main tree]
1410                         *              |
1411                         *              sfn[subtree root]
1412                         *                 \
1413                         *                  sn[new leaf node]
1414                         */
1415
1416                        /* Create subtree root node */
1417                        sfn = node_alloc(info->nl_net);
1418                        if (!sfn)
1419                                goto failure;
1420
1421                        fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
1422                        rcu_assign_pointer(sfn->leaf,
1423                                           info->nl_net->ipv6.fib6_null_entry);
1424                        sfn->fn_flags = RTN_ROOT;
1425
1426                        /* Now add the first leaf node to new subtree */
1427
1428                        sn = fib6_add_1(info->nl_net, table, sfn,
1429                                        &rt->fib6_src.addr, rt->fib6_src.plen,
1430                                        offsetof(struct fib6_info, fib6_src),
1431                                        allow_create, replace_required, extack);
1432
1433                        if (IS_ERR(sn)) {
1434                                /* If it is failed, discard just allocated
1435                                   root, and then (in failure) stale node
1436                                   in main tree.
1437                                 */
1438                                node_free_immediate(info->nl_net, sfn);
1439                                err = PTR_ERR(sn);
1440                                goto failure;
1441                        }
1442
1443                        /* Now link new subtree to main tree */
1444                        rcu_assign_pointer(sfn->parent, fn);
1445                        rcu_assign_pointer(fn->subtree, sfn);
1446                } else {
1447                        sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
1448                                        &rt->fib6_src.addr, rt->fib6_src.plen,
1449                                        offsetof(struct fib6_info, fib6_src),
1450                                        allow_create, replace_required, extack);
1451
1452                        if (IS_ERR(sn)) {
1453                                err = PTR_ERR(sn);
1454                                goto failure;
1455                        }
1456                }
1457
1458                if (!rcu_access_pointer(fn->leaf)) {
1459                        if (fn->fn_flags & RTN_TL_ROOT) {
1460                                /* put back null_entry for root node */
1461                                rcu_assign_pointer(fn->leaf,
1462                                            info->nl_net->ipv6.fib6_null_entry);
1463                        } else {
1464                                fib6_info_hold(rt);
1465                                rcu_assign_pointer(fn->leaf, rt);
1466                        }
1467                }
1468                fn = sn;
1469        }
1470#endif
1471
1472        err = fib6_add_rt2node(fn, rt, info, extack);
1473        if (!err) {
1474                if (rt->nh)
1475                        list_add(&rt->nh_list, &rt->nh->f6i_list);
1476                __fib6_update_sernum_upto_root(rt, sernum);
1477                fib6_start_gc(info->nl_net, rt);
1478        }
1479
1480out:
1481        if (err) {
1482#ifdef CONFIG_IPV6_SUBTREES
1483                /*
1484                 * If fib6_add_1 has cleared the old leaf pointer in the
1485                 * super-tree leaf node we have to find a new one for it.
1486                 */
1487                if (pn != fn) {
1488                        struct fib6_info *pn_leaf =
1489                                rcu_dereference_protected(pn->leaf,
1490                                    lockdep_is_held(&table->tb6_lock));
1491                        if (pn_leaf == rt) {
1492                                pn_leaf = NULL;
1493                                RCU_INIT_POINTER(pn->leaf, NULL);
1494                                fib6_info_release(rt);
1495                        }
1496                        if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
1497                                pn_leaf = fib6_find_prefix(info->nl_net, table,
1498                                                           pn);
1499#if RT6_DEBUG >= 2
1500                                if (!pn_leaf) {
1501                                        WARN_ON(!pn_leaf);
1502                                        pn_leaf =
1503                                            info->nl_net->ipv6.fib6_null_entry;
1504                                }
1505#endif
1506                                fib6_info_hold(pn_leaf);
1507                                rcu_assign_pointer(pn->leaf, pn_leaf);
1508                        }
1509                }
1510#endif
1511                goto failure;
1512        } else if (fib6_requires_src(rt)) {
1513                fib6_routes_require_src_inc(info->nl_net);
1514        }
1515        return err;
1516
1517failure:
1518        /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1519         * 1. fn is an intermediate node and we failed to add the new
1520         * route to it in both subtree creation failure and fib6_add_rt2node()
1521         * failure case.
1522         * 2. fn is the root node in the table and we fail to add the first
1523         * default route to it.
1524         */
1525        if (fn &&
1526            (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
1527             (fn->fn_flags & RTN_TL_ROOT &&
1528              !rcu_access_pointer(fn->leaf))))
1529                fib6_repair_tree(info->nl_net, table, fn);
1530        return err;
1531}
1532
1533/*
1534 *      Routing tree lookup
1535 *
1536 */
1537
1538struct lookup_args {
1539        int                     offset;         /* key offset on fib6_info */
1540        const struct in6_addr   *addr;          /* search key                   */
1541};
1542
1543static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
1544                                            struct lookup_args *args)
1545{
1546        struct fib6_node *fn;
1547        __be32 dir;
1548
1549        if (unlikely(args->offset == 0))
1550                return NULL;
1551
1552        /*
1553         *      Descend on a tree
1554         */
1555
1556        fn = root;
1557
1558        for (;;) {
1559                struct fib6_node *next;
1560
1561                dir = addr_bit_set(args->addr, fn->fn_bit);
1562
1563                next = dir ? rcu_dereference(fn->right) :
1564                             rcu_dereference(fn->left);
1565
1566                if (next) {
1567                        fn = next;
1568                        continue;
1569                }
1570                break;
1571        }
1572
1573        while (fn) {
1574                struct fib6_node *subtree = FIB6_SUBTREE(fn);
1575
1576                if (subtree || fn->fn_flags & RTN_RTINFO) {
1577                        struct fib6_info *leaf = rcu_dereference(fn->leaf);
1578                        struct rt6key *key;
1579
1580                        if (!leaf)
1581                                goto backtrack;
1582
1583                        key = (struct rt6key *) ((u8 *)leaf + args->offset);
1584
1585                        if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1586#ifdef CONFIG_IPV6_SUBTREES
1587                                if (subtree) {
1588                                        struct fib6_node *sfn;
1589                                        sfn = fib6_node_lookup_1(subtree,
1590                                                                 args + 1);
1591                                        if (!sfn)
1592                                                goto backtrack;
1593                                        fn = sfn;
1594                                }
1595#endif
1596                                if (fn->fn_flags & RTN_RTINFO)
1597                                        return fn;
1598                        }
1599                }
1600backtrack:
1601                if (fn->fn_flags & RTN_ROOT)
1602                        break;
1603
1604                fn = rcu_dereference(fn->parent);
1605        }
1606
1607        return NULL;
1608}
1609
1610/* called with rcu_read_lock() held
1611 */
1612struct fib6_node *fib6_node_lookup(struct fib6_node *root,
1613                                   const struct in6_addr *daddr,
1614                                   const struct in6_addr *saddr)
1615{
1616        struct fib6_node *fn;
1617        struct lookup_args args[] = {
1618                {
1619                        .offset = offsetof(struct fib6_info, fib6_dst),
1620                        .addr = daddr,
1621                },
1622#ifdef CONFIG_IPV6_SUBTREES
1623                {
1624                        .offset = offsetof(struct fib6_info, fib6_src),
1625                        .addr = saddr,
1626                },
1627#endif
1628                {
1629                        .offset = 0,    /* sentinel */
1630                }
1631        };
1632
1633        fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
1634        if (!fn || fn->fn_flags & RTN_TL_ROOT)
1635                fn = root;
1636
1637        return fn;
1638}
1639
1640/*
1641 *      Get node with specified destination prefix (and source prefix,
1642 *      if subtrees are used)
1643 *      exact_match == true means we try to find fn with exact match of
1644 *      the passed in prefix addr
1645 *      exact_match == false means we try to find fn with longest prefix
1646 *      match of the passed in prefix addr. This is useful for finding fn
1647 *      for cached route as it will be stored in the exception table under
1648 *      the node with longest prefix length.
1649 */
1650
1651
1652static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1653                                       const struct in6_addr *addr,
1654                                       int plen, int offset,
1655                                       bool exact_match)
1656{
1657        struct fib6_node *fn, *prev = NULL;
1658
1659        for (fn = root; fn ; ) {
1660                struct fib6_info *leaf = rcu_dereference(fn->leaf);
1661                struct rt6key *key;
1662
1663                /* This node is being deleted */
1664                if (!leaf) {
1665                        if (plen <= fn->fn_bit)
1666                                goto out;
1667                        else
1668                                goto next;
1669                }
1670
1671                key = (struct rt6key *)((u8 *)leaf + offset);
1672
1673                /*
1674                 *      Prefix match
1675                 */
1676                if (plen < fn->fn_bit ||
1677                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1678                        goto out;
1679
1680                if (plen == fn->fn_bit)
1681                        return fn;
1682
1683                if (fn->fn_flags & RTN_RTINFO)
1684                        prev = fn;
1685
1686next:
1687                /*
1688                 *      We have more bits to go
1689                 */
1690                if (addr_bit_set(addr, fn->fn_bit))
1691                        fn = rcu_dereference(fn->right);
1692                else
1693                        fn = rcu_dereference(fn->left);
1694        }
1695out:
1696        if (exact_match)
1697                return NULL;
1698        else
1699                return prev;
1700}
1701
1702struct fib6_node *fib6_locate(struct fib6_node *root,
1703                              const struct in6_addr *daddr, int dst_len,
1704                              const struct in6_addr *saddr, int src_len,
1705                              bool exact_match)
1706{
1707        struct fib6_node *fn;
1708
1709        fn = fib6_locate_1(root, daddr, dst_len,
1710                           offsetof(struct fib6_info, fib6_dst),
1711                           exact_match);
1712
1713#ifdef CONFIG_IPV6_SUBTREES
1714        if (src_len) {
1715                WARN_ON(saddr == NULL);
1716                if (fn) {
1717                        struct fib6_node *subtree = FIB6_SUBTREE(fn);
1718
1719                        if (subtree) {
1720                                fn = fib6_locate_1(subtree, saddr, src_len,
1721                                           offsetof(struct fib6_info, fib6_src),
1722                                           exact_match);
1723                        }
1724                }
1725        }
1726#endif
1727
1728        if (fn && fn->fn_flags & RTN_RTINFO)
1729                return fn;
1730
1731        return NULL;
1732}
1733
1734
1735/*
1736 *      Deletion
1737 *
1738 */
1739
1740static struct fib6_info *fib6_find_prefix(struct net *net,
1741                                         struct fib6_table *table,
1742                                         struct fib6_node *fn)
1743{
1744        struct fib6_node *child_left, *child_right;
1745
1746        if (fn->fn_flags & RTN_ROOT)
1747                return net->ipv6.fib6_null_entry;
1748
1749        while (fn) {
1750                child_left = rcu_dereference_protected(fn->left,
1751                                    lockdep_is_held(&table->tb6_lock));
1752                child_right = rcu_dereference_protected(fn->right,
1753                                    lockdep_is_held(&table->tb6_lock));
1754                if (child_left)
1755                        return rcu_dereference_protected(child_left->leaf,
1756                                        lockdep_is_held(&table->tb6_lock));
1757                if (child_right)
1758                        return rcu_dereference_protected(child_right->leaf,
1759                                        lockdep_is_held(&table->tb6_lock));
1760
1761                fn = FIB6_SUBTREE(fn);
1762        }
1763        return NULL;
1764}
1765
1766/*
1767 *      Called to trim the tree of intermediate nodes when possible. "fn"
1768 *      is the node we want to try and remove.
1769 *      Need to own table->tb6_lock
1770 */
1771
1772static struct fib6_node *fib6_repair_tree(struct net *net,
1773                                          struct fib6_table *table,
1774                                          struct fib6_node *fn)
1775{
1776        int children;
1777        int nstate;
1778        struct fib6_node *child;
1779        struct fib6_walker *w;
1780        int iter = 0;
1781
1782        /* Set fn->leaf to null_entry for root node. */
1783        if (fn->fn_flags & RTN_TL_ROOT) {
1784                rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
1785                return fn;
1786        }
1787
1788        for (;;) {
1789                struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
1790                                            lockdep_is_held(&table->tb6_lock));
1791                struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
1792                                            lockdep_is_held(&table->tb6_lock));
1793                struct fib6_node *pn = rcu_dereference_protected(fn->parent,
1794                                            lockdep_is_held(&table->tb6_lock));
1795                struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
1796                                            lockdep_is_held(&table->tb6_lock));
1797                struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
1798                                            lockdep_is_held(&table->tb6_lock));
1799                struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
1800                                            lockdep_is_held(&table->tb6_lock));
1801                struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
1802                                            lockdep_is_held(&table->tb6_lock));
1803                struct fib6_info *new_fn_leaf;
1804
1805                RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1806                iter++;
1807
1808                WARN_ON(fn->fn_flags & RTN_RTINFO);
1809                WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1810                WARN_ON(fn_leaf);
1811
1812                children = 0;
1813                child = NULL;
1814                if (fn_r)
1815                        child = fn_r, children |= 1;
1816                if (fn_l)
1817                        child = fn_l, children |= 2;
1818
1819                if (children == 3 || FIB6_SUBTREE(fn)
1820#ifdef CONFIG_IPV6_SUBTREES
1821                    /* Subtree root (i.e. fn) may have one child */
1822                    || (children && fn->fn_flags & RTN_ROOT)
1823#endif
1824                    ) {
1825                        new_fn_leaf = fib6_find_prefix(net, table, fn);
1826#if RT6_DEBUG >= 2
1827                        if (!new_fn_leaf) {
1828                                WARN_ON(!new_fn_leaf);
1829                                new_fn_leaf = net->ipv6.fib6_null_entry;
1830                        }
1831#endif
1832                        fib6_info_hold(new_fn_leaf);
1833                        rcu_assign_pointer(fn->leaf, new_fn_leaf);
1834                        return pn;
1835                }
1836
1837#ifdef CONFIG_IPV6_SUBTREES
1838                if (FIB6_SUBTREE(pn) == fn) {
1839                        WARN_ON(!(fn->fn_flags & RTN_ROOT));
1840                        RCU_INIT_POINTER(pn->subtree, NULL);
1841                        nstate = FWS_L;
1842                } else {
1843                        WARN_ON(fn->fn_flags & RTN_ROOT);
1844#endif
1845                        if (pn_r == fn)
1846                                rcu_assign_pointer(pn->right, child);
1847                        else if (pn_l == fn)
1848                                rcu_assign_pointer(pn->left, child);
1849#if RT6_DEBUG >= 2
1850                        else
1851                                WARN_ON(1);
1852#endif
1853                        if (child)
1854                                rcu_assign_pointer(child->parent, pn);
1855                        nstate = FWS_R;
1856#ifdef CONFIG_IPV6_SUBTREES
1857                }
1858#endif
1859
1860                read_lock(&net->ipv6.fib6_walker_lock);
1861                FOR_WALKERS(net, w) {
1862                        if (!child) {
1863                                if (w->node == fn) {
1864                                        RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
1865                                        w->node = pn;
1866                                        w->state = nstate;
1867                                }
1868                        } else {
1869                                if (w->node == fn) {
1870                                        w->node = child;
1871                                        if (children&2) {
1872                                                RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1873                                                w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1874                                        } else {
1875                                                RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1876                                                w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1877                                        }
1878                                }
1879                        }
1880                }
1881                read_unlock(&net->ipv6.fib6_walker_lock);
1882
1883                node_free(net, fn);
1884                if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1885                        return pn;
1886
1887                RCU_INIT_POINTER(pn->leaf, NULL);
1888                fib6_info_release(pn_leaf);
1889                fn = pn;
1890        }
1891}
1892
1893static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1894                           struct fib6_info __rcu **rtp, struct nl_info *info)
1895{
1896        struct fib6_info *leaf, *replace_rt = NULL;
1897        struct fib6_walker *w;
1898        struct fib6_info *rt = rcu_dereference_protected(*rtp,
1899                                    lockdep_is_held(&table->tb6_lock));
1900        struct net *net = info->nl_net;
1901        bool notify_del = false;
1902
1903        RT6_TRACE("fib6_del_route\n");
1904
1905        /* If the deleted route is the first in the node and it is not part of
1906         * a multipath route, then we need to replace it with the next route
1907         * in the node, if exists.
1908         */
1909        leaf = rcu_dereference_protected(fn->leaf,
1910                                         lockdep_is_held(&table->tb6_lock));
1911        if (leaf == rt && !rt->fib6_nsiblings) {
1912                if (rcu_access_pointer(rt->fib6_next))
1913                        replace_rt = rcu_dereference_protected(rt->fib6_next,
1914                                            lockdep_is_held(&table->tb6_lock));
1915                else
1916                        notify_del = true;
1917        }
1918
1919        /* Unlink it */
1920        *rtp = rt->fib6_next;
1921        rt->fib6_node = NULL;
1922        net->ipv6.rt6_stats->fib_rt_entries--;
1923        net->ipv6.rt6_stats->fib_discarded_routes++;
1924
1925        /* Flush all cached dst in exception table */
1926        rt6_flush_exceptions(rt);
1927
1928        /* Reset round-robin state, if necessary */
1929        if (rcu_access_pointer(fn->rr_ptr) == rt)
1930                fn->rr_ptr = NULL;
1931
1932        /* Remove this entry from other siblings */
1933        if (rt->fib6_nsiblings) {
1934                struct fib6_info *sibling, *next_sibling;
1935
1936                /* The route is deleted from a multipath route. If this
1937                 * multipath route is the first route in the node, then we need
1938                 * to emit a delete notification. Otherwise, we need to skip
1939                 * the notification.
1940                 */
1941                if (rt->fib6_metric == leaf->fib6_metric &&
1942                    rt6_qualify_for_ecmp(leaf))
1943                        notify_del = true;
1944                list_for_each_entry_safe(sibling, next_sibling,
1945                                         &rt->fib6_siblings, fib6_siblings)
1946                        sibling->fib6_nsiblings--;
1947                rt->fib6_nsiblings = 0;
1948                list_del_init(&rt->fib6_siblings);
1949                rt6_multipath_rebalance(next_sibling);
1950        }
1951
1952        /* Adjust walkers */
1953        read_lock(&net->ipv6.fib6_walker_lock);
1954        FOR_WALKERS(net, w) {
1955                if (w->state == FWS_C && w->leaf == rt) {
1956                        RT6_TRACE("walker %p adjusted by delroute\n", w);
1957                        w->leaf = rcu_dereference_protected(rt->fib6_next,
1958                                            lockdep_is_held(&table->tb6_lock));
1959                        if (!w->leaf)
1960                                w->state = FWS_U;
1961                }
1962        }
1963        read_unlock(&net->ipv6.fib6_walker_lock);
1964
1965        /* If it was last route, call fib6_repair_tree() to:
1966         * 1. For root node, put back null_entry as how the table was created.
1967         * 2. For other nodes, expunge its radix tree node.
1968         */
1969        if (!rcu_access_pointer(fn->leaf)) {
1970                if (!(fn->fn_flags & RTN_TL_ROOT)) {
1971                        fn->fn_flags &= ~RTN_RTINFO;
1972                        net->ipv6.rt6_stats->fib_route_nodes--;
1973                }
1974                fn = fib6_repair_tree(net, table, fn);
1975        }
1976
1977        fib6_purge_rt(rt, fn, net);
1978
1979        if (!info->skip_notify_kernel) {
1980                if (notify_del)
1981                        call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
1982                                                  rt, NULL);
1983                else if (replace_rt)
1984                        call_fib6_entry_notifiers_replace(net, replace_rt);
1985        }
1986        if (!info->skip_notify)
1987                inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
1988
1989        fib6_info_release(rt);
1990}
1991
1992/* Need to own table->tb6_lock */
1993int fib6_del(struct fib6_info *rt, struct nl_info *info)
1994{
1995        struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1996                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
1997        struct fib6_table *table = rt->fib6_table;
1998        struct net *net = info->nl_net;
1999        struct fib6_info __rcu **rtp;
2000        struct fib6_info __rcu **rtp_next;
2001
2002        if (!fn || rt == net->ipv6.fib6_null_entry)
2003                return -ENOENT;
2004
2005        WARN_ON(!(fn->fn_flags & RTN_RTINFO));
2006
2007        /*
2008         *      Walk the leaf entries looking for ourself
2009         */
2010
2011        for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
2012                struct fib6_info *cur = rcu_dereference_protected(*rtp,
2013                                        lockdep_is_held(&table->tb6_lock));
2014                if (rt == cur) {
2015                        if (fib6_requires_src(cur))
2016                                fib6_routes_require_src_dec(info->nl_net);
2017                        fib6_del_route(table, fn, rtp, info);
2018                        return 0;
2019                }
2020                rtp_next = &cur->fib6_next;
2021        }
2022        return -ENOENT;
2023}
2024
2025/*
2026 *      Tree traversal function.
2027 *
2028 *      Certainly, it is not interrupt safe.
2029 *      However, it is internally reenterable wrt itself and fib6_add/fib6_del.
2030 *      It means, that we can modify tree during walking
2031 *      and use this function for garbage collection, clone pruning,
2032 *      cleaning tree when a device goes down etc. etc.
2033 *
2034 *      It guarantees that every node will be traversed,
2035 *      and that it will be traversed only once.
2036 *
2037 *      Callback function w->func may return:
2038 *      0 -> continue walking.
2039 *      positive value -> walking is suspended (used by tree dumps,
2040 *      and probably by gc, if it will be split to several slices)
2041 *      negative value -> terminate walking.
2042 *
2043 *      The function itself returns:
2044 *      0   -> walk is complete.
2045 *      >0  -> walk is incomplete (i.e. suspended)
2046 *      <0  -> walk is terminated by an error.
2047 *
2048 *      This function is called with tb6_lock held.
2049 */
2050
2051static int fib6_walk_continue(struct fib6_walker *w)
2052{
2053        struct fib6_node *fn, *pn, *left, *right;
2054
2055        /* w->root should always be table->tb6_root */
2056        WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
2057
2058        for (;;) {
2059                fn = w->node;
2060                if (!fn)
2061                        return 0;
2062
2063                switch (w->state) {
2064#ifdef CONFIG_IPV6_SUBTREES
2065                case FWS_S:
2066                        if (FIB6_SUBTREE(fn)) {
2067                                w->node = FIB6_SUBTREE(fn);
2068                                continue;
2069                        }
2070                        w->state = FWS_L;
2071#endif
2072                        /* fall through */
2073                case FWS_L:
2074                        left = rcu_dereference_protected(fn->left, 1);
2075                        if (left) {
2076                                w->node = left;
2077                                w->state = FWS_INIT;
2078                                continue;
2079                        }
2080                        w->state = FWS_R;
2081                        /* fall through */
2082                case FWS_R:
2083                        right = rcu_dereference_protected(fn->right, 1);
2084                        if (right) {
2085                                w->node = right;
2086                                w->state = FWS_INIT;
2087                                continue;
2088                        }
2089                        w->state = FWS_C;
2090                        w->leaf = rcu_dereference_protected(fn->leaf, 1);
2091                        /* fall through */
2092                case FWS_C:
2093                        if (w->leaf && fn->fn_flags & RTN_RTINFO) {
2094                                int err;
2095
2096                                if (w->skip) {
2097                                        w->skip--;
2098                                        goto skip;
2099                                }
2100
2101                                err = w->func(w);
2102                                if (err)
2103                                        return err;
2104
2105                                w->count++;
2106                                continue;
2107                        }
2108skip:
2109                        w->state = FWS_U;
2110                        /* fall through */
2111                case FWS_U:
2112                        if (fn == w->root)
2113                                return 0;
2114                        pn = rcu_dereference_protected(fn->parent, 1);
2115                        left = rcu_dereference_protected(pn->left, 1);
2116                        right = rcu_dereference_protected(pn->right, 1);
2117                        w->node = pn;
2118#ifdef CONFIG_IPV6_SUBTREES
2119                        if (FIB6_SUBTREE(pn) == fn) {
2120                                WARN_ON(!(fn->fn_flags & RTN_ROOT));
2121                                w->state = FWS_L;
2122                                continue;
2123                        }
2124#endif
2125                        if (left == fn) {
2126                                w->state = FWS_R;
2127                                continue;
2128                        }
2129                        if (right == fn) {
2130                                w->state = FWS_C;
2131                                w->leaf = rcu_dereference_protected(w->node->leaf, 1);
2132                                continue;
2133                        }
2134#if RT6_DEBUG >= 2
2135                        WARN_ON(1);
2136#endif
2137                }
2138        }
2139}
2140
2141static int fib6_walk(struct net *net, struct fib6_walker *w)
2142{
2143        int res;
2144
2145        w->state = FWS_INIT;
2146        w->node = w->root;
2147
2148        fib6_walker_link(net, w);
2149        res = fib6_walk_continue(w);
2150        if (res <= 0)
2151                fib6_walker_unlink(net, w);
2152        return res;
2153}
2154
2155static int fib6_clean_node(struct fib6_walker *w)
2156{
2157        int res;
2158        struct fib6_info *rt;
2159        struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
2160        struct nl_info info = {
2161                .nl_net = c->net,
2162                .skip_notify = c->skip_notify,
2163        };
2164
2165        if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
2166            w->node->fn_sernum != c->sernum)
2167                w->node->fn_sernum = c->sernum;
2168
2169        if (!c->func) {
2170                WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
2171                w->leaf = NULL;
2172                return 0;
2173        }
2174
2175        for_each_fib6_walker_rt(w) {
2176                res = c->func(rt, c->arg);
2177                if (res == -1) {
2178                        w->leaf = rt;
2179                        res = fib6_del(rt, &info);
2180                        if (res) {
2181#if RT6_DEBUG >= 2
2182                                pr_debug("%s: del failed: rt=%p@%p err=%d\n",
2183                                         __func__, rt,
2184                                         rcu_access_pointer(rt->fib6_node),
2185                                         res);
2186#endif
2187                                continue;
2188                        }
2189                        return 0;
2190                } else if (res == -2) {
2191                        if (WARN_ON(!rt->fib6_nsiblings))
2192                                continue;
2193                        rt = list_last_entry(&rt->fib6_siblings,
2194                                             struct fib6_info, fib6_siblings);
2195                        continue;
2196                }
2197                WARN_ON(res != 0);
2198        }
2199        w->leaf = rt;
2200        return 0;
2201}
2202
2203/*
2204 *      Convenient frontend to tree walker.
2205 *
2206 *      func is called on each route.
2207 *              It may return -2 -> skip multipath route.
2208 *                            -1 -> delete this route.
2209 *                            0  -> continue walking
2210 */
2211
2212static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2213                            int (*func)(struct fib6_info *, void *arg),
2214                            int sernum, void *arg, bool skip_notify)
2215{
2216        struct fib6_cleaner c;
2217
2218        c.w.root = root;
2219        c.w.func = fib6_clean_node;
2220        c.w.count = 0;
2221        c.w.skip = 0;
2222        c.w.skip_in_node = 0;
2223        c.func = func;
2224        c.sernum = sernum;
2225        c.arg = arg;
2226        c.net = net;
2227        c.skip_notify = skip_notify;
2228
2229        fib6_walk(net, &c.w);
2230}
2231
2232static void __fib6_clean_all(struct net *net,
2233                             int (*func)(struct fib6_info *, void *),
2234                             int sernum, void *arg, bool skip_notify)
2235{
2236        struct fib6_table *table;
2237        struct hlist_head *head;
2238        unsigned int h;
2239
2240        rcu_read_lock();
2241        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2242                head = &net->ipv6.fib_table_hash[h];
2243                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2244                        spin_lock_bh(&table->tb6_lock);
2245                        fib6_clean_tree(net, &table->tb6_root,
2246                                        func, sernum, arg, skip_notify);
2247                        spin_unlock_bh(&table->tb6_lock);
2248                }
2249        }
2250        rcu_read_unlock();
2251}
2252
2253void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2254                    void *arg)
2255{
2256        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
2257}
2258
2259void fib6_clean_all_skip_notify(struct net *net,
2260                                int (*func)(struct fib6_info *, void *),
2261                                void *arg)
2262{
2263        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
2264}
2265
2266static void fib6_flush_trees(struct net *net)
2267{
2268        int new_sernum = fib6_new_sernum(net);
2269
2270        __fib6_clean_all(net, NULL, new_sernum, NULL, false);
2271}
2272
2273/*
2274 *      Garbage collection
2275 */
2276
2277static int fib6_age(struct fib6_info *rt, void *arg)
2278{
2279        struct fib6_gc_args *gc_args = arg;
2280        unsigned long now = jiffies;
2281
2282        /*
2283         *      check addrconf expiration here.
2284         *      Routes are expired even if they are in use.
2285         */
2286
2287        if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
2288                if (time_after(now, rt->expires)) {
2289                        RT6_TRACE("expiring %p\n", rt);
2290                        return -1;
2291                }
2292                gc_args->more++;
2293        }
2294
2295        /*      Also age clones in the exception table.
2296         *      Note, that clones are aged out
2297         *      only if they are not in use now.
2298         */
2299        rt6_age_exceptions(rt, gc_args, now);
2300
2301        return 0;
2302}
2303
2304void fib6_run_gc(unsigned long expires, struct net *net, bool force)
2305{
2306        struct fib6_gc_args gc_args;
2307        unsigned long now;
2308
2309        if (force) {
2310                spin_lock_bh(&net->ipv6.fib6_gc_lock);
2311        } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
2312                mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
2313                return;
2314        }
2315        gc_args.timeout = expires ? (int)expires :
2316                          net->ipv6.sysctl.ip6_rt_gc_interval;
2317        gc_args.more = 0;
2318
2319        fib6_clean_all(net, fib6_age, &gc_args);
2320        now = jiffies;
2321        net->ipv6.ip6_rt_last_gc = now;
2322
2323        if (gc_args.more)
2324                mod_timer(&net->ipv6.ip6_fib_timer,
2325                          round_jiffies(now
2326                                        + net->ipv6.sysctl.ip6_rt_gc_interval));
2327        else
2328                del_timer(&net->ipv6.ip6_fib_timer);
2329        spin_unlock_bh(&net->ipv6.fib6_gc_lock);
2330}
2331
2332static void fib6_gc_timer_cb(struct timer_list *t)
2333{
2334        struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);
2335
2336        fib6_run_gc(0, arg, true);
2337}
2338
2339static int __net_init fib6_net_init(struct net *net)
2340{
2341        size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
2342        int err;
2343
2344        err = fib6_notifier_init(net);
2345        if (err)
2346                return err;
2347
2348        spin_lock_init(&net->ipv6.fib6_gc_lock);
2349        rwlock_init(&net->ipv6.fib6_walker_lock);
2350        INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
2351        timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
2352
2353        net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
2354        if (!net->ipv6.rt6_stats)
2355                goto out_timer;
2356
2357        /* Avoid false sharing : Use at least a full cache line */
2358        size = max_t(size_t, size, L1_CACHE_BYTES);
2359
2360        net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
2361        if (!net->ipv6.fib_table_hash)
2362                goto out_rt6_stats;
2363
2364        net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
2365                                          GFP_KERNEL);
2366        if (!net->ipv6.fib6_main_tbl)
2367                goto out_fib_table_hash;
2368
2369        net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
2370        rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
2371                           net->ipv6.fib6_null_entry);
2372        net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
2373                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2374        inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
2375
2376#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2377        net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
2378                                           GFP_KERNEL);
2379        if (!net->ipv6.fib6_local_tbl)
2380                goto out_fib6_main_tbl;
2381        net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
2382        rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
2383                           net->ipv6.fib6_null_entry);
2384        net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
2385                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2386        inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
2387#endif
2388        fib6_tables_init(net);
2389
2390        return 0;
2391
2392#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2393out_fib6_main_tbl:
2394        kfree(net->ipv6.fib6_main_tbl);
2395#endif
2396out_fib_table_hash:
2397        kfree(net->ipv6.fib_table_hash);
2398out_rt6_stats:
2399        kfree(net->ipv6.rt6_stats);
2400out_timer:
2401        fib6_notifier_exit(net);
2402        return -ENOMEM;
2403}
2404
2405static void fib6_net_exit(struct net *net)
2406{
2407        unsigned int i;
2408
2409        del_timer_sync(&net->ipv6.ip6_fib_timer);
2410
2411        for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
2412                struct hlist_head *head = &net->ipv6.fib_table_hash[i];
2413                struct hlist_node *tmp;
2414                struct fib6_table *tb;
2415
2416                hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
2417                        hlist_del(&tb->tb6_hlist);
2418                        fib6_free_table(tb);
2419                }
2420        }
2421
2422        kfree(net->ipv6.fib_table_hash);
2423        kfree(net->ipv6.rt6_stats);
2424        fib6_notifier_exit(net);
2425}
2426
2427static struct pernet_operations fib6_net_ops = {
2428        .init = fib6_net_init,
2429        .exit = fib6_net_exit,
2430};
2431
2432int __init fib6_init(void)
2433{
2434        int ret = -ENOMEM;
2435
2436        fib6_node_kmem = kmem_cache_create("fib6_nodes",
2437                                           sizeof(struct fib6_node),
2438                                           0, SLAB_HWCACHE_ALIGN,
2439                                           NULL);
2440        if (!fib6_node_kmem)
2441                goto out;
2442
2443        ret = register_pernet_subsys(&fib6_net_ops);
2444        if (ret)
2445                goto out_kmem_cache_create;
2446
2447        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
2448                                   inet6_dump_fib, 0);
2449        if (ret)
2450                goto out_unregister_subsys;
2451
2452        __fib6_flush_trees = fib6_flush_trees;
2453out:
2454        return ret;
2455
2456out_unregister_subsys:
2457        unregister_pernet_subsys(&fib6_net_ops);
2458out_kmem_cache_create:
2459        kmem_cache_destroy(fib6_node_kmem);
2460        goto out;
2461}
2462
2463void fib6_gc_cleanup(void)
2464{
2465        unregister_pernet_subsys(&fib6_net_ops);
2466        kmem_cache_destroy(fib6_node_kmem);
2467}
2468
2469#ifdef CONFIG_PROC_FS
2470static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2471{
2472        struct fib6_info *rt = v;
2473        struct ipv6_route_iter *iter = seq->private;
2474        struct fib6_nh *fib6_nh = rt->fib6_nh;
2475        unsigned int flags = rt->fib6_flags;
2476        const struct net_device *dev;
2477
2478        if (rt->nh)
2479                fib6_nh = nexthop_fib6_nh(rt->nh);
2480
2481        seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
2482
2483#ifdef CONFIG_IPV6_SUBTREES
2484        seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
2485#else
2486        seq_puts(seq, "00000000000000000000000000000000 00 ");
2487#endif
2488        if (fib6_nh->fib_nh_gw_family) {
2489                flags |= RTF_GATEWAY;
2490                seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
2491        } else {
2492                seq_puts(seq, "00000000000000000000000000000000");
2493        }
2494
2495        dev = fib6_nh->fib_nh_dev;
2496        seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2497                   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
2498                   flags, dev ? dev->name : "");
2499        iter->w.leaf = NULL;
2500        return 0;
2501}
2502
2503static int ipv6_route_yield(struct fib6_walker *w)
2504{
2505        struct ipv6_route_iter *iter = w->args;
2506
2507        if (!iter->skip)
2508                return 1;
2509
2510        do {
2511                iter->w.leaf = rcu_dereference_protected(
2512                                iter->w.leaf->fib6_next,
2513                                lockdep_is_held(&iter->tbl->tb6_lock));
2514                iter->skip--;
2515                if (!iter->skip && iter->w.leaf)
2516                        return 1;
2517        } while (iter->w.leaf);
2518
2519        return 0;
2520}
2521
2522static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
2523                                      struct net *net)
2524{
2525        memset(&iter->w, 0, sizeof(iter->w));
2526        iter->w.func = ipv6_route_yield;
2527        iter->w.root = &iter->tbl->tb6_root;
2528        iter->w.state = FWS_INIT;
2529        iter->w.node = iter->w.root;
2530        iter->w.args = iter;
2531        iter->sernum = iter->w.root->fn_sernum;
2532        INIT_LIST_HEAD(&iter->w.lh);
2533        fib6_walker_link(net, &iter->w);
2534}
2535
2536static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
2537                                                    struct net *net)
2538{
2539        unsigned int h;
2540        struct hlist_node *node;
2541
2542        if (tbl) {
2543                h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
2544                node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
2545        } else {
2546                h = 0;
2547                node = NULL;
2548        }
2549
2550        while (!node && h < FIB6_TABLE_HASHSZ) {
2551                node = rcu_dereference_bh(
2552                        hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
2553        }
2554        return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
2555}
2556
2557static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
2558{
2559        if (iter->sernum != iter->w.root->fn_sernum) {
2560                iter->sernum = iter->w.root->fn_sernum;
2561                iter->w.state = FWS_INIT;
2562                iter->w.node = iter->w.root;
2563                WARN_ON(iter->w.skip);
2564                iter->w.skip = iter->w.count;
2565        }
2566}
2567
2568static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2569{
2570        int r;
2571        struct fib6_info *n;
2572        struct net *net = seq_file_net(seq);
2573        struct ipv6_route_iter *iter = seq->private;
2574
2575        ++(*pos);
2576        if (!v)
2577                goto iter_table;
2578
2579        n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
2580        if (n)
2581                return n;
2582
2583iter_table:
2584        ipv6_route_check_sernum(iter);
2585        spin_lock_bh(&iter->tbl->tb6_lock);
2586        r = fib6_walk_continue(&iter->w);
2587        spin_unlock_bh(&iter->tbl->tb6_lock);
2588        if (r > 0) {
2589                return iter->w.leaf;
2590        } else if (r < 0) {
2591                fib6_walker_unlink(net, &iter->w);
2592                return NULL;
2593        }
2594        fib6_walker_unlink(net, &iter->w);
2595
2596        iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
2597        if (!iter->tbl)
2598                return NULL;
2599
2600        ipv6_route_seq_setup_walk(iter, net);
2601        goto iter_table;
2602}
2603
2604static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
2605        __acquires(RCU_BH)
2606{
2607        struct net *net = seq_file_net(seq);
2608        struct ipv6_route_iter *iter = seq->private;
2609
2610        rcu_read_lock_bh();
2611        iter->tbl = ipv6_route_seq_next_table(NULL, net);
2612        iter->skip = *pos;
2613
2614        if (iter->tbl) {
2615                ipv6_route_seq_setup_walk(iter, net);
2616                return ipv6_route_seq_next(seq, NULL, pos);
2617        } else {
2618                return NULL;
2619        }
2620}
2621
2622static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
2623{
2624        struct fib6_walker *w = &iter->w;
2625        return w->node && !(w->state == FWS_U && w->node == w->root);
2626}
2627
2628static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2629        __releases(RCU_BH)
2630{
2631        struct net *net = seq_file_net(seq);
2632        struct ipv6_route_iter *iter = seq->private;
2633
2634        if (ipv6_route_iter_active(iter))
2635                fib6_walker_unlink(net, &iter->w);
2636
2637        rcu_read_unlock_bh();
2638}
2639
2640const struct seq_operations ipv6_route_seq_ops = {
2641        .start  = ipv6_route_seq_start,
2642        .next   = ipv6_route_seq_next,
2643        .stop   = ipv6_route_seq_stop,
2644        .show   = ipv6_route_seq_show
2645};
2646#endif /* CONFIG_PROC_FS */
2647