linux/net/ipv4/fib_hash.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              IPv4 FIB: lookup engine and maintenance routines.
   7 *
   8 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   9 *
  10 *              This program is free software; you can redistribute it and/or
  11 *              modify it under the terms of the GNU General Public License
  12 *              as published by the Free Software Foundation; either version
  13 *              2 of the License, or (at your option) any later version.
  14 */
  15
  16#include <asm/uaccess.h>
  17#include <asm/system.h>
  18#include <linux/bitops.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/mm.h>
  22#include <linux/string.h>
  23#include <linux/socket.h>
  24#include <linux/sockios.h>
  25#include <linux/errno.h>
  26#include <linux/in.h>
  27#include <linux/inet.h>
  28#include <linux/inetdevice.h>
  29#include <linux/netdevice.h>
  30#include <linux/if_arp.h>
  31#include <linux/proc_fs.h>
  32#include <linux/skbuff.h>
  33#include <linux/netlink.h>
  34#include <linux/init.h>
  35#include <linux/slab.h>
  36
  37#include <net/net_namespace.h>
  38#include <net/ip.h>
  39#include <net/protocol.h>
  40#include <net/route.h>
  41#include <net/tcp.h>
  42#include <net/sock.h>
  43#include <net/ip_fib.h>
  44
  45#include "fib_lookup.h"
  46
  47static struct kmem_cache *fn_hash_kmem __read_mostly;
  48static struct kmem_cache *fn_alias_kmem __read_mostly;
  49
  50struct fib_node {
  51        struct hlist_node       fn_hash;
  52        struct list_head        fn_alias;
  53        __be32                  fn_key;
  54        struct fib_alias        fn_embedded_alias;
  55};
  56
  57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
  58
  59struct fn_zone {
  60        struct fn_zone __rcu    *fz_next;       /* Next not empty zone  */
  61        struct hlist_head __rcu *fz_hash;       /* Hash table pointer   */
  62        seqlock_t               fz_lock;
  63        u32                     fz_hashmask;    /* (fz_divisor - 1)     */
  64
  65        u8                      fz_order;       /* Zone order (0..32)   */
  66        u8                      fz_revorder;    /* 32 - fz_order        */
  67        __be32                  fz_mask;        /* inet_make_mask(order) */
  68#define FZ_MASK(fz)             ((fz)->fz_mask)
  69
  70        struct hlist_head       fz_embedded_hash[EMBEDDED_HASH_SIZE];
  71
  72        int                     fz_nent;        /* Number of entries    */
  73        int                     fz_divisor;     /* Hash size (mask+1)   */
  74};
  75
  76struct fn_hash {
  77        struct fn_zone          *fn_zones[33];
  78        struct fn_zone __rcu    *fn_zone_list;
  79};
  80
  81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
  82{
  83        u32 h = ntohl(key) >> fz->fz_revorder;
  84        h ^= (h>>20);
  85        h ^= (h>>10);
  86        h ^= (h>>5);
  87        h &= fz->fz_hashmask;
  88        return h;
  89}
  90
  91static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
  92{
  93        return dst & FZ_MASK(fz);
  94}
  95
  96static unsigned int fib_hash_genid;
  97
  98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
  99
 100static struct hlist_head *fz_hash_alloc(int divisor)
 101{
 102        unsigned long size = divisor * sizeof(struct hlist_head);
 103
 104        if (size <= PAGE_SIZE)
 105                return kzalloc(size, GFP_KERNEL);
 106
 107        return (struct hlist_head *)
 108                __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
 109}
 110
 111/* The fib hash lock must be held when this is called. */
 112static inline void fn_rebuild_zone(struct fn_zone *fz,
 113                                   struct hlist_head *old_ht,
 114                                   int old_divisor)
 115{
 116        int i;
 117
 118        for (i = 0; i < old_divisor; i++) {
 119                struct hlist_node *node, *n;
 120                struct fib_node *f;
 121
 122                hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
 123                        struct hlist_head *new_head;
 124
 125                        hlist_del_rcu(&f->fn_hash);
 126
 127                        new_head = rcu_dereference_protected(fz->fz_hash, 1) +
 128                                   fn_hash(f->fn_key, fz);
 129                        hlist_add_head_rcu(&f->fn_hash, new_head);
 130                }
 131        }
 132}
 133
 134static void fz_hash_free(struct hlist_head *hash, int divisor)
 135{
 136        unsigned long size = divisor * sizeof(struct hlist_head);
 137
 138        if (size <= PAGE_SIZE)
 139                kfree(hash);
 140        else
 141                free_pages((unsigned long)hash, get_order(size));
 142}
 143
 144static void fn_rehash_zone(struct fn_zone *fz)
 145{
 146        struct hlist_head *ht, *old_ht;
 147        int old_divisor, new_divisor;
 148        u32 new_hashmask;
 149
 150        new_divisor = old_divisor = fz->fz_divisor;
 151
 152        switch (old_divisor) {
 153        case EMBEDDED_HASH_SIZE:
 154                new_divisor *= EMBEDDED_HASH_SIZE;
 155                break;
 156        case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
 157                new_divisor *= (EMBEDDED_HASH_SIZE/2);
 158                break;
 159        default:
 160                if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
 161                        printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
 162                        return;
 163                }
 164                new_divisor = (old_divisor << 1);
 165                break;
 166        }
 167
 168        new_hashmask = (new_divisor - 1);
 169
 170#if RT_CACHE_DEBUG >= 2
 171        printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
 172               fz->fz_order, old_divisor);
 173#endif
 174
 175        ht = fz_hash_alloc(new_divisor);
 176
 177        if (ht) {
 178                struct fn_zone nfz;
 179
 180                memcpy(&nfz, fz, sizeof(nfz));
 181
 182                write_seqlock_bh(&fz->fz_lock);
 183                old_ht = rcu_dereference_protected(fz->fz_hash, 1);
 184                RCU_INIT_POINTER(nfz.fz_hash, ht);
 185                nfz.fz_hashmask = new_hashmask;
 186                nfz.fz_divisor = new_divisor;
 187                fn_rebuild_zone(&nfz, old_ht, old_divisor);
 188                fib_hash_genid++;
 189                rcu_assign_pointer(fz->fz_hash, ht);
 190                fz->fz_hashmask = new_hashmask;
 191                fz->fz_divisor = new_divisor;
 192                write_sequnlock_bh(&fz->fz_lock);
 193
 194                if (old_ht != fz->fz_embedded_hash) {
 195                        synchronize_rcu();
 196                        fz_hash_free(old_ht, old_divisor);
 197                }
 198        }
 199}
 200
 201static void fn_free_node_rcu(struct rcu_head *head)
 202{
 203        struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
 204
 205        kmem_cache_free(fn_hash_kmem, f);
 206}
 207
 208static inline void fn_free_node(struct fib_node *f)
 209{
 210        call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
 211}
 212
 213static void fn_free_alias_rcu(struct rcu_head *head)
 214{
 215        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 216
 217        kmem_cache_free(fn_alias_kmem, fa);
 218}
 219
 220static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
 221{
 222        fib_release_info(fa->fa_info);
 223        if (fa == &f->fn_embedded_alias)
 224                fa->fa_info = NULL;
 225        else
 226                call_rcu(&fa->rcu, fn_free_alias_rcu);
 227}
 228
 229static struct fn_zone *
 230fn_new_zone(struct fn_hash *table, int z)
 231{
 232        int i;
 233        struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
 234        if (!fz)
 235                return NULL;
 236
 237        seqlock_init(&fz->fz_lock);
 238        fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
 239        fz->fz_hashmask = fz->fz_divisor - 1;
 240        RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
 241        fz->fz_order = z;
 242        fz->fz_revorder = 32 - z;
 243        fz->fz_mask = inet_make_mask(z);
 244
 245        /* Find the first not empty zone with more specific mask */
 246        for (i = z + 1; i <= 32; i++)
 247                if (table->fn_zones[i])
 248                        break;
 249        if (i > 32) {
 250                /* No more specific masks, we are the first. */
 251                rcu_assign_pointer(fz->fz_next,
 252                                   rtnl_dereference(table->fn_zone_list));
 253                rcu_assign_pointer(table->fn_zone_list, fz);
 254        } else {
 255                rcu_assign_pointer(fz->fz_next,
 256                                   rtnl_dereference(table->fn_zones[i]->fz_next));
 257                rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
 258        }
 259        table->fn_zones[z] = fz;
 260        fib_hash_genid++;
 261        return fz;
 262}
 263
 264int fib_table_lookup(struct fib_table *tb,
 265                     const struct flowi *flp, struct fib_result *res,
 266                     int fib_flags)
 267{
 268        int err;
 269        struct fn_zone *fz;
 270        struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 271
 272        rcu_read_lock();
 273        for (fz = rcu_dereference(t->fn_zone_list);
 274             fz != NULL;
 275             fz = rcu_dereference(fz->fz_next)) {
 276                struct hlist_head *head;
 277                struct hlist_node *node;
 278                struct fib_node *f;
 279                __be32 k;
 280                unsigned int seq;
 281
 282                do {
 283                        seq = read_seqbegin(&fz->fz_lock);
 284                        k = fz_key(flp->fl4_dst, fz);
 285
 286                        head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
 287                        hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 288                                if (f->fn_key != k)
 289                                        continue;
 290
 291                                err = fib_semantic_match(&f->fn_alias,
 292                                                 flp, res,
 293                                                 fz->fz_order, fib_flags);
 294                                if (err <= 0)
 295                                        goto out;
 296                        }
 297                } while (read_seqretry(&fz->fz_lock, seq));
 298        }
 299        err = 1;
 300out:
 301        rcu_read_unlock();
 302        return err;
 303}
 304
 305void fib_table_select_default(struct fib_table *tb,
 306                              const struct flowi *flp, struct fib_result *res)
 307{
 308        int order, last_idx;
 309        struct hlist_node *node;
 310        struct fib_node *f;
 311        struct fib_info *fi = NULL;
 312        struct fib_info *last_resort;
 313        struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 314        struct fn_zone *fz = t->fn_zones[0];
 315        struct hlist_head *head;
 316
 317        if (fz == NULL)
 318                return;
 319
 320        last_idx = -1;
 321        last_resort = NULL;
 322        order = -1;
 323
 324        rcu_read_lock();
 325        head = rcu_dereference(fz->fz_hash);
 326        hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 327                struct fib_alias *fa;
 328
 329                list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
 330                        struct fib_info *next_fi = fa->fa_info;
 331
 332                        if (fa->fa_scope != res->scope ||
 333                            fa->fa_type != RTN_UNICAST)
 334                                continue;
 335
 336                        if (next_fi->fib_priority > res->fi->fib_priority)
 337                                break;
 338                        if (!next_fi->fib_nh[0].nh_gw ||
 339                            next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 340                                continue;
 341
 342                        fib_alias_accessed(fa);
 343
 344                        if (fi == NULL) {
 345                                if (next_fi != res->fi)
 346                                        break;
 347                        } else if (!fib_detect_death(fi, order, &last_resort,
 348                                                &last_idx, tb->tb_default)) {
 349                                fib_result_assign(res, fi);
 350                                tb->tb_default = order;
 351                                goto out;
 352                        }
 353                        fi = next_fi;
 354                        order++;
 355                }
 356        }
 357
 358        if (order <= 0 || fi == NULL) {
 359                tb->tb_default = -1;
 360                goto out;
 361        }
 362
 363        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
 364                                tb->tb_default)) {
 365                fib_result_assign(res, fi);
 366                tb->tb_default = order;
 367                goto out;
 368        }
 369
 370        if (last_idx >= 0)
 371                fib_result_assign(res, last_resort);
 372        tb->tb_default = last_idx;
 373out:
 374        rcu_read_unlock();
 375}
 376
 377/* Insert node F to FZ. */
 378static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
 379{
 380        struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
 381
 382        hlist_add_head_rcu(&f->fn_hash, head);
 383}
 384
 385/* Return the node in FZ matching KEY. */
 386static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 387{
 388        struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
 389        struct hlist_node *node;
 390        struct fib_node *f;
 391
 392        hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 393                if (f->fn_key == key)
 394                        return f;
 395        }
 396
 397        return NULL;
 398}
 399
 400
 401static struct fib_alias *fib_fast_alloc(struct fib_node *f)
 402{
 403        struct fib_alias *fa = &f->fn_embedded_alias;
 404
 405        if (fa->fa_info != NULL)
 406                fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
 407        return fa;
 408}
 409
 410/* Caller must hold RTNL. */
 411int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 412{
 413        struct fn_hash *table = (struct fn_hash *) tb->tb_data;
 414        struct fib_node *new_f = NULL;
 415        struct fib_node *f;
 416        struct fib_alias *fa, *new_fa;
 417        struct fn_zone *fz;
 418        struct fib_info *fi;
 419        u8 tos = cfg->fc_tos;
 420        __be32 key;
 421        int err;
 422
 423        if (cfg->fc_dst_len > 32)
 424                return -EINVAL;
 425
 426        fz = table->fn_zones[cfg->fc_dst_len];
 427        if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
 428                return -ENOBUFS;
 429
 430        key = 0;
 431        if (cfg->fc_dst) {
 432                if (cfg->fc_dst & ~FZ_MASK(fz))
 433                        return -EINVAL;
 434                key = fz_key(cfg->fc_dst, fz);
 435        }
 436
 437        fi = fib_create_info(cfg);
 438        if (IS_ERR(fi))
 439                return PTR_ERR(fi);
 440
 441        if (fz->fz_nent > (fz->fz_divisor<<1) &&
 442            fz->fz_divisor < FZ_MAX_DIVISOR &&
 443            (cfg->fc_dst_len == 32 ||
 444             (1 << cfg->fc_dst_len) > fz->fz_divisor))
 445                fn_rehash_zone(fz);
 446
 447        f = fib_find_node(fz, key);
 448
 449        if (!f)
 450                fa = NULL;
 451        else
 452                fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
 453
 454        /* Now fa, if non-NULL, points to the first fib alias
 455         * with the same keys [prefix,tos,priority], if such key already
 456         * exists or to the node before which we will insert new one.
 457         *
 458         * If fa is NULL, we will need to allocate a new one and
 459         * insert to the head of f.
 460         *
 461         * If f is NULL, no fib node matched the destination key
 462         * and we need to allocate a new one of those as well.
 463         */
 464
 465        if (fa && fa->fa_tos == tos &&
 466            fa->fa_info->fib_priority == fi->fib_priority) {
 467                struct fib_alias *fa_first, *fa_match;
 468
 469                err = -EEXIST;
 470                if (cfg->fc_nlflags & NLM_F_EXCL)
 471                        goto out;
 472
 473                /* We have 2 goals:
 474                 * 1. Find exact match for type, scope, fib_info to avoid
 475                 * duplicate routes
 476                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
 477                 */
 478                fa_match = NULL;
 479                fa_first = fa;
 480                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
 481                list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
 482                        if (fa->fa_tos != tos)
 483                                break;
 484                        if (fa->fa_info->fib_priority != fi->fib_priority)
 485                                break;
 486                        if (fa->fa_type == cfg->fc_type &&
 487                            fa->fa_scope == cfg->fc_scope &&
 488                            fa->fa_info == fi) {
 489                                fa_match = fa;
 490                                break;
 491                        }
 492                }
 493
 494                if (cfg->fc_nlflags & NLM_F_REPLACE) {
 495                        u8 state;
 496
 497                        fa = fa_first;
 498                        if (fa_match) {
 499                                if (fa == fa_match)
 500                                        err = 0;
 501                                goto out;
 502                        }
 503                        err = -ENOBUFS;
 504                        new_fa = fib_fast_alloc(f);
 505                        if (new_fa == NULL)
 506                                goto out;
 507
 508                        new_fa->fa_tos = fa->fa_tos;
 509                        new_fa->fa_info = fi;
 510                        new_fa->fa_type = cfg->fc_type;
 511                        new_fa->fa_scope = cfg->fc_scope;
 512                        state = fa->fa_state;
 513                        new_fa->fa_state = state & ~FA_S_ACCESSED;
 514                        fib_hash_genid++;
 515                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
 516
 517                        fn_free_alias(fa, f);
 518                        if (state & FA_S_ACCESSED)
 519                                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
 520                        rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
 521                                  tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
 522                        return 0;
 523                }
 524
 525                /* Error if we find a perfect match which
 526                 * uses the same scope, type, and nexthop
 527                 * information.
 528                 */
 529                if (fa_match)
 530                        goto out;
 531
 532                if (!(cfg->fc_nlflags & NLM_F_APPEND))
 533                        fa = fa_first;
 534        }
 535
 536        err = -ENOENT;
 537        if (!(cfg->fc_nlflags & NLM_F_CREATE))
 538                goto out;
 539
 540        err = -ENOBUFS;
 541
 542        if (!f) {
 543                new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
 544                if (new_f == NULL)
 545                        goto out;
 546
 547                INIT_HLIST_NODE(&new_f->fn_hash);
 548                INIT_LIST_HEAD(&new_f->fn_alias);
 549                new_f->fn_key = key;
 550                f = new_f;
 551        }
 552
 553        new_fa = fib_fast_alloc(f);
 554        if (new_fa == NULL)
 555                goto out;
 556
 557        new_fa->fa_info = fi;
 558        new_fa->fa_tos = tos;
 559        new_fa->fa_type = cfg->fc_type;
 560        new_fa->fa_scope = cfg->fc_scope;
 561        new_fa->fa_state = 0;
 562
 563        /*
 564         * Insert new entry to the list.
 565         */
 566
 567        if (new_f)
 568                fib_insert_node(fz, new_f);
 569        list_add_tail_rcu(&new_fa->fa_list,
 570                 (fa ? &fa->fa_list : &f->fn_alias));
 571        fib_hash_genid++;
 572
 573        if (new_f)
 574                fz->fz_nent++;
 575        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
 576
 577        rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
 578                  &cfg->fc_nlinfo, 0);
 579        return 0;
 580
 581out:
 582        if (new_f)
 583                kmem_cache_free(fn_hash_kmem, new_f);
 584        fib_release_info(fi);
 585        return err;
 586}
 587
 588int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 589{
 590        struct fn_hash *table = (struct fn_hash *)tb->tb_data;
 591        struct fib_node *f;
 592        struct fib_alias *fa, *fa_to_delete;
 593        struct fn_zone *fz;
 594        __be32 key;
 595
 596        if (cfg->fc_dst_len > 32)
 597                return -EINVAL;
 598
 599        if ((fz  = table->fn_zones[cfg->fc_dst_len]) == NULL)
 600                return -ESRCH;
 601
 602        key = 0;
 603        if (cfg->fc_dst) {
 604                if (cfg->fc_dst & ~FZ_MASK(fz))
 605                        return -EINVAL;
 606                key = fz_key(cfg->fc_dst, fz);
 607        }
 608
 609        f = fib_find_node(fz, key);
 610
 611        if (!f)
 612                fa = NULL;
 613        else
 614                fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
 615        if (!fa)
 616                return -ESRCH;
 617
 618        fa_to_delete = NULL;
 619        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
 620        list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
 621                struct fib_info *fi = fa->fa_info;
 622
 623                if (fa->fa_tos != cfg->fc_tos)
 624                        break;
 625
 626                if ((!cfg->fc_type ||
 627                     fa->fa_type == cfg->fc_type) &&
 628                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
 629                     fa->fa_scope == cfg->fc_scope) &&
 630                    (!cfg->fc_protocol ||
 631                     fi->fib_protocol == cfg->fc_protocol) &&
 632                    fib_nh_match(cfg, fi) == 0) {
 633                        fa_to_delete = fa;
 634                        break;
 635                }
 636        }
 637
 638        if (fa_to_delete) {
 639                int kill_fn;
 640
 641                fa = fa_to_delete;
 642                rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
 643                          tb->tb_id, &cfg->fc_nlinfo, 0);
 644
 645                kill_fn = 0;
 646                list_del_rcu(&fa->fa_list);
 647                if (list_empty(&f->fn_alias)) {
 648                        hlist_del_rcu(&f->fn_hash);
 649                        kill_fn = 1;
 650                }
 651                fib_hash_genid++;
 652
 653                if (fa->fa_state & FA_S_ACCESSED)
 654                        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
 655                fn_free_alias(fa, f);
 656                if (kill_fn) {
 657                        fn_free_node(f);
 658                        fz->fz_nent--;
 659                }
 660
 661                return 0;
 662        }
 663        return -ESRCH;
 664}
 665
 666static int fn_flush_list(struct fn_zone *fz, int idx)
 667{
 668        struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
 669        struct hlist_node *node, *n;
 670        struct fib_node *f;
 671        int found = 0;
 672
 673        hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
 674                struct fib_alias *fa, *fa_node;
 675                int kill_f;
 676
 677                kill_f = 0;
 678                list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
 679                        struct fib_info *fi = fa->fa_info;
 680
 681                        if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
 682                                list_del_rcu(&fa->fa_list);
 683                                if (list_empty(&f->fn_alias)) {
 684                                        hlist_del_rcu(&f->fn_hash);
 685                                        kill_f = 1;
 686                                }
 687                                fib_hash_genid++;
 688
 689                                fn_free_alias(fa, f);
 690                                found++;
 691                        }
 692                }
 693                if (kill_f) {
 694                        fn_free_node(f);
 695                        fz->fz_nent--;
 696                }
 697        }
 698        return found;
 699}
 700
 701/* caller must hold RTNL. */
 702int fib_table_flush(struct fib_table *tb)
 703{
 704        struct fn_hash *table = (struct fn_hash *) tb->tb_data;
 705        struct fn_zone *fz;
 706        int found = 0;
 707
 708        for (fz = rtnl_dereference(table->fn_zone_list);
 709             fz != NULL;
 710             fz = rtnl_dereference(fz->fz_next)) {
 711                int i;
 712
 713                for (i = fz->fz_divisor - 1; i >= 0; i--)
 714                        found += fn_flush_list(fz, i);
 715        }
 716        return found;
 717}
 718
 719void fib_free_table(struct fib_table *tb)
 720{
 721        struct fn_hash *table = (struct fn_hash *) tb->tb_data;
 722        struct fn_zone *fz, *next;
 723
 724        next = table->fn_zone_list;
 725        while (next != NULL) {
 726                fz = next;
 727                next = fz->fz_next;
 728
 729                if (fz->fz_hash != fz->fz_embedded_hash)
 730                        fz_hash_free(fz->fz_hash, fz->fz_divisor);
 731
 732                kfree(fz);
 733        }
 734
 735        kfree(tb);
 736}
 737
 738static inline int
 739fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 740                     struct fib_table *tb,
 741                     struct fn_zone *fz,
 742                     struct hlist_head *head)
 743{
 744        struct hlist_node *node;
 745        struct fib_node *f;
 746        int i, s_i;
 747
 748        s_i = cb->args[4];
 749        i = 0;
 750        hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 751                struct fib_alias *fa;
 752
 753                list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
 754                        if (i < s_i)
 755                                goto next;
 756
 757                        if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
 758                                          cb->nlh->nlmsg_seq,
 759                                          RTM_NEWROUTE,
 760                                          tb->tb_id,
 761                                          fa->fa_type,
 762                                          fa->fa_scope,
 763                                          f->fn_key,
 764                                          fz->fz_order,
 765                                          fa->fa_tos,
 766                                          fa->fa_info,
 767                                          NLM_F_MULTI) < 0) {
 768                                cb->args[4] = i;
 769                                return -1;
 770                        }
 771next:
 772                        i++;
 773                }
 774        }
 775        cb->args[4] = i;
 776        return skb->len;
 777}
 778
 779static inline int
 780fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
 781                   struct fib_table *tb,
 782                   struct fn_zone *fz)
 783{
 784        int h, s_h;
 785        struct hlist_head *head = rcu_dereference(fz->fz_hash);
 786
 787        if (head == NULL)
 788                return skb->len;
 789        s_h = cb->args[3];
 790        for (h = s_h; h < fz->fz_divisor; h++) {
 791                if (hlist_empty(head + h))
 792                        continue;
 793                if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
 794                        cb->args[3] = h;
 795                        return -1;
 796                }
 797                memset(&cb->args[4], 0,
 798                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
 799        }
 800        cb->args[3] = h;
 801        return skb->len;
 802}
 803
 804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 805                   struct netlink_callback *cb)
 806{
 807        int m = 0, s_m;
 808        struct fn_zone *fz;
 809        struct fn_hash *table = (struct fn_hash *)tb->tb_data;
 810
 811        s_m = cb->args[2];
 812        rcu_read_lock();
 813        for (fz = rcu_dereference(table->fn_zone_list);
 814             fz != NULL;
 815             fz = rcu_dereference(fz->fz_next), m++) {
 816                if (m < s_m)
 817                        continue;
 818                if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
 819                        cb->args[2] = m;
 820                        rcu_read_unlock();
 821                        return -1;
 822                }
 823                memset(&cb->args[3], 0,
 824                       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 825        }
 826        rcu_read_unlock();
 827        cb->args[2] = m;
 828        return skb->len;
 829}
 830
 831void __init fib_hash_init(void)
 832{
 833        fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
 834                                         0, SLAB_PANIC, NULL);
 835
 836        fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
 837                                          0, SLAB_PANIC, NULL);
 838
 839}
 840
 841struct fib_table *fib_hash_table(u32 id)
 842{
 843        struct fib_table *tb;
 844
 845        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
 846                     GFP_KERNEL);
 847        if (tb == NULL)
 848                return NULL;
 849
 850        tb->tb_id = id;
 851        tb->tb_default = -1;
 852
 853        memset(tb->tb_data, 0, sizeof(struct fn_hash));
 854        return tb;
 855}
 856
 857/* ------------------------------------------------------------------------ */
 858#ifdef CONFIG_PROC_FS
 859
 860struct fib_iter_state {
 861        struct seq_net_private p;
 862        struct fn_zone  *zone;
 863        int             bucket;
 864        struct hlist_head *hash_head;
 865        struct fib_node *fn;
 866        struct fib_alias *fa;
 867        loff_t pos;
 868        unsigned int genid;
 869        int valid;
 870};
 871
 872static struct fib_alias *fib_get_first(struct seq_file *seq)
 873{
 874        struct fib_iter_state *iter = seq->private;
 875        struct fib_table *main_table;
 876        struct fn_hash *table;
 877
 878        main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
 879        table = (struct fn_hash *)main_table->tb_data;
 880
 881        iter->bucket    = 0;
 882        iter->hash_head = NULL;
 883        iter->fn        = NULL;
 884        iter->fa        = NULL;
 885        iter->pos       = 0;
 886        iter->genid     = fib_hash_genid;
 887        iter->valid     = 1;
 888
 889        for (iter->zone = rcu_dereference(table->fn_zone_list);
 890             iter->zone != NULL;
 891             iter->zone = rcu_dereference(iter->zone->fz_next)) {
 892                int maxslot;
 893
 894                if (!iter->zone->fz_nent)
 895                        continue;
 896
 897                iter->hash_head = rcu_dereference(iter->zone->fz_hash);
 898                maxslot = iter->zone->fz_divisor;
 899
 900                for (iter->bucket = 0; iter->bucket < maxslot;
 901                     ++iter->bucket, ++iter->hash_head) {
 902                        struct hlist_node *node;
 903                        struct fib_node *fn;
 904
 905                        hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
 906                                struct fib_alias *fa;
 907
 908                                list_for_each_entry(fa, &fn->fn_alias, fa_list) {
 909                                        iter->fn = fn;
 910                                        iter->fa = fa;
 911                                        goto out;
 912                                }
 913                        }
 914                }
 915        }
 916out:
 917        return iter->fa;
 918}
 919
 920static struct fib_alias *fib_get_next(struct seq_file *seq)
 921{
 922        struct fib_iter_state *iter = seq->private;
 923        struct fib_node *fn;
 924        struct fib_alias *fa;
 925
 926        /* Advance FA, if any. */
 927        fn = iter->fn;
 928        fa = iter->fa;
 929        if (fa) {
 930                BUG_ON(!fn);
 931                list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
 932                        iter->fa = fa;
 933                        goto out;
 934                }
 935        }
 936
 937        fa = iter->fa = NULL;
 938
 939        /* Advance FN. */
 940        if (fn) {
 941                struct hlist_node *node = &fn->fn_hash;
 942                hlist_for_each_entry_continue(fn, node, fn_hash) {
 943                        iter->fn = fn;
 944
 945                        list_for_each_entry(fa, &fn->fn_alias, fa_list) {
 946                                iter->fa = fa;
 947                                goto out;
 948                        }
 949                }
 950        }
 951
 952        fn = iter->fn = NULL;
 953
 954        /* Advance hash chain. */
 955        if (!iter->zone)
 956                goto out;
 957
 958        for (;;) {
 959                struct hlist_node *node;
 960                int maxslot;
 961
 962                maxslot = iter->zone->fz_divisor;
 963
 964                while (++iter->bucket < maxslot) {
 965                        iter->hash_head++;
 966
 967                        hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
 968                                list_for_each_entry(fa, &fn->fn_alias, fa_list) {
 969                                        iter->fn = fn;
 970                                        iter->fa = fa;
 971                                        goto out;
 972                                }
 973                        }
 974                }
 975
 976                iter->zone = rcu_dereference(iter->zone->fz_next);
 977
 978                if (!iter->zone)
 979                        goto out;
 980
 981                iter->bucket = 0;
 982                iter->hash_head = rcu_dereference(iter->zone->fz_hash);
 983
 984                hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
 985                        list_for_each_entry(fa, &fn->fn_alias, fa_list) {
 986                                iter->fn = fn;
 987                                iter->fa = fa;
 988                                goto out;
 989                        }
 990                }
 991        }
 992out:
 993        iter->pos++;
 994        return fa;
 995}
 996
 997static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
 998{
 999        struct fib_iter_state *iter = seq->private;
1000        struct fib_alias *fa;
1001
1002        if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
1003                fa   = iter->fa;
1004                pos -= iter->pos;
1005        } else
1006                fa = fib_get_first(seq);
1007
1008        if (fa)
1009                while (pos && (fa = fib_get_next(seq)))
1010                        --pos;
1011        return pos ? NULL : fa;
1012}
1013
1014static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
1015        __acquires(RCU)
1016{
1017        void *v = NULL;
1018
1019        rcu_read_lock();
1020        if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
1021                v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1022        return v;
1023}
1024
1025static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1026{
1027        ++*pos;
1028        return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
1029}
1030
1031static void fib_seq_stop(struct seq_file *seq, void *v)
1032        __releases(RCU)
1033{
1034        rcu_read_unlock();
1035}
1036
1037static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
1038{
1039        static const unsigned type2flags[RTN_MAX + 1] = {
1040                [7] = RTF_REJECT,
1041                [8] = RTF_REJECT,
1042        };
1043        unsigned flags = type2flags[type];
1044
1045        if (fi && fi->fib_nh->nh_gw)
1046                flags |= RTF_GATEWAY;
1047        if (mask == htonl(0xFFFFFFFF))
1048                flags |= RTF_HOST;
1049        flags |= RTF_UP;
1050        return flags;
1051}
1052
1053/*
1054 *      This outputs /proc/net/route.
1055 *
1056 *      It always works in backward compatibility mode.
1057 *      The format of the file is not supposed to be changed.
1058 */
1059static int fib_seq_show(struct seq_file *seq, void *v)
1060{
1061        struct fib_iter_state *iter;
1062        int len;
1063        __be32 prefix, mask;
1064        unsigned flags;
1065        struct fib_node *f;
1066        struct fib_alias *fa;
1067        struct fib_info *fi;
1068
1069        if (v == SEQ_START_TOKEN) {
1070                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
1071                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
1072                           "\tWindow\tIRTT");
1073                goto out;
1074        }
1075
1076        iter    = seq->private;
1077        f       = iter->fn;
1078        fa      = iter->fa;
1079        fi      = fa->fa_info;
1080        prefix  = f->fn_key;
1081        mask    = FZ_MASK(iter->zone);
1082        flags   = fib_flag_trans(fa->fa_type, mask, fi);
1083        if (fi)
1084                seq_printf(seq,
1085                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1086                         fi->fib_dev ? fi->fib_dev->name : "*", prefix,
1087                         fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
1088                         mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
1089                         fi->fib_window,
1090                         fi->fib_rtt >> 3, &len);
1091        else
1092                seq_printf(seq,
1093                         "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1094                         prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
1095
1096        seq_printf(seq, "%*s\n", 127 - len, "");
1097out:
1098        return 0;
1099}
1100
1101static const struct seq_operations fib_seq_ops = {
1102        .start  = fib_seq_start,
1103        .next   = fib_seq_next,
1104        .stop   = fib_seq_stop,
1105        .show   = fib_seq_show,
1106};
1107
1108static int fib_seq_open(struct inode *inode, struct file *file)
1109{
1110        return seq_open_net(inode, file, &fib_seq_ops,
1111                            sizeof(struct fib_iter_state));
1112}
1113
1114static const struct file_operations fib_seq_fops = {
1115        .owner          = THIS_MODULE,
1116        .open           = fib_seq_open,
1117        .read           = seq_read,
1118        .llseek         = seq_lseek,
1119        .release        = seq_release_net,
1120};
1121
1122int __net_init fib_proc_init(struct net *net)
1123{
1124        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
1125                return -ENOMEM;
1126        return 0;
1127}
1128
1129void __net_exit fib_proc_exit(struct net *net)
1130{
1131        proc_net_remove(net, "route");
1132}
1133#endif /* CONFIG_PROC_FS */
1134