linux/net/ipv4/fib_semantics.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              IPv4 Forwarding Information Base: semantics.
   7 *
   8 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   9 *
  10 *              This program is free software; you can redistribute it and/or
  11 *              modify it under the terms of the GNU General Public License
  12 *              as published by the Free Software Foundation; either version
  13 *              2 of the License, or (at your option) any later version.
  14 */
  15
  16#include <asm/uaccess.h>
  17#include <asm/system.h>
  18#include <linux/bitops.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/jiffies.h>
  22#include <linux/mm.h>
  23#include <linux/string.h>
  24#include <linux/socket.h>
  25#include <linux/sockios.h>
  26#include <linux/errno.h>
  27#include <linux/in.h>
  28#include <linux/inet.h>
  29#include <linux/inetdevice.h>
  30#include <linux/netdevice.h>
  31#include <linux/if_arp.h>
  32#include <linux/proc_fs.h>
  33#include <linux/skbuff.h>
  34#include <linux/init.h>
  35
  36#include <net/arp.h>
  37#include <net/ip.h>
  38#include <net/protocol.h>
  39#include <net/route.h>
  40#include <net/tcp.h>
  41#include <net/sock.h>
  42#include <net/ip_fib.h>
  43#include <net/netlink.h>
  44#include <net/nexthop.h>
  45
  46#include "fib_lookup.h"
  47
  48static DEFINE_SPINLOCK(fib_info_lock);
  49static struct hlist_head *fib_info_hash;
  50static struct hlist_head *fib_info_laddrhash;
  51static unsigned int fib_hash_size;
  52static unsigned int fib_info_cnt;
  53
  54#define DEVINDEX_HASHBITS 8
  55#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
  56static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
  57
  58#ifdef CONFIG_IP_ROUTE_MULTIPATH
  59
  60static DEFINE_SPINLOCK(fib_multipath_lock);
  61
  62#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
  63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
  64
  65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
  66for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
  67
  68#else /* CONFIG_IP_ROUTE_MULTIPATH */
  69
  70/* Hope, that gcc will optimize it to get rid of dummy loop */
  71
  72#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
  73for (nhsel=0; nhsel < 1; nhsel++)
  74
  75#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \
  76for (nhsel=0; nhsel < 1; nhsel++)
  77
  78#endif /* CONFIG_IP_ROUTE_MULTIPATH */
  79
  80#define endfor_nexthops(fi) }
  81
  82
  83static const struct
  84{
  85        int     error;
  86        u8      scope;
  87} fib_props[RTN_MAX + 1] = {
  88        {
  89                .error  = 0,
  90                .scope  = RT_SCOPE_NOWHERE,
  91        },      /* RTN_UNSPEC */
  92        {
  93                .error  = 0,
  94                .scope  = RT_SCOPE_UNIVERSE,
  95        },      /* RTN_UNICAST */
  96        {
  97                .error  = 0,
  98                .scope  = RT_SCOPE_HOST,
  99        },      /* RTN_LOCAL */
 100        {
 101                .error  = 0,
 102                .scope  = RT_SCOPE_LINK,
 103        },      /* RTN_BROADCAST */
 104        {
 105                .error  = 0,
 106                .scope  = RT_SCOPE_LINK,
 107        },      /* RTN_ANYCAST */
 108        {
 109                .error  = 0,
 110                .scope  = RT_SCOPE_UNIVERSE,
 111        },      /* RTN_MULTICAST */
 112        {
 113                .error  = -EINVAL,
 114                .scope  = RT_SCOPE_UNIVERSE,
 115        },      /* RTN_BLACKHOLE */
 116        {
 117                .error  = -EHOSTUNREACH,
 118                .scope  = RT_SCOPE_UNIVERSE,
 119        },      /* RTN_UNREACHABLE */
 120        {
 121                .error  = -EACCES,
 122                .scope  = RT_SCOPE_UNIVERSE,
 123        },      /* RTN_PROHIBIT */
 124        {
 125                .error  = -EAGAIN,
 126                .scope  = RT_SCOPE_UNIVERSE,
 127        },      /* RTN_THROW */
 128        {
 129                .error  = -EINVAL,
 130                .scope  = RT_SCOPE_NOWHERE,
 131        },      /* RTN_NAT */
 132        {
 133                .error  = -EINVAL,
 134                .scope  = RT_SCOPE_NOWHERE,
 135        },      /* RTN_XRESOLVE */
 136};
 137
 138
 139/* Release a nexthop info record */
 140
 141void free_fib_info(struct fib_info *fi)
 142{
 143        if (fi->fib_dead == 0) {
 144                printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
 145                return;
 146        }
 147        change_nexthops(fi) {
 148                if (nh->nh_dev)
 149                        dev_put(nh->nh_dev);
 150                nh->nh_dev = NULL;
 151        } endfor_nexthops(fi);
 152        fib_info_cnt--;
 153        release_net(fi->fib_net);
 154        kfree(fi);
 155}
 156
 157void fib_release_info(struct fib_info *fi)
 158{
 159        spin_lock_bh(&fib_info_lock);
 160        if (fi && --fi->fib_treeref == 0) {
 161                hlist_del(&fi->fib_hash);
 162                if (fi->fib_prefsrc)
 163                        hlist_del(&fi->fib_lhash);
 164                change_nexthops(fi) {
 165                        if (!nh->nh_dev)
 166                                continue;
 167                        hlist_del(&nh->nh_hash);
 168                } endfor_nexthops(fi)
 169                fi->fib_dead = 1;
 170                fib_info_put(fi);
 171        }
 172        spin_unlock_bh(&fib_info_lock);
 173}
 174
 175static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 176{
 177        const struct fib_nh *onh = ofi->fib_nh;
 178
 179        for_nexthops(fi) {
 180                if (nh->nh_oif != onh->nh_oif ||
 181                    nh->nh_gw  != onh->nh_gw ||
 182                    nh->nh_scope != onh->nh_scope ||
 183#ifdef CONFIG_IP_ROUTE_MULTIPATH
 184                    nh->nh_weight != onh->nh_weight ||
 185#endif
 186#ifdef CONFIG_NET_CLS_ROUTE
 187                    nh->nh_tclassid != onh->nh_tclassid ||
 188#endif
 189                    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
 190                        return -1;
 191                onh++;
 192        } endfor_nexthops(fi);
 193        return 0;
 194}
 195
 196static inline unsigned int fib_devindex_hashfn(unsigned int val)
 197{
 198        unsigned int mask = DEVINDEX_HASHSIZE - 1;
 199
 200        return (val ^
 201                (val >> DEVINDEX_HASHBITS) ^
 202                (val >> (DEVINDEX_HASHBITS * 2))) & mask;
 203}
 204
 205static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 206{
 207        unsigned int mask = (fib_hash_size - 1);
 208        unsigned int val = fi->fib_nhs;
 209
 210        val ^= fi->fib_protocol;
 211        val ^= (__force u32)fi->fib_prefsrc;
 212        val ^= fi->fib_priority;
 213        for_nexthops(fi) {
 214                val ^= fib_devindex_hashfn(nh->nh_oif);
 215        } endfor_nexthops(fi)
 216
 217        return (val ^ (val >> 7) ^ (val >> 12)) & mask;
 218}
 219
 220static struct fib_info *fib_find_info(const struct fib_info *nfi)
 221{
 222        struct hlist_head *head;
 223        struct hlist_node *node;
 224        struct fib_info *fi;
 225        unsigned int hash;
 226
 227        hash = fib_info_hashfn(nfi);
 228        head = &fib_info_hash[hash];
 229
 230        hlist_for_each_entry(fi, node, head, fib_hash) {
 231                if (fi->fib_net != nfi->fib_net)
 232                        continue;
 233                if (fi->fib_nhs != nfi->fib_nhs)
 234                        continue;
 235                if (nfi->fib_protocol == fi->fib_protocol &&
 236                    nfi->fib_prefsrc == fi->fib_prefsrc &&
 237                    nfi->fib_priority == fi->fib_priority &&
 238                    memcmp(nfi->fib_metrics, fi->fib_metrics,
 239                           sizeof(fi->fib_metrics)) == 0 &&
 240                    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
 241                    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
 242                        return fi;
 243        }
 244
 245        return NULL;
 246}
 247
 248/* Check, that the gateway is already configured.
 249   Used only by redirect accept routine.
 250 */
 251
 252int ip_fib_check_default(__be32 gw, struct net_device *dev)
 253{
 254        struct hlist_head *head;
 255        struct hlist_node *node;
 256        struct fib_nh *nh;
 257        unsigned int hash;
 258
 259        spin_lock(&fib_info_lock);
 260
 261        hash = fib_devindex_hashfn(dev->ifindex);
 262        head = &fib_info_devhash[hash];
 263        hlist_for_each_entry(nh, node, head, nh_hash) {
 264                if (nh->nh_dev == dev &&
 265                    nh->nh_gw == gw &&
 266                    !(nh->nh_flags&RTNH_F_DEAD)) {
 267                        spin_unlock(&fib_info_lock);
 268                        return 0;
 269                }
 270        }
 271
 272        spin_unlock(&fib_info_lock);
 273
 274        return -1;
 275}
 276
 277static inline size_t fib_nlmsg_size(struct fib_info *fi)
 278{
 279        size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
 280                         + nla_total_size(4) /* RTA_TABLE */
 281                         + nla_total_size(4) /* RTA_DST */
 282                         + nla_total_size(4) /* RTA_PRIORITY */
 283                         + nla_total_size(4); /* RTA_PREFSRC */
 284
 285        /* space for nested metrics */
 286        payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
 287
 288        if (fi->fib_nhs) {
 289                /* Also handles the special case fib_nhs == 1 */
 290
 291                /* each nexthop is packed in an attribute */
 292                size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
 293
 294                /* may contain flow and gateway attribute */
 295                nhsize += 2 * nla_total_size(4);
 296
 297                /* all nexthops are packed in a nested attribute */
 298                payload += nla_total_size(fi->fib_nhs * nhsize);
 299        }
 300
 301        return payload;
 302}
 303
 304void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
 305               int dst_len, u32 tb_id, struct nl_info *info,
 306               unsigned int nlm_flags)
 307{
 308        struct sk_buff *skb;
 309        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 310        int err = -ENOBUFS;
 311
 312        skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
 313        if (skb == NULL)
 314                goto errout;
 315
 316        err = fib_dump_info(skb, info->pid, seq, event, tb_id,
 317                            fa->fa_type, fa->fa_scope, key, dst_len,
 318                            fa->fa_tos, fa->fa_info, nlm_flags);
 319        if (err < 0) {
 320                /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
 321                WARN_ON(err == -EMSGSIZE);
 322                kfree_skb(skb);
 323                goto errout;
 324        }
 325        rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
 326                    info->nlh, GFP_KERNEL);
 327        return;
 328errout:
 329        if (err < 0)
 330                rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
 331}
 332
 333/* Return the first fib alias matching TOS with
 334 * priority less than or equal to PRIO.
 335 */
 336struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
 337{
 338        if (fah) {
 339                struct fib_alias *fa;
 340                list_for_each_entry(fa, fah, fa_list) {
 341                        if (fa->fa_tos > tos)
 342                                continue;
 343                        if (fa->fa_info->fib_priority >= prio ||
 344                            fa->fa_tos < tos)
 345                                return fa;
 346                }
 347        }
 348        return NULL;
 349}
 350
 351int fib_detect_death(struct fib_info *fi, int order,
 352                     struct fib_info **last_resort, int *last_idx, int dflt)
 353{
 354        struct neighbour *n;
 355        int state = NUD_NONE;
 356
 357        n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
 358        if (n) {
 359                state = n->nud_state;
 360                neigh_release(n);
 361        }
 362        if (state == NUD_REACHABLE)
 363                return 0;
 364        if ((state&NUD_VALID) && order != dflt)
 365                return 0;
 366        if ((state&NUD_VALID) ||
 367            (*last_idx<0 && order > dflt)) {
 368                *last_resort = fi;
 369                *last_idx = order;
 370        }
 371        return 1;
 372}
 373
 374#ifdef CONFIG_IP_ROUTE_MULTIPATH
 375
 376static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 377{
 378        int nhs = 0;
 379
 380        while (rtnh_ok(rtnh, remaining)) {
 381                nhs++;
 382                rtnh = rtnh_next(rtnh, &remaining);
 383        }
 384
 385        /* leftover implies invalid nexthop configuration, discard it */
 386        return remaining > 0 ? 0 : nhs;
 387}
 388
 389static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 390                       int remaining, struct fib_config *cfg)
 391{
 392        change_nexthops(fi) {
 393                int attrlen;
 394
 395                if (!rtnh_ok(rtnh, remaining))
 396                        return -EINVAL;
 397
 398                nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
 399                nh->nh_oif = rtnh->rtnh_ifindex;
 400                nh->nh_weight = rtnh->rtnh_hops + 1;
 401
 402                attrlen = rtnh_attrlen(rtnh);
 403                if (attrlen > 0) {
 404                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 405
 406                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
 407                        nh->nh_gw = nla ? nla_get_be32(nla) : 0;
 408#ifdef CONFIG_NET_CLS_ROUTE
 409                        nla = nla_find(attrs, attrlen, RTA_FLOW);
 410                        nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
 411#endif
 412                }
 413
 414                rtnh = rtnh_next(rtnh, &remaining);
 415        } endfor_nexthops(fi);
 416
 417        return 0;
 418}
 419
 420#endif
 421
 422int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 423{
 424#ifdef CONFIG_IP_ROUTE_MULTIPATH
 425        struct rtnexthop *rtnh;
 426        int remaining;
 427#endif
 428
 429        if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
 430                return 1;
 431
 432        if (cfg->fc_oif || cfg->fc_gw) {
 433                if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
 434                    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
 435                        return 0;
 436                return 1;
 437        }
 438
 439#ifdef CONFIG_IP_ROUTE_MULTIPATH
 440        if (cfg->fc_mp == NULL)
 441                return 0;
 442
 443        rtnh = cfg->fc_mp;
 444        remaining = cfg->fc_mp_len;
 445
 446        for_nexthops(fi) {
 447                int attrlen;
 448
 449                if (!rtnh_ok(rtnh, remaining))
 450                        return -EINVAL;
 451
 452                if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
 453                        return 1;
 454
 455                attrlen = rtnh_attrlen(rtnh);
 456                if (attrlen < 0) {
 457                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 458
 459                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
 460                        if (nla && nla_get_be32(nla) != nh->nh_gw)
 461                                return 1;
 462#ifdef CONFIG_NET_CLS_ROUTE
 463                        nla = nla_find(attrs, attrlen, RTA_FLOW);
 464                        if (nla && nla_get_u32(nla) != nh->nh_tclassid)
 465                                return 1;
 466#endif
 467                }
 468
 469                rtnh = rtnh_next(rtnh, &remaining);
 470        } endfor_nexthops(fi);
 471#endif
 472        return 0;
 473}
 474
 475
 476/*
 477   Picture
 478   -------
 479
 480   Semantics of nexthop is very messy by historical reasons.
 481   We have to take into account, that:
 482   a) gateway can be actually local interface address,
 483      so that gatewayed route is direct.
 484   b) gateway must be on-link address, possibly
 485      described not by an ifaddr, but also by a direct route.
 486   c) If both gateway and interface are specified, they should not
 487      contradict.
 488   d) If we use tunnel routes, gateway could be not on-link.
 489
 490   Attempt to reconcile all of these (alas, self-contradictory) conditions
 491   results in pretty ugly and hairy code with obscure logic.
 492
 493   I chose to generalized it instead, so that the size
 494   of code does not increase practically, but it becomes
 495   much more general.
 496   Every prefix is assigned a "scope" value: "host" is local address,
 497   "link" is direct route,
 498   [ ... "site" ... "interior" ... ]
 499   and "universe" is true gateway route with global meaning.
 500
 501   Every prefix refers to a set of "nexthop"s (gw, oif),
 502   where gw must have narrower scope. This recursion stops
 503   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 504   which means that gw is forced to be on link.
 505
 506   Code is still hairy, but now it is apparently logically
 507   consistent and very flexible. F.e. as by-product it allows
 508   to co-exists in peace independent exterior and interior
 509   routing processes.
 510
 511   Normally it looks as following.
 512
 513   {universe prefix}  -> (gw, oif) [scope link]
 514                          |
 515                          |-> {link prefix} -> (gw, oif) [scope local]
 516                                                |
 517                                                |-> {local prefix} (terminal node)
 518 */
 519
 520static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 521                        struct fib_nh *nh)
 522{
 523        int err;
 524        struct net *net;
 525
 526        net = cfg->fc_nlinfo.nl_net;
 527        if (nh->nh_gw) {
 528                struct fib_result res;
 529
 530#ifdef CONFIG_IP_ROUTE_PERVASIVE
 531                if (nh->nh_flags&RTNH_F_PERVASIVE)
 532                        return 0;
 533#endif
 534                if (nh->nh_flags&RTNH_F_ONLINK) {
 535                        struct net_device *dev;
 536
 537                        if (cfg->fc_scope >= RT_SCOPE_LINK)
 538                                return -EINVAL;
 539                        if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
 540                                return -EINVAL;
 541                        if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
 542                                return -ENODEV;
 543                        if (!(dev->flags&IFF_UP))
 544                                return -ENETDOWN;
 545                        nh->nh_dev = dev;
 546                        dev_hold(dev);
 547                        nh->nh_scope = RT_SCOPE_LINK;
 548                        return 0;
 549                }
 550                {
 551                        struct flowi fl = {
 552                                .nl_u = {
 553                                        .ip4_u = {
 554                                                .daddr = nh->nh_gw,
 555                                                .scope = cfg->fc_scope + 1,
 556                                        },
 557                                },
 558                                .oif = nh->nh_oif,
 559                        };
 560
 561                        /* It is not necessary, but requires a bit of thinking */
 562                        if (fl.fl4_scope < RT_SCOPE_LINK)
 563                                fl.fl4_scope = RT_SCOPE_LINK;
 564                        if ((err = fib_lookup(net, &fl, &res)) != 0)
 565                                return err;
 566                }
 567                err = -EINVAL;
 568                if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
 569                        goto out;
 570                nh->nh_scope = res.scope;
 571                nh->nh_oif = FIB_RES_OIF(res);
 572                if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
 573                        goto out;
 574                dev_hold(nh->nh_dev);
 575                err = -ENETDOWN;
 576                if (!(nh->nh_dev->flags & IFF_UP))
 577                        goto out;
 578                err = 0;
 579out:
 580                fib_res_put(&res);
 581                return err;
 582        } else {
 583                struct in_device *in_dev;
 584
 585                if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
 586                        return -EINVAL;
 587
 588                in_dev = inetdev_by_index(net, nh->nh_oif);
 589                if (in_dev == NULL)
 590                        return -ENODEV;
 591                if (!(in_dev->dev->flags&IFF_UP)) {
 592                        in_dev_put(in_dev);
 593                        return -ENETDOWN;
 594                }
 595                nh->nh_dev = in_dev->dev;
 596                dev_hold(nh->nh_dev);
 597                nh->nh_scope = RT_SCOPE_HOST;
 598                in_dev_put(in_dev);
 599        }
 600        return 0;
 601}
 602
 603static inline unsigned int fib_laddr_hashfn(__be32 val)
 604{
 605        unsigned int mask = (fib_hash_size - 1);
 606
 607        return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
 608}
 609
 610static struct hlist_head *fib_hash_alloc(int bytes)
 611{
 612        if (bytes <= PAGE_SIZE)
 613                return kzalloc(bytes, GFP_KERNEL);
 614        else
 615                return (struct hlist_head *)
 616                        __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
 617}
 618
 619static void fib_hash_free(struct hlist_head *hash, int bytes)
 620{
 621        if (!hash)
 622                return;
 623
 624        if (bytes <= PAGE_SIZE)
 625                kfree(hash);
 626        else
 627                free_pages((unsigned long) hash, get_order(bytes));
 628}
 629
 630static void fib_hash_move(struct hlist_head *new_info_hash,
 631                          struct hlist_head *new_laddrhash,
 632                          unsigned int new_size)
 633{
 634        struct hlist_head *old_info_hash, *old_laddrhash;
 635        unsigned int old_size = fib_hash_size;
 636        unsigned int i, bytes;
 637
 638        spin_lock_bh(&fib_info_lock);
 639        old_info_hash = fib_info_hash;
 640        old_laddrhash = fib_info_laddrhash;
 641        fib_hash_size = new_size;
 642
 643        for (i = 0; i < old_size; i++) {
 644                struct hlist_head *head = &fib_info_hash[i];
 645                struct hlist_node *node, *n;
 646                struct fib_info *fi;
 647
 648                hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
 649                        struct hlist_head *dest;
 650                        unsigned int new_hash;
 651
 652                        hlist_del(&fi->fib_hash);
 653
 654                        new_hash = fib_info_hashfn(fi);
 655                        dest = &new_info_hash[new_hash];
 656                        hlist_add_head(&fi->fib_hash, dest);
 657                }
 658        }
 659        fib_info_hash = new_info_hash;
 660
 661        for (i = 0; i < old_size; i++) {
 662                struct hlist_head *lhead = &fib_info_laddrhash[i];
 663                struct hlist_node *node, *n;
 664                struct fib_info *fi;
 665
 666                hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
 667                        struct hlist_head *ldest;
 668                        unsigned int new_hash;
 669
 670                        hlist_del(&fi->fib_lhash);
 671
 672                        new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
 673                        ldest = &new_laddrhash[new_hash];
 674                        hlist_add_head(&fi->fib_lhash, ldest);
 675                }
 676        }
 677        fib_info_laddrhash = new_laddrhash;
 678
 679        spin_unlock_bh(&fib_info_lock);
 680
 681        bytes = old_size * sizeof(struct hlist_head *);
 682        fib_hash_free(old_info_hash, bytes);
 683        fib_hash_free(old_laddrhash, bytes);
 684}
 685
 686struct fib_info *fib_create_info(struct fib_config *cfg)
 687{
 688        int err;
 689        struct fib_info *fi = NULL;
 690        struct fib_info *ofi;
 691        int nhs = 1;
 692        struct net *net = cfg->fc_nlinfo.nl_net;
 693
 694        /* Fast check to catch the most weird cases */
 695        if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
 696                goto err_inval;
 697
 698#ifdef CONFIG_IP_ROUTE_MULTIPATH
 699        if (cfg->fc_mp) {
 700                nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
 701                if (nhs == 0)
 702                        goto err_inval;
 703        }
 704#endif
 705
 706        err = -ENOBUFS;
 707        if (fib_info_cnt >= fib_hash_size) {
 708                unsigned int new_size = fib_hash_size << 1;
 709                struct hlist_head *new_info_hash;
 710                struct hlist_head *new_laddrhash;
 711                unsigned int bytes;
 712
 713                if (!new_size)
 714                        new_size = 1;
 715                bytes = new_size * sizeof(struct hlist_head *);
 716                new_info_hash = fib_hash_alloc(bytes);
 717                new_laddrhash = fib_hash_alloc(bytes);
 718                if (!new_info_hash || !new_laddrhash) {
 719                        fib_hash_free(new_info_hash, bytes);
 720                        fib_hash_free(new_laddrhash, bytes);
 721                } else
 722                        fib_hash_move(new_info_hash, new_laddrhash, new_size);
 723
 724                if (!fib_hash_size)
 725                        goto failure;
 726        }
 727
 728        fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
 729        if (fi == NULL)
 730                goto failure;
 731        fib_info_cnt++;
 732
 733        fi->fib_net = hold_net(net);
 734        fi->fib_protocol = cfg->fc_protocol;
 735        fi->fib_flags = cfg->fc_flags;
 736        fi->fib_priority = cfg->fc_priority;
 737        fi->fib_prefsrc = cfg->fc_prefsrc;
 738
 739        fi->fib_nhs = nhs;
 740        change_nexthops(fi) {
 741                nh->nh_parent = fi;
 742        } endfor_nexthops(fi)
 743
 744        if (cfg->fc_mx) {
 745                struct nlattr *nla;
 746                int remaining;
 747
 748                nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
 749                        int type = nla_type(nla);
 750
 751                        if (type) {
 752                                if (type > RTAX_MAX)
 753                                        goto err_inval;
 754                                fi->fib_metrics[type - 1] = nla_get_u32(nla);
 755                        }
 756                }
 757        }
 758
 759        if (cfg->fc_mp) {
 760#ifdef CONFIG_IP_ROUTE_MULTIPATH
 761                err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
 762                if (err != 0)
 763                        goto failure;
 764                if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
 765                        goto err_inval;
 766                if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
 767                        goto err_inval;
 768#ifdef CONFIG_NET_CLS_ROUTE
 769                if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
 770                        goto err_inval;
 771#endif
 772#else
 773                goto err_inval;
 774#endif
 775        } else {
 776                struct fib_nh *nh = fi->fib_nh;
 777
 778                nh->nh_oif = cfg->fc_oif;
 779                nh->nh_gw = cfg->fc_gw;
 780                nh->nh_flags = cfg->fc_flags;
 781#ifdef CONFIG_NET_CLS_ROUTE
 782                nh->nh_tclassid = cfg->fc_flow;
 783#endif
 784#ifdef CONFIG_IP_ROUTE_MULTIPATH
 785                nh->nh_weight = 1;
 786#endif
 787        }
 788
 789        if (fib_props[cfg->fc_type].error) {
 790                if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
 791                        goto err_inval;
 792                goto link_it;
 793        }
 794
 795        if (cfg->fc_scope > RT_SCOPE_HOST)
 796                goto err_inval;
 797
 798        if (cfg->fc_scope == RT_SCOPE_HOST) {
 799                struct fib_nh *nh = fi->fib_nh;
 800
 801                /* Local address is added. */
 802                if (nhs != 1 || nh->nh_gw)
 803                        goto err_inval;
 804                nh->nh_scope = RT_SCOPE_NOWHERE;
 805                nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
 806                err = -ENODEV;
 807                if (nh->nh_dev == NULL)
 808                        goto failure;
 809        } else {
 810                change_nexthops(fi) {
 811                        if ((err = fib_check_nh(cfg, fi, nh)) != 0)
 812                                goto failure;
 813                } endfor_nexthops(fi)
 814        }
 815
 816        if (fi->fib_prefsrc) {
 817                if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
 818                    fi->fib_prefsrc != cfg->fc_dst)
 819                        if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
 820                                goto err_inval;
 821        }
 822
 823link_it:
 824        if ((ofi = fib_find_info(fi)) != NULL) {
 825                fi->fib_dead = 1;
 826                free_fib_info(fi);
 827                ofi->fib_treeref++;
 828                return ofi;
 829        }
 830
 831        fi->fib_treeref++;
 832        atomic_inc(&fi->fib_clntref);
 833        spin_lock_bh(&fib_info_lock);
 834        hlist_add_head(&fi->fib_hash,
 835                       &fib_info_hash[fib_info_hashfn(fi)]);
 836        if (fi->fib_prefsrc) {
 837                struct hlist_head *head;
 838
 839                head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
 840                hlist_add_head(&fi->fib_lhash, head);
 841        }
 842        change_nexthops(fi) {
 843                struct hlist_head *head;
 844                unsigned int hash;
 845
 846                if (!nh->nh_dev)
 847                        continue;
 848                hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
 849                head = &fib_info_devhash[hash];
 850                hlist_add_head(&nh->nh_hash, head);
 851        } endfor_nexthops(fi)
 852        spin_unlock_bh(&fib_info_lock);
 853        return fi;
 854
 855err_inval:
 856        err = -EINVAL;
 857
 858failure:
 859        if (fi) {
 860                fi->fib_dead = 1;
 861                free_fib_info(fi);
 862        }
 863
 864        return ERR_PTR(err);
 865}
 866
 867/* Note! fib_semantic_match intentionally uses  RCU list functions. */
 868int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 869                       struct fib_result *res, int prefixlen)
 870{
 871        struct fib_alias *fa;
 872        int nh_sel = 0;
 873
 874        list_for_each_entry_rcu(fa, head, fa_list) {
 875                int err;
 876
 877                if (fa->fa_tos &&
 878                    fa->fa_tos != flp->fl4_tos)
 879                        continue;
 880
 881                if (fa->fa_scope < flp->fl4_scope)
 882                        continue;
 883
 884                fa->fa_state |= FA_S_ACCESSED;
 885
 886                err = fib_props[fa->fa_type].error;
 887                if (err == 0) {
 888                        struct fib_info *fi = fa->fa_info;
 889
 890                        if (fi->fib_flags & RTNH_F_DEAD)
 891                                continue;
 892
 893                        switch (fa->fa_type) {
 894                        case RTN_UNICAST:
 895                        case RTN_LOCAL:
 896                        case RTN_BROADCAST:
 897                        case RTN_ANYCAST:
 898                        case RTN_MULTICAST:
 899                                for_nexthops(fi) {
 900                                        if (nh->nh_flags&RTNH_F_DEAD)
 901                                                continue;
 902                                        if (!flp->oif || flp->oif == nh->nh_oif)
 903                                                break;
 904                                }
 905#ifdef CONFIG_IP_ROUTE_MULTIPATH
 906                                if (nhsel < fi->fib_nhs) {
 907                                        nh_sel = nhsel;
 908                                        goto out_fill_res;
 909                                }
 910#else
 911                                if (nhsel < 1) {
 912                                        goto out_fill_res;
 913                                }
 914#endif
 915                                endfor_nexthops(fi);
 916                                continue;
 917
 918                        default:
 919                                printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
 920                                        fa->fa_type);
 921                                return -EINVAL;
 922                        }
 923                }
 924                return err;
 925        }
 926        return 1;
 927
 928out_fill_res:
 929        res->prefixlen = prefixlen;
 930        res->nh_sel = nh_sel;
 931        res->type = fa->fa_type;
 932        res->scope = fa->fa_scope;
 933        res->fi = fa->fa_info;
 934        atomic_inc(&res->fi->fib_clntref);
 935        return 0;
 936}
 937
 938/* Find appropriate source address to this destination */
 939
 940__be32 __fib_res_prefsrc(struct fib_result *res)
 941{
 942        return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
 943}
 944
 945int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 946                  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
 947                  struct fib_info *fi, unsigned int flags)
 948{
 949        struct nlmsghdr *nlh;
 950        struct rtmsg *rtm;
 951
 952        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
 953        if (nlh == NULL)
 954                return -EMSGSIZE;
 955
 956        rtm = nlmsg_data(nlh);
 957        rtm->rtm_family = AF_INET;
 958        rtm->rtm_dst_len = dst_len;
 959        rtm->rtm_src_len = 0;
 960        rtm->rtm_tos = tos;
 961        if (tb_id < 256)
 962                rtm->rtm_table = tb_id;
 963        else
 964                rtm->rtm_table = RT_TABLE_COMPAT;
 965        NLA_PUT_U32(skb, RTA_TABLE, tb_id);
 966        rtm->rtm_type = type;
 967        rtm->rtm_flags = fi->fib_flags;
 968        rtm->rtm_scope = scope;
 969        rtm->rtm_protocol = fi->fib_protocol;
 970
 971        if (rtm->rtm_dst_len)
 972                NLA_PUT_BE32(skb, RTA_DST, dst);
 973
 974        if (fi->fib_priority)
 975                NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
 976
 977        if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
 978                goto nla_put_failure;
 979
 980        if (fi->fib_prefsrc)
 981                NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
 982
 983        if (fi->fib_nhs == 1) {
 984                if (fi->fib_nh->nh_gw)
 985                        NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
 986
 987                if (fi->fib_nh->nh_oif)
 988                        NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
 989#ifdef CONFIG_NET_CLS_ROUTE
 990                if (fi->fib_nh[0].nh_tclassid)
 991                        NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
 992#endif
 993        }
 994#ifdef CONFIG_IP_ROUTE_MULTIPATH
 995        if (fi->fib_nhs > 1) {
 996                struct rtnexthop *rtnh;
 997                struct nlattr *mp;
 998
 999                mp = nla_nest_start(skb, RTA_MULTIPATH);
1000                if (mp == NULL)
1001                        goto nla_put_failure;
1002
1003                for_nexthops(fi) {
1004                        rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1005                        if (rtnh == NULL)
1006                                goto nla_put_failure;
1007
1008                        rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1009                        rtnh->rtnh_hops = nh->nh_weight - 1;
1010                        rtnh->rtnh_ifindex = nh->nh_oif;
1011
1012                        if (nh->nh_gw)
1013                                NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1014#ifdef CONFIG_NET_CLS_ROUTE
1015                        if (nh->nh_tclassid)
1016                                NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1017#endif
1018                        /* length of rtnetlink header + attributes */
1019                        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1020                } endfor_nexthops(fi);
1021
1022                nla_nest_end(skb, mp);
1023        }
1024#endif
1025        return nlmsg_end(skb, nlh);
1026
1027nla_put_failure:
1028        nlmsg_cancel(skb, nlh);
1029        return -EMSGSIZE;
1030}
1031
1032/*
1033   Update FIB if:
1034   - local address disappeared -> we must delete all the entries
1035     referring to it.
1036   - device went down -> we must shutdown all nexthops going via it.
1037 */
1038int fib_sync_down_addr(struct net *net, __be32 local)
1039{
1040        int ret = 0;
1041        unsigned int hash = fib_laddr_hashfn(local);
1042        struct hlist_head *head = &fib_info_laddrhash[hash];
1043        struct hlist_node *node;
1044        struct fib_info *fi;
1045
1046        if (fib_info_laddrhash == NULL || local == 0)
1047                return 0;
1048
1049        hlist_for_each_entry(fi, node, head, fib_lhash) {
1050                if (fi->fib_net != net)
1051                        continue;
1052                if (fi->fib_prefsrc == local) {
1053                        fi->fib_flags |= RTNH_F_DEAD;
1054                        ret++;
1055                }
1056        }
1057        return ret;
1058}
1059
1060int fib_sync_down_dev(struct net_device *dev, int force)
1061{
1062        int ret = 0;
1063        int scope = RT_SCOPE_NOWHERE;
1064        struct fib_info *prev_fi = NULL;
1065        unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1066        struct hlist_head *head = &fib_info_devhash[hash];
1067        struct hlist_node *node;
1068        struct fib_nh *nh;
1069
1070        if (force)
1071                scope = -1;
1072
1073        hlist_for_each_entry(nh, node, head, nh_hash) {
1074                struct fib_info *fi = nh->nh_parent;
1075                int dead;
1076
1077                BUG_ON(!fi->fib_nhs);
1078                if (nh->nh_dev != dev || fi == prev_fi)
1079                        continue;
1080                prev_fi = fi;
1081                dead = 0;
1082                change_nexthops(fi) {
1083                        if (nh->nh_flags&RTNH_F_DEAD)
1084                                dead++;
1085                        else if (nh->nh_dev == dev &&
1086                                        nh->nh_scope != scope) {
1087                                nh->nh_flags |= RTNH_F_DEAD;
1088#ifdef CONFIG_IP_ROUTE_MULTIPATH
1089                                spin_lock_bh(&fib_multipath_lock);
1090                                fi->fib_power -= nh->nh_power;
1091                                nh->nh_power = 0;
1092                                spin_unlock_bh(&fib_multipath_lock);
1093#endif
1094                                dead++;
1095                        }
1096#ifdef CONFIG_IP_ROUTE_MULTIPATH
1097                        if (force > 1 && nh->nh_dev == dev) {
1098                                dead = fi->fib_nhs;
1099                                break;
1100                        }
1101#endif
1102                } endfor_nexthops(fi)
1103                if (dead == fi->fib_nhs) {
1104                        fi->fib_flags |= RTNH_F_DEAD;
1105                        ret++;
1106                }
1107        }
1108
1109        return ret;
1110}
1111
1112#ifdef CONFIG_IP_ROUTE_MULTIPATH
1113
1114/*
1115   Dead device goes up. We wake up dead nexthops.
1116   It takes sense only on multipath routes.
1117 */
1118
1119int fib_sync_up(struct net_device *dev)
1120{
1121        struct fib_info *prev_fi;
1122        unsigned int hash;
1123        struct hlist_head *head;
1124        struct hlist_node *node;
1125        struct fib_nh *nh;
1126        int ret;
1127
1128        if (!(dev->flags&IFF_UP))
1129                return 0;
1130
1131        prev_fi = NULL;
1132        hash = fib_devindex_hashfn(dev->ifindex);
1133        head = &fib_info_devhash[hash];
1134        ret = 0;
1135
1136        hlist_for_each_entry(nh, node, head, nh_hash) {
1137                struct fib_info *fi = nh->nh_parent;
1138                int alive;
1139
1140                BUG_ON(!fi->fib_nhs);
1141                if (nh->nh_dev != dev || fi == prev_fi)
1142                        continue;
1143
1144                prev_fi = fi;
1145                alive = 0;
1146                change_nexthops(fi) {
1147                        if (!(nh->nh_flags&RTNH_F_DEAD)) {
1148                                alive++;
1149                                continue;
1150                        }
1151                        if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1152                                continue;
1153                        if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1154                                continue;
1155                        alive++;
1156                        spin_lock_bh(&fib_multipath_lock);
1157                        nh->nh_power = 0;
1158                        nh->nh_flags &= ~RTNH_F_DEAD;
1159                        spin_unlock_bh(&fib_multipath_lock);
1160                } endfor_nexthops(fi)
1161
1162                if (alive > 0) {
1163                        fi->fib_flags &= ~RTNH_F_DEAD;
1164                        ret++;
1165                }
1166        }
1167
1168        return ret;
1169}
1170
1171/*
1172   The algorithm is suboptimal, but it provides really
1173   fair weighted route distribution.
1174 */
1175
1176void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177{
1178        struct fib_info *fi = res->fi;
1179        int w;
1180
1181        spin_lock_bh(&fib_multipath_lock);
1182        if (fi->fib_power <= 0) {
1183                int power = 0;
1184                change_nexthops(fi) {
1185                        if (!(nh->nh_flags&RTNH_F_DEAD)) {
1186                                power += nh->nh_weight;
1187                                nh->nh_power = nh->nh_weight;
1188                        }
1189                } endfor_nexthops(fi);
1190                fi->fib_power = power;
1191                if (power <= 0) {
1192                        spin_unlock_bh(&fib_multipath_lock);
1193                        /* Race condition: route has just become dead. */
1194                        res->nh_sel = 0;
1195                        return;
1196                }
1197        }
1198
1199
1200        /* w should be random number [0..fi->fib_power-1],
1201           it is pretty bad approximation.
1202         */
1203
1204        w = jiffies % fi->fib_power;
1205
1206        change_nexthops(fi) {
1207                if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1208                        if ((w -= nh->nh_power) <= 0) {
1209                                nh->nh_power--;
1210                                fi->fib_power--;
1211                                res->nh_sel = nhsel;
1212                                spin_unlock_bh(&fib_multipath_lock);
1213                                return;
1214                        }
1215                }
1216        } endfor_nexthops(fi);
1217
1218        /* Race condition: route has just become dead. */
1219        res->nh_sel = 0;
1220        spin_unlock_bh(&fib_multipath_lock);
1221}
1222#endif
1223