linux/net/ipv4/fib_frontend.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              IPv4 Forwarding Information Base: FIB frontend.
   7 *
   8 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   9 *
  10 *              This program is free software; you can redistribute it and/or
  11 *              modify it under the terms of the GNU General Public License
  12 *              as published by the Free Software Foundation; either version
  13 *              2 of the License, or (at your option) any later version.
  14 */
  15
  16#include <linux/module.h>
  17#include <asm/uaccess.h>
  18#include <linux/bitops.h>
  19#include <linux/capability.h>
  20#include <linux/types.h>
  21#include <linux/kernel.h>
  22#include <linux/mm.h>
  23#include <linux/string.h>
  24#include <linux/socket.h>
  25#include <linux/sockios.h>
  26#include <linux/errno.h>
  27#include <linux/in.h>
  28#include <linux/inet.h>
  29#include <linux/inetdevice.h>
  30#include <linux/netdevice.h>
  31#include <linux/if_addr.h>
  32#include <linux/if_arp.h>
  33#include <linux/skbuff.h>
  34#include <linux/cache.h>
  35#include <linux/init.h>
  36#include <linux/list.h>
  37#include <linux/slab.h>
  38
  39#include <net/ip.h>
  40#include <net/protocol.h>
  41#include <net/route.h>
  42#include <net/tcp.h>
  43#include <net/sock.h>
  44#include <net/arp.h>
  45#include <net/ip_fib.h>
  46#include <net/rtnetlink.h>
  47#include <net/xfrm.h>
  48
  49#ifndef CONFIG_IP_MULTIPLE_TABLES
  50
  51static int __net_init fib4_rules_init(struct net *net)
  52{
  53        struct fib_table *local_table, *main_table;
  54
  55        local_table = fib_trie_table(RT_TABLE_LOCAL);
  56        if (local_table == NULL)
  57                return -ENOMEM;
  58
  59        main_table  = fib_trie_table(RT_TABLE_MAIN);
  60        if (main_table == NULL)
  61                goto fail;
  62
  63        hlist_add_head_rcu(&local_table->tb_hlist,
  64                                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
  65        hlist_add_head_rcu(&main_table->tb_hlist,
  66                                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
  67        return 0;
  68
  69fail:
  70        kfree(local_table);
  71        return -ENOMEM;
  72}
  73#else
  74
  75struct fib_table *fib_new_table(struct net *net, u32 id)
  76{
  77        struct fib_table *tb;
  78        unsigned int h;
  79
  80        if (id == 0)
  81                id = RT_TABLE_MAIN;
  82        tb = fib_get_table(net, id);
  83        if (tb)
  84                return tb;
  85
  86        tb = fib_trie_table(id);
  87        if (!tb)
  88                return NULL;
  89
  90        switch (id) {
  91        case RT_TABLE_LOCAL:
  92                net->ipv4.fib_local = tb;
  93                break;
  94
  95        case RT_TABLE_MAIN:
  96                net->ipv4.fib_main = tb;
  97                break;
  98
  99        case RT_TABLE_DEFAULT:
 100                net->ipv4.fib_default = tb;
 101                break;
 102
 103        default:
 104                break;
 105        }
 106
 107        h = id & (FIB_TABLE_HASHSZ - 1);
 108        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
 109        return tb;
 110}
 111
 112struct fib_table *fib_get_table(struct net *net, u32 id)
 113{
 114        struct fib_table *tb;
 115        struct hlist_head *head;
 116        unsigned int h;
 117
 118        if (id == 0)
 119                id = RT_TABLE_MAIN;
 120        h = id & (FIB_TABLE_HASHSZ - 1);
 121
 122        rcu_read_lock();
 123        head = &net->ipv4.fib_table_hash[h];
 124        hlist_for_each_entry_rcu(tb, head, tb_hlist) {
 125                if (tb->tb_id == id) {
 126                        rcu_read_unlock();
 127                        return tb;
 128                }
 129        }
 130        rcu_read_unlock();
 131        return NULL;
 132}
 133#endif /* CONFIG_IP_MULTIPLE_TABLES */
 134
 135static void fib_flush(struct net *net)
 136{
 137        int flushed = 0;
 138        struct fib_table *tb;
 139        struct hlist_head *head;
 140        unsigned int h;
 141
 142        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 143                head = &net->ipv4.fib_table_hash[h];
 144                hlist_for_each_entry(tb, head, tb_hlist)
 145                        flushed += fib_table_flush(tb);
 146        }
 147
 148        if (flushed)
 149                rt_cache_flush(net);
 150}
 151
 152/*
 153 * Find address type as if only "dev" was present in the system. If
 154 * on_dev is NULL then all interfaces are taken into consideration.
 155 */
 156static inline unsigned int __inet_dev_addr_type(struct net *net,
 157                                                const struct net_device *dev,
 158                                                __be32 addr)
 159{
 160        struct flowi4           fl4 = { .daddr = addr };
 161        struct fib_result       res;
 162        unsigned int ret = RTN_BROADCAST;
 163        struct fib_table *local_table;
 164
 165        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
 166                return RTN_BROADCAST;
 167        if (ipv4_is_multicast(addr))
 168                return RTN_MULTICAST;
 169
 170        local_table = fib_get_table(net, RT_TABLE_LOCAL);
 171        if (local_table) {
 172                ret = RTN_UNICAST;
 173                rcu_read_lock();
 174                if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 175                        if (!dev || dev == res.fi->fib_dev)
 176                                ret = res.type;
 177                }
 178                rcu_read_unlock();
 179        }
 180        return ret;
 181}
 182
 183unsigned int inet_addr_type(struct net *net, __be32 addr)
 184{
 185        return __inet_dev_addr_type(net, NULL, addr);
 186}
 187EXPORT_SYMBOL(inet_addr_type);
 188
 189unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 190                                __be32 addr)
 191{
 192        return __inet_dev_addr_type(net, dev, addr);
 193}
 194EXPORT_SYMBOL(inet_dev_addr_type);
 195
 196__be32 fib_compute_spec_dst(struct sk_buff *skb)
 197{
 198        struct net_device *dev = skb->dev;
 199        struct in_device *in_dev;
 200        struct fib_result res;
 201        struct rtable *rt;
 202        struct flowi4 fl4;
 203        struct net *net;
 204        int scope;
 205
 206        rt = skb_rtable(skb);
 207        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
 208            RTCF_LOCAL)
 209                return ip_hdr(skb)->daddr;
 210
 211        in_dev = __in_dev_get_rcu(dev);
 212        BUG_ON(!in_dev);
 213
 214        net = dev_net(dev);
 215
 216        scope = RT_SCOPE_UNIVERSE;
 217        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 218                fl4.flowi4_oif = 0;
 219                fl4.flowi4_iif = LOOPBACK_IFINDEX;
 220                fl4.daddr = ip_hdr(skb)->saddr;
 221                fl4.saddr = 0;
 222                fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 223                fl4.flowi4_scope = scope;
 224                fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
 225                if (!fib_lookup(net, &fl4, &res))
 226                        return FIB_RES_PREFSRC(net, res);
 227        } else {
 228                scope = RT_SCOPE_LINK;
 229        }
 230
 231        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 232}
 233
 234/* Given (packet source, input interface) and optional (dst, oif, tos):
 235 * - (main) check, that source is valid i.e. not broadcast or our local
 236 *   address.
 237 * - figure out what "logical" interface this packet arrived
 238 *   and calculate "specific destination" address.
 239 * - check, that packet arrived from expected physical interface.
 240 * called with rcu_read_lock()
 241 */
 242static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 243                                 u8 tos, int oif, struct net_device *dev,
 244                                 int rpf, struct in_device *idev, u32 *itag)
 245{
 246        int ret, no_addr, accept_local;
 247        struct fib_result res;
 248        struct flowi4 fl4;
 249        struct net *net;
 250        bool dev_match;
 251
 252        fl4.flowi4_oif = 0;
 253        fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
 254        fl4.daddr = src;
 255        fl4.saddr = dst;
 256        fl4.flowi4_tos = tos;
 257        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 258
 259        no_addr = idev->ifa_list == NULL;
 260
 261        accept_local = IN_DEV_ACCEPT_LOCAL(idev);
 262        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 263
 264        net = dev_net(dev);
 265        if (fib_lookup(net, &fl4, &res))
 266                goto last_resort;
 267        if (res.type != RTN_UNICAST) {
 268                if (res.type != RTN_LOCAL || !accept_local)
 269                        goto e_inval;
 270        }
 271        fib_combine_itag(itag, &res);
 272        dev_match = false;
 273
 274#ifdef CONFIG_IP_ROUTE_MULTIPATH
 275        for (ret = 0; ret < res.fi->fib_nhs; ret++) {
 276                struct fib_nh *nh = &res.fi->fib_nh[ret];
 277
 278                if (nh->nh_dev == dev) {
 279                        dev_match = true;
 280                        break;
 281                }
 282        }
 283#else
 284        if (FIB_RES_DEV(res) == dev)
 285                dev_match = true;
 286#endif
 287        if (dev_match) {
 288                ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
 289                return ret;
 290        }
 291        if (no_addr)
 292                goto last_resort;
 293        if (rpf == 1)
 294                goto e_rpf;
 295        fl4.flowi4_oif = dev->ifindex;
 296
 297        ret = 0;
 298        if (fib_lookup(net, &fl4, &res) == 0) {
 299                if (res.type == RTN_UNICAST)
 300                        ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
 301        }
 302        return ret;
 303
 304last_resort:
 305        if (rpf)
 306                goto e_rpf;
 307        *itag = 0;
 308        return 0;
 309
 310e_inval:
 311        return -EINVAL;
 312e_rpf:
 313        return -EXDEV;
 314}
 315
 316/* Ignore rp_filter for packets protected by IPsec. */
 317int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 318                        u8 tos, int oif, struct net_device *dev,
 319                        struct in_device *idev, u32 *itag)
 320{
 321        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 322
 323        if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
 324            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
 325                *itag = 0;
 326                return 0;
 327        }
 328        return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 329}
 330
 331static inline __be32 sk_extract_addr(struct sockaddr *addr)
 332{
 333        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
 334}
 335
 336static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
 337{
 338        struct nlattr *nla;
 339
 340        nla = (struct nlattr *) ((char *) mx + len);
 341        nla->nla_type = type;
 342        nla->nla_len = nla_attr_size(4);
 343        *(u32 *) nla_data(nla) = value;
 344
 345        return len + nla_total_size(4);
 346}
 347
 348static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 349                                 struct fib_config *cfg)
 350{
 351        __be32 addr;
 352        int plen;
 353
 354        memset(cfg, 0, sizeof(*cfg));
 355        cfg->fc_nlinfo.nl_net = net;
 356
 357        if (rt->rt_dst.sa_family != AF_INET)
 358                return -EAFNOSUPPORT;
 359
 360        /*
 361         * Check mask for validity:
 362         * a) it must be contiguous.
 363         * b) destination must have all host bits clear.
 364         * c) if application forgot to set correct family (AF_INET),
 365         *    reject request unless it is absolutely clear i.e.
 366         *    both family and mask are zero.
 367         */
 368        plen = 32;
 369        addr = sk_extract_addr(&rt->rt_dst);
 370        if (!(rt->rt_flags & RTF_HOST)) {
 371                __be32 mask = sk_extract_addr(&rt->rt_genmask);
 372
 373                if (rt->rt_genmask.sa_family != AF_INET) {
 374                        if (mask || rt->rt_genmask.sa_family)
 375                                return -EAFNOSUPPORT;
 376                }
 377
 378                if (bad_mask(mask, addr))
 379                        return -EINVAL;
 380
 381                plen = inet_mask_len(mask);
 382        }
 383
 384        cfg->fc_dst_len = plen;
 385        cfg->fc_dst = addr;
 386
 387        if (cmd != SIOCDELRT) {
 388                cfg->fc_nlflags = NLM_F_CREATE;
 389                cfg->fc_protocol = RTPROT_BOOT;
 390        }
 391
 392        if (rt->rt_metric)
 393                cfg->fc_priority = rt->rt_metric - 1;
 394
 395        if (rt->rt_flags & RTF_REJECT) {
 396                cfg->fc_scope = RT_SCOPE_HOST;
 397                cfg->fc_type = RTN_UNREACHABLE;
 398                return 0;
 399        }
 400
 401        cfg->fc_scope = RT_SCOPE_NOWHERE;
 402        cfg->fc_type = RTN_UNICAST;
 403
 404        if (rt->rt_dev) {
 405                char *colon;
 406                struct net_device *dev;
 407                char devname[IFNAMSIZ];
 408
 409                if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
 410                        return -EFAULT;
 411
 412                devname[IFNAMSIZ-1] = 0;
 413                colon = strchr(devname, ':');
 414                if (colon)
 415                        *colon = 0;
 416                dev = __dev_get_by_name(net, devname);
 417                if (!dev)
 418                        return -ENODEV;
 419                cfg->fc_oif = dev->ifindex;
 420                if (colon) {
 421                        struct in_ifaddr *ifa;
 422                        struct in_device *in_dev = __in_dev_get_rtnl(dev);
 423                        if (!in_dev)
 424                                return -ENODEV;
 425                        *colon = ':';
 426                        for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
 427                                if (strcmp(ifa->ifa_label, devname) == 0)
 428                                        break;
 429                        if (ifa == NULL)
 430                                return -ENODEV;
 431                        cfg->fc_prefsrc = ifa->ifa_local;
 432                }
 433        }
 434
 435        addr = sk_extract_addr(&rt->rt_gateway);
 436        if (rt->rt_gateway.sa_family == AF_INET && addr) {
 437                cfg->fc_gw = addr;
 438                if (rt->rt_flags & RTF_GATEWAY &&
 439                    inet_addr_type(net, addr) == RTN_UNICAST)
 440                        cfg->fc_scope = RT_SCOPE_UNIVERSE;
 441        }
 442
 443        if (cmd == SIOCDELRT)
 444                return 0;
 445
 446        if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
 447                return -EINVAL;
 448
 449        if (cfg->fc_scope == RT_SCOPE_NOWHERE)
 450                cfg->fc_scope = RT_SCOPE_LINK;
 451
 452        if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
 453                struct nlattr *mx;
 454                int len = 0;
 455
 456                mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
 457                if (mx == NULL)
 458                        return -ENOMEM;
 459
 460                if (rt->rt_flags & RTF_MTU)
 461                        len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
 462
 463                if (rt->rt_flags & RTF_WINDOW)
 464                        len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
 465
 466                if (rt->rt_flags & RTF_IRTT)
 467                        len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
 468
 469                cfg->fc_mx = mx;
 470                cfg->fc_mx_len = len;
 471        }
 472
 473        return 0;
 474}
 475
 476/*
 477 * Handle IP routing ioctl calls.
 478 * These are used to manipulate the routing tables
 479 */
 480int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 481{
 482        struct fib_config cfg;
 483        struct rtentry rt;
 484        int err;
 485
 486        switch (cmd) {
 487        case SIOCADDRT:         /* Add a route */
 488        case SIOCDELRT:         /* Delete a route */
 489                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 490                        return -EPERM;
 491
 492                if (copy_from_user(&rt, arg, sizeof(rt)))
 493                        return -EFAULT;
 494
 495                rtnl_lock();
 496                err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
 497                if (err == 0) {
 498                        struct fib_table *tb;
 499
 500                        if (cmd == SIOCDELRT) {
 501                                tb = fib_get_table(net, cfg.fc_table);
 502                                if (tb)
 503                                        err = fib_table_delete(tb, &cfg);
 504                                else
 505                                        err = -ESRCH;
 506                        } else {
 507                                tb = fib_new_table(net, cfg.fc_table);
 508                                if (tb)
 509                                        err = fib_table_insert(tb, &cfg);
 510                                else
 511                                        err = -ENOBUFS;
 512                        }
 513
 514                        /* allocated by rtentry_to_fib_config() */
 515                        kfree(cfg.fc_mx);
 516                }
 517                rtnl_unlock();
 518                return err;
 519        }
 520        return -EINVAL;
 521}
 522
 523const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 524        [RTA_DST]               = { .type = NLA_U32 },
 525        [RTA_SRC]               = { .type = NLA_U32 },
 526        [RTA_IIF]               = { .type = NLA_U32 },
 527        [RTA_OIF]               = { .type = NLA_U32 },
 528        [RTA_GATEWAY]           = { .type = NLA_U32 },
 529        [RTA_PRIORITY]          = { .type = NLA_U32 },
 530        [RTA_PREFSRC]           = { .type = NLA_U32 },
 531        [RTA_METRICS]           = { .type = NLA_NESTED },
 532        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
 533        [RTA_FLOW]              = { .type = NLA_U32 },
 534};
 535
 536static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 537                             struct nlmsghdr *nlh, struct fib_config *cfg)
 538{
 539        struct nlattr *attr;
 540        int err, remaining;
 541        struct rtmsg *rtm;
 542
 543        err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
 544        if (err < 0)
 545                goto errout;
 546
 547        memset(cfg, 0, sizeof(*cfg));
 548
 549        rtm = nlmsg_data(nlh);
 550        cfg->fc_dst_len = rtm->rtm_dst_len;
 551        cfg->fc_tos = rtm->rtm_tos;
 552        cfg->fc_table = rtm->rtm_table;
 553        cfg->fc_protocol = rtm->rtm_protocol;
 554        cfg->fc_scope = rtm->rtm_scope;
 555        cfg->fc_type = rtm->rtm_type;
 556        cfg->fc_flags = rtm->rtm_flags;
 557        cfg->fc_nlflags = nlh->nlmsg_flags;
 558
 559        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 560        cfg->fc_nlinfo.nlh = nlh;
 561        cfg->fc_nlinfo.nl_net = net;
 562
 563        if (cfg->fc_type > RTN_MAX) {
 564                err = -EINVAL;
 565                goto errout;
 566        }
 567
 568        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
 569                switch (nla_type(attr)) {
 570                case RTA_DST:
 571                        cfg->fc_dst = nla_get_be32(attr);
 572                        break;
 573                case RTA_OIF:
 574                        cfg->fc_oif = nla_get_u32(attr);
 575                        break;
 576                case RTA_GATEWAY:
 577                        cfg->fc_gw = nla_get_be32(attr);
 578                        break;
 579                case RTA_PRIORITY:
 580                        cfg->fc_priority = nla_get_u32(attr);
 581                        break;
 582                case RTA_PREFSRC:
 583                        cfg->fc_prefsrc = nla_get_be32(attr);
 584                        break;
 585                case RTA_METRICS:
 586                        cfg->fc_mx = nla_data(attr);
 587                        cfg->fc_mx_len = nla_len(attr);
 588                        break;
 589                case RTA_MULTIPATH:
 590                        cfg->fc_mp = nla_data(attr);
 591                        cfg->fc_mp_len = nla_len(attr);
 592                        break;
 593                case RTA_FLOW:
 594                        cfg->fc_flow = nla_get_u32(attr);
 595                        break;
 596                case RTA_TABLE:
 597                        cfg->fc_table = nla_get_u32(attr);
 598                        break;
 599                }
 600        }
 601
 602        return 0;
 603errout:
 604        return err;
 605}
 606
 607static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 608{
 609        struct net *net = sock_net(skb->sk);
 610        struct fib_config cfg;
 611        struct fib_table *tb;
 612        int err;
 613
 614        err = rtm_to_fib_config(net, skb, nlh, &cfg);
 615        if (err < 0)
 616                goto errout;
 617
 618        tb = fib_get_table(net, cfg.fc_table);
 619        if (tb == NULL) {
 620                err = -ESRCH;
 621                goto errout;
 622        }
 623
 624        err = fib_table_delete(tb, &cfg);
 625errout:
 626        return err;
 627}
 628
 629static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
 630{
 631        struct net *net = sock_net(skb->sk);
 632        struct fib_config cfg;
 633        struct fib_table *tb;
 634        int err;
 635
 636        err = rtm_to_fib_config(net, skb, nlh, &cfg);
 637        if (err < 0)
 638                goto errout;
 639
 640        tb = fib_new_table(net, cfg.fc_table);
 641        if (tb == NULL) {
 642                err = -ENOBUFS;
 643                goto errout;
 644        }
 645
 646        err = fib_table_insert(tb, &cfg);
 647errout:
 648        return err;
 649}
 650
 651static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 652{
 653        struct net *net = sock_net(skb->sk);
 654        unsigned int h, s_h;
 655        unsigned int e = 0, s_e;
 656        struct fib_table *tb;
 657        struct hlist_head *head;
 658        int dumped = 0;
 659
 660        if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
 661            ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
 662                return skb->len;
 663
 664        s_h = cb->args[0];
 665        s_e = cb->args[1];
 666
 667        for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
 668                e = 0;
 669                head = &net->ipv4.fib_table_hash[h];
 670                hlist_for_each_entry(tb, head, tb_hlist) {
 671                        if (e < s_e)
 672                                goto next;
 673                        if (dumped)
 674                                memset(&cb->args[2], 0, sizeof(cb->args) -
 675                                                 2 * sizeof(cb->args[0]));
 676                        if (fib_table_dump(tb, skb, cb) < 0)
 677                                goto out;
 678                        dumped = 1;
 679next:
 680                        e++;
 681                }
 682        }
 683out:
 684        cb->args[1] = e;
 685        cb->args[0] = h;
 686
 687        return skb->len;
 688}
 689
 690/* Prepare and feed intra-kernel routing request.
 691 * Really, it should be netlink message, but :-( netlink
 692 * can be not configured, so that we feed it directly
 693 * to fib engine. It is legal, because all events occur
 694 * only when netlink is already locked.
 695 */
 696static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
 697{
 698        struct net *net = dev_net(ifa->ifa_dev->dev);
 699        struct fib_table *tb;
 700        struct fib_config cfg = {
 701                .fc_protocol = RTPROT_KERNEL,
 702                .fc_type = type,
 703                .fc_dst = dst,
 704                .fc_dst_len = dst_len,
 705                .fc_prefsrc = ifa->ifa_local,
 706                .fc_oif = ifa->ifa_dev->dev->ifindex,
 707                .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
 708                .fc_nlinfo = {
 709                        .nl_net = net,
 710                },
 711        };
 712
 713        if (type == RTN_UNICAST)
 714                tb = fib_new_table(net, RT_TABLE_MAIN);
 715        else
 716                tb = fib_new_table(net, RT_TABLE_LOCAL);
 717
 718        if (tb == NULL)
 719                return;
 720
 721        cfg.fc_table = tb->tb_id;
 722
 723        if (type != RTN_LOCAL)
 724                cfg.fc_scope = RT_SCOPE_LINK;
 725        else
 726                cfg.fc_scope = RT_SCOPE_HOST;
 727
 728        if (cmd == RTM_NEWROUTE)
 729                fib_table_insert(tb, &cfg);
 730        else
 731                fib_table_delete(tb, &cfg);
 732}
 733
 734void fib_add_ifaddr(struct in_ifaddr *ifa)
 735{
 736        struct in_device *in_dev = ifa->ifa_dev;
 737        struct net_device *dev = in_dev->dev;
 738        struct in_ifaddr *prim = ifa;
 739        __be32 mask = ifa->ifa_mask;
 740        __be32 addr = ifa->ifa_local;
 741        __be32 prefix = ifa->ifa_address & mask;
 742
 743        if (ifa->ifa_flags & IFA_F_SECONDARY) {
 744                prim = inet_ifa_byprefix(in_dev, prefix, mask);
 745                if (prim == NULL) {
 746                        pr_warn("%s: bug: prim == NULL\n", __func__);
 747                        return;
 748                }
 749        }
 750
 751        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
 752
 753        if (!(dev->flags & IFF_UP))
 754                return;
 755
 756        /* Add broadcast address, if it is explicitly assigned. */
 757        if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
 758                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 759
 760        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
 761            (prefix != addr || ifa->ifa_prefixlen < 32)) {
 762                fib_magic(RTM_NEWROUTE,
 763                          dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
 764                          prefix, ifa->ifa_prefixlen, prim);
 765
 766                /* Add network specific broadcasts, when it takes a sense */
 767                if (ifa->ifa_prefixlen < 31) {
 768                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
 769                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
 770                                  32, prim);
 771                }
 772        }
 773}
 774
 775/* Delete primary or secondary address.
 776 * Optionally, on secondary address promotion consider the addresses
 777 * from subnet iprim as deleted, even if they are in device list.
 778 * In this case the secondary ifa can be in device list.
 779 */
 780void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 781{
 782        struct in_device *in_dev = ifa->ifa_dev;
 783        struct net_device *dev = in_dev->dev;
 784        struct in_ifaddr *ifa1;
 785        struct in_ifaddr *prim = ifa, *prim1 = NULL;
 786        __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
 787        __be32 any = ifa->ifa_address & ifa->ifa_mask;
 788#define LOCAL_OK        1
 789#define BRD_OK          2
 790#define BRD0_OK         4
 791#define BRD1_OK         8
 792        unsigned int ok = 0;
 793        int subnet = 0;         /* Primary network */
 794        int gone = 1;           /* Address is missing */
 795        int same_prefsrc = 0;   /* Another primary with same IP */
 796
 797        if (ifa->ifa_flags & IFA_F_SECONDARY) {
 798                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
 799                if (prim == NULL) {
 800                        pr_warn("%s: bug: prim == NULL\n", __func__);
 801                        return;
 802                }
 803                if (iprim && iprim != prim) {
 804                        pr_warn("%s: bug: iprim != prim\n", __func__);
 805                        return;
 806                }
 807        } else if (!ipv4_is_zeronet(any) &&
 808                   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
 809                fib_magic(RTM_DELROUTE,
 810                          dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
 811                          any, ifa->ifa_prefixlen, prim);
 812                subnet = 1;
 813        }
 814
 815        /* Deletion is more complicated than add.
 816         * We should take care of not to delete too much :-)
 817         *
 818         * Scan address list to be sure that addresses are really gone.
 819         */
 820
 821        for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
 822                if (ifa1 == ifa) {
 823                        /* promotion, keep the IP */
 824                        gone = 0;
 825                        continue;
 826                }
 827                /* Ignore IFAs from our subnet */
 828                if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
 829                    inet_ifa_match(ifa1->ifa_address, iprim))
 830                        continue;
 831
 832                /* Ignore ifa1 if it uses different primary IP (prefsrc) */
 833                if (ifa1->ifa_flags & IFA_F_SECONDARY) {
 834                        /* Another address from our subnet? */
 835                        if (ifa1->ifa_mask == prim->ifa_mask &&
 836                            inet_ifa_match(ifa1->ifa_address, prim))
 837                                prim1 = prim;
 838                        else {
 839                                /* We reached the secondaries, so
 840                                 * same_prefsrc should be determined.
 841                                 */
 842                                if (!same_prefsrc)
 843                                        continue;
 844                                /* Search new prim1 if ifa1 is not
 845                                 * using the current prim1
 846                                 */
 847                                if (!prim1 ||
 848                                    ifa1->ifa_mask != prim1->ifa_mask ||
 849                                    !inet_ifa_match(ifa1->ifa_address, prim1))
 850                                        prim1 = inet_ifa_byprefix(in_dev,
 851                                                        ifa1->ifa_address,
 852                                                        ifa1->ifa_mask);
 853                                if (!prim1)
 854                                        continue;
 855                                if (prim1->ifa_local != prim->ifa_local)
 856                                        continue;
 857                        }
 858                } else {
 859                        if (prim->ifa_local != ifa1->ifa_local)
 860                                continue;
 861                        prim1 = ifa1;
 862                        if (prim != prim1)
 863                                same_prefsrc = 1;
 864                }
 865                if (ifa->ifa_local == ifa1->ifa_local)
 866                        ok |= LOCAL_OK;
 867                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
 868                        ok |= BRD_OK;
 869                if (brd == ifa1->ifa_broadcast)
 870                        ok |= BRD1_OK;
 871                if (any == ifa1->ifa_broadcast)
 872                        ok |= BRD0_OK;
 873                /* primary has network specific broadcasts */
 874                if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
 875                        __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
 876                        __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
 877
 878                        if (!ipv4_is_zeronet(any1)) {
 879                                if (ifa->ifa_broadcast == brd1 ||
 880                                    ifa->ifa_broadcast == any1)
 881                                        ok |= BRD_OK;
 882                                if (brd == brd1 || brd == any1)
 883                                        ok |= BRD1_OK;
 884                                if (any == brd1 || any == any1)
 885                                        ok |= BRD0_OK;
 886                        }
 887                }
 888        }
 889
 890        if (!(ok & BRD_OK))
 891                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 892        if (subnet && ifa->ifa_prefixlen < 31) {
 893                if (!(ok & BRD1_OK))
 894                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
 895                if (!(ok & BRD0_OK))
 896                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
 897        }
 898        if (!(ok & LOCAL_OK)) {
 899                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
 900
 901                /* Check, that this local address finally disappeared. */
 902                if (gone &&
 903                    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
 904                        /* And the last, but not the least thing.
 905                         * We must flush stray FIB entries.
 906                         *
 907                         * First of all, we scan fib_info list searching
 908                         * for stray nexthop entries, then ignite fib_flush.
 909                         */
 910                        if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
 911                                fib_flush(dev_net(dev));
 912                }
 913        }
 914#undef LOCAL_OK
 915#undef BRD_OK
 916#undef BRD0_OK
 917#undef BRD1_OK
 918}
 919
 920static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 921{
 922
 923        struct fib_result       res;
 924        struct flowi4           fl4 = {
 925                .flowi4_mark = frn->fl_mark,
 926                .daddr = frn->fl_addr,
 927                .flowi4_tos = frn->fl_tos,
 928                .flowi4_scope = frn->fl_scope,
 929        };
 930
 931        frn->err = -ENOENT;
 932        if (tb) {
 933                local_bh_disable();
 934
 935                frn->tb_id = tb->tb_id;
 936                frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
 937
 938                if (!frn->err) {
 939                        frn->prefixlen = res.prefixlen;
 940                        frn->nh_sel = res.nh_sel;
 941                        frn->type = res.type;
 942                        frn->scope = res.scope;
 943                }
 944                local_bh_enable();
 945        }
 946}
 947
 948static void nl_fib_input(struct sk_buff *skb)
 949{
 950        struct net *net;
 951        struct fib_result_nl *frn;
 952        struct nlmsghdr *nlh;
 953        struct fib_table *tb;
 954        u32 portid;
 955
 956        net = sock_net(skb->sk);
 957        nlh = nlmsg_hdr(skb);
 958        if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
 959            nlmsg_len(nlh) < sizeof(*frn))
 960                return;
 961
 962        skb = netlink_skb_clone(skb, GFP_KERNEL);
 963        if (skb == NULL)
 964                return;
 965        nlh = nlmsg_hdr(skb);
 966
 967        frn = (struct fib_result_nl *) nlmsg_data(nlh);
 968        tb = fib_get_table(net, frn->tb_id_in);
 969
 970        nl_fib_lookup(frn, tb);
 971
 972        portid = NETLINK_CB(skb).portid;      /* netlink portid */
 973        NETLINK_CB(skb).portid = 0;        /* from kernel */
 974        NETLINK_CB(skb).dst_group = 0;  /* unicast */
 975        netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
 976}
 977
 978static int __net_init nl_fib_lookup_init(struct net *net)
 979{
 980        struct sock *sk;
 981        struct netlink_kernel_cfg cfg = {
 982                .input  = nl_fib_input,
 983        };
 984
 985        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
 986        if (sk == NULL)
 987                return -EAFNOSUPPORT;
 988        net->ipv4.fibnl = sk;
 989        return 0;
 990}
 991
 992static void nl_fib_lookup_exit(struct net *net)
 993{
 994        netlink_kernel_release(net->ipv4.fibnl);
 995        net->ipv4.fibnl = NULL;
 996}
 997
 998static void fib_disable_ip(struct net_device *dev, int force)
 999{
1000        if (fib_sync_down_dev(dev, force))
1001                fib_flush(dev_net(dev));
1002        rt_cache_flush(dev_net(dev));
1003        arp_ifdown(dev);
1004}
1005
1006static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1007{
1008        struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
1009        struct net_device *dev = ifa->ifa_dev->dev;
1010        struct net *net = dev_net(dev);
1011
1012        switch (event) {
1013        case NETDEV_UP:
1014                fib_add_ifaddr(ifa);
1015#ifdef CONFIG_IP_ROUTE_MULTIPATH
1016                fib_sync_up(dev);
1017#endif
1018                atomic_inc(&net->ipv4.dev_addr_genid);
1019                rt_cache_flush(dev_net(dev));
1020                break;
1021        case NETDEV_DOWN:
1022                fib_del_ifaddr(ifa, NULL);
1023                atomic_inc(&net->ipv4.dev_addr_genid);
1024                if (ifa->ifa_dev->ifa_list == NULL) {
1025                        /* Last address was deleted from this interface.
1026                         * Disable IP.
1027                         */
1028                        fib_disable_ip(dev, 1);
1029                } else {
1030                        rt_cache_flush(dev_net(dev));
1031                }
1032                break;
1033        }
1034        return NOTIFY_DONE;
1035}
1036
1037static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1038{
1039        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1040        struct in_device *in_dev;
1041        struct net *net = dev_net(dev);
1042
1043        if (event == NETDEV_UNREGISTER) {
1044                fib_disable_ip(dev, 2);
1045                rt_flush_dev(dev);
1046                return NOTIFY_DONE;
1047        }
1048
1049        in_dev = __in_dev_get_rtnl(dev);
1050        if (!in_dev)
1051                return NOTIFY_DONE;
1052
1053        switch (event) {
1054        case NETDEV_UP:
1055                for_ifa(in_dev) {
1056                        fib_add_ifaddr(ifa);
1057                } endfor_ifa(in_dev);
1058#ifdef CONFIG_IP_ROUTE_MULTIPATH
1059                fib_sync_up(dev);
1060#endif
1061                atomic_inc(&net->ipv4.dev_addr_genid);
1062                rt_cache_flush(net);
1063                break;
1064        case NETDEV_DOWN:
1065                fib_disable_ip(dev, 0);
1066                break;
1067        case NETDEV_CHANGEMTU:
1068        case NETDEV_CHANGE:
1069                rt_cache_flush(net);
1070                break;
1071        }
1072        return NOTIFY_DONE;
1073}
1074
1075static struct notifier_block fib_inetaddr_notifier = {
1076        .notifier_call = fib_inetaddr_event,
1077};
1078
1079static struct notifier_block fib_netdev_notifier = {
1080        .notifier_call = fib_netdev_event,
1081};
1082
1083static int __net_init ip_fib_net_init(struct net *net)
1084{
1085        int err;
1086        size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1087
1088        /* Avoid false sharing : Use at least a full cache line */
1089        size = max_t(size_t, size, L1_CACHE_BYTES);
1090
1091        net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1092        if (net->ipv4.fib_table_hash == NULL)
1093                return -ENOMEM;
1094
1095        err = fib4_rules_init(net);
1096        if (err < 0)
1097                goto fail;
1098        return 0;
1099
1100fail:
1101        kfree(net->ipv4.fib_table_hash);
1102        return err;
1103}
1104
1105static void ip_fib_net_exit(struct net *net)
1106{
1107        unsigned int i;
1108
1109#ifdef CONFIG_IP_MULTIPLE_TABLES
1110        fib4_rules_exit(net);
1111#endif
1112
1113        rtnl_lock();
1114        for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1115                struct fib_table *tb;
1116                struct hlist_head *head;
1117                struct hlist_node *tmp;
1118
1119                head = &net->ipv4.fib_table_hash[i];
1120                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1121                        hlist_del(&tb->tb_hlist);
1122                        fib_table_flush(tb);
1123                        fib_free_table(tb);
1124                }
1125        }
1126        rtnl_unlock();
1127        kfree(net->ipv4.fib_table_hash);
1128}
1129
1130static int __net_init fib_net_init(struct net *net)
1131{
1132        int error;
1133
1134#ifdef CONFIG_IP_ROUTE_CLASSID
1135        net->ipv4.fib_num_tclassid_users = 0;
1136#endif
1137        error = ip_fib_net_init(net);
1138        if (error < 0)
1139                goto out;
1140        error = nl_fib_lookup_init(net);
1141        if (error < 0)
1142                goto out_nlfl;
1143        error = fib_proc_init(net);
1144        if (error < 0)
1145                goto out_proc;
1146out:
1147        return error;
1148
1149out_proc:
1150        nl_fib_lookup_exit(net);
1151out_nlfl:
1152        ip_fib_net_exit(net);
1153        goto out;
1154}
1155
1156static void __net_exit fib_net_exit(struct net *net)
1157{
1158        fib_proc_exit(net);
1159        nl_fib_lookup_exit(net);
1160        ip_fib_net_exit(net);
1161}
1162
1163static struct pernet_operations fib_net_ops = {
1164        .init = fib_net_init,
1165        .exit = fib_net_exit,
1166};
1167
1168void __init ip_fib_init(void)
1169{
1170        rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
1171        rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
1172        rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
1173
1174        register_pernet_subsys(&fib_net_ops);
1175        register_netdevice_notifier(&fib_netdev_notifier);
1176        register_inetaddr_notifier(&fib_inetaddr_notifier);
1177
1178        fib_trie_init();
1179}
1180