linux/net/ipv4/fib_frontend.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              IPv4 Forwarding Information Base: FIB frontend.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/uaccess.h>
  14#include <linux/bitops.h>
  15#include <linux/capability.h>
  16#include <linux/types.h>
  17#include <linux/kernel.h>
  18#include <linux/mm.h>
  19#include <linux/string.h>
  20#include <linux/socket.h>
  21#include <linux/sockios.h>
  22#include <linux/errno.h>
  23#include <linux/in.h>
  24#include <linux/inet.h>
  25#include <linux/inetdevice.h>
  26#include <linux/netdevice.h>
  27#include <linux/if_addr.h>
  28#include <linux/if_arp.h>
  29#include <linux/skbuff.h>
  30#include <linux/cache.h>
  31#include <linux/init.h>
  32#include <linux/list.h>
  33#include <linux/slab.h>
  34
  35#include <net/ip.h>
  36#include <net/protocol.h>
  37#include <net/route.h>
  38#include <net/tcp.h>
  39#include <net/sock.h>
  40#include <net/arp.h>
  41#include <net/ip_fib.h>
  42#include <net/rtnetlink.h>
  43#include <net/xfrm.h>
  44#include <net/l3mdev.h>
  45#include <net/lwtunnel.h>
  46#include <trace/events/fib.h>
  47
  48#ifndef CONFIG_IP_MULTIPLE_TABLES
  49
  50static int __net_init fib4_rules_init(struct net *net)
  51{
  52        struct fib_table *local_table, *main_table;
  53
  54        main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
  55        if (!main_table)
  56                return -ENOMEM;
  57
  58        local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
  59        if (!local_table)
  60                goto fail;
  61
  62        hlist_add_head_rcu(&local_table->tb_hlist,
  63                                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
  64        hlist_add_head_rcu(&main_table->tb_hlist,
  65                                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
  66        return 0;
  67
  68fail:
  69        fib_free_table(main_table);
  70        return -ENOMEM;
  71}
  72
  73static bool fib4_has_custom_rules(struct net *net)
  74{
  75        return false;
  76}
  77#else
  78
  79struct fib_table *fib_new_table(struct net *net, u32 id)
  80{
  81        struct fib_table *tb, *alias = NULL;
  82        unsigned int h;
  83
  84        if (id == 0)
  85                id = RT_TABLE_MAIN;
  86        tb = fib_get_table(net, id);
  87        if (tb)
  88                return tb;
  89
  90        if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
  91                alias = fib_new_table(net, RT_TABLE_MAIN);
  92
  93        tb = fib_trie_table(id, alias);
  94        if (!tb)
  95                return NULL;
  96
  97        switch (id) {
  98        case RT_TABLE_MAIN:
  99                rcu_assign_pointer(net->ipv4.fib_main, tb);
 100                break;
 101        case RT_TABLE_DEFAULT:
 102                rcu_assign_pointer(net->ipv4.fib_default, tb);
 103                break;
 104        default:
 105                break;
 106        }
 107
 108        h = id & (FIB_TABLE_HASHSZ - 1);
 109        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
 110        return tb;
 111}
 112EXPORT_SYMBOL_GPL(fib_new_table);
 113
 114/* caller must hold either rtnl or rcu read lock */
 115struct fib_table *fib_get_table(struct net *net, u32 id)
 116{
 117        struct fib_table *tb;
 118        struct hlist_head *head;
 119        unsigned int h;
 120
 121        if (id == 0)
 122                id = RT_TABLE_MAIN;
 123        h = id & (FIB_TABLE_HASHSZ - 1);
 124
 125        head = &net->ipv4.fib_table_hash[h];
 126        hlist_for_each_entry_rcu(tb, head, tb_hlist) {
 127                if (tb->tb_id == id)
 128                        return tb;
 129        }
 130        return NULL;
 131}
 132
 133static bool fib4_has_custom_rules(struct net *net)
 134{
 135        return net->ipv4.fib_has_custom_rules;
 136}
 137#endif /* CONFIG_IP_MULTIPLE_TABLES */
 138
 139static void fib_replace_table(struct net *net, struct fib_table *old,
 140                              struct fib_table *new)
 141{
 142#ifdef CONFIG_IP_MULTIPLE_TABLES
 143        switch (new->tb_id) {
 144        case RT_TABLE_MAIN:
 145                rcu_assign_pointer(net->ipv4.fib_main, new);
 146                break;
 147        case RT_TABLE_DEFAULT:
 148                rcu_assign_pointer(net->ipv4.fib_default, new);
 149                break;
 150        default:
 151                break;
 152        }
 153
 154#endif
 155        /* replace the old table in the hlist */
 156        hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
 157}
 158
 159int fib_unmerge(struct net *net)
 160{
 161        struct fib_table *old, *new, *main_table;
 162
 163        /* attempt to fetch local table if it has been allocated */
 164        old = fib_get_table(net, RT_TABLE_LOCAL);
 165        if (!old)
 166                return 0;
 167
 168        new = fib_trie_unmerge(old);
 169        if (!new)
 170                return -ENOMEM;
 171
 172        /* table is already unmerged */
 173        if (new == old)
 174                return 0;
 175
 176        /* replace merged table with clean table */
 177        fib_replace_table(net, old, new);
 178        fib_free_table(old);
 179
 180        /* attempt to fetch main table if it has been allocated */
 181        main_table = fib_get_table(net, RT_TABLE_MAIN);
 182        if (!main_table)
 183                return 0;
 184
 185        /* flush local entries from main table */
 186        fib_table_flush_external(main_table);
 187
 188        return 0;
 189}
 190
 191static void fib_flush(struct net *net)
 192{
 193        int flushed = 0;
 194        unsigned int h;
 195
 196        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 197                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 198                struct hlist_node *tmp;
 199                struct fib_table *tb;
 200
 201                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
 202                        flushed += fib_table_flush(net, tb, false);
 203        }
 204
 205        if (flushed)
 206                rt_cache_flush(net);
 207}
 208
 209/*
 210 * Find address type as if only "dev" was present in the system. If
 211 * on_dev is NULL then all interfaces are taken into consideration.
 212 */
 213static inline unsigned int __inet_dev_addr_type(struct net *net,
 214                                                const struct net_device *dev,
 215                                                __be32 addr, u32 tb_id)
 216{
 217        struct flowi4           fl4 = { .daddr = addr };
 218        struct fib_result       res;
 219        unsigned int ret = RTN_BROADCAST;
 220        struct fib_table *table;
 221
 222        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
 223                return RTN_BROADCAST;
 224        if (ipv4_is_multicast(addr))
 225                return RTN_MULTICAST;
 226
 227        rcu_read_lock();
 228
 229        table = fib_get_table(net, tb_id);
 230        if (table) {
 231                ret = RTN_UNICAST;
 232                if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 233                        if (!dev || dev == res.fi->fib_dev)
 234                                ret = res.type;
 235                }
 236        }
 237
 238        rcu_read_unlock();
 239        return ret;
 240}
 241
 242unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
 243{
 244        return __inet_dev_addr_type(net, NULL, addr, tb_id);
 245}
 246EXPORT_SYMBOL(inet_addr_type_table);
 247
 248unsigned int inet_addr_type(struct net *net, __be32 addr)
 249{
 250        return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
 251}
 252EXPORT_SYMBOL(inet_addr_type);
 253
 254unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 255                                __be32 addr)
 256{
 257        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 258
 259        return __inet_dev_addr_type(net, dev, addr, rt_table);
 260}
 261EXPORT_SYMBOL(inet_dev_addr_type);
 262
 263/* inet_addr_type with dev == NULL but using the table from a dev
 264 * if one is associated
 265 */
 266unsigned int inet_addr_type_dev_table(struct net *net,
 267                                      const struct net_device *dev,
 268                                      __be32 addr)
 269{
 270        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 271
 272        return __inet_dev_addr_type(net, NULL, addr, rt_table);
 273}
 274EXPORT_SYMBOL(inet_addr_type_dev_table);
 275
 276__be32 fib_compute_spec_dst(struct sk_buff *skb)
 277{
 278        struct net_device *dev = skb->dev;
 279        struct in_device *in_dev;
 280        struct fib_result res;
 281        struct rtable *rt;
 282        struct net *net;
 283        int scope;
 284
 285        rt = skb_rtable(skb);
 286        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
 287            RTCF_LOCAL)
 288                return ip_hdr(skb)->daddr;
 289
 290        in_dev = __in_dev_get_rcu(dev);
 291
 292        net = dev_net(dev);
 293
 294        scope = RT_SCOPE_UNIVERSE;
 295        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 296                bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
 297                struct flowi4 fl4 = {
 298                        .flowi4_iif = LOOPBACK_IFINDEX,
 299                        .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
 300                        .daddr = ip_hdr(skb)->saddr,
 301                        .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
 302                        .flowi4_scope = scope,
 303                        .flowi4_mark = vmark ? skb->mark : 0,
 304                };
 305                if (!fib_lookup(net, &fl4, &res, 0))
 306                        return fib_result_prefsrc(net, &res);
 307        } else {
 308                scope = RT_SCOPE_LINK;
 309        }
 310
 311        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 312}
 313
 314bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 315{
 316        bool dev_match = false;
 317#ifdef CONFIG_IP_ROUTE_MULTIPATH
 318        int ret;
 319
 320        for (ret = 0; ret < fi->fib_nhs; ret++) {
 321                struct fib_nh *nh = &fi->fib_nh[ret];
 322
 323                if (nh->fib_nh_dev == dev) {
 324                        dev_match = true;
 325                        break;
 326                } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) {
 327                        dev_match = true;
 328                        break;
 329                }
 330        }
 331#else
 332        if (fi->fib_nh[0].fib_nh_dev == dev)
 333                dev_match = true;
 334#endif
 335
 336        return dev_match;
 337}
 338EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
 339
 340/* Given (packet source, input interface) and optional (dst, oif, tos):
 341 * - (main) check, that source is valid i.e. not broadcast or our local
 342 *   address.
 343 * - figure out what "logical" interface this packet arrived
 344 *   and calculate "specific destination" address.
 345 * - check, that packet arrived from expected physical interface.
 346 * called with rcu_read_lock()
 347 */
 348static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 349                                 u8 tos, int oif, struct net_device *dev,
 350                                 int rpf, struct in_device *idev, u32 *itag)
 351{
 352        struct net *net = dev_net(dev);
 353        struct flow_keys flkeys;
 354        int ret, no_addr;
 355        struct fib_result res;
 356        struct flowi4 fl4;
 357        bool dev_match;
 358
 359        fl4.flowi4_oif = 0;
 360        fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
 361        if (!fl4.flowi4_iif)
 362                fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
 363        fl4.daddr = src;
 364        fl4.saddr = dst;
 365        fl4.flowi4_tos = tos;
 366        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 367        fl4.flowi4_tun_key.tun_id = 0;
 368        fl4.flowi4_flags = 0;
 369        fl4.flowi4_uid = sock_net_uid(net, NULL);
 370
 371        no_addr = idev->ifa_list == NULL;
 372
 373        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 374        if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
 375                fl4.flowi4_proto = 0;
 376                fl4.fl4_sport = 0;
 377                fl4.fl4_dport = 0;
 378        }
 379
 380        if (fib_lookup(net, &fl4, &res, 0))
 381                goto last_resort;
 382        if (res.type != RTN_UNICAST &&
 383            (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 384                goto e_inval;
 385        fib_combine_itag(itag, &res);
 386
 387        dev_match = fib_info_nh_uses_dev(res.fi, dev);
 388        if (dev_match) {
 389                ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 390                return ret;
 391        }
 392        if (no_addr)
 393                goto last_resort;
 394        if (rpf == 1)
 395                goto e_rpf;
 396        fl4.flowi4_oif = dev->ifindex;
 397
 398        ret = 0;
 399        if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 400                if (res.type == RTN_UNICAST)
 401                        ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 402        }
 403        return ret;
 404
 405last_resort:
 406        if (rpf)
 407                goto e_rpf;
 408        *itag = 0;
 409        return 0;
 410
 411e_inval:
 412        return -EINVAL;
 413e_rpf:
 414        return -EXDEV;
 415}
 416
 417/* Ignore rp_filter for packets protected by IPsec. */
 418int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 419                        u8 tos, int oif, struct net_device *dev,
 420                        struct in_device *idev, u32 *itag)
 421{
 422        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 423        struct net *net = dev_net(dev);
 424
 425        if (!r && !fib_num_tclassid_users(net) &&
 426            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
 427                if (IN_DEV_ACCEPT_LOCAL(idev))
 428                        goto ok;
 429                /* with custom local routes in place, checking local addresses
 430                 * only will be too optimistic, with custom rules, checking
 431                 * local addresses only can be too strict, e.g. due to vrf
 432                 */
 433                if (net->ipv4.fib_has_custom_local_routes ||
 434                    fib4_has_custom_rules(net))
 435                        goto full_check;
 436                if (inet_lookup_ifaddr_rcu(net, src))
 437                        return -EINVAL;
 438
 439ok:
 440                *itag = 0;
 441                return 0;
 442        }
 443
 444full_check:
 445        return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 446}
 447
 448static inline __be32 sk_extract_addr(struct sockaddr *addr)
 449{
 450        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
 451}
 452
 453static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
 454{
 455        struct nlattr *nla;
 456
 457        nla = (struct nlattr *) ((char *) mx + len);
 458        nla->nla_type = type;
 459        nla->nla_len = nla_attr_size(4);
 460        *(u32 *) nla_data(nla) = value;
 461
 462        return len + nla_total_size(4);
 463}
 464
 465static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 466                                 struct fib_config *cfg)
 467{
 468        __be32 addr;
 469        int plen;
 470
 471        memset(cfg, 0, sizeof(*cfg));
 472        cfg->fc_nlinfo.nl_net = net;
 473
 474        if (rt->rt_dst.sa_family != AF_INET)
 475                return -EAFNOSUPPORT;
 476
 477        /*
 478         * Check mask for validity:
 479         * a) it must be contiguous.
 480         * b) destination must have all host bits clear.
 481         * c) if application forgot to set correct family (AF_INET),
 482         *    reject request unless it is absolutely clear i.e.
 483         *    both family and mask are zero.
 484         */
 485        plen = 32;
 486        addr = sk_extract_addr(&rt->rt_dst);
 487        if (!(rt->rt_flags & RTF_HOST)) {
 488                __be32 mask = sk_extract_addr(&rt->rt_genmask);
 489
 490                if (rt->rt_genmask.sa_family != AF_INET) {
 491                        if (mask || rt->rt_genmask.sa_family)
 492                                return -EAFNOSUPPORT;
 493                }
 494
 495                if (bad_mask(mask, addr))
 496                        return -EINVAL;
 497
 498                plen = inet_mask_len(mask);
 499        }
 500
 501        cfg->fc_dst_len = plen;
 502        cfg->fc_dst = addr;
 503
 504        if (cmd != SIOCDELRT) {
 505                cfg->fc_nlflags = NLM_F_CREATE;
 506                cfg->fc_protocol = RTPROT_BOOT;
 507        }
 508
 509        if (rt->rt_metric)
 510                cfg->fc_priority = rt->rt_metric - 1;
 511
 512        if (rt->rt_flags & RTF_REJECT) {
 513                cfg->fc_scope = RT_SCOPE_HOST;
 514                cfg->fc_type = RTN_UNREACHABLE;
 515                return 0;
 516        }
 517
 518        cfg->fc_scope = RT_SCOPE_NOWHERE;
 519        cfg->fc_type = RTN_UNICAST;
 520
 521        if (rt->rt_dev) {
 522                char *colon;
 523                struct net_device *dev;
 524                char devname[IFNAMSIZ];
 525
 526                if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
 527                        return -EFAULT;
 528
 529                devname[IFNAMSIZ-1] = 0;
 530                colon = strchr(devname, ':');
 531                if (colon)
 532                        *colon = 0;
 533                dev = __dev_get_by_name(net, devname);
 534                if (!dev)
 535                        return -ENODEV;
 536                cfg->fc_oif = dev->ifindex;
 537                cfg->fc_table = l3mdev_fib_table(dev);
 538                if (colon) {
 539                        struct in_ifaddr *ifa;
 540                        struct in_device *in_dev = __in_dev_get_rtnl(dev);
 541                        if (!in_dev)
 542                                return -ENODEV;
 543                        *colon = ':';
 544                        for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
 545                                if (strcmp(ifa->ifa_label, devname) == 0)
 546                                        break;
 547                        if (!ifa)
 548                                return -ENODEV;
 549                        cfg->fc_prefsrc = ifa->ifa_local;
 550                }
 551        }
 552
 553        addr = sk_extract_addr(&rt->rt_gateway);
 554        if (rt->rt_gateway.sa_family == AF_INET && addr) {
 555                unsigned int addr_type;
 556
 557                cfg->fc_gw4 = addr;
 558                cfg->fc_gw_family = AF_INET;
 559                addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
 560                if (rt->rt_flags & RTF_GATEWAY &&
 561                    addr_type == RTN_UNICAST)
 562                        cfg->fc_scope = RT_SCOPE_UNIVERSE;
 563        }
 564
 565        if (cmd == SIOCDELRT)
 566                return 0;
 567
 568        if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
 569                return -EINVAL;
 570
 571        if (cfg->fc_scope == RT_SCOPE_NOWHERE)
 572                cfg->fc_scope = RT_SCOPE_LINK;
 573
 574        if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
 575                struct nlattr *mx;
 576                int len = 0;
 577
 578                mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
 579                if (!mx)
 580                        return -ENOMEM;
 581
 582                if (rt->rt_flags & RTF_MTU)
 583                        len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
 584
 585                if (rt->rt_flags & RTF_WINDOW)
 586                        len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
 587
 588                if (rt->rt_flags & RTF_IRTT)
 589                        len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
 590
 591                cfg->fc_mx = mx;
 592                cfg->fc_mx_len = len;
 593        }
 594
 595        return 0;
 596}
 597
 598/*
 599 * Handle IP routing ioctl calls.
 600 * These are used to manipulate the routing tables
 601 */
 602int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
 603{
 604        struct fib_config cfg;
 605        int err;
 606
 607        switch (cmd) {
 608        case SIOCADDRT:         /* Add a route */
 609        case SIOCDELRT:         /* Delete a route */
 610                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 611                        return -EPERM;
 612
 613                rtnl_lock();
 614                err = rtentry_to_fib_config(net, cmd, rt, &cfg);
 615                if (err == 0) {
 616                        struct fib_table *tb;
 617
 618                        if (cmd == SIOCDELRT) {
 619                                tb = fib_get_table(net, cfg.fc_table);
 620                                if (tb)
 621                                        err = fib_table_delete(net, tb, &cfg,
 622                                                               NULL);
 623                                else
 624                                        err = -ESRCH;
 625                        } else {
 626                                tb = fib_new_table(net, cfg.fc_table);
 627                                if (tb)
 628                                        err = fib_table_insert(net, tb,
 629                                                               &cfg, NULL);
 630                                else
 631                                        err = -ENOBUFS;
 632                        }
 633
 634                        /* allocated by rtentry_to_fib_config() */
 635                        kfree(cfg.fc_mx);
 636                }
 637                rtnl_unlock();
 638                return err;
 639        }
 640        return -EINVAL;
 641}
 642
 643const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 644        [RTA_DST]               = { .type = NLA_U32 },
 645        [RTA_SRC]               = { .type = NLA_U32 },
 646        [RTA_IIF]               = { .type = NLA_U32 },
 647        [RTA_OIF]               = { .type = NLA_U32 },
 648        [RTA_GATEWAY]           = { .type = NLA_U32 },
 649        [RTA_PRIORITY]          = { .type = NLA_U32 },
 650        [RTA_PREFSRC]           = { .type = NLA_U32 },
 651        [RTA_METRICS]           = { .type = NLA_NESTED },
 652        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
 653        [RTA_FLOW]              = { .type = NLA_U32 },
 654        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
 655        [RTA_ENCAP]             = { .type = NLA_NESTED },
 656        [RTA_UID]               = { .type = NLA_U32 },
 657        [RTA_MARK]              = { .type = NLA_U32 },
 658        [RTA_TABLE]             = { .type = NLA_U32 },
 659        [RTA_IP_PROTO]          = { .type = NLA_U8 },
 660        [RTA_SPORT]             = { .type = NLA_U16 },
 661        [RTA_DPORT]             = { .type = NLA_U16 },
 662};
 663
 664int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
 665                    struct netlink_ext_ack *extack)
 666{
 667        struct rtvia *via;
 668        int alen;
 669
 670        if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
 671                NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
 672                return -EINVAL;
 673        }
 674
 675        via = nla_data(nla);
 676        alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
 677
 678        switch (via->rtvia_family) {
 679        case AF_INET:
 680                if (alen != sizeof(__be32)) {
 681                        NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
 682                        return -EINVAL;
 683                }
 684                cfg->fc_gw_family = AF_INET;
 685                cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
 686                break;
 687        case AF_INET6:
 688#ifdef CONFIG_IPV6
 689                if (alen != sizeof(struct in6_addr)) {
 690                        NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
 691                        return -EINVAL;
 692                }
 693                cfg->fc_gw_family = AF_INET6;
 694                cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
 695#else
 696                NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
 697                return -EINVAL;
 698#endif
 699                break;
 700        default:
 701                NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
 702                return -EINVAL;
 703        }
 704
 705        return 0;
 706}
 707
 708static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 709                             struct nlmsghdr *nlh, struct fib_config *cfg,
 710                             struct netlink_ext_ack *extack)
 711{
 712        bool has_gw = false, has_via = false;
 713        struct nlattr *attr;
 714        int err, remaining;
 715        struct rtmsg *rtm;
 716
 717        err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
 718                                        rtm_ipv4_policy, extack);
 719        if (err < 0)
 720                goto errout;
 721
 722        memset(cfg, 0, sizeof(*cfg));
 723
 724        rtm = nlmsg_data(nlh);
 725        cfg->fc_dst_len = rtm->rtm_dst_len;
 726        cfg->fc_tos = rtm->rtm_tos;
 727        cfg->fc_table = rtm->rtm_table;
 728        cfg->fc_protocol = rtm->rtm_protocol;
 729        cfg->fc_scope = rtm->rtm_scope;
 730        cfg->fc_type = rtm->rtm_type;
 731        cfg->fc_flags = rtm->rtm_flags;
 732        cfg->fc_nlflags = nlh->nlmsg_flags;
 733
 734        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 735        cfg->fc_nlinfo.nlh = nlh;
 736        cfg->fc_nlinfo.nl_net = net;
 737
 738        if (cfg->fc_type > RTN_MAX) {
 739                NL_SET_ERR_MSG(extack, "Invalid route type");
 740                err = -EINVAL;
 741                goto errout;
 742        }
 743
 744        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
 745                switch (nla_type(attr)) {
 746                case RTA_DST:
 747                        cfg->fc_dst = nla_get_be32(attr);
 748                        break;
 749                case RTA_OIF:
 750                        cfg->fc_oif = nla_get_u32(attr);
 751                        break;
 752                case RTA_GATEWAY:
 753                        has_gw = true;
 754                        cfg->fc_gw4 = nla_get_be32(attr);
 755                        if (cfg->fc_gw4)
 756                                cfg->fc_gw_family = AF_INET;
 757                        break;
 758                case RTA_VIA:
 759                        has_via = true;
 760                        err = fib_gw_from_via(cfg, attr, extack);
 761                        if (err)
 762                                goto errout;
 763                        break;
 764                case RTA_PRIORITY:
 765                        cfg->fc_priority = nla_get_u32(attr);
 766                        break;
 767                case RTA_PREFSRC:
 768                        cfg->fc_prefsrc = nla_get_be32(attr);
 769                        break;
 770                case RTA_METRICS:
 771                        cfg->fc_mx = nla_data(attr);
 772                        cfg->fc_mx_len = nla_len(attr);
 773                        break;
 774                case RTA_MULTIPATH:
 775                        err = lwtunnel_valid_encap_type_attr(nla_data(attr),
 776                                                             nla_len(attr),
 777                                                             extack);
 778                        if (err < 0)
 779                                goto errout;
 780                        cfg->fc_mp = nla_data(attr);
 781                        cfg->fc_mp_len = nla_len(attr);
 782                        break;
 783                case RTA_FLOW:
 784                        cfg->fc_flow = nla_get_u32(attr);
 785                        break;
 786                case RTA_TABLE:
 787                        cfg->fc_table = nla_get_u32(attr);
 788                        break;
 789                case RTA_ENCAP:
 790                        cfg->fc_encap = attr;
 791                        break;
 792                case RTA_ENCAP_TYPE:
 793                        cfg->fc_encap_type = nla_get_u16(attr);
 794                        err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
 795                                                        extack);
 796                        if (err < 0)
 797                                goto errout;
 798                        break;
 799                }
 800        }
 801
 802        if (has_gw && has_via) {
 803                NL_SET_ERR_MSG(extack,
 804                               "Nexthop configuration can not contain both GATEWAY and VIA");
 805                goto errout;
 806        }
 807
 808        return 0;
 809errout:
 810        return err;
 811}
 812
 813static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 814                             struct netlink_ext_ack *extack)
 815{
 816        struct net *net = sock_net(skb->sk);
 817        struct fib_config cfg;
 818        struct fib_table *tb;
 819        int err;
 820
 821        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 822        if (err < 0)
 823                goto errout;
 824
 825        tb = fib_get_table(net, cfg.fc_table);
 826        if (!tb) {
 827                NL_SET_ERR_MSG(extack, "FIB table does not exist");
 828                err = -ESRCH;
 829                goto errout;
 830        }
 831
 832        err = fib_table_delete(net, tb, &cfg, extack);
 833errout:
 834        return err;
 835}
 836
 837static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 838                             struct netlink_ext_ack *extack)
 839{
 840        struct net *net = sock_net(skb->sk);
 841        struct fib_config cfg;
 842        struct fib_table *tb;
 843        int err;
 844
 845        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 846        if (err < 0)
 847                goto errout;
 848
 849        tb = fib_new_table(net, cfg.fc_table);
 850        if (!tb) {
 851                err = -ENOBUFS;
 852                goto errout;
 853        }
 854
 855        err = fib_table_insert(net, tb, &cfg, extack);
 856        if (!err && cfg.fc_type == RTN_LOCAL)
 857                net->ipv4.fib_has_custom_local_routes = true;
 858errout:
 859        return err;
 860}
 861
 862int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 863                          struct fib_dump_filter *filter,
 864                          struct netlink_callback *cb)
 865{
 866        struct netlink_ext_ack *extack = cb->extack;
 867        struct nlattr *tb[RTA_MAX + 1];
 868        struct rtmsg *rtm;
 869        int err, i;
 870
 871        ASSERT_RTNL();
 872
 873        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 874                NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
 875                return -EINVAL;
 876        }
 877
 878        rtm = nlmsg_data(nlh);
 879        if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
 880            rtm->rtm_scope) {
 881                NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 882                return -EINVAL;
 883        }
 884        if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
 885                NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
 886                return -EINVAL;
 887        }
 888
 889        filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC);
 890        filter->flags    = rtm->rtm_flags;
 891        filter->protocol = rtm->rtm_protocol;
 892        filter->rt_type  = rtm->rtm_type;
 893        filter->table_id = rtm->rtm_table;
 894
 895        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
 896                                            rtm_ipv4_policy, extack);
 897        if (err < 0)
 898                return err;
 899
 900        for (i = 0; i <= RTA_MAX; ++i) {
 901                int ifindex;
 902
 903                if (!tb[i])
 904                        continue;
 905
 906                switch (i) {
 907                case RTA_TABLE:
 908                        filter->table_id = nla_get_u32(tb[i]);
 909                        break;
 910                case RTA_OIF:
 911                        ifindex = nla_get_u32(tb[i]);
 912                        filter->dev = __dev_get_by_index(net, ifindex);
 913                        if (!filter->dev)
 914                                return -ENODEV;
 915                        break;
 916                default:
 917                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
 918                        return -EINVAL;
 919                }
 920        }
 921
 922        if (filter->flags || filter->protocol || filter->rt_type ||
 923            filter->table_id || filter->dev) {
 924                filter->filter_set = 1;
 925                cb->answer_flags = NLM_F_DUMP_FILTERED;
 926        }
 927
 928        return 0;
 929}
 930EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
 931
 932static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 933{
 934        const struct nlmsghdr *nlh = cb->nlh;
 935        struct net *net = sock_net(skb->sk);
 936        struct fib_dump_filter filter = {};
 937        unsigned int h, s_h;
 938        unsigned int e = 0, s_e;
 939        struct fib_table *tb;
 940        struct hlist_head *head;
 941        int dumped = 0, err;
 942
 943        if (cb->strict_check) {
 944                err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
 945                if (err < 0)
 946                        return err;
 947        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 948                struct rtmsg *rtm = nlmsg_data(nlh);
 949
 950                filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 951        }
 952
 953        /* fib entries are never clones and ipv4 does not use prefix flag */
 954        if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED))
 955                return skb->len;
 956
 957        if (filter.table_id) {
 958                tb = fib_get_table(net, filter.table_id);
 959                if (!tb) {
 960                        if (filter.dump_all_families)
 961                                return skb->len;
 962
 963                        NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
 964                        return -ENOENT;
 965                }
 966
 967                err = fib_table_dump(tb, skb, cb, &filter);
 968                return skb->len ? : err;
 969        }
 970
 971        s_h = cb->args[0];
 972        s_e = cb->args[1];
 973
 974        rcu_read_lock();
 975
 976        for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
 977                e = 0;
 978                head = &net->ipv4.fib_table_hash[h];
 979                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
 980                        if (e < s_e)
 981                                goto next;
 982                        if (dumped)
 983                                memset(&cb->args[2], 0, sizeof(cb->args) -
 984                                                 2 * sizeof(cb->args[0]));
 985                        err = fib_table_dump(tb, skb, cb, &filter);
 986                        if (err < 0) {
 987                                if (likely(skb->len))
 988                                        goto out;
 989
 990                                goto out_err;
 991                        }
 992                        dumped = 1;
 993next:
 994                        e++;
 995                }
 996        }
 997out:
 998        err = skb->len;
 999out_err:
1000        rcu_read_unlock();
1001
1002        cb->args[1] = e;
1003        cb->args[0] = h;
1004
1005        return err;
1006}
1007
1008/* Prepare and feed intra-kernel routing request.
1009 * Really, it should be netlink message, but :-( netlink
1010 * can be not configured, so that we feed it directly
1011 * to fib engine. It is legal, because all events occur
1012 * only when netlink is already locked.
1013 */
1014static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
1015                      struct in_ifaddr *ifa, u32 rt_priority)
1016{
1017        struct net *net = dev_net(ifa->ifa_dev->dev);
1018        u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
1019        struct fib_table *tb;
1020        struct fib_config cfg = {
1021                .fc_protocol = RTPROT_KERNEL,
1022                .fc_type = type,
1023                .fc_dst = dst,
1024                .fc_dst_len = dst_len,
1025                .fc_priority = rt_priority,
1026                .fc_prefsrc = ifa->ifa_local,
1027                .fc_oif = ifa->ifa_dev->dev->ifindex,
1028                .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
1029                .fc_nlinfo = {
1030                        .nl_net = net,
1031                },
1032        };
1033
1034        if (!tb_id)
1035                tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
1036
1037        tb = fib_new_table(net, tb_id);
1038        if (!tb)
1039                return;
1040
1041        cfg.fc_table = tb->tb_id;
1042
1043        if (type != RTN_LOCAL)
1044                cfg.fc_scope = RT_SCOPE_LINK;
1045        else
1046                cfg.fc_scope = RT_SCOPE_HOST;
1047
1048        if (cmd == RTM_NEWROUTE)
1049                fib_table_insert(net, tb, &cfg, NULL);
1050        else
1051                fib_table_delete(net, tb, &cfg, NULL);
1052}
1053
1054void fib_add_ifaddr(struct in_ifaddr *ifa)
1055{
1056        struct in_device *in_dev = ifa->ifa_dev;
1057        struct net_device *dev = in_dev->dev;
1058        struct in_ifaddr *prim = ifa;
1059        __be32 mask = ifa->ifa_mask;
1060        __be32 addr = ifa->ifa_local;
1061        __be32 prefix = ifa->ifa_address & mask;
1062
1063        if (ifa->ifa_flags & IFA_F_SECONDARY) {
1064                prim = inet_ifa_byprefix(in_dev, prefix, mask);
1065                if (!prim) {
1066                        pr_warn("%s: bug: prim == NULL\n", __func__);
1067                        return;
1068                }
1069        }
1070
1071        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
1072
1073        if (!(dev->flags & IFF_UP))
1074                return;
1075
1076        /* Add broadcast address, if it is explicitly assigned. */
1077        if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
1078                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1079                          prim, 0);
1080
1081        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
1082            (prefix != addr || ifa->ifa_prefixlen < 32)) {
1083                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1084                        fib_magic(RTM_NEWROUTE,
1085                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1086                                  prefix, ifa->ifa_prefixlen, prim,
1087                                  ifa->ifa_rt_priority);
1088
1089                /* Add network specific broadcasts, when it takes a sense */
1090                if (ifa->ifa_prefixlen < 31) {
1091                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
1092                                  prim, 0);
1093                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
1094                                  32, prim, 0);
1095                }
1096        }
1097}
1098
1099void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
1100{
1101        __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
1102        struct in_device *in_dev = ifa->ifa_dev;
1103        struct net_device *dev = in_dev->dev;
1104
1105        if (!(dev->flags & IFF_UP) ||
1106            ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
1107            ipv4_is_zeronet(prefix) ||
1108            prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
1109                return;
1110
1111        /* add the new */
1112        fib_magic(RTM_NEWROUTE,
1113                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1114                  prefix, ifa->ifa_prefixlen, ifa, new_metric);
1115
1116        /* delete the old */
1117        fib_magic(RTM_DELROUTE,
1118                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1119                  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
1120}
1121
1122/* Delete primary or secondary address.
1123 * Optionally, on secondary address promotion consider the addresses
1124 * from subnet iprim as deleted, even if they are in device list.
1125 * In this case the secondary ifa can be in device list.
1126 */
1127void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
1128{
1129        struct in_device *in_dev = ifa->ifa_dev;
1130        struct net_device *dev = in_dev->dev;
1131        struct in_ifaddr *ifa1;
1132        struct in_ifaddr *prim = ifa, *prim1 = NULL;
1133        __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
1134        __be32 any = ifa->ifa_address & ifa->ifa_mask;
1135#define LOCAL_OK        1
1136#define BRD_OK          2
1137#define BRD0_OK         4
1138#define BRD1_OK         8
1139        unsigned int ok = 0;
1140        int subnet = 0;         /* Primary network */
1141        int gone = 1;           /* Address is missing */
1142        int same_prefsrc = 0;   /* Another primary with same IP */
1143
1144        if (ifa->ifa_flags & IFA_F_SECONDARY) {
1145                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
1146                if (!prim) {
1147                        /* if the device has been deleted, we don't perform
1148                         * address promotion
1149                         */
1150                        if (!in_dev->dead)
1151                                pr_warn("%s: bug: prim == NULL\n", __func__);
1152                        return;
1153                }
1154                if (iprim && iprim != prim) {
1155                        pr_warn("%s: bug: iprim != prim\n", __func__);
1156                        return;
1157                }
1158        } else if (!ipv4_is_zeronet(any) &&
1159                   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
1160                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1161                        fib_magic(RTM_DELROUTE,
1162                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1163                                  any, ifa->ifa_prefixlen, prim, 0);
1164                subnet = 1;
1165        }
1166
1167        if (in_dev->dead)
1168                goto no_promotions;
1169
1170        /* Deletion is more complicated than add.
1171         * We should take care of not to delete too much :-)
1172         *
1173         * Scan address list to be sure that addresses are really gone.
1174         */
1175
1176        for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
1177                if (ifa1 == ifa) {
1178                        /* promotion, keep the IP */
1179                        gone = 0;
1180                        continue;
1181                }
1182                /* Ignore IFAs from our subnet */
1183                if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
1184                    inet_ifa_match(ifa1->ifa_address, iprim))
1185                        continue;
1186
1187                /* Ignore ifa1 if it uses different primary IP (prefsrc) */
1188                if (ifa1->ifa_flags & IFA_F_SECONDARY) {
1189                        /* Another address from our subnet? */
1190                        if (ifa1->ifa_mask == prim->ifa_mask &&
1191                            inet_ifa_match(ifa1->ifa_address, prim))
1192                                prim1 = prim;
1193                        else {
1194                                /* We reached the secondaries, so
1195                                 * same_prefsrc should be determined.
1196                                 */
1197                                if (!same_prefsrc)
1198                                        continue;
1199                                /* Search new prim1 if ifa1 is not
1200                                 * using the current prim1
1201                                 */
1202                                if (!prim1 ||
1203                                    ifa1->ifa_mask != prim1->ifa_mask ||
1204                                    !inet_ifa_match(ifa1->ifa_address, prim1))
1205                                        prim1 = inet_ifa_byprefix(in_dev,
1206                                                        ifa1->ifa_address,
1207                                                        ifa1->ifa_mask);
1208                                if (!prim1)
1209                                        continue;
1210                                if (prim1->ifa_local != prim->ifa_local)
1211                                        continue;
1212                        }
1213                } else {
1214                        if (prim->ifa_local != ifa1->ifa_local)
1215                                continue;
1216                        prim1 = ifa1;
1217                        if (prim != prim1)
1218                                same_prefsrc = 1;
1219                }
1220                if (ifa->ifa_local == ifa1->ifa_local)
1221                        ok |= LOCAL_OK;
1222                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
1223                        ok |= BRD_OK;
1224                if (brd == ifa1->ifa_broadcast)
1225                        ok |= BRD1_OK;
1226                if (any == ifa1->ifa_broadcast)
1227                        ok |= BRD0_OK;
1228                /* primary has network specific broadcasts */
1229                if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
1230                        __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
1231                        __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
1232
1233                        if (!ipv4_is_zeronet(any1)) {
1234                                if (ifa->ifa_broadcast == brd1 ||
1235                                    ifa->ifa_broadcast == any1)
1236                                        ok |= BRD_OK;
1237                                if (brd == brd1 || brd == any1)
1238                                        ok |= BRD1_OK;
1239                                if (any == brd1 || any == any1)
1240                                        ok |= BRD0_OK;
1241                        }
1242                }
1243        }
1244
1245no_promotions:
1246        if (!(ok & BRD_OK))
1247                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1248                          prim, 0);
1249        if (subnet && ifa->ifa_prefixlen < 31) {
1250                if (!(ok & BRD1_OK))
1251                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
1252                                  prim, 0);
1253                if (!(ok & BRD0_OK))
1254                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
1255                                  prim, 0);
1256        }
1257        if (!(ok & LOCAL_OK)) {
1258                unsigned int addr_type;
1259
1260                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
1261
1262                /* Check, that this local address finally disappeared. */
1263                addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
1264                                                     ifa->ifa_local);
1265                if (gone && addr_type != RTN_LOCAL) {
1266                        /* And the last, but not the least thing.
1267                         * We must flush stray FIB entries.
1268                         *
1269                         * First of all, we scan fib_info list searching
1270                         * for stray nexthop entries, then ignite fib_flush.
1271                         */
1272                        if (fib_sync_down_addr(dev, ifa->ifa_local))
1273                                fib_flush(dev_net(dev));
1274                }
1275        }
1276#undef LOCAL_OK
1277#undef BRD_OK
1278#undef BRD0_OK
1279#undef BRD1_OK
1280}
1281
1282static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
1283{
1284
1285        struct fib_result       res;
1286        struct flowi4           fl4 = {
1287                .flowi4_mark = frn->fl_mark,
1288                .daddr = frn->fl_addr,
1289                .flowi4_tos = frn->fl_tos,
1290                .flowi4_scope = frn->fl_scope,
1291        };
1292        struct fib_table *tb;
1293
1294        rcu_read_lock();
1295
1296        tb = fib_get_table(net, frn->tb_id_in);
1297
1298        frn->err = -ENOENT;
1299        if (tb) {
1300                local_bh_disable();
1301
1302                frn->tb_id = tb->tb_id;
1303                frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1304
1305                if (!frn->err) {
1306                        frn->prefixlen = res.prefixlen;
1307                        frn->nh_sel = res.nh_sel;
1308                        frn->type = res.type;
1309                        frn->scope = res.scope;
1310                }
1311                local_bh_enable();
1312        }
1313
1314        rcu_read_unlock();
1315}
1316
1317static void nl_fib_input(struct sk_buff *skb)
1318{
1319        struct net *net;
1320        struct fib_result_nl *frn;
1321        struct nlmsghdr *nlh;
1322        u32 portid;
1323
1324        net = sock_net(skb->sk);
1325        nlh = nlmsg_hdr(skb);
1326        if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1327            skb->len < nlh->nlmsg_len ||
1328            nlmsg_len(nlh) < sizeof(*frn))
1329                return;
1330
1331        skb = netlink_skb_clone(skb, GFP_KERNEL);
1332        if (!skb)
1333                return;
1334        nlh = nlmsg_hdr(skb);
1335
1336        frn = (struct fib_result_nl *) nlmsg_data(nlh);
1337        nl_fib_lookup(net, frn);
1338
1339        portid = NETLINK_CB(skb).portid;      /* netlink portid */
1340        NETLINK_CB(skb).portid = 0;        /* from kernel */
1341        NETLINK_CB(skb).dst_group = 0;  /* unicast */
1342        netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
1343}
1344
1345static int __net_init nl_fib_lookup_init(struct net *net)
1346{
1347        struct sock *sk;
1348        struct netlink_kernel_cfg cfg = {
1349                .input  = nl_fib_input,
1350        };
1351
1352        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1353        if (!sk)
1354                return -EAFNOSUPPORT;
1355        net->ipv4.fibnl = sk;
1356        return 0;
1357}
1358
1359static void nl_fib_lookup_exit(struct net *net)
1360{
1361        netlink_kernel_release(net->ipv4.fibnl);
1362        net->ipv4.fibnl = NULL;
1363}
1364
1365static void fib_disable_ip(struct net_device *dev, unsigned long event,
1366                           bool force)
1367{
1368        if (fib_sync_down_dev(dev, event, force))
1369                fib_flush(dev_net(dev));
1370        else
1371                rt_cache_flush(dev_net(dev));
1372        arp_ifdown(dev);
1373}
1374
1375static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1376{
1377        struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
1378        struct net_device *dev = ifa->ifa_dev->dev;
1379        struct net *net = dev_net(dev);
1380
1381        switch (event) {
1382        case NETDEV_UP:
1383                fib_add_ifaddr(ifa);
1384#ifdef CONFIG_IP_ROUTE_MULTIPATH
1385                fib_sync_up(dev, RTNH_F_DEAD);
1386#endif
1387                atomic_inc(&net->ipv4.dev_addr_genid);
1388                rt_cache_flush(dev_net(dev));
1389                break;
1390        case NETDEV_DOWN:
1391                fib_del_ifaddr(ifa, NULL);
1392                atomic_inc(&net->ipv4.dev_addr_genid);
1393                if (!ifa->ifa_dev->ifa_list) {
1394                        /* Last address was deleted from this interface.
1395                         * Disable IP.
1396                         */
1397                        fib_disable_ip(dev, event, true);
1398                } else {
1399                        rt_cache_flush(dev_net(dev));
1400                }
1401                break;
1402        }
1403        return NOTIFY_DONE;
1404}
1405
1406static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1407{
1408        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1409        struct netdev_notifier_changeupper_info *upper_info = ptr;
1410        struct netdev_notifier_info_ext *info_ext = ptr;
1411        struct in_device *in_dev;
1412        struct net *net = dev_net(dev);
1413        unsigned int flags;
1414
1415        if (event == NETDEV_UNREGISTER) {
1416                fib_disable_ip(dev, event, true);
1417                rt_flush_dev(dev);
1418                return NOTIFY_DONE;
1419        }
1420
1421        in_dev = __in_dev_get_rtnl(dev);
1422        if (!in_dev)
1423                return NOTIFY_DONE;
1424
1425        switch (event) {
1426        case NETDEV_UP:
1427                for_ifa(in_dev) {
1428                        fib_add_ifaddr(ifa);
1429                } endfor_ifa(in_dev);
1430#ifdef CONFIG_IP_ROUTE_MULTIPATH
1431                fib_sync_up(dev, RTNH_F_DEAD);
1432#endif
1433                atomic_inc(&net->ipv4.dev_addr_genid);
1434                rt_cache_flush(net);
1435                break;
1436        case NETDEV_DOWN:
1437                fib_disable_ip(dev, event, false);
1438                break;
1439        case NETDEV_CHANGE:
1440                flags = dev_get_flags(dev);
1441                if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1442                        fib_sync_up(dev, RTNH_F_LINKDOWN);
1443                else
1444                        fib_sync_down_dev(dev, event, false);
1445                rt_cache_flush(net);
1446                break;
1447        case NETDEV_CHANGEMTU:
1448                fib_sync_mtu(dev, info_ext->ext.mtu);
1449                rt_cache_flush(net);
1450                break;
1451        case NETDEV_CHANGEUPPER:
1452                upper_info = ptr;
1453                /* flush all routes if dev is linked to or unlinked from
1454                 * an L3 master device (e.g., VRF)
1455                 */
1456                if (upper_info->upper_dev &&
1457                    netif_is_l3_master(upper_info->upper_dev))
1458                        fib_disable_ip(dev, NETDEV_DOWN, true);
1459                break;
1460        }
1461        return NOTIFY_DONE;
1462}
1463
1464static struct notifier_block fib_inetaddr_notifier = {
1465        .notifier_call = fib_inetaddr_event,
1466};
1467
1468static struct notifier_block fib_netdev_notifier = {
1469        .notifier_call = fib_netdev_event,
1470};
1471
1472static int __net_init ip_fib_net_init(struct net *net)
1473{
1474        int err;
1475        size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1476
1477        err = fib4_notifier_init(net);
1478        if (err)
1479                return err;
1480
1481        /* Avoid false sharing : Use at least a full cache line */
1482        size = max_t(size_t, size, L1_CACHE_BYTES);
1483
1484        net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1485        if (!net->ipv4.fib_table_hash) {
1486                err = -ENOMEM;
1487                goto err_table_hash_alloc;
1488        }
1489
1490        err = fib4_rules_init(net);
1491        if (err < 0)
1492                goto err_rules_init;
1493        return 0;
1494
1495err_rules_init:
1496        kfree(net->ipv4.fib_table_hash);
1497err_table_hash_alloc:
1498        fib4_notifier_exit(net);
1499        return err;
1500}
1501
1502static void ip_fib_net_exit(struct net *net)
1503{
1504        int i;
1505
1506        rtnl_lock();
1507#ifdef CONFIG_IP_MULTIPLE_TABLES
1508        RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1509        RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1510#endif
1511        /* Destroy the tables in reverse order to guarantee that the
1512         * local table, ID 255, is destroyed before the main table, ID
1513         * 254. This is necessary as the local table may contain
1514         * references to data contained in the main table.
1515         */
1516        for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
1517                struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1518                struct hlist_node *tmp;
1519                struct fib_table *tb;
1520
1521                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1522                        hlist_del(&tb->tb_hlist);
1523                        fib_table_flush(net, tb, true);
1524                        fib_free_table(tb);
1525                }
1526        }
1527
1528#ifdef CONFIG_IP_MULTIPLE_TABLES
1529        fib4_rules_exit(net);
1530#endif
1531        rtnl_unlock();
1532        kfree(net->ipv4.fib_table_hash);
1533        fib4_notifier_exit(net);
1534}
1535
1536static int __net_init fib_net_init(struct net *net)
1537{
1538        int error;
1539
1540#ifdef CONFIG_IP_ROUTE_CLASSID
1541        net->ipv4.fib_num_tclassid_users = 0;
1542#endif
1543        error = ip_fib_net_init(net);
1544        if (error < 0)
1545                goto out;
1546        error = nl_fib_lookup_init(net);
1547        if (error < 0)
1548                goto out_nlfl;
1549        error = fib_proc_init(net);
1550        if (error < 0)
1551                goto out_proc;
1552out:
1553        return error;
1554
1555out_proc:
1556        nl_fib_lookup_exit(net);
1557out_nlfl:
1558        ip_fib_net_exit(net);
1559        goto out;
1560}
1561
1562static void __net_exit fib_net_exit(struct net *net)
1563{
1564        fib_proc_exit(net);
1565        nl_fib_lookup_exit(net);
1566        ip_fib_net_exit(net);
1567}
1568
1569static struct pernet_operations fib_net_ops = {
1570        .init = fib_net_init,
1571        .exit = fib_net_exit,
1572};
1573
1574void __init ip_fib_init(void)
1575{
1576        fib_trie_init();
1577
1578        register_pernet_subsys(&fib_net_ops);
1579
1580        register_netdevice_notifier(&fib_netdev_notifier);
1581        register_inetaddr_notifier(&fib_inetaddr_notifier);
1582
1583        rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
1584        rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
1585        rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
1586}
1587