linux/net/ipv4/fib_frontend.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              IPv4 Forwarding Information Base: FIB frontend.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/uaccess.h>
  14#include <linux/bitops.h>
  15#include <linux/capability.h>
  16#include <linux/types.h>
  17#include <linux/kernel.h>
  18#include <linux/mm.h>
  19#include <linux/string.h>
  20#include <linux/socket.h>
  21#include <linux/sockios.h>
  22#include <linux/errno.h>
  23#include <linux/in.h>
  24#include <linux/inet.h>
  25#include <linux/inetdevice.h>
  26#include <linux/netdevice.h>
  27#include <linux/if_addr.h>
  28#include <linux/if_arp.h>
  29#include <linux/skbuff.h>
  30#include <linux/cache.h>
  31#include <linux/init.h>
  32#include <linux/list.h>
  33#include <linux/slab.h>
  34
  35#include <net/ip.h>
  36#include <net/protocol.h>
  37#include <net/route.h>
  38#include <net/tcp.h>
  39#include <net/sock.h>
  40#include <net/arp.h>
  41#include <net/ip_fib.h>
  42#include <net/nexthop.h>
  43#include <net/rtnetlink.h>
  44#include <net/xfrm.h>
  45#include <net/l3mdev.h>
  46#include <net/lwtunnel.h>
  47#include <trace/events/fib.h>
  48
  49#ifndef CONFIG_IP_MULTIPLE_TABLES
  50
  51static int __net_init fib4_rules_init(struct net *net)
  52{
  53        struct fib_table *local_table, *main_table;
  54
  55        main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
  56        if (!main_table)
  57                return -ENOMEM;
  58
  59        local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
  60        if (!local_table)
  61                goto fail;
  62
  63        hlist_add_head_rcu(&local_table->tb_hlist,
  64                                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
  65        hlist_add_head_rcu(&main_table->tb_hlist,
  66                                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
  67        return 0;
  68
  69fail:
  70        fib_free_table(main_table);
  71        return -ENOMEM;
  72}
  73#else
  74
  75struct fib_table *fib_new_table(struct net *net, u32 id)
  76{
  77        struct fib_table *tb, *alias = NULL;
  78        unsigned int h;
  79
  80        if (id == 0)
  81                id = RT_TABLE_MAIN;
  82        tb = fib_get_table(net, id);
  83        if (tb)
  84                return tb;
  85
  86        if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
  87                alias = fib_new_table(net, RT_TABLE_MAIN);
  88
  89        tb = fib_trie_table(id, alias);
  90        if (!tb)
  91                return NULL;
  92
  93        switch (id) {
  94        case RT_TABLE_MAIN:
  95                rcu_assign_pointer(net->ipv4.fib_main, tb);
  96                break;
  97        case RT_TABLE_DEFAULT:
  98                rcu_assign_pointer(net->ipv4.fib_default, tb);
  99                break;
 100        default:
 101                break;
 102        }
 103
 104        h = id & (FIB_TABLE_HASHSZ - 1);
 105        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
 106        return tb;
 107}
 108EXPORT_SYMBOL_GPL(fib_new_table);
 109
 110/* caller must hold either rtnl or rcu read lock */
 111struct fib_table *fib_get_table(struct net *net, u32 id)
 112{
 113        struct fib_table *tb;
 114        struct hlist_head *head;
 115        unsigned int h;
 116
 117        if (id == 0)
 118                id = RT_TABLE_MAIN;
 119        h = id & (FIB_TABLE_HASHSZ - 1);
 120
 121        head = &net->ipv4.fib_table_hash[h];
 122        hlist_for_each_entry_rcu(tb, head, tb_hlist,
 123                                 lockdep_rtnl_is_held()) {
 124                if (tb->tb_id == id)
 125                        return tb;
 126        }
 127        return NULL;
 128}
 129#endif /* CONFIG_IP_MULTIPLE_TABLES */
 130
 131static void fib_replace_table(struct net *net, struct fib_table *old,
 132                              struct fib_table *new)
 133{
 134#ifdef CONFIG_IP_MULTIPLE_TABLES
 135        switch (new->tb_id) {
 136        case RT_TABLE_MAIN:
 137                rcu_assign_pointer(net->ipv4.fib_main, new);
 138                break;
 139        case RT_TABLE_DEFAULT:
 140                rcu_assign_pointer(net->ipv4.fib_default, new);
 141                break;
 142        default:
 143                break;
 144        }
 145
 146#endif
 147        /* replace the old table in the hlist */
 148        hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
 149}
 150
 151int fib_unmerge(struct net *net)
 152{
 153        struct fib_table *old, *new, *main_table;
 154
 155        /* attempt to fetch local table if it has been allocated */
 156        old = fib_get_table(net, RT_TABLE_LOCAL);
 157        if (!old)
 158                return 0;
 159
 160        new = fib_trie_unmerge(old);
 161        if (!new)
 162                return -ENOMEM;
 163
 164        /* table is already unmerged */
 165        if (new == old)
 166                return 0;
 167
 168        /* replace merged table with clean table */
 169        fib_replace_table(net, old, new);
 170        fib_free_table(old);
 171
 172        /* attempt to fetch main table if it has been allocated */
 173        main_table = fib_get_table(net, RT_TABLE_MAIN);
 174        if (!main_table)
 175                return 0;
 176
 177        /* flush local entries from main table */
 178        fib_table_flush_external(main_table);
 179
 180        return 0;
 181}
 182
 183void fib_flush(struct net *net)
 184{
 185        int flushed = 0;
 186        unsigned int h;
 187
 188        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 189                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 190                struct hlist_node *tmp;
 191                struct fib_table *tb;
 192
 193                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
 194                        flushed += fib_table_flush(net, tb, false);
 195        }
 196
 197        if (flushed)
 198                rt_cache_flush(net);
 199}
 200
 201/*
 202 * Find address type as if only "dev" was present in the system. If
 203 * on_dev is NULL then all interfaces are taken into consideration.
 204 */
 205static inline unsigned int __inet_dev_addr_type(struct net *net,
 206                                                const struct net_device *dev,
 207                                                __be32 addr, u32 tb_id)
 208{
 209        struct flowi4           fl4 = { .daddr = addr };
 210        struct fib_result       res;
 211        unsigned int ret = RTN_BROADCAST;
 212        struct fib_table *table;
 213
 214        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
 215                return RTN_BROADCAST;
 216        if (ipv4_is_multicast(addr))
 217                return RTN_MULTICAST;
 218
 219        rcu_read_lock();
 220
 221        table = fib_get_table(net, tb_id);
 222        if (table) {
 223                ret = RTN_UNICAST;
 224                if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 225                        struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
 226
 227                        if (!dev || dev == nhc->nhc_dev)
 228                                ret = res.type;
 229                }
 230        }
 231
 232        rcu_read_unlock();
 233        return ret;
 234}
 235
 236unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
 237{
 238        return __inet_dev_addr_type(net, NULL, addr, tb_id);
 239}
 240EXPORT_SYMBOL(inet_addr_type_table);
 241
 242unsigned int inet_addr_type(struct net *net, __be32 addr)
 243{
 244        return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
 245}
 246EXPORT_SYMBOL(inet_addr_type);
 247
 248unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 249                                __be32 addr)
 250{
 251        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 252
 253        return __inet_dev_addr_type(net, dev, addr, rt_table);
 254}
 255EXPORT_SYMBOL(inet_dev_addr_type);
 256
 257/* inet_addr_type with dev == NULL but using the table from a dev
 258 * if one is associated
 259 */
 260unsigned int inet_addr_type_dev_table(struct net *net,
 261                                      const struct net_device *dev,
 262                                      __be32 addr)
 263{
 264        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 265
 266        return __inet_dev_addr_type(net, NULL, addr, rt_table);
 267}
 268EXPORT_SYMBOL(inet_addr_type_dev_table);
 269
 270__be32 fib_compute_spec_dst(struct sk_buff *skb)
 271{
 272        struct net_device *dev = skb->dev;
 273        struct in_device *in_dev;
 274        struct fib_result res;
 275        struct rtable *rt;
 276        struct net *net;
 277        int scope;
 278
 279        rt = skb_rtable(skb);
 280        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
 281            RTCF_LOCAL)
 282                return ip_hdr(skb)->daddr;
 283
 284        in_dev = __in_dev_get_rcu(dev);
 285
 286        net = dev_net(dev);
 287
 288        scope = RT_SCOPE_UNIVERSE;
 289        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 290                bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
 291                struct flowi4 fl4 = {
 292                        .flowi4_iif = LOOPBACK_IFINDEX,
 293                        .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
 294                        .daddr = ip_hdr(skb)->saddr,
 295                        .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
 296                        .flowi4_scope = scope,
 297                        .flowi4_mark = vmark ? skb->mark : 0,
 298                };
 299                if (!fib_lookup(net, &fl4, &res, 0))
 300                        return fib_result_prefsrc(net, &res);
 301        } else {
 302                scope = RT_SCOPE_LINK;
 303        }
 304
 305        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 306}
 307
 308bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 309{
 310        bool dev_match = false;
 311#ifdef CONFIG_IP_ROUTE_MULTIPATH
 312        if (unlikely(fi->nh)) {
 313                dev_match = nexthop_uses_dev(fi->nh, dev);
 314        } else {
 315                int ret;
 316
 317                for (ret = 0; ret < fib_info_num_path(fi); ret++) {
 318                        const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
 319
 320                        if (nhc_l3mdev_matches_dev(nhc, dev)) {
 321                                dev_match = true;
 322                                break;
 323                        }
 324                }
 325        }
 326#else
 327        if (fib_info_nhc(fi, 0)->nhc_dev == dev)
 328                dev_match = true;
 329#endif
 330
 331        return dev_match;
 332}
 333EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
 334
 335/* Given (packet source, input interface) and optional (dst, oif, tos):
 336 * - (main) check, that source is valid i.e. not broadcast or our local
 337 *   address.
 338 * - figure out what "logical" interface this packet arrived
 339 *   and calculate "specific destination" address.
 340 * - check, that packet arrived from expected physical interface.
 341 * called with rcu_read_lock()
 342 */
 343static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 344                                 u8 tos, int oif, struct net_device *dev,
 345                                 int rpf, struct in_device *idev, u32 *itag)
 346{
 347        struct net *net = dev_net(dev);
 348        struct flow_keys flkeys;
 349        int ret, no_addr;
 350        struct fib_result res;
 351        struct flowi4 fl4;
 352        bool dev_match;
 353
 354        fl4.flowi4_oif = 0;
 355        fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
 356        if (!fl4.flowi4_iif)
 357                fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
 358        fl4.daddr = src;
 359        fl4.saddr = dst;
 360        fl4.flowi4_tos = tos;
 361        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 362        fl4.flowi4_tun_key.tun_id = 0;
 363        fl4.flowi4_flags = 0;
 364        fl4.flowi4_uid = sock_net_uid(net, NULL);
 365        fl4.flowi4_multipath_hash = 0;
 366
 367        no_addr = idev->ifa_list == NULL;
 368
 369        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 370        if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
 371                fl4.flowi4_proto = 0;
 372                fl4.fl4_sport = 0;
 373                fl4.fl4_dport = 0;
 374        }
 375
 376        if (fib_lookup(net, &fl4, &res, 0))
 377                goto last_resort;
 378        if (res.type != RTN_UNICAST &&
 379            (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 380                goto e_inval;
 381        fib_combine_itag(itag, &res);
 382
 383        dev_match = fib_info_nh_uses_dev(res.fi, dev);
 384        /* This is not common, loopback packets retain skb_dst so normally they
 385         * would not even hit this slow path.
 386         */
 387        dev_match = dev_match || (res.type == RTN_LOCAL &&
 388                                  dev == net->loopback_dev);
 389        if (dev_match) {
 390                ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 391                return ret;
 392        }
 393        if (no_addr)
 394                goto last_resort;
 395        if (rpf == 1)
 396                goto e_rpf;
 397        fl4.flowi4_oif = dev->ifindex;
 398
 399        ret = 0;
 400        if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 401                if (res.type == RTN_UNICAST)
 402                        ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 403        }
 404        return ret;
 405
 406last_resort:
 407        if (rpf)
 408                goto e_rpf;
 409        *itag = 0;
 410        return 0;
 411
 412e_inval:
 413        return -EINVAL;
 414e_rpf:
 415        return -EXDEV;
 416}
 417
 418/* Ignore rp_filter for packets protected by IPsec. */
 419int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 420                        u8 tos, int oif, struct net_device *dev,
 421                        struct in_device *idev, u32 *itag)
 422{
 423        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 424        struct net *net = dev_net(dev);
 425
 426        if (!r && !fib_num_tclassid_users(net) &&
 427            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
 428                if (IN_DEV_ACCEPT_LOCAL(idev))
 429                        goto ok;
 430                /* with custom local routes in place, checking local addresses
 431                 * only will be too optimistic, with custom rules, checking
 432                 * local addresses only can be too strict, e.g. due to vrf
 433                 */
 434                if (net->ipv4.fib_has_custom_local_routes ||
 435                    fib4_has_custom_rules(net))
 436                        goto full_check;
 437                if (inet_lookup_ifaddr_rcu(net, src))
 438                        return -EINVAL;
 439
 440ok:
 441                *itag = 0;
 442                return 0;
 443        }
 444
 445full_check:
 446        return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 447}
 448
 449static inline __be32 sk_extract_addr(struct sockaddr *addr)
 450{
 451        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
 452}
 453
 454static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
 455{
 456        struct nlattr *nla;
 457
 458        nla = (struct nlattr *) ((char *) mx + len);
 459        nla->nla_type = type;
 460        nla->nla_len = nla_attr_size(4);
 461        *(u32 *) nla_data(nla) = value;
 462
 463        return len + nla_total_size(4);
 464}
 465
 466static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 467                                 struct fib_config *cfg)
 468{
 469        __be32 addr;
 470        int plen;
 471
 472        memset(cfg, 0, sizeof(*cfg));
 473        cfg->fc_nlinfo.nl_net = net;
 474
 475        if (rt->rt_dst.sa_family != AF_INET)
 476                return -EAFNOSUPPORT;
 477
 478        /*
 479         * Check mask for validity:
 480         * a) it must be contiguous.
 481         * b) destination must have all host bits clear.
 482         * c) if application forgot to set correct family (AF_INET),
 483         *    reject request unless it is absolutely clear i.e.
 484         *    both family and mask are zero.
 485         */
 486        plen = 32;
 487        addr = sk_extract_addr(&rt->rt_dst);
 488        if (!(rt->rt_flags & RTF_HOST)) {
 489                __be32 mask = sk_extract_addr(&rt->rt_genmask);
 490
 491                if (rt->rt_genmask.sa_family != AF_INET) {
 492                        if (mask || rt->rt_genmask.sa_family)
 493                                return -EAFNOSUPPORT;
 494                }
 495
 496                if (bad_mask(mask, addr))
 497                        return -EINVAL;
 498
 499                plen = inet_mask_len(mask);
 500        }
 501
 502        cfg->fc_dst_len = plen;
 503        cfg->fc_dst = addr;
 504
 505        if (cmd != SIOCDELRT) {
 506                cfg->fc_nlflags = NLM_F_CREATE;
 507                cfg->fc_protocol = RTPROT_BOOT;
 508        }
 509
 510        if (rt->rt_metric)
 511                cfg->fc_priority = rt->rt_metric - 1;
 512
 513        if (rt->rt_flags & RTF_REJECT) {
 514                cfg->fc_scope = RT_SCOPE_HOST;
 515                cfg->fc_type = RTN_UNREACHABLE;
 516                return 0;
 517        }
 518
 519        cfg->fc_scope = RT_SCOPE_NOWHERE;
 520        cfg->fc_type = RTN_UNICAST;
 521
 522        if (rt->rt_dev) {
 523                char *colon;
 524                struct net_device *dev;
 525                char devname[IFNAMSIZ];
 526
 527                if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
 528                        return -EFAULT;
 529
 530                devname[IFNAMSIZ-1] = 0;
 531                colon = strchr(devname, ':');
 532                if (colon)
 533                        *colon = 0;
 534                dev = __dev_get_by_name(net, devname);
 535                if (!dev)
 536                        return -ENODEV;
 537                cfg->fc_oif = dev->ifindex;
 538                cfg->fc_table = l3mdev_fib_table(dev);
 539                if (colon) {
 540                        const struct in_ifaddr *ifa;
 541                        struct in_device *in_dev;
 542
 543                        in_dev = __in_dev_get_rtnl(dev);
 544                        if (!in_dev)
 545                                return -ENODEV;
 546
 547                        *colon = ':';
 548
 549                        rcu_read_lock();
 550                        in_dev_for_each_ifa_rcu(ifa, in_dev) {
 551                                if (strcmp(ifa->ifa_label, devname) == 0)
 552                                        break;
 553                        }
 554                        rcu_read_unlock();
 555
 556                        if (!ifa)
 557                                return -ENODEV;
 558                        cfg->fc_prefsrc = ifa->ifa_local;
 559                }
 560        }
 561
 562        addr = sk_extract_addr(&rt->rt_gateway);
 563        if (rt->rt_gateway.sa_family == AF_INET && addr) {
 564                unsigned int addr_type;
 565
 566                cfg->fc_gw4 = addr;
 567                cfg->fc_gw_family = AF_INET;
 568                addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
 569                if (rt->rt_flags & RTF_GATEWAY &&
 570                    addr_type == RTN_UNICAST)
 571                        cfg->fc_scope = RT_SCOPE_UNIVERSE;
 572        }
 573
 574        if (cmd == SIOCDELRT)
 575                return 0;
 576
 577        if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
 578                return -EINVAL;
 579
 580        if (cfg->fc_scope == RT_SCOPE_NOWHERE)
 581                cfg->fc_scope = RT_SCOPE_LINK;
 582
 583        if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
 584                struct nlattr *mx;
 585                int len = 0;
 586
 587                mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
 588                if (!mx)
 589                        return -ENOMEM;
 590
 591                if (rt->rt_flags & RTF_MTU)
 592                        len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
 593
 594                if (rt->rt_flags & RTF_WINDOW)
 595                        len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
 596
 597                if (rt->rt_flags & RTF_IRTT)
 598                        len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
 599
 600                cfg->fc_mx = mx;
 601                cfg->fc_mx_len = len;
 602        }
 603
 604        return 0;
 605}
 606
 607/*
 608 * Handle IP routing ioctl calls.
 609 * These are used to manipulate the routing tables
 610 */
 611int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
 612{
 613        struct fib_config cfg;
 614        int err;
 615
 616        switch (cmd) {
 617        case SIOCADDRT:         /* Add a route */
 618        case SIOCDELRT:         /* Delete a route */
 619                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 620                        return -EPERM;
 621
 622                rtnl_lock();
 623                err = rtentry_to_fib_config(net, cmd, rt, &cfg);
 624                if (err == 0) {
 625                        struct fib_table *tb;
 626
 627                        if (cmd == SIOCDELRT) {
 628                                tb = fib_get_table(net, cfg.fc_table);
 629                                if (tb)
 630                                        err = fib_table_delete(net, tb, &cfg,
 631                                                               NULL);
 632                                else
 633                                        err = -ESRCH;
 634                        } else {
 635                                tb = fib_new_table(net, cfg.fc_table);
 636                                if (tb)
 637                                        err = fib_table_insert(net, tb,
 638                                                               &cfg, NULL);
 639                                else
 640                                        err = -ENOBUFS;
 641                        }
 642
 643                        /* allocated by rtentry_to_fib_config() */
 644                        kfree(cfg.fc_mx);
 645                }
 646                rtnl_unlock();
 647                return err;
 648        }
 649        return -EINVAL;
 650}
 651
 652const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 653        [RTA_UNSPEC]            = { .strict_start_type = RTA_DPORT + 1 },
 654        [RTA_DST]               = { .type = NLA_U32 },
 655        [RTA_SRC]               = { .type = NLA_U32 },
 656        [RTA_IIF]               = { .type = NLA_U32 },
 657        [RTA_OIF]               = { .type = NLA_U32 },
 658        [RTA_GATEWAY]           = { .type = NLA_U32 },
 659        [RTA_PRIORITY]          = { .type = NLA_U32 },
 660        [RTA_PREFSRC]           = { .type = NLA_U32 },
 661        [RTA_METRICS]           = { .type = NLA_NESTED },
 662        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
 663        [RTA_FLOW]              = { .type = NLA_U32 },
 664        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
 665        [RTA_ENCAP]             = { .type = NLA_NESTED },
 666        [RTA_UID]               = { .type = NLA_U32 },
 667        [RTA_MARK]              = { .type = NLA_U32 },
 668        [RTA_TABLE]             = { .type = NLA_U32 },
 669        [RTA_IP_PROTO]          = { .type = NLA_U8 },
 670        [RTA_SPORT]             = { .type = NLA_U16 },
 671        [RTA_DPORT]             = { .type = NLA_U16 },
 672        [RTA_NH_ID]             = { .type = NLA_U32 },
 673};
 674
 675int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
 676                    struct netlink_ext_ack *extack)
 677{
 678        struct rtvia *via;
 679        int alen;
 680
 681        if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
 682                NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
 683                return -EINVAL;
 684        }
 685
 686        via = nla_data(nla);
 687        alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
 688
 689        switch (via->rtvia_family) {
 690        case AF_INET:
 691                if (alen != sizeof(__be32)) {
 692                        NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
 693                        return -EINVAL;
 694                }
 695                cfg->fc_gw_family = AF_INET;
 696                cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
 697                break;
 698        case AF_INET6:
 699#if IS_ENABLED(CONFIG_IPV6)
 700                if (alen != sizeof(struct in6_addr)) {
 701                        NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
 702                        return -EINVAL;
 703                }
 704                cfg->fc_gw_family = AF_INET6;
 705                cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
 706#else
 707                NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
 708                return -EINVAL;
 709#endif
 710                break;
 711        default:
 712                NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
 713                return -EINVAL;
 714        }
 715
 716        return 0;
 717}
 718
 719static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 720                             struct nlmsghdr *nlh, struct fib_config *cfg,
 721                             struct netlink_ext_ack *extack)
 722{
 723        bool has_gw = false, has_via = false;
 724        struct nlattr *attr;
 725        int err, remaining;
 726        struct rtmsg *rtm;
 727
 728        err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
 729                                        rtm_ipv4_policy, extack);
 730        if (err < 0)
 731                goto errout;
 732
 733        memset(cfg, 0, sizeof(*cfg));
 734
 735        rtm = nlmsg_data(nlh);
 736        cfg->fc_dst_len = rtm->rtm_dst_len;
 737        cfg->fc_tos = rtm->rtm_tos;
 738        cfg->fc_table = rtm->rtm_table;
 739        cfg->fc_protocol = rtm->rtm_protocol;
 740        cfg->fc_scope = rtm->rtm_scope;
 741        cfg->fc_type = rtm->rtm_type;
 742        cfg->fc_flags = rtm->rtm_flags;
 743        cfg->fc_nlflags = nlh->nlmsg_flags;
 744
 745        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 746        cfg->fc_nlinfo.nlh = nlh;
 747        cfg->fc_nlinfo.nl_net = net;
 748
 749        if (cfg->fc_type > RTN_MAX) {
 750                NL_SET_ERR_MSG(extack, "Invalid route type");
 751                err = -EINVAL;
 752                goto errout;
 753        }
 754
 755        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
 756                switch (nla_type(attr)) {
 757                case RTA_DST:
 758                        cfg->fc_dst = nla_get_be32(attr);
 759                        break;
 760                case RTA_OIF:
 761                        cfg->fc_oif = nla_get_u32(attr);
 762                        break;
 763                case RTA_GATEWAY:
 764                        has_gw = true;
 765                        cfg->fc_gw4 = nla_get_be32(attr);
 766                        if (cfg->fc_gw4)
 767                                cfg->fc_gw_family = AF_INET;
 768                        break;
 769                case RTA_VIA:
 770                        has_via = true;
 771                        err = fib_gw_from_via(cfg, attr, extack);
 772                        if (err)
 773                                goto errout;
 774                        break;
 775                case RTA_PRIORITY:
 776                        cfg->fc_priority = nla_get_u32(attr);
 777                        break;
 778                case RTA_PREFSRC:
 779                        cfg->fc_prefsrc = nla_get_be32(attr);
 780                        break;
 781                case RTA_METRICS:
 782                        cfg->fc_mx = nla_data(attr);
 783                        cfg->fc_mx_len = nla_len(attr);
 784                        break;
 785                case RTA_MULTIPATH:
 786                        err = lwtunnel_valid_encap_type_attr(nla_data(attr),
 787                                                             nla_len(attr),
 788                                                             extack);
 789                        if (err < 0)
 790                                goto errout;
 791                        cfg->fc_mp = nla_data(attr);
 792                        cfg->fc_mp_len = nla_len(attr);
 793                        break;
 794                case RTA_FLOW:
 795                        cfg->fc_flow = nla_get_u32(attr);
 796                        break;
 797                case RTA_TABLE:
 798                        cfg->fc_table = nla_get_u32(attr);
 799                        break;
 800                case RTA_ENCAP:
 801                        cfg->fc_encap = attr;
 802                        break;
 803                case RTA_ENCAP_TYPE:
 804                        cfg->fc_encap_type = nla_get_u16(attr);
 805                        err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
 806                                                        extack);
 807                        if (err < 0)
 808                                goto errout;
 809                        break;
 810                case RTA_NH_ID:
 811                        cfg->fc_nh_id = nla_get_u32(attr);
 812                        break;
 813                }
 814        }
 815
 816        if (cfg->fc_nh_id) {
 817                if (cfg->fc_oif || cfg->fc_gw_family ||
 818                    cfg->fc_encap || cfg->fc_mp) {
 819                        NL_SET_ERR_MSG(extack,
 820                                       "Nexthop specification and nexthop id are mutually exclusive");
 821                        return -EINVAL;
 822                }
 823        }
 824
 825        if (has_gw && has_via) {
 826                NL_SET_ERR_MSG(extack,
 827                               "Nexthop configuration can not contain both GATEWAY and VIA");
 828                return -EINVAL;
 829        }
 830
 831        return 0;
 832errout:
 833        return err;
 834}
 835
 836static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 837                             struct netlink_ext_ack *extack)
 838{
 839        struct net *net = sock_net(skb->sk);
 840        struct fib_config cfg;
 841        struct fib_table *tb;
 842        int err;
 843
 844        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 845        if (err < 0)
 846                goto errout;
 847
 848        if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
 849                NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
 850                err = -EINVAL;
 851                goto errout;
 852        }
 853
 854        tb = fib_get_table(net, cfg.fc_table);
 855        if (!tb) {
 856                NL_SET_ERR_MSG(extack, "FIB table does not exist");
 857                err = -ESRCH;
 858                goto errout;
 859        }
 860
 861        err = fib_table_delete(net, tb, &cfg, extack);
 862errout:
 863        return err;
 864}
 865
 866static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 867                             struct netlink_ext_ack *extack)
 868{
 869        struct net *net = sock_net(skb->sk);
 870        struct fib_config cfg;
 871        struct fib_table *tb;
 872        int err;
 873
 874        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 875        if (err < 0)
 876                goto errout;
 877
 878        tb = fib_new_table(net, cfg.fc_table);
 879        if (!tb) {
 880                err = -ENOBUFS;
 881                goto errout;
 882        }
 883
 884        err = fib_table_insert(net, tb, &cfg, extack);
 885        if (!err && cfg.fc_type == RTN_LOCAL)
 886                net->ipv4.fib_has_custom_local_routes = true;
 887errout:
 888        return err;
 889}
 890
 891int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 892                          struct fib_dump_filter *filter,
 893                          struct netlink_callback *cb)
 894{
 895        struct netlink_ext_ack *extack = cb->extack;
 896        struct nlattr *tb[RTA_MAX + 1];
 897        struct rtmsg *rtm;
 898        int err, i;
 899
 900        ASSERT_RTNL();
 901
 902        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 903                NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
 904                return -EINVAL;
 905        }
 906
 907        rtm = nlmsg_data(nlh);
 908        if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
 909            rtm->rtm_scope) {
 910                NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 911                return -EINVAL;
 912        }
 913
 914        if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
 915                NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
 916                return -EINVAL;
 917        }
 918        if (rtm->rtm_flags & RTM_F_CLONED)
 919                filter->dump_routes = false;
 920        else
 921                filter->dump_exceptions = false;
 922
 923        filter->flags    = rtm->rtm_flags;
 924        filter->protocol = rtm->rtm_protocol;
 925        filter->rt_type  = rtm->rtm_type;
 926        filter->table_id = rtm->rtm_table;
 927
 928        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
 929                                            rtm_ipv4_policy, extack);
 930        if (err < 0)
 931                return err;
 932
 933        for (i = 0; i <= RTA_MAX; ++i) {
 934                int ifindex;
 935
 936                if (!tb[i])
 937                        continue;
 938
 939                switch (i) {
 940                case RTA_TABLE:
 941                        filter->table_id = nla_get_u32(tb[i]);
 942                        break;
 943                case RTA_OIF:
 944                        ifindex = nla_get_u32(tb[i]);
 945                        filter->dev = __dev_get_by_index(net, ifindex);
 946                        if (!filter->dev)
 947                                return -ENODEV;
 948                        break;
 949                default:
 950                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
 951                        return -EINVAL;
 952                }
 953        }
 954
 955        if (filter->flags || filter->protocol || filter->rt_type ||
 956            filter->table_id || filter->dev) {
 957                filter->filter_set = 1;
 958                cb->answer_flags = NLM_F_DUMP_FILTERED;
 959        }
 960
 961        return 0;
 962}
 963EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
 964
 965static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 966{
 967        struct fib_dump_filter filter = { .dump_routes = true,
 968                                          .dump_exceptions = true };
 969        const struct nlmsghdr *nlh = cb->nlh;
 970        struct net *net = sock_net(skb->sk);
 971        unsigned int h, s_h;
 972        unsigned int e = 0, s_e;
 973        struct fib_table *tb;
 974        struct hlist_head *head;
 975        int dumped = 0, err;
 976
 977        if (cb->strict_check) {
 978                err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
 979                if (err < 0)
 980                        return err;
 981        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 982                struct rtmsg *rtm = nlmsg_data(nlh);
 983
 984                filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 985        }
 986
 987        /* ipv4 does not use prefix flag */
 988        if (filter.flags & RTM_F_PREFIX)
 989                return skb->len;
 990
 991        if (filter.table_id) {
 992                tb = fib_get_table(net, filter.table_id);
 993                if (!tb) {
 994                        if (rtnl_msg_family(cb->nlh) != PF_INET)
 995                                return skb->len;
 996
 997                        NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
 998                        return -ENOENT;
 999                }
1000
1001                rcu_read_lock();
1002                err = fib_table_dump(tb, skb, cb, &filter);
1003                rcu_read_unlock();
1004                return skb->len ? : err;
1005        }
1006
1007        s_h = cb->args[0];
1008        s_e = cb->args[1];
1009
1010        rcu_read_lock();
1011
1012        for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
1013                e = 0;
1014                head = &net->ipv4.fib_table_hash[h];
1015                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1016                        if (e < s_e)
1017                                goto next;
1018                        if (dumped)
1019                                memset(&cb->args[2], 0, sizeof(cb->args) -
1020                                                 2 * sizeof(cb->args[0]));
1021                        err = fib_table_dump(tb, skb, cb, &filter);
1022                        if (err < 0) {
1023                                if (likely(skb->len))
1024                                        goto out;
1025
1026                                goto out_err;
1027                        }
1028                        dumped = 1;
1029next:
1030                        e++;
1031                }
1032        }
1033out:
1034        err = skb->len;
1035out_err:
1036        rcu_read_unlock();
1037
1038        cb->args[1] = e;
1039        cb->args[0] = h;
1040
1041        return err;
1042}
1043
1044/* Prepare and feed intra-kernel routing request.
1045 * Really, it should be netlink message, but :-( netlink
1046 * can be not configured, so that we feed it directly
1047 * to fib engine. It is legal, because all events occur
1048 * only when netlink is already locked.
1049 */
1050static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
1051                      struct in_ifaddr *ifa, u32 rt_priority)
1052{
1053        struct net *net = dev_net(ifa->ifa_dev->dev);
1054        u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
1055        struct fib_table *tb;
1056        struct fib_config cfg = {
1057                .fc_protocol = RTPROT_KERNEL,
1058                .fc_type = type,
1059                .fc_dst = dst,
1060                .fc_dst_len = dst_len,
1061                .fc_priority = rt_priority,
1062                .fc_prefsrc = ifa->ifa_local,
1063                .fc_oif = ifa->ifa_dev->dev->ifindex,
1064                .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
1065                .fc_nlinfo = {
1066                        .nl_net = net,
1067                },
1068        };
1069
1070        if (!tb_id)
1071                tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
1072
1073        tb = fib_new_table(net, tb_id);
1074        if (!tb)
1075                return;
1076
1077        cfg.fc_table = tb->tb_id;
1078
1079        if (type != RTN_LOCAL)
1080                cfg.fc_scope = RT_SCOPE_LINK;
1081        else
1082                cfg.fc_scope = RT_SCOPE_HOST;
1083
1084        if (cmd == RTM_NEWROUTE)
1085                fib_table_insert(net, tb, &cfg, NULL);
1086        else
1087                fib_table_delete(net, tb, &cfg, NULL);
1088}
1089
1090void fib_add_ifaddr(struct in_ifaddr *ifa)
1091{
1092        struct in_device *in_dev = ifa->ifa_dev;
1093        struct net_device *dev = in_dev->dev;
1094        struct in_ifaddr *prim = ifa;
1095        __be32 mask = ifa->ifa_mask;
1096        __be32 addr = ifa->ifa_local;
1097        __be32 prefix = ifa->ifa_address & mask;
1098
1099        if (ifa->ifa_flags & IFA_F_SECONDARY) {
1100                prim = inet_ifa_byprefix(in_dev, prefix, mask);
1101                if (!prim) {
1102                        pr_warn("%s: bug: prim == NULL\n", __func__);
1103                        return;
1104                }
1105        }
1106
1107        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
1108
1109        if (!(dev->flags & IFF_UP))
1110                return;
1111
1112        /* Add broadcast address, if it is explicitly assigned. */
1113        if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
1114                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1115                          prim, 0);
1116
1117        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
1118            (prefix != addr || ifa->ifa_prefixlen < 32)) {
1119                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1120                        fib_magic(RTM_NEWROUTE,
1121                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1122                                  prefix, ifa->ifa_prefixlen, prim,
1123                                  ifa->ifa_rt_priority);
1124
1125                /* Add network specific broadcasts, when it takes a sense */
1126                if (ifa->ifa_prefixlen < 31) {
1127                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
1128                                  prim, 0);
1129                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
1130                                  32, prim, 0);
1131                }
1132        }
1133}
1134
1135void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
1136{
1137        __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
1138        struct in_device *in_dev = ifa->ifa_dev;
1139        struct net_device *dev = in_dev->dev;
1140
1141        if (!(dev->flags & IFF_UP) ||
1142            ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
1143            ipv4_is_zeronet(prefix) ||
1144            (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
1145                return;
1146
1147        /* add the new */
1148        fib_magic(RTM_NEWROUTE,
1149                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1150                  prefix, ifa->ifa_prefixlen, ifa, new_metric);
1151
1152        /* delete the old */
1153        fib_magic(RTM_DELROUTE,
1154                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1155                  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
1156}
1157
1158/* Delete primary or secondary address.
1159 * Optionally, on secondary address promotion consider the addresses
1160 * from subnet iprim as deleted, even if they are in device list.
1161 * In this case the secondary ifa can be in device list.
1162 */
1163void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
1164{
1165        struct in_device *in_dev = ifa->ifa_dev;
1166        struct net_device *dev = in_dev->dev;
1167        struct in_ifaddr *ifa1;
1168        struct in_ifaddr *prim = ifa, *prim1 = NULL;
1169        __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
1170        __be32 any = ifa->ifa_address & ifa->ifa_mask;
1171#define LOCAL_OK        1
1172#define BRD_OK          2
1173#define BRD0_OK         4
1174#define BRD1_OK         8
1175        unsigned int ok = 0;
1176        int subnet = 0;         /* Primary network */
1177        int gone = 1;           /* Address is missing */
1178        int same_prefsrc = 0;   /* Another primary with same IP */
1179
1180        if (ifa->ifa_flags & IFA_F_SECONDARY) {
1181                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
1182                if (!prim) {
1183                        /* if the device has been deleted, we don't perform
1184                         * address promotion
1185                         */
1186                        if (!in_dev->dead)
1187                                pr_warn("%s: bug: prim == NULL\n", __func__);
1188                        return;
1189                }
1190                if (iprim && iprim != prim) {
1191                        pr_warn("%s: bug: iprim != prim\n", __func__);
1192                        return;
1193                }
1194        } else if (!ipv4_is_zeronet(any) &&
1195                   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
1196                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1197                        fib_magic(RTM_DELROUTE,
1198                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1199                                  any, ifa->ifa_prefixlen, prim, 0);
1200                subnet = 1;
1201        }
1202
1203        if (in_dev->dead)
1204                goto no_promotions;
1205
1206        /* Deletion is more complicated than add.
1207         * We should take care of not to delete too much :-)
1208         *
1209         * Scan address list to be sure that addresses are really gone.
1210         */
1211        rcu_read_lock();
1212        in_dev_for_each_ifa_rcu(ifa1, in_dev) {
1213                if (ifa1 == ifa) {
1214                        /* promotion, keep the IP */
1215                        gone = 0;
1216                        continue;
1217                }
1218                /* Ignore IFAs from our subnet */
1219                if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
1220                    inet_ifa_match(ifa1->ifa_address, iprim))
1221                        continue;
1222
1223                /* Ignore ifa1 if it uses different primary IP (prefsrc) */
1224                if (ifa1->ifa_flags & IFA_F_SECONDARY) {
1225                        /* Another address from our subnet? */
1226                        if (ifa1->ifa_mask == prim->ifa_mask &&
1227                            inet_ifa_match(ifa1->ifa_address, prim))
1228                                prim1 = prim;
1229                        else {
1230                                /* We reached the secondaries, so
1231                                 * same_prefsrc should be determined.
1232                                 */
1233                                if (!same_prefsrc)
1234                                        continue;
1235                                /* Search new prim1 if ifa1 is not
1236                                 * using the current prim1
1237                                 */
1238                                if (!prim1 ||
1239                                    ifa1->ifa_mask != prim1->ifa_mask ||
1240                                    !inet_ifa_match(ifa1->ifa_address, prim1))
1241                                        prim1 = inet_ifa_byprefix(in_dev,
1242                                                        ifa1->ifa_address,
1243                                                        ifa1->ifa_mask);
1244                                if (!prim1)
1245                                        continue;
1246                                if (prim1->ifa_local != prim->ifa_local)
1247                                        continue;
1248                        }
1249                } else {
1250                        if (prim->ifa_local != ifa1->ifa_local)
1251                                continue;
1252                        prim1 = ifa1;
1253                        if (prim != prim1)
1254                                same_prefsrc = 1;
1255                }
1256                if (ifa->ifa_local == ifa1->ifa_local)
1257                        ok |= LOCAL_OK;
1258                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
1259                        ok |= BRD_OK;
1260                if (brd == ifa1->ifa_broadcast)
1261                        ok |= BRD1_OK;
1262                if (any == ifa1->ifa_broadcast)
1263                        ok |= BRD0_OK;
1264                /* primary has network specific broadcasts */
1265                if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
1266                        __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
1267                        __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
1268
1269                        if (!ipv4_is_zeronet(any1)) {
1270                                if (ifa->ifa_broadcast == brd1 ||
1271                                    ifa->ifa_broadcast == any1)
1272                                        ok |= BRD_OK;
1273                                if (brd == brd1 || brd == any1)
1274                                        ok |= BRD1_OK;
1275                                if (any == brd1 || any == any1)
1276                                        ok |= BRD0_OK;
1277                        }
1278                }
1279        }
1280        rcu_read_unlock();
1281
1282no_promotions:
1283        if (!(ok & BRD_OK))
1284                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1285                          prim, 0);
1286        if (subnet && ifa->ifa_prefixlen < 31) {
1287                if (!(ok & BRD1_OK))
1288                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
1289                                  prim, 0);
1290                if (!(ok & BRD0_OK))
1291                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
1292                                  prim, 0);
1293        }
1294        if (!(ok & LOCAL_OK)) {
1295                unsigned int addr_type;
1296
1297                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
1298
1299                /* Check, that this local address finally disappeared. */
1300                addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
1301                                                     ifa->ifa_local);
1302                if (gone && addr_type != RTN_LOCAL) {
1303                        /* And the last, but not the least thing.
1304                         * We must flush stray FIB entries.
1305                         *
1306                         * First of all, we scan fib_info list searching
1307                         * for stray nexthop entries, then ignite fib_flush.
1308                         */
1309                        if (fib_sync_down_addr(dev, ifa->ifa_local))
1310                                fib_flush(dev_net(dev));
1311                }
1312        }
1313#undef LOCAL_OK
1314#undef BRD_OK
1315#undef BRD0_OK
1316#undef BRD1_OK
1317}
1318
1319static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
1320{
1321
1322        struct fib_result       res;
1323        struct flowi4           fl4 = {
1324                .flowi4_mark = frn->fl_mark,
1325                .daddr = frn->fl_addr,
1326                .flowi4_tos = frn->fl_tos,
1327                .flowi4_scope = frn->fl_scope,
1328        };
1329        struct fib_table *tb;
1330
1331        rcu_read_lock();
1332
1333        tb = fib_get_table(net, frn->tb_id_in);
1334
1335        frn->err = -ENOENT;
1336        if (tb) {
1337                local_bh_disable();
1338
1339                frn->tb_id = tb->tb_id;
1340                frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1341
1342                if (!frn->err) {
1343                        frn->prefixlen = res.prefixlen;
1344                        frn->nh_sel = res.nh_sel;
1345                        frn->type = res.type;
1346                        frn->scope = res.scope;
1347                }
1348                local_bh_enable();
1349        }
1350
1351        rcu_read_unlock();
1352}
1353
1354static void nl_fib_input(struct sk_buff *skb)
1355{
1356        struct net *net;
1357        struct fib_result_nl *frn;
1358        struct nlmsghdr *nlh;
1359        u32 portid;
1360
1361        net = sock_net(skb->sk);
1362        nlh = nlmsg_hdr(skb);
1363        if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1364            skb->len < nlh->nlmsg_len ||
1365            nlmsg_len(nlh) < sizeof(*frn))
1366                return;
1367
1368        skb = netlink_skb_clone(skb, GFP_KERNEL);
1369        if (!skb)
1370                return;
1371        nlh = nlmsg_hdr(skb);
1372
1373        frn = (struct fib_result_nl *) nlmsg_data(nlh);
1374        nl_fib_lookup(net, frn);
1375
1376        portid = NETLINK_CB(skb).portid;      /* netlink portid */
1377        NETLINK_CB(skb).portid = 0;        /* from kernel */
1378        NETLINK_CB(skb).dst_group = 0;  /* unicast */
1379        netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
1380}
1381
1382static int __net_init nl_fib_lookup_init(struct net *net)
1383{
1384        struct sock *sk;
1385        struct netlink_kernel_cfg cfg = {
1386                .input  = nl_fib_input,
1387        };
1388
1389        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1390        if (!sk)
1391                return -EAFNOSUPPORT;
1392        net->ipv4.fibnl = sk;
1393        return 0;
1394}
1395
1396static void nl_fib_lookup_exit(struct net *net)
1397{
1398        netlink_kernel_release(net->ipv4.fibnl);
1399        net->ipv4.fibnl = NULL;
1400}
1401
1402static void fib_disable_ip(struct net_device *dev, unsigned long event,
1403                           bool force)
1404{
1405        if (fib_sync_down_dev(dev, event, force))
1406                fib_flush(dev_net(dev));
1407        else
1408                rt_cache_flush(dev_net(dev));
1409        arp_ifdown(dev);
1410}
1411
1412static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1413{
1414        struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
1415        struct net_device *dev = ifa->ifa_dev->dev;
1416        struct net *net = dev_net(dev);
1417
1418        switch (event) {
1419        case NETDEV_UP:
1420                fib_add_ifaddr(ifa);
1421#ifdef CONFIG_IP_ROUTE_MULTIPATH
1422                fib_sync_up(dev, RTNH_F_DEAD);
1423#endif
1424                atomic_inc(&net->ipv4.dev_addr_genid);
1425                rt_cache_flush(dev_net(dev));
1426                break;
1427        case NETDEV_DOWN:
1428                fib_del_ifaddr(ifa, NULL);
1429                atomic_inc(&net->ipv4.dev_addr_genid);
1430                if (!ifa->ifa_dev->ifa_list) {
1431                        /* Last address was deleted from this interface.
1432                         * Disable IP.
1433                         */
1434                        fib_disable_ip(dev, event, true);
1435                } else {
1436                        rt_cache_flush(dev_net(dev));
1437                }
1438                break;
1439        }
1440        return NOTIFY_DONE;
1441}
1442
1443static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1444{
1445        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1446        struct netdev_notifier_changeupper_info *upper_info = ptr;
1447        struct netdev_notifier_info_ext *info_ext = ptr;
1448        struct in_device *in_dev;
1449        struct net *net = dev_net(dev);
1450        struct in_ifaddr *ifa;
1451        unsigned int flags;
1452
1453        if (event == NETDEV_UNREGISTER) {
1454                fib_disable_ip(dev, event, true);
1455                rt_flush_dev(dev);
1456                return NOTIFY_DONE;
1457        }
1458
1459        in_dev = __in_dev_get_rtnl(dev);
1460        if (!in_dev)
1461                return NOTIFY_DONE;
1462
1463        switch (event) {
1464        case NETDEV_UP:
1465                in_dev_for_each_ifa_rtnl(ifa, in_dev) {
1466                        fib_add_ifaddr(ifa);
1467                }
1468#ifdef CONFIG_IP_ROUTE_MULTIPATH
1469                fib_sync_up(dev, RTNH_F_DEAD);
1470#endif
1471                atomic_inc(&net->ipv4.dev_addr_genid);
1472                rt_cache_flush(net);
1473                break;
1474        case NETDEV_DOWN:
1475                fib_disable_ip(dev, event, false);
1476                break;
1477        case NETDEV_CHANGE:
1478                flags = dev_get_flags(dev);
1479                if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1480                        fib_sync_up(dev, RTNH_F_LINKDOWN);
1481                else
1482                        fib_sync_down_dev(dev, event, false);
1483                rt_cache_flush(net);
1484                break;
1485        case NETDEV_CHANGEMTU:
1486                fib_sync_mtu(dev, info_ext->ext.mtu);
1487                rt_cache_flush(net);
1488                break;
1489        case NETDEV_CHANGEUPPER:
1490                upper_info = ptr;
1491                /* flush all routes if dev is linked to or unlinked from
1492                 * an L3 master device (e.g., VRF)
1493                 */
1494                if (upper_info->upper_dev &&
1495                    netif_is_l3_master(upper_info->upper_dev))
1496                        fib_disable_ip(dev, NETDEV_DOWN, true);
1497                break;
1498        }
1499        return NOTIFY_DONE;
1500}
1501
1502static struct notifier_block fib_inetaddr_notifier = {
1503        .notifier_call = fib_inetaddr_event,
1504};
1505
1506static struct notifier_block fib_netdev_notifier = {
1507        .notifier_call = fib_netdev_event,
1508};
1509
1510static int __net_init ip_fib_net_init(struct net *net)
1511{
1512        int err;
1513        size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1514
1515        err = fib4_notifier_init(net);
1516        if (err)
1517                return err;
1518
1519        /* Avoid false sharing : Use at least a full cache line */
1520        size = max_t(size_t, size, L1_CACHE_BYTES);
1521
1522        net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1523        if (!net->ipv4.fib_table_hash) {
1524                err = -ENOMEM;
1525                goto err_table_hash_alloc;
1526        }
1527
1528        err = fib4_rules_init(net);
1529        if (err < 0)
1530                goto err_rules_init;
1531        return 0;
1532
1533err_rules_init:
1534        kfree(net->ipv4.fib_table_hash);
1535err_table_hash_alloc:
1536        fib4_notifier_exit(net);
1537        return err;
1538}
1539
1540static void ip_fib_net_exit(struct net *net)
1541{
1542        int i;
1543
1544        rtnl_lock();
1545#ifdef CONFIG_IP_MULTIPLE_TABLES
1546        RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1547        RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1548#endif
1549        /* Destroy the tables in reverse order to guarantee that the
1550         * local table, ID 255, is destroyed before the main table, ID
1551         * 254. This is necessary as the local table may contain
1552         * references to data contained in the main table.
1553         */
1554        for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
1555                struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1556                struct hlist_node *tmp;
1557                struct fib_table *tb;
1558
1559                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1560                        hlist_del(&tb->tb_hlist);
1561                        fib_table_flush(net, tb, true);
1562                        fib_free_table(tb);
1563                }
1564        }
1565
1566#ifdef CONFIG_IP_MULTIPLE_TABLES
1567        fib4_rules_exit(net);
1568#endif
1569        rtnl_unlock();
1570        kfree(net->ipv4.fib_table_hash);
1571        fib4_notifier_exit(net);
1572}
1573
1574static int __net_init fib_net_init(struct net *net)
1575{
1576        int error;
1577
1578#ifdef CONFIG_IP_ROUTE_CLASSID
1579        net->ipv4.fib_num_tclassid_users = 0;
1580#endif
1581        error = ip_fib_net_init(net);
1582        if (error < 0)
1583                goto out;
1584        error = nl_fib_lookup_init(net);
1585        if (error < 0)
1586                goto out_nlfl;
1587        error = fib_proc_init(net);
1588        if (error < 0)
1589                goto out_proc;
1590out:
1591        return error;
1592
1593out_proc:
1594        nl_fib_lookup_exit(net);
1595out_nlfl:
1596        ip_fib_net_exit(net);
1597        goto out;
1598}
1599
1600static void __net_exit fib_net_exit(struct net *net)
1601{
1602        fib_proc_exit(net);
1603        nl_fib_lookup_exit(net);
1604        ip_fib_net_exit(net);
1605}
1606
1607static struct pernet_operations fib_net_ops = {
1608        .init = fib_net_init,
1609        .exit = fib_net_exit,
1610};
1611
1612void __init ip_fib_init(void)
1613{
1614        fib_trie_init();
1615
1616        register_pernet_subsys(&fib_net_ops);
1617
1618        register_netdevice_notifier(&fib_netdev_notifier);
1619        register_inetaddr_notifier(&fib_inetaddr_notifier);
1620
1621        rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
1622        rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
1623        rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
1624}
1625