linux/net/ipv4/fib_frontend.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              IPv4 Forwarding Information Base: FIB frontend.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/uaccess.h>
  14#include <linux/bitops.h>
  15#include <linux/capability.h>
  16#include <linux/types.h>
  17#include <linux/kernel.h>
  18#include <linux/mm.h>
  19#include <linux/string.h>
  20#include <linux/socket.h>
  21#include <linux/sockios.h>
  22#include <linux/errno.h>
  23#include <linux/in.h>
  24#include <linux/inet.h>
  25#include <linux/inetdevice.h>
  26#include <linux/netdevice.h>
  27#include <linux/if_addr.h>
  28#include <linux/if_arp.h>
  29#include <linux/skbuff.h>
  30#include <linux/cache.h>
  31#include <linux/init.h>
  32#include <linux/list.h>
  33#include <linux/slab.h>
  34
  35#include <net/ip.h>
  36#include <net/protocol.h>
  37#include <net/route.h>
  38#include <net/tcp.h>
  39#include <net/sock.h>
  40#include <net/arp.h>
  41#include <net/ip_fib.h>
  42#include <net/nexthop.h>
  43#include <net/rtnetlink.h>
  44#include <net/xfrm.h>
  45#include <net/l3mdev.h>
  46#include <net/lwtunnel.h>
  47#include <trace/events/fib.h>
  48
  49#ifndef CONFIG_IP_MULTIPLE_TABLES
  50
  51static int __net_init fib4_rules_init(struct net *net)
  52{
  53        struct fib_table *local_table, *main_table;
  54
  55        main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
  56        if (!main_table)
  57                return -ENOMEM;
  58
  59        local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
  60        if (!local_table)
  61                goto fail;
  62
  63        hlist_add_head_rcu(&local_table->tb_hlist,
  64                                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
  65        hlist_add_head_rcu(&main_table->tb_hlist,
  66                                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
  67        return 0;
  68
  69fail:
  70        fib_free_table(main_table);
  71        return -ENOMEM;
  72}
  73#else
  74
  75struct fib_table *fib_new_table(struct net *net, u32 id)
  76{
  77        struct fib_table *tb, *alias = NULL;
  78        unsigned int h;
  79
  80        if (id == 0)
  81                id = RT_TABLE_MAIN;
  82        tb = fib_get_table(net, id);
  83        if (tb)
  84                return tb;
  85
  86        if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
  87                alias = fib_new_table(net, RT_TABLE_MAIN);
  88
  89        tb = fib_trie_table(id, alias);
  90        if (!tb)
  91                return NULL;
  92
  93        switch (id) {
  94        case RT_TABLE_MAIN:
  95                rcu_assign_pointer(net->ipv4.fib_main, tb);
  96                break;
  97        case RT_TABLE_DEFAULT:
  98                rcu_assign_pointer(net->ipv4.fib_default, tb);
  99                break;
 100        default:
 101                break;
 102        }
 103
 104        h = id & (FIB_TABLE_HASHSZ - 1);
 105        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
 106        return tb;
 107}
 108EXPORT_SYMBOL_GPL(fib_new_table);
 109
 110/* caller must hold either rtnl or rcu read lock */
 111struct fib_table *fib_get_table(struct net *net, u32 id)
 112{
 113        struct fib_table *tb;
 114        struct hlist_head *head;
 115        unsigned int h;
 116
 117        if (id == 0)
 118                id = RT_TABLE_MAIN;
 119        h = id & (FIB_TABLE_HASHSZ - 1);
 120
 121        head = &net->ipv4.fib_table_hash[h];
 122        hlist_for_each_entry_rcu(tb, head, tb_hlist,
 123                                 lockdep_rtnl_is_held()) {
 124                if (tb->tb_id == id)
 125                        return tb;
 126        }
 127        return NULL;
 128}
 129#endif /* CONFIG_IP_MULTIPLE_TABLES */
 130
 131static void fib_replace_table(struct net *net, struct fib_table *old,
 132                              struct fib_table *new)
 133{
 134#ifdef CONFIG_IP_MULTIPLE_TABLES
 135        switch (new->tb_id) {
 136        case RT_TABLE_MAIN:
 137                rcu_assign_pointer(net->ipv4.fib_main, new);
 138                break;
 139        case RT_TABLE_DEFAULT:
 140                rcu_assign_pointer(net->ipv4.fib_default, new);
 141                break;
 142        default:
 143                break;
 144        }
 145
 146#endif
 147        /* replace the old table in the hlist */
 148        hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
 149}
 150
 151int fib_unmerge(struct net *net)
 152{
 153        struct fib_table *old, *new, *main_table;
 154
 155        /* attempt to fetch local table if it has been allocated */
 156        old = fib_get_table(net, RT_TABLE_LOCAL);
 157        if (!old)
 158                return 0;
 159
 160        new = fib_trie_unmerge(old);
 161        if (!new)
 162                return -ENOMEM;
 163
 164        /* table is already unmerged */
 165        if (new == old)
 166                return 0;
 167
 168        /* replace merged table with clean table */
 169        fib_replace_table(net, old, new);
 170        fib_free_table(old);
 171
 172        /* attempt to fetch main table if it has been allocated */
 173        main_table = fib_get_table(net, RT_TABLE_MAIN);
 174        if (!main_table)
 175                return 0;
 176
 177        /* flush local entries from main table */
 178        fib_table_flush_external(main_table);
 179
 180        return 0;
 181}
 182
 183void fib_flush(struct net *net)
 184{
 185        int flushed = 0;
 186        unsigned int h;
 187
 188        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 189                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 190                struct hlist_node *tmp;
 191                struct fib_table *tb;
 192
 193                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
 194                        flushed += fib_table_flush(net, tb, false);
 195        }
 196
 197        if (flushed)
 198                rt_cache_flush(net);
 199}
 200
 201/*
 202 * Find address type as if only "dev" was present in the system. If
 203 * on_dev is NULL then all interfaces are taken into consideration.
 204 */
 205static inline unsigned int __inet_dev_addr_type(struct net *net,
 206                                                const struct net_device *dev,
 207                                                __be32 addr, u32 tb_id)
 208{
 209        struct flowi4           fl4 = { .daddr = addr };
 210        struct fib_result       res;
 211        unsigned int ret = RTN_BROADCAST;
 212        struct fib_table *table;
 213
 214        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
 215                return RTN_BROADCAST;
 216        if (ipv4_is_multicast(addr))
 217                return RTN_MULTICAST;
 218
 219        rcu_read_lock();
 220
 221        table = fib_get_table(net, tb_id);
 222        if (table) {
 223                ret = RTN_UNICAST;
 224                if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 225                        struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
 226
 227                        if (!dev || dev == nhc->nhc_dev)
 228                                ret = res.type;
 229                }
 230        }
 231
 232        rcu_read_unlock();
 233        return ret;
 234}
 235
 236unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
 237{
 238        return __inet_dev_addr_type(net, NULL, addr, tb_id);
 239}
 240EXPORT_SYMBOL(inet_addr_type_table);
 241
 242unsigned int inet_addr_type(struct net *net, __be32 addr)
 243{
 244        return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
 245}
 246EXPORT_SYMBOL(inet_addr_type);
 247
 248unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 249                                __be32 addr)
 250{
 251        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 252
 253        return __inet_dev_addr_type(net, dev, addr, rt_table);
 254}
 255EXPORT_SYMBOL(inet_dev_addr_type);
 256
 257/* inet_addr_type with dev == NULL but using the table from a dev
 258 * if one is associated
 259 */
 260unsigned int inet_addr_type_dev_table(struct net *net,
 261                                      const struct net_device *dev,
 262                                      __be32 addr)
 263{
 264        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 265
 266        return __inet_dev_addr_type(net, NULL, addr, rt_table);
 267}
 268EXPORT_SYMBOL(inet_addr_type_dev_table);
 269
 270__be32 fib_compute_spec_dst(struct sk_buff *skb)
 271{
 272        struct net_device *dev = skb->dev;
 273        struct in_device *in_dev;
 274        struct fib_result res;
 275        struct rtable *rt;
 276        struct net *net;
 277        int scope;
 278
 279        rt = skb_rtable(skb);
 280        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
 281            RTCF_LOCAL)
 282                return ip_hdr(skb)->daddr;
 283
 284        in_dev = __in_dev_get_rcu(dev);
 285
 286        net = dev_net(dev);
 287
 288        scope = RT_SCOPE_UNIVERSE;
 289        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 290                bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
 291                struct flowi4 fl4 = {
 292                        .flowi4_iif = LOOPBACK_IFINDEX,
 293                        .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
 294                        .daddr = ip_hdr(skb)->saddr,
 295                        .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
 296                        .flowi4_scope = scope,
 297                        .flowi4_mark = vmark ? skb->mark : 0,
 298                };
 299                if (!fib_lookup(net, &fl4, &res, 0))
 300                        return fib_result_prefsrc(net, &res);
 301        } else {
 302                scope = RT_SCOPE_LINK;
 303        }
 304
 305        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 306}
 307
 308bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 309{
 310        bool dev_match = false;
 311#ifdef CONFIG_IP_ROUTE_MULTIPATH
 312        if (unlikely(fi->nh)) {
 313                dev_match = nexthop_uses_dev(fi->nh, dev);
 314        } else {
 315                int ret;
 316
 317                for (ret = 0; ret < fib_info_num_path(fi); ret++) {
 318                        const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
 319
 320                        if (nhc_l3mdev_matches_dev(nhc, dev)) {
 321                                dev_match = true;
 322                                break;
 323                        }
 324                }
 325        }
 326#else
 327        if (fib_info_nhc(fi, 0)->nhc_dev == dev)
 328                dev_match = true;
 329#endif
 330
 331        return dev_match;
 332}
 333EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
 334
 335/* Given (packet source, input interface) and optional (dst, oif, tos):
 336 * - (main) check, that source is valid i.e. not broadcast or our local
 337 *   address.
 338 * - figure out what "logical" interface this packet arrived
 339 *   and calculate "specific destination" address.
 340 * - check, that packet arrived from expected physical interface.
 341 * called with rcu_read_lock()
 342 */
 343static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 344                                 u8 tos, int oif, struct net_device *dev,
 345                                 int rpf, struct in_device *idev, u32 *itag)
 346{
 347        struct net *net = dev_net(dev);
 348        struct flow_keys flkeys;
 349        int ret, no_addr;
 350        struct fib_result res;
 351        struct flowi4 fl4;
 352        bool dev_match;
 353
 354        fl4.flowi4_oif = 0;
 355        fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
 356        if (!fl4.flowi4_iif)
 357                fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
 358        fl4.daddr = src;
 359        fl4.saddr = dst;
 360        fl4.flowi4_tos = tos;
 361        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 362        fl4.flowi4_tun_key.tun_id = 0;
 363        fl4.flowi4_flags = 0;
 364        fl4.flowi4_uid = sock_net_uid(net, NULL);
 365        fl4.flowi4_multipath_hash = 0;
 366
 367        no_addr = idev->ifa_list == NULL;
 368
 369        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 370        if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
 371                fl4.flowi4_proto = 0;
 372                fl4.fl4_sport = 0;
 373                fl4.fl4_dport = 0;
 374        } else {
 375                swap(fl4.fl4_sport, fl4.fl4_dport);
 376        }
 377
 378        if (fib_lookup(net, &fl4, &res, 0))
 379                goto last_resort;
 380        if (res.type != RTN_UNICAST &&
 381            (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 382                goto e_inval;
 383        fib_combine_itag(itag, &res);
 384
 385        dev_match = fib_info_nh_uses_dev(res.fi, dev);
 386        /* This is not common, loopback packets retain skb_dst so normally they
 387         * would not even hit this slow path.
 388         */
 389        dev_match = dev_match || (res.type == RTN_LOCAL &&
 390                                  dev == net->loopback_dev);
 391        if (dev_match) {
 392                ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 393                return ret;
 394        }
 395        if (no_addr)
 396                goto last_resort;
 397        if (rpf == 1)
 398                goto e_rpf;
 399        fl4.flowi4_oif = dev->ifindex;
 400
 401        ret = 0;
 402        if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 403                if (res.type == RTN_UNICAST)
 404                        ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 405        }
 406        return ret;
 407
 408last_resort:
 409        if (rpf)
 410                goto e_rpf;
 411        *itag = 0;
 412        return 0;
 413
 414e_inval:
 415        return -EINVAL;
 416e_rpf:
 417        return -EXDEV;
 418}
 419
 420/* Ignore rp_filter for packets protected by IPsec. */
 421int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 422                        u8 tos, int oif, struct net_device *dev,
 423                        struct in_device *idev, u32 *itag)
 424{
 425        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 426        struct net *net = dev_net(dev);
 427
 428        if (!r && !fib_num_tclassid_users(net) &&
 429            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
 430                if (IN_DEV_ACCEPT_LOCAL(idev))
 431                        goto ok;
 432                /* with custom local routes in place, checking local addresses
 433                 * only will be too optimistic, with custom rules, checking
 434                 * local addresses only can be too strict, e.g. due to vrf
 435                 */
 436                if (net->ipv4.fib_has_custom_local_routes ||
 437                    fib4_has_custom_rules(net))
 438                        goto full_check;
 439                if (inet_lookup_ifaddr_rcu(net, src))
 440                        return -EINVAL;
 441
 442ok:
 443                *itag = 0;
 444                return 0;
 445        }
 446
 447full_check:
 448        return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 449}
 450
 451static inline __be32 sk_extract_addr(struct sockaddr *addr)
 452{
 453        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
 454}
 455
 456static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
 457{
 458        struct nlattr *nla;
 459
 460        nla = (struct nlattr *) ((char *) mx + len);
 461        nla->nla_type = type;
 462        nla->nla_len = nla_attr_size(4);
 463        *(u32 *) nla_data(nla) = value;
 464
 465        return len + nla_total_size(4);
 466}
 467
 468static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 469                                 struct fib_config *cfg)
 470{
 471        __be32 addr;
 472        int plen;
 473
 474        memset(cfg, 0, sizeof(*cfg));
 475        cfg->fc_nlinfo.nl_net = net;
 476
 477        if (rt->rt_dst.sa_family != AF_INET)
 478                return -EAFNOSUPPORT;
 479
 480        /*
 481         * Check mask for validity:
 482         * a) it must be contiguous.
 483         * b) destination must have all host bits clear.
 484         * c) if application forgot to set correct family (AF_INET),
 485         *    reject request unless it is absolutely clear i.e.
 486         *    both family and mask are zero.
 487         */
 488        plen = 32;
 489        addr = sk_extract_addr(&rt->rt_dst);
 490        if (!(rt->rt_flags & RTF_HOST)) {
 491                __be32 mask = sk_extract_addr(&rt->rt_genmask);
 492
 493                if (rt->rt_genmask.sa_family != AF_INET) {
 494                        if (mask || rt->rt_genmask.sa_family)
 495                                return -EAFNOSUPPORT;
 496                }
 497
 498                if (bad_mask(mask, addr))
 499                        return -EINVAL;
 500
 501                plen = inet_mask_len(mask);
 502        }
 503
 504        cfg->fc_dst_len = plen;
 505        cfg->fc_dst = addr;
 506
 507        if (cmd != SIOCDELRT) {
 508                cfg->fc_nlflags = NLM_F_CREATE;
 509                cfg->fc_protocol = RTPROT_BOOT;
 510        }
 511
 512        if (rt->rt_metric)
 513                cfg->fc_priority = rt->rt_metric - 1;
 514
 515        if (rt->rt_flags & RTF_REJECT) {
 516                cfg->fc_scope = RT_SCOPE_HOST;
 517                cfg->fc_type = RTN_UNREACHABLE;
 518                return 0;
 519        }
 520
 521        cfg->fc_scope = RT_SCOPE_NOWHERE;
 522        cfg->fc_type = RTN_UNICAST;
 523
 524        if (rt->rt_dev) {
 525                char *colon;
 526                struct net_device *dev;
 527                char devname[IFNAMSIZ];
 528
 529                if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
 530                        return -EFAULT;
 531
 532                devname[IFNAMSIZ-1] = 0;
 533                colon = strchr(devname, ':');
 534                if (colon)
 535                        *colon = 0;
 536                dev = __dev_get_by_name(net, devname);
 537                if (!dev)
 538                        return -ENODEV;
 539                cfg->fc_oif = dev->ifindex;
 540                cfg->fc_table = l3mdev_fib_table(dev);
 541                if (colon) {
 542                        const struct in_ifaddr *ifa;
 543                        struct in_device *in_dev;
 544
 545                        in_dev = __in_dev_get_rtnl(dev);
 546                        if (!in_dev)
 547                                return -ENODEV;
 548
 549                        *colon = ':';
 550
 551                        rcu_read_lock();
 552                        in_dev_for_each_ifa_rcu(ifa, in_dev) {
 553                                if (strcmp(ifa->ifa_label, devname) == 0)
 554                                        break;
 555                        }
 556                        rcu_read_unlock();
 557
 558                        if (!ifa)
 559                                return -ENODEV;
 560                        cfg->fc_prefsrc = ifa->ifa_local;
 561                }
 562        }
 563
 564        addr = sk_extract_addr(&rt->rt_gateway);
 565        if (rt->rt_gateway.sa_family == AF_INET && addr) {
 566                unsigned int addr_type;
 567
 568                cfg->fc_gw4 = addr;
 569                cfg->fc_gw_family = AF_INET;
 570                addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
 571                if (rt->rt_flags & RTF_GATEWAY &&
 572                    addr_type == RTN_UNICAST)
 573                        cfg->fc_scope = RT_SCOPE_UNIVERSE;
 574        }
 575
 576        if (cmd == SIOCDELRT)
 577                return 0;
 578
 579        if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
 580                return -EINVAL;
 581
 582        if (cfg->fc_scope == RT_SCOPE_NOWHERE)
 583                cfg->fc_scope = RT_SCOPE_LINK;
 584
 585        if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
 586                struct nlattr *mx;
 587                int len = 0;
 588
 589                mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
 590                if (!mx)
 591                        return -ENOMEM;
 592
 593                if (rt->rt_flags & RTF_MTU)
 594                        len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
 595
 596                if (rt->rt_flags & RTF_WINDOW)
 597                        len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
 598
 599                if (rt->rt_flags & RTF_IRTT)
 600                        len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
 601
 602                cfg->fc_mx = mx;
 603                cfg->fc_mx_len = len;
 604        }
 605
 606        return 0;
 607}
 608
 609/*
 610 * Handle IP routing ioctl calls.
 611 * These are used to manipulate the routing tables
 612 */
 613int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
 614{
 615        struct fib_config cfg;
 616        int err;
 617
 618        switch (cmd) {
 619        case SIOCADDRT:         /* Add a route */
 620        case SIOCDELRT:         /* Delete a route */
 621                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 622                        return -EPERM;
 623
 624                rtnl_lock();
 625                err = rtentry_to_fib_config(net, cmd, rt, &cfg);
 626                if (err == 0) {
 627                        struct fib_table *tb;
 628
 629                        if (cmd == SIOCDELRT) {
 630                                tb = fib_get_table(net, cfg.fc_table);
 631                                if (tb)
 632                                        err = fib_table_delete(net, tb, &cfg,
 633                                                               NULL);
 634                                else
 635                                        err = -ESRCH;
 636                        } else {
 637                                tb = fib_new_table(net, cfg.fc_table);
 638                                if (tb)
 639                                        err = fib_table_insert(net, tb,
 640                                                               &cfg, NULL);
 641                                else
 642                                        err = -ENOBUFS;
 643                        }
 644
 645                        /* allocated by rtentry_to_fib_config() */
 646                        kfree(cfg.fc_mx);
 647                }
 648                rtnl_unlock();
 649                return err;
 650        }
 651        return -EINVAL;
 652}
 653
 654const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 655        [RTA_UNSPEC]            = { .strict_start_type = RTA_DPORT + 1 },
 656        [RTA_DST]               = { .type = NLA_U32 },
 657        [RTA_SRC]               = { .type = NLA_U32 },
 658        [RTA_IIF]               = { .type = NLA_U32 },
 659        [RTA_OIF]               = { .type = NLA_U32 },
 660        [RTA_GATEWAY]           = { .type = NLA_U32 },
 661        [RTA_PRIORITY]          = { .type = NLA_U32 },
 662        [RTA_PREFSRC]           = { .type = NLA_U32 },
 663        [RTA_METRICS]           = { .type = NLA_NESTED },
 664        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
 665        [RTA_FLOW]              = { .type = NLA_U32 },
 666        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
 667        [RTA_ENCAP]             = { .type = NLA_NESTED },
 668        [RTA_UID]               = { .type = NLA_U32 },
 669        [RTA_MARK]              = { .type = NLA_U32 },
 670        [RTA_TABLE]             = { .type = NLA_U32 },
 671        [RTA_IP_PROTO]          = { .type = NLA_U8 },
 672        [RTA_SPORT]             = { .type = NLA_U16 },
 673        [RTA_DPORT]             = { .type = NLA_U16 },
 674        [RTA_NH_ID]             = { .type = NLA_U32 },
 675};
 676
 677int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
 678                    struct netlink_ext_ack *extack)
 679{
 680        struct rtvia *via;
 681        int alen;
 682
 683        if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
 684                NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
 685                return -EINVAL;
 686        }
 687
 688        via = nla_data(nla);
 689        alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
 690
 691        switch (via->rtvia_family) {
 692        case AF_INET:
 693                if (alen != sizeof(__be32)) {
 694                        NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
 695                        return -EINVAL;
 696                }
 697                cfg->fc_gw_family = AF_INET;
 698                cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
 699                break;
 700        case AF_INET6:
 701#if IS_ENABLED(CONFIG_IPV6)
 702                if (alen != sizeof(struct in6_addr)) {
 703                        NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
 704                        return -EINVAL;
 705                }
 706                cfg->fc_gw_family = AF_INET6;
 707                cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
 708#else
 709                NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
 710                return -EINVAL;
 711#endif
 712                break;
 713        default:
 714                NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
 715                return -EINVAL;
 716        }
 717
 718        return 0;
 719}
 720
 721static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 722                             struct nlmsghdr *nlh, struct fib_config *cfg,
 723                             struct netlink_ext_ack *extack)
 724{
 725        bool has_gw = false, has_via = false;
 726        struct nlattr *attr;
 727        int err, remaining;
 728        struct rtmsg *rtm;
 729
 730        err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
 731                                        rtm_ipv4_policy, extack);
 732        if (err < 0)
 733                goto errout;
 734
 735        memset(cfg, 0, sizeof(*cfg));
 736
 737        rtm = nlmsg_data(nlh);
 738        cfg->fc_dst_len = rtm->rtm_dst_len;
 739        cfg->fc_tos = rtm->rtm_tos;
 740        cfg->fc_table = rtm->rtm_table;
 741        cfg->fc_protocol = rtm->rtm_protocol;
 742        cfg->fc_scope = rtm->rtm_scope;
 743        cfg->fc_type = rtm->rtm_type;
 744        cfg->fc_flags = rtm->rtm_flags;
 745        cfg->fc_nlflags = nlh->nlmsg_flags;
 746
 747        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 748        cfg->fc_nlinfo.nlh = nlh;
 749        cfg->fc_nlinfo.nl_net = net;
 750
 751        if (cfg->fc_type > RTN_MAX) {
 752                NL_SET_ERR_MSG(extack, "Invalid route type");
 753                err = -EINVAL;
 754                goto errout;
 755        }
 756
 757        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
 758                switch (nla_type(attr)) {
 759                case RTA_DST:
 760                        cfg->fc_dst = nla_get_be32(attr);
 761                        break;
 762                case RTA_OIF:
 763                        cfg->fc_oif = nla_get_u32(attr);
 764                        break;
 765                case RTA_GATEWAY:
 766                        has_gw = true;
 767                        cfg->fc_gw4 = nla_get_be32(attr);
 768                        if (cfg->fc_gw4)
 769                                cfg->fc_gw_family = AF_INET;
 770                        break;
 771                case RTA_VIA:
 772                        has_via = true;
 773                        err = fib_gw_from_via(cfg, attr, extack);
 774                        if (err)
 775                                goto errout;
 776                        break;
 777                case RTA_PRIORITY:
 778                        cfg->fc_priority = nla_get_u32(attr);
 779                        break;
 780                case RTA_PREFSRC:
 781                        cfg->fc_prefsrc = nla_get_be32(attr);
 782                        break;
 783                case RTA_METRICS:
 784                        cfg->fc_mx = nla_data(attr);
 785                        cfg->fc_mx_len = nla_len(attr);
 786                        break;
 787                case RTA_MULTIPATH:
 788                        err = lwtunnel_valid_encap_type_attr(nla_data(attr),
 789                                                             nla_len(attr),
 790                                                             extack);
 791                        if (err < 0)
 792                                goto errout;
 793                        cfg->fc_mp = nla_data(attr);
 794                        cfg->fc_mp_len = nla_len(attr);
 795                        break;
 796                case RTA_FLOW:
 797                        cfg->fc_flow = nla_get_u32(attr);
 798                        break;
 799                case RTA_TABLE:
 800                        cfg->fc_table = nla_get_u32(attr);
 801                        break;
 802                case RTA_ENCAP:
 803                        cfg->fc_encap = attr;
 804                        break;
 805                case RTA_ENCAP_TYPE:
 806                        cfg->fc_encap_type = nla_get_u16(attr);
 807                        err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
 808                                                        extack);
 809                        if (err < 0)
 810                                goto errout;
 811                        break;
 812                case RTA_NH_ID:
 813                        cfg->fc_nh_id = nla_get_u32(attr);
 814                        break;
 815                }
 816        }
 817
 818        if (cfg->fc_nh_id) {
 819                if (cfg->fc_oif || cfg->fc_gw_family ||
 820                    cfg->fc_encap || cfg->fc_mp) {
 821                        NL_SET_ERR_MSG(extack,
 822                                       "Nexthop specification and nexthop id are mutually exclusive");
 823                        return -EINVAL;
 824                }
 825        }
 826
 827        if (has_gw && has_via) {
 828                NL_SET_ERR_MSG(extack,
 829                               "Nexthop configuration can not contain both GATEWAY and VIA");
 830                return -EINVAL;
 831        }
 832
 833        return 0;
 834errout:
 835        return err;
 836}
 837
 838static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 839                             struct netlink_ext_ack *extack)
 840{
 841        struct net *net = sock_net(skb->sk);
 842        struct fib_config cfg;
 843        struct fib_table *tb;
 844        int err;
 845
 846        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 847        if (err < 0)
 848                goto errout;
 849
 850        if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
 851                NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
 852                err = -EINVAL;
 853                goto errout;
 854        }
 855
 856        tb = fib_get_table(net, cfg.fc_table);
 857        if (!tb) {
 858                NL_SET_ERR_MSG(extack, "FIB table does not exist");
 859                err = -ESRCH;
 860                goto errout;
 861        }
 862
 863        err = fib_table_delete(net, tb, &cfg, extack);
 864errout:
 865        return err;
 866}
 867
 868static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 869                             struct netlink_ext_ack *extack)
 870{
 871        struct net *net = sock_net(skb->sk);
 872        struct fib_config cfg;
 873        struct fib_table *tb;
 874        int err;
 875
 876        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 877        if (err < 0)
 878                goto errout;
 879
 880        tb = fib_new_table(net, cfg.fc_table);
 881        if (!tb) {
 882                err = -ENOBUFS;
 883                goto errout;
 884        }
 885
 886        err = fib_table_insert(net, tb, &cfg, extack);
 887        if (!err && cfg.fc_type == RTN_LOCAL)
 888                net->ipv4.fib_has_custom_local_routes = true;
 889errout:
 890        return err;
 891}
 892
 893int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 894                          struct fib_dump_filter *filter,
 895                          struct netlink_callback *cb)
 896{
 897        struct netlink_ext_ack *extack = cb->extack;
 898        struct nlattr *tb[RTA_MAX + 1];
 899        struct rtmsg *rtm;
 900        int err, i;
 901
 902        ASSERT_RTNL();
 903
 904        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 905                NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
 906                return -EINVAL;
 907        }
 908
 909        rtm = nlmsg_data(nlh);
 910        if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
 911            rtm->rtm_scope) {
 912                NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 913                return -EINVAL;
 914        }
 915
 916        if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
 917                NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
 918                return -EINVAL;
 919        }
 920        if (rtm->rtm_flags & RTM_F_CLONED)
 921                filter->dump_routes = false;
 922        else
 923                filter->dump_exceptions = false;
 924
 925        filter->flags    = rtm->rtm_flags;
 926        filter->protocol = rtm->rtm_protocol;
 927        filter->rt_type  = rtm->rtm_type;
 928        filter->table_id = rtm->rtm_table;
 929
 930        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
 931                                            rtm_ipv4_policy, extack);
 932        if (err < 0)
 933                return err;
 934
 935        for (i = 0; i <= RTA_MAX; ++i) {
 936                int ifindex;
 937
 938                if (!tb[i])
 939                        continue;
 940
 941                switch (i) {
 942                case RTA_TABLE:
 943                        filter->table_id = nla_get_u32(tb[i]);
 944                        break;
 945                case RTA_OIF:
 946                        ifindex = nla_get_u32(tb[i]);
 947                        filter->dev = __dev_get_by_index(net, ifindex);
 948                        if (!filter->dev)
 949                                return -ENODEV;
 950                        break;
 951                default:
 952                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
 953                        return -EINVAL;
 954                }
 955        }
 956
 957        if (filter->flags || filter->protocol || filter->rt_type ||
 958            filter->table_id || filter->dev) {
 959                filter->filter_set = 1;
 960                cb->answer_flags = NLM_F_DUMP_FILTERED;
 961        }
 962
 963        return 0;
 964}
 965EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
 966
 967static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 968{
 969        struct fib_dump_filter filter = { .dump_routes = true,
 970                                          .dump_exceptions = true };
 971        const struct nlmsghdr *nlh = cb->nlh;
 972        struct net *net = sock_net(skb->sk);
 973        unsigned int h, s_h;
 974        unsigned int e = 0, s_e;
 975        struct fib_table *tb;
 976        struct hlist_head *head;
 977        int dumped = 0, err;
 978
 979        if (cb->strict_check) {
 980                err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
 981                if (err < 0)
 982                        return err;
 983        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 984                struct rtmsg *rtm = nlmsg_data(nlh);
 985
 986                filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 987        }
 988
 989        /* ipv4 does not use prefix flag */
 990        if (filter.flags & RTM_F_PREFIX)
 991                return skb->len;
 992
 993        if (filter.table_id) {
 994                tb = fib_get_table(net, filter.table_id);
 995                if (!tb) {
 996                        if (rtnl_msg_family(cb->nlh) != PF_INET)
 997                                return skb->len;
 998
 999                        NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
1000                        return -ENOENT;
1001                }
1002
1003                rcu_read_lock();
1004                err = fib_table_dump(tb, skb, cb, &filter);
1005                rcu_read_unlock();
1006                return skb->len ? : err;
1007        }
1008
1009        s_h = cb->args[0];
1010        s_e = cb->args[1];
1011
1012        rcu_read_lock();
1013
1014        for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
1015                e = 0;
1016                head = &net->ipv4.fib_table_hash[h];
1017                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1018                        if (e < s_e)
1019                                goto next;
1020                        if (dumped)
1021                                memset(&cb->args[2], 0, sizeof(cb->args) -
1022                                                 2 * sizeof(cb->args[0]));
1023                        err = fib_table_dump(tb, skb, cb, &filter);
1024                        if (err < 0) {
1025                                if (likely(skb->len))
1026                                        goto out;
1027
1028                                goto out_err;
1029                        }
1030                        dumped = 1;
1031next:
1032                        e++;
1033                }
1034        }
1035out:
1036        err = skb->len;
1037out_err:
1038        rcu_read_unlock();
1039
1040        cb->args[1] = e;
1041        cb->args[0] = h;
1042
1043        return err;
1044}
1045
1046/* Prepare and feed intra-kernel routing request.
1047 * Really, it should be netlink message, but :-( netlink
1048 * can be not configured, so that we feed it directly
1049 * to fib engine. It is legal, because all events occur
1050 * only when netlink is already locked.
1051 */
1052static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
1053                      struct in_ifaddr *ifa, u32 rt_priority)
1054{
1055        struct net *net = dev_net(ifa->ifa_dev->dev);
1056        u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
1057        struct fib_table *tb;
1058        struct fib_config cfg = {
1059                .fc_protocol = RTPROT_KERNEL,
1060                .fc_type = type,
1061                .fc_dst = dst,
1062                .fc_dst_len = dst_len,
1063                .fc_priority = rt_priority,
1064                .fc_prefsrc = ifa->ifa_local,
1065                .fc_oif = ifa->ifa_dev->dev->ifindex,
1066                .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
1067                .fc_nlinfo = {
1068                        .nl_net = net,
1069                },
1070        };
1071
1072        if (!tb_id)
1073                tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
1074
1075        tb = fib_new_table(net, tb_id);
1076        if (!tb)
1077                return;
1078
1079        cfg.fc_table = tb->tb_id;
1080
1081        if (type != RTN_LOCAL)
1082                cfg.fc_scope = RT_SCOPE_LINK;
1083        else
1084                cfg.fc_scope = RT_SCOPE_HOST;
1085
1086        if (cmd == RTM_NEWROUTE)
1087                fib_table_insert(net, tb, &cfg, NULL);
1088        else
1089                fib_table_delete(net, tb, &cfg, NULL);
1090}
1091
1092void fib_add_ifaddr(struct in_ifaddr *ifa)
1093{
1094        struct in_device *in_dev = ifa->ifa_dev;
1095        struct net_device *dev = in_dev->dev;
1096        struct in_ifaddr *prim = ifa;
1097        __be32 mask = ifa->ifa_mask;
1098        __be32 addr = ifa->ifa_local;
1099        __be32 prefix = ifa->ifa_address & mask;
1100
1101        if (ifa->ifa_flags & IFA_F_SECONDARY) {
1102                prim = inet_ifa_byprefix(in_dev, prefix, mask);
1103                if (!prim) {
1104                        pr_warn("%s: bug: prim == NULL\n", __func__);
1105                        return;
1106                }
1107        }
1108
1109        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
1110
1111        if (!(dev->flags & IFF_UP))
1112                return;
1113
1114        /* Add broadcast address, if it is explicitly assigned. */
1115        if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
1116                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1117                          prim, 0);
1118
1119        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
1120            (prefix != addr || ifa->ifa_prefixlen < 32)) {
1121                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1122                        fib_magic(RTM_NEWROUTE,
1123                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1124                                  prefix, ifa->ifa_prefixlen, prim,
1125                                  ifa->ifa_rt_priority);
1126
1127                /* Add the network broadcast address, when it makes sense */
1128                if (ifa->ifa_prefixlen < 31) {
1129                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
1130                                  32, prim, 0);
1131                }
1132        }
1133}
1134
1135void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
1136{
1137        __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
1138        struct in_device *in_dev = ifa->ifa_dev;
1139        struct net_device *dev = in_dev->dev;
1140
1141        if (!(dev->flags & IFF_UP) ||
1142            ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
1143            ipv4_is_zeronet(prefix) ||
1144            (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
1145                return;
1146
1147        /* add the new */
1148        fib_magic(RTM_NEWROUTE,
1149                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1150                  prefix, ifa->ifa_prefixlen, ifa, new_metric);
1151
1152        /* delete the old */
1153        fib_magic(RTM_DELROUTE,
1154                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1155                  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
1156}
1157
1158/* Delete primary or secondary address.
1159 * Optionally, on secondary address promotion consider the addresses
1160 * from subnet iprim as deleted, even if they are in device list.
1161 * In this case the secondary ifa can be in device list.
1162 */
1163void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
1164{
1165        struct in_device *in_dev = ifa->ifa_dev;
1166        struct net_device *dev = in_dev->dev;
1167        struct in_ifaddr *ifa1;
1168        struct in_ifaddr *prim = ifa, *prim1 = NULL;
1169        __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
1170        __be32 any = ifa->ifa_address & ifa->ifa_mask;
1171#define LOCAL_OK        1
1172#define BRD_OK          2
1173#define BRD0_OK         4
1174#define BRD1_OK         8
1175        unsigned int ok = 0;
1176        int subnet = 0;         /* Primary network */
1177        int gone = 1;           /* Address is missing */
1178        int same_prefsrc = 0;   /* Another primary with same IP */
1179
1180        if (ifa->ifa_flags & IFA_F_SECONDARY) {
1181                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
1182                if (!prim) {
1183                        /* if the device has been deleted, we don't perform
1184                         * address promotion
1185                         */
1186                        if (!in_dev->dead)
1187                                pr_warn("%s: bug: prim == NULL\n", __func__);
1188                        return;
1189                }
1190                if (iprim && iprim != prim) {
1191                        pr_warn("%s: bug: iprim != prim\n", __func__);
1192                        return;
1193                }
1194        } else if (!ipv4_is_zeronet(any) &&
1195                   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
1196                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1197                        fib_magic(RTM_DELROUTE,
1198                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1199                                  any, ifa->ifa_prefixlen, prim, 0);
1200                subnet = 1;
1201        }
1202
1203        if (in_dev->dead)
1204                goto no_promotions;
1205
1206        /* Deletion is more complicated than add.
1207         * We should take care of not to delete too much :-)
1208         *
1209         * Scan address list to be sure that addresses are really gone.
1210         */
1211        rcu_read_lock();
1212        in_dev_for_each_ifa_rcu(ifa1, in_dev) {
1213                if (ifa1 == ifa) {
1214                        /* promotion, keep the IP */
1215                        gone = 0;
1216                        continue;
1217                }
1218                /* Ignore IFAs from our subnet */
1219                if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
1220                    inet_ifa_match(ifa1->ifa_address, iprim))
1221                        continue;
1222
1223                /* Ignore ifa1 if it uses different primary IP (prefsrc) */
1224                if (ifa1->ifa_flags & IFA_F_SECONDARY) {
1225                        /* Another address from our subnet? */
1226                        if (ifa1->ifa_mask == prim->ifa_mask &&
1227                            inet_ifa_match(ifa1->ifa_address, prim))
1228                                prim1 = prim;
1229                        else {
1230                                /* We reached the secondaries, so
1231                                 * same_prefsrc should be determined.
1232                                 */
1233                                if (!same_prefsrc)
1234                                        continue;
1235                                /* Search new prim1 if ifa1 is not
1236                                 * using the current prim1
1237                                 */
1238                                if (!prim1 ||
1239                                    ifa1->ifa_mask != prim1->ifa_mask ||
1240                                    !inet_ifa_match(ifa1->ifa_address, prim1))
1241                                        prim1 = inet_ifa_byprefix(in_dev,
1242                                                        ifa1->ifa_address,
1243                                                        ifa1->ifa_mask);
1244                                if (!prim1)
1245                                        continue;
1246                                if (prim1->ifa_local != prim->ifa_local)
1247                                        continue;
1248                        }
1249                } else {
1250                        if (prim->ifa_local != ifa1->ifa_local)
1251                                continue;
1252                        prim1 = ifa1;
1253                        if (prim != prim1)
1254                                same_prefsrc = 1;
1255                }
1256                if (ifa->ifa_local == ifa1->ifa_local)
1257                        ok |= LOCAL_OK;
1258                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
1259                        ok |= BRD_OK;
1260                if (brd == ifa1->ifa_broadcast)
1261                        ok |= BRD1_OK;
1262                if (any == ifa1->ifa_broadcast)
1263                        ok |= BRD0_OK;
1264                /* primary has network specific broadcasts */
1265                if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
1266                        __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
1267                        __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
1268
1269                        if (!ipv4_is_zeronet(any1)) {
1270                                if (ifa->ifa_broadcast == brd1 ||
1271                                    ifa->ifa_broadcast == any1)
1272                                        ok |= BRD_OK;
1273                                if (brd == brd1 || brd == any1)
1274                                        ok |= BRD1_OK;
1275                                if (any == brd1 || any == any1)
1276                                        ok |= BRD0_OK;
1277                        }
1278                }
1279        }
1280        rcu_read_unlock();
1281
1282no_promotions:
1283        if (!(ok & BRD_OK))
1284                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1285                          prim, 0);
1286        if (subnet && ifa->ifa_prefixlen < 31) {
1287                if (!(ok & BRD1_OK))
1288                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
1289                                  prim, 0);
1290                if (!(ok & BRD0_OK))
1291                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
1292                                  prim, 0);
1293        }
1294        if (!(ok & LOCAL_OK)) {
1295                unsigned int addr_type;
1296
1297                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
1298
1299                /* Check, that this local address finally disappeared. */
1300                addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
1301                                                     ifa->ifa_local);
1302                if (gone && addr_type != RTN_LOCAL) {
1303                        /* And the last, but not the least thing.
1304                         * We must flush stray FIB entries.
1305                         *
1306                         * First of all, we scan fib_info list searching
1307                         * for stray nexthop entries, then ignite fib_flush.
1308                         */
1309                        if (fib_sync_down_addr(dev, ifa->ifa_local))
1310                                fib_flush(dev_net(dev));
1311                }
1312        }
1313#undef LOCAL_OK
1314#undef BRD_OK
1315#undef BRD0_OK
1316#undef BRD1_OK
1317}
1318
1319static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
1320{
1321
1322        struct fib_result       res;
1323        struct flowi4           fl4 = {
1324                .flowi4_mark = frn->fl_mark,
1325                .daddr = frn->fl_addr,
1326                .flowi4_tos = frn->fl_tos,
1327                .flowi4_scope = frn->fl_scope,
1328        };
1329        struct fib_table *tb;
1330
1331        rcu_read_lock();
1332
1333        tb = fib_get_table(net, frn->tb_id_in);
1334
1335        frn->err = -ENOENT;
1336        if (tb) {
1337                local_bh_disable();
1338
1339                frn->tb_id = tb->tb_id;
1340                frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1341
1342                if (!frn->err) {
1343                        frn->prefixlen = res.prefixlen;
1344                        frn->nh_sel = res.nh_sel;
1345                        frn->type = res.type;
1346                        frn->scope = res.scope;
1347                }
1348                local_bh_enable();
1349        }
1350
1351        rcu_read_unlock();
1352}
1353
1354static void nl_fib_input(struct sk_buff *skb)
1355{
1356        struct net *net;
1357        struct fib_result_nl *frn;
1358        struct nlmsghdr *nlh;
1359        u32 portid;
1360
1361        net = sock_net(skb->sk);
1362        nlh = nlmsg_hdr(skb);
1363        if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1364            skb->len < nlh->nlmsg_len ||
1365            nlmsg_len(nlh) < sizeof(*frn))
1366                return;
1367
1368        skb = netlink_skb_clone(skb, GFP_KERNEL);
1369        if (!skb)
1370                return;
1371        nlh = nlmsg_hdr(skb);
1372
1373        frn = (struct fib_result_nl *) nlmsg_data(nlh);
1374        nl_fib_lookup(net, frn);
1375
1376        portid = NETLINK_CB(skb).portid;      /* netlink portid */
1377        NETLINK_CB(skb).portid = 0;        /* from kernel */
1378        NETLINK_CB(skb).dst_group = 0;  /* unicast */
1379        nlmsg_unicast(net->ipv4.fibnl, skb, portid);
1380}
1381
1382static int __net_init nl_fib_lookup_init(struct net *net)
1383{
1384        struct sock *sk;
1385        struct netlink_kernel_cfg cfg = {
1386                .input  = nl_fib_input,
1387        };
1388
1389        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1390        if (!sk)
1391                return -EAFNOSUPPORT;
1392        net->ipv4.fibnl = sk;
1393        return 0;
1394}
1395
1396static void nl_fib_lookup_exit(struct net *net)
1397{
1398        netlink_kernel_release(net->ipv4.fibnl);
1399        net->ipv4.fibnl = NULL;
1400}
1401
1402static void fib_disable_ip(struct net_device *dev, unsigned long event,
1403                           bool force)
1404{
1405        if (fib_sync_down_dev(dev, event, force))
1406                fib_flush(dev_net(dev));
1407        else
1408                rt_cache_flush(dev_net(dev));
1409        arp_ifdown(dev);
1410}
1411
1412static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1413{
1414        struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
1415        struct net_device *dev = ifa->ifa_dev->dev;
1416        struct net *net = dev_net(dev);
1417
1418        switch (event) {
1419        case NETDEV_UP:
1420                fib_add_ifaddr(ifa);
1421#ifdef CONFIG_IP_ROUTE_MULTIPATH
1422                fib_sync_up(dev, RTNH_F_DEAD);
1423#endif
1424                atomic_inc(&net->ipv4.dev_addr_genid);
1425                rt_cache_flush(dev_net(dev));
1426                break;
1427        case NETDEV_DOWN:
1428                fib_del_ifaddr(ifa, NULL);
1429                atomic_inc(&net->ipv4.dev_addr_genid);
1430                if (!ifa->ifa_dev->ifa_list) {
1431                        /* Last address was deleted from this interface.
1432                         * Disable IP.
1433                         */
1434                        fib_disable_ip(dev, event, true);
1435                } else {
1436                        rt_cache_flush(dev_net(dev));
1437                }
1438                break;
1439        }
1440        return NOTIFY_DONE;
1441}
1442
1443static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1444{
1445        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1446        struct netdev_notifier_changeupper_info *upper_info = ptr;
1447        struct netdev_notifier_info_ext *info_ext = ptr;
1448        struct in_device *in_dev;
1449        struct net *net = dev_net(dev);
1450        struct in_ifaddr *ifa;
1451        unsigned int flags;
1452
1453        if (event == NETDEV_UNREGISTER) {
1454                fib_disable_ip(dev, event, true);
1455                rt_flush_dev(dev);
1456                return NOTIFY_DONE;
1457        }
1458
1459        in_dev = __in_dev_get_rtnl(dev);
1460        if (!in_dev)
1461                return NOTIFY_DONE;
1462
1463        switch (event) {
1464        case NETDEV_UP:
1465                in_dev_for_each_ifa_rtnl(ifa, in_dev) {
1466                        fib_add_ifaddr(ifa);
1467                }
1468#ifdef CONFIG_IP_ROUTE_MULTIPATH
1469                fib_sync_up(dev, RTNH_F_DEAD);
1470#endif
1471                atomic_inc(&net->ipv4.dev_addr_genid);
1472                rt_cache_flush(net);
1473                break;
1474        case NETDEV_DOWN:
1475                fib_disable_ip(dev, event, false);
1476                break;
1477        case NETDEV_CHANGE:
1478                flags = dev_get_flags(dev);
1479                if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1480                        fib_sync_up(dev, RTNH_F_LINKDOWN);
1481                else
1482                        fib_sync_down_dev(dev, event, false);
1483                rt_cache_flush(net);
1484                break;
1485        case NETDEV_CHANGEMTU:
1486                fib_sync_mtu(dev, info_ext->ext.mtu);
1487                rt_cache_flush(net);
1488                break;
1489        case NETDEV_CHANGEUPPER:
1490                upper_info = ptr;
1491                /* flush all routes if dev is linked to or unlinked from
1492                 * an L3 master device (e.g., VRF)
1493                 */
1494                if (upper_info->upper_dev &&
1495                    netif_is_l3_master(upper_info->upper_dev))
1496                        fib_disable_ip(dev, NETDEV_DOWN, true);
1497                break;
1498        }
1499        return NOTIFY_DONE;
1500}
1501
1502static struct notifier_block fib_inetaddr_notifier = {
1503        .notifier_call = fib_inetaddr_event,
1504};
1505
1506static struct notifier_block fib_netdev_notifier = {
1507        .notifier_call = fib_netdev_event,
1508};
1509
1510static int __net_init ip_fib_net_init(struct net *net)
1511{
1512        int err;
1513        size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1514
1515        err = fib4_notifier_init(net);
1516        if (err)
1517                return err;
1518
1519#ifdef CONFIG_IP_ROUTE_MULTIPATH
1520        /* Default to 3-tuple */
1521        net->ipv4.sysctl_fib_multipath_hash_fields =
1522                FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
1523#endif
1524
1525        /* Avoid false sharing : Use at least a full cache line */
1526        size = max_t(size_t, size, L1_CACHE_BYTES);
1527
1528        net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1529        if (!net->ipv4.fib_table_hash) {
1530                err = -ENOMEM;
1531                goto err_table_hash_alloc;
1532        }
1533
1534        err = fib4_rules_init(net);
1535        if (err < 0)
1536                goto err_rules_init;
1537        return 0;
1538
1539err_rules_init:
1540        kfree(net->ipv4.fib_table_hash);
1541err_table_hash_alloc:
1542        fib4_notifier_exit(net);
1543        return err;
1544}
1545
1546static void ip_fib_net_exit(struct net *net)
1547{
1548        int i;
1549
1550        rtnl_lock();
1551#ifdef CONFIG_IP_MULTIPLE_TABLES
1552        RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1553        RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1554#endif
1555        /* Destroy the tables in reverse order to guarantee that the
1556         * local table, ID 255, is destroyed before the main table, ID
1557         * 254. This is necessary as the local table may contain
1558         * references to data contained in the main table.
1559         */
1560        for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
1561                struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1562                struct hlist_node *tmp;
1563                struct fib_table *tb;
1564
1565                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1566                        hlist_del(&tb->tb_hlist);
1567                        fib_table_flush(net, tb, true);
1568                        fib_free_table(tb);
1569                }
1570        }
1571
1572#ifdef CONFIG_IP_MULTIPLE_TABLES
1573        fib4_rules_exit(net);
1574#endif
1575        rtnl_unlock();
1576        kfree(net->ipv4.fib_table_hash);
1577        fib4_notifier_exit(net);
1578}
1579
1580static int __net_init fib_net_init(struct net *net)
1581{
1582        int error;
1583
1584#ifdef CONFIG_IP_ROUTE_CLASSID
1585        net->ipv4.fib_num_tclassid_users = 0;
1586#endif
1587        error = ip_fib_net_init(net);
1588        if (error < 0)
1589                goto out;
1590        error = nl_fib_lookup_init(net);
1591        if (error < 0)
1592                goto out_nlfl;
1593        error = fib_proc_init(net);
1594        if (error < 0)
1595                goto out_proc;
1596out:
1597        return error;
1598
1599out_proc:
1600        nl_fib_lookup_exit(net);
1601out_nlfl:
1602        ip_fib_net_exit(net);
1603        goto out;
1604}
1605
1606static void __net_exit fib_net_exit(struct net *net)
1607{
1608        fib_proc_exit(net);
1609        nl_fib_lookup_exit(net);
1610        ip_fib_net_exit(net);
1611}
1612
1613static struct pernet_operations fib_net_ops = {
1614        .init = fib_net_init,
1615        .exit = fib_net_exit,
1616};
1617
1618void __init ip_fib_init(void)
1619{
1620        fib_trie_init();
1621
1622        register_pernet_subsys(&fib_net_ops);
1623
1624        register_netdevice_notifier(&fib_netdev_notifier);
1625        register_inetaddr_notifier(&fib_inetaddr_notifier);
1626
1627        rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
1628        rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
1629        rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
1630}
1631