linux/net/ipv4/fib_frontend.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              IPv4 Forwarding Information Base: FIB frontend.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/uaccess.h>
  14#include <linux/bitops.h>
  15#include <linux/capability.h>
  16#include <linux/types.h>
  17#include <linux/kernel.h>
  18#include <linux/mm.h>
  19#include <linux/string.h>
  20#include <linux/socket.h>
  21#include <linux/sockios.h>
  22#include <linux/errno.h>
  23#include <linux/in.h>
  24#include <linux/inet.h>
  25#include <linux/inetdevice.h>
  26#include <linux/netdevice.h>
  27#include <linux/if_addr.h>
  28#include <linux/if_arp.h>
  29#include <linux/skbuff.h>
  30#include <linux/cache.h>
  31#include <linux/init.h>
  32#include <linux/list.h>
  33#include <linux/slab.h>
  34
  35#include <net/inet_dscp.h>
  36#include <net/ip.h>
  37#include <net/protocol.h>
  38#include <net/route.h>
  39#include <net/tcp.h>
  40#include <net/sock.h>
  41#include <net/arp.h>
  42#include <net/ip_fib.h>
  43#include <net/nexthop.h>
  44#include <net/rtnetlink.h>
  45#include <net/xfrm.h>
  46#include <net/l3mdev.h>
  47#include <net/lwtunnel.h>
  48#include <trace/events/fib.h>
  49
  50#ifndef CONFIG_IP_MULTIPLE_TABLES
  51
  52static int __net_init fib4_rules_init(struct net *net)
  53{
  54        struct fib_table *local_table, *main_table;
  55
  56        main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
  57        if (!main_table)
  58                return -ENOMEM;
  59
  60        local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
  61        if (!local_table)
  62                goto fail;
  63
  64        hlist_add_head_rcu(&local_table->tb_hlist,
  65                                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
  66        hlist_add_head_rcu(&main_table->tb_hlist,
  67                                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
  68        return 0;
  69
  70fail:
  71        fib_free_table(main_table);
  72        return -ENOMEM;
  73}
  74#else
  75
  76struct fib_table *fib_new_table(struct net *net, u32 id)
  77{
  78        struct fib_table *tb, *alias = NULL;
  79        unsigned int h;
  80
  81        if (id == 0)
  82                id = RT_TABLE_MAIN;
  83        tb = fib_get_table(net, id);
  84        if (tb)
  85                return tb;
  86
  87        if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
  88                alias = fib_new_table(net, RT_TABLE_MAIN);
  89
  90        tb = fib_trie_table(id, alias);
  91        if (!tb)
  92                return NULL;
  93
  94        switch (id) {
  95        case RT_TABLE_MAIN:
  96                rcu_assign_pointer(net->ipv4.fib_main, tb);
  97                break;
  98        case RT_TABLE_DEFAULT:
  99                rcu_assign_pointer(net->ipv4.fib_default, tb);
 100                break;
 101        default:
 102                break;
 103        }
 104
 105        h = id & (FIB_TABLE_HASHSZ - 1);
 106        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
 107        return tb;
 108}
 109EXPORT_SYMBOL_GPL(fib_new_table);
 110
 111/* caller must hold either rtnl or rcu read lock */
 112struct fib_table *fib_get_table(struct net *net, u32 id)
 113{
 114        struct fib_table *tb;
 115        struct hlist_head *head;
 116        unsigned int h;
 117
 118        if (id == 0)
 119                id = RT_TABLE_MAIN;
 120        h = id & (FIB_TABLE_HASHSZ - 1);
 121
 122        head = &net->ipv4.fib_table_hash[h];
 123        hlist_for_each_entry_rcu(tb, head, tb_hlist,
 124                                 lockdep_rtnl_is_held()) {
 125                if (tb->tb_id == id)
 126                        return tb;
 127        }
 128        return NULL;
 129}
 130#endif /* CONFIG_IP_MULTIPLE_TABLES */
 131
 132static void fib_replace_table(struct net *net, struct fib_table *old,
 133                              struct fib_table *new)
 134{
 135#ifdef CONFIG_IP_MULTIPLE_TABLES
 136        switch (new->tb_id) {
 137        case RT_TABLE_MAIN:
 138                rcu_assign_pointer(net->ipv4.fib_main, new);
 139                break;
 140        case RT_TABLE_DEFAULT:
 141                rcu_assign_pointer(net->ipv4.fib_default, new);
 142                break;
 143        default:
 144                break;
 145        }
 146
 147#endif
 148        /* replace the old table in the hlist */
 149        hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
 150}
 151
 152int fib_unmerge(struct net *net)
 153{
 154        struct fib_table *old, *new, *main_table;
 155
 156        /* attempt to fetch local table if it has been allocated */
 157        old = fib_get_table(net, RT_TABLE_LOCAL);
 158        if (!old)
 159                return 0;
 160
 161        new = fib_trie_unmerge(old);
 162        if (!new)
 163                return -ENOMEM;
 164
 165        /* table is already unmerged */
 166        if (new == old)
 167                return 0;
 168
 169        /* replace merged table with clean table */
 170        fib_replace_table(net, old, new);
 171        fib_free_table(old);
 172
 173        /* attempt to fetch main table if it has been allocated */
 174        main_table = fib_get_table(net, RT_TABLE_MAIN);
 175        if (!main_table)
 176                return 0;
 177
 178        /* flush local entries from main table */
 179        fib_table_flush_external(main_table);
 180
 181        return 0;
 182}
 183
 184void fib_flush(struct net *net)
 185{
 186        int flushed = 0;
 187        unsigned int h;
 188
 189        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 190                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 191                struct hlist_node *tmp;
 192                struct fib_table *tb;
 193
 194                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
 195                        flushed += fib_table_flush(net, tb, false);
 196        }
 197
 198        if (flushed)
 199                rt_cache_flush(net);
 200}
 201
 202/*
 203 * Find address type as if only "dev" was present in the system. If
 204 * on_dev is NULL then all interfaces are taken into consideration.
 205 */
 206static inline unsigned int __inet_dev_addr_type(struct net *net,
 207                                                const struct net_device *dev,
 208                                                __be32 addr, u32 tb_id)
 209{
 210        struct flowi4           fl4 = { .daddr = addr };
 211        struct fib_result       res;
 212        unsigned int ret = RTN_BROADCAST;
 213        struct fib_table *table;
 214
 215        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
 216                return RTN_BROADCAST;
 217        if (ipv4_is_multicast(addr))
 218                return RTN_MULTICAST;
 219
 220        rcu_read_lock();
 221
 222        table = fib_get_table(net, tb_id);
 223        if (table) {
 224                ret = RTN_UNICAST;
 225                if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 226                        struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
 227
 228                        if (!dev || dev == nhc->nhc_dev)
 229                                ret = res.type;
 230                }
 231        }
 232
 233        rcu_read_unlock();
 234        return ret;
 235}
 236
 237unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
 238{
 239        return __inet_dev_addr_type(net, NULL, addr, tb_id);
 240}
 241EXPORT_SYMBOL(inet_addr_type_table);
 242
 243unsigned int inet_addr_type(struct net *net, __be32 addr)
 244{
 245        return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
 246}
 247EXPORT_SYMBOL(inet_addr_type);
 248
 249unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 250                                __be32 addr)
 251{
 252        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 253
 254        return __inet_dev_addr_type(net, dev, addr, rt_table);
 255}
 256EXPORT_SYMBOL(inet_dev_addr_type);
 257
 258/* inet_addr_type with dev == NULL but using the table from a dev
 259 * if one is associated
 260 */
 261unsigned int inet_addr_type_dev_table(struct net *net,
 262                                      const struct net_device *dev,
 263                                      __be32 addr)
 264{
 265        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 266
 267        return __inet_dev_addr_type(net, NULL, addr, rt_table);
 268}
 269EXPORT_SYMBOL(inet_addr_type_dev_table);
 270
 271__be32 fib_compute_spec_dst(struct sk_buff *skb)
 272{
 273        struct net_device *dev = skb->dev;
 274        struct in_device *in_dev;
 275        struct fib_result res;
 276        struct rtable *rt;
 277        struct net *net;
 278        int scope;
 279
 280        rt = skb_rtable(skb);
 281        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
 282            RTCF_LOCAL)
 283                return ip_hdr(skb)->daddr;
 284
 285        in_dev = __in_dev_get_rcu(dev);
 286
 287        net = dev_net(dev);
 288
 289        scope = RT_SCOPE_UNIVERSE;
 290        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 291                bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
 292                struct flowi4 fl4 = {
 293                        .flowi4_iif = LOOPBACK_IFINDEX,
 294                        .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
 295                        .daddr = ip_hdr(skb)->saddr,
 296                        .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
 297                        .flowi4_scope = scope,
 298                        .flowi4_mark = vmark ? skb->mark : 0,
 299                };
 300                if (!fib_lookup(net, &fl4, &res, 0))
 301                        return fib_result_prefsrc(net, &res);
 302        } else {
 303                scope = RT_SCOPE_LINK;
 304        }
 305
 306        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 307}
 308
 309bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 310{
 311        bool dev_match = false;
 312#ifdef CONFIG_IP_ROUTE_MULTIPATH
 313        if (unlikely(fi->nh)) {
 314                dev_match = nexthop_uses_dev(fi->nh, dev);
 315        } else {
 316                int ret;
 317
 318                for (ret = 0; ret < fib_info_num_path(fi); ret++) {
 319                        const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
 320
 321                        if (nhc_l3mdev_matches_dev(nhc, dev)) {
 322                                dev_match = true;
 323                                break;
 324                        }
 325                }
 326        }
 327#else
 328        if (fib_info_nhc(fi, 0)->nhc_dev == dev)
 329                dev_match = true;
 330#endif
 331
 332        return dev_match;
 333}
 334EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
 335
 336/* Given (packet source, input interface) and optional (dst, oif, tos):
 337 * - (main) check, that source is valid i.e. not broadcast or our local
 338 *   address.
 339 * - figure out what "logical" interface this packet arrived
 340 *   and calculate "specific destination" address.
 341 * - check, that packet arrived from expected physical interface.
 342 * called with rcu_read_lock()
 343 */
 344static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 345                                 u8 tos, int oif, struct net_device *dev,
 346                                 int rpf, struct in_device *idev, u32 *itag)
 347{
 348        struct net *net = dev_net(dev);
 349        struct flow_keys flkeys;
 350        int ret, no_addr;
 351        struct fib_result res;
 352        struct flowi4 fl4;
 353        bool dev_match;
 354
 355        fl4.flowi4_oif = 0;
 356        fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
 357        fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
 358        fl4.daddr = src;
 359        fl4.saddr = dst;
 360        fl4.flowi4_tos = tos;
 361        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 362        fl4.flowi4_tun_key.tun_id = 0;
 363        fl4.flowi4_flags = 0;
 364        fl4.flowi4_uid = sock_net_uid(net, NULL);
 365        fl4.flowi4_multipath_hash = 0;
 366
 367        no_addr = idev->ifa_list == NULL;
 368
 369        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 370        if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
 371                fl4.flowi4_proto = 0;
 372                fl4.fl4_sport = 0;
 373                fl4.fl4_dport = 0;
 374        } else {
 375                swap(fl4.fl4_sport, fl4.fl4_dport);
 376        }
 377
 378        if (fib_lookup(net, &fl4, &res, 0))
 379                goto last_resort;
 380        if (res.type != RTN_UNICAST &&
 381            (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 382                goto e_inval;
 383        fib_combine_itag(itag, &res);
 384
 385        dev_match = fib_info_nh_uses_dev(res.fi, dev);
 386        /* This is not common, loopback packets retain skb_dst so normally they
 387         * would not even hit this slow path.
 388         */
 389        dev_match = dev_match || (res.type == RTN_LOCAL &&
 390                                  dev == net->loopback_dev);
 391        if (dev_match) {
 392                ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 393                return ret;
 394        }
 395        if (no_addr)
 396                goto last_resort;
 397        if (rpf == 1)
 398                goto e_rpf;
 399        fl4.flowi4_oif = dev->ifindex;
 400
 401        ret = 0;
 402        if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 403                if (res.type == RTN_UNICAST)
 404                        ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 405        }
 406        return ret;
 407
 408last_resort:
 409        if (rpf)
 410                goto e_rpf;
 411        *itag = 0;
 412        return 0;
 413
 414e_inval:
 415        return -EINVAL;
 416e_rpf:
 417        return -EXDEV;
 418}
 419
 420/* Ignore rp_filter for packets protected by IPsec. */
 421int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 422                        u8 tos, int oif, struct net_device *dev,
 423                        struct in_device *idev, u32 *itag)
 424{
 425        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 426        struct net *net = dev_net(dev);
 427
 428        if (!r && !fib_num_tclassid_users(net) &&
 429            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
 430                if (IN_DEV_ACCEPT_LOCAL(idev))
 431                        goto ok;
 432                /* with custom local routes in place, checking local addresses
 433                 * only will be too optimistic, with custom rules, checking
 434                 * local addresses only can be too strict, e.g. due to vrf
 435                 */
 436                if (net->ipv4.fib_has_custom_local_routes ||
 437                    fib4_has_custom_rules(net))
 438                        goto full_check;
 439                /* Within the same container, it is regarded as a martian source,
 440                 * and the same host but different containers are not.
 441                 */
 442                if (inet_lookup_ifaddr_rcu(net, src))
 443                        return -EINVAL;
 444
 445ok:
 446                *itag = 0;
 447                return 0;
 448        }
 449
 450full_check:
 451        return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 452}
 453
 454static inline __be32 sk_extract_addr(struct sockaddr *addr)
 455{
 456        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
 457}
 458
 459static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
 460{
 461        struct nlattr *nla;
 462
 463        nla = (struct nlattr *) ((char *) mx + len);
 464        nla->nla_type = type;
 465        nla->nla_len = nla_attr_size(4);
 466        *(u32 *) nla_data(nla) = value;
 467
 468        return len + nla_total_size(4);
 469}
 470
 471static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 472                                 struct fib_config *cfg)
 473{
 474        __be32 addr;
 475        int plen;
 476
 477        memset(cfg, 0, sizeof(*cfg));
 478        cfg->fc_nlinfo.nl_net = net;
 479
 480        if (rt->rt_dst.sa_family != AF_INET)
 481                return -EAFNOSUPPORT;
 482
 483        /*
 484         * Check mask for validity:
 485         * a) it must be contiguous.
 486         * b) destination must have all host bits clear.
 487         * c) if application forgot to set correct family (AF_INET),
 488         *    reject request unless it is absolutely clear i.e.
 489         *    both family and mask are zero.
 490         */
 491        plen = 32;
 492        addr = sk_extract_addr(&rt->rt_dst);
 493        if (!(rt->rt_flags & RTF_HOST)) {
 494                __be32 mask = sk_extract_addr(&rt->rt_genmask);
 495
 496                if (rt->rt_genmask.sa_family != AF_INET) {
 497                        if (mask || rt->rt_genmask.sa_family)
 498                                return -EAFNOSUPPORT;
 499                }
 500
 501                if (bad_mask(mask, addr))
 502                        return -EINVAL;
 503
 504                plen = inet_mask_len(mask);
 505        }
 506
 507        cfg->fc_dst_len = plen;
 508        cfg->fc_dst = addr;
 509
 510        if (cmd != SIOCDELRT) {
 511                cfg->fc_nlflags = NLM_F_CREATE;
 512                cfg->fc_protocol = RTPROT_BOOT;
 513        }
 514
 515        if (rt->rt_metric)
 516                cfg->fc_priority = rt->rt_metric - 1;
 517
 518        if (rt->rt_flags & RTF_REJECT) {
 519                cfg->fc_scope = RT_SCOPE_HOST;
 520                cfg->fc_type = RTN_UNREACHABLE;
 521                return 0;
 522        }
 523
 524        cfg->fc_scope = RT_SCOPE_NOWHERE;
 525        cfg->fc_type = RTN_UNICAST;
 526
 527        if (rt->rt_dev) {
 528                char *colon;
 529                struct net_device *dev;
 530                char devname[IFNAMSIZ];
 531
 532                if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
 533                        return -EFAULT;
 534
 535                devname[IFNAMSIZ-1] = 0;
 536                colon = strchr(devname, ':');
 537                if (colon)
 538                        *colon = 0;
 539                dev = __dev_get_by_name(net, devname);
 540                if (!dev)
 541                        return -ENODEV;
 542                cfg->fc_oif = dev->ifindex;
 543                cfg->fc_table = l3mdev_fib_table(dev);
 544                if (colon) {
 545                        const struct in_ifaddr *ifa;
 546                        struct in_device *in_dev;
 547
 548                        in_dev = __in_dev_get_rtnl(dev);
 549                        if (!in_dev)
 550                                return -ENODEV;
 551
 552                        *colon = ':';
 553
 554                        rcu_read_lock();
 555                        in_dev_for_each_ifa_rcu(ifa, in_dev) {
 556                                if (strcmp(ifa->ifa_label, devname) == 0)
 557                                        break;
 558                        }
 559                        rcu_read_unlock();
 560
 561                        if (!ifa)
 562                                return -ENODEV;
 563                        cfg->fc_prefsrc = ifa->ifa_local;
 564                }
 565        }
 566
 567        addr = sk_extract_addr(&rt->rt_gateway);
 568        if (rt->rt_gateway.sa_family == AF_INET && addr) {
 569                unsigned int addr_type;
 570
 571                cfg->fc_gw4 = addr;
 572                cfg->fc_gw_family = AF_INET;
 573                addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
 574                if (rt->rt_flags & RTF_GATEWAY &&
 575                    addr_type == RTN_UNICAST)
 576                        cfg->fc_scope = RT_SCOPE_UNIVERSE;
 577        }
 578
 579        if (cmd == SIOCDELRT)
 580                return 0;
 581
 582        if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
 583                return -EINVAL;
 584
 585        if (cfg->fc_scope == RT_SCOPE_NOWHERE)
 586                cfg->fc_scope = RT_SCOPE_LINK;
 587
 588        if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
 589                struct nlattr *mx;
 590                int len = 0;
 591
 592                mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
 593                if (!mx)
 594                        return -ENOMEM;
 595
 596                if (rt->rt_flags & RTF_MTU)
 597                        len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
 598
 599                if (rt->rt_flags & RTF_WINDOW)
 600                        len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
 601
 602                if (rt->rt_flags & RTF_IRTT)
 603                        len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
 604
 605                cfg->fc_mx = mx;
 606                cfg->fc_mx_len = len;
 607        }
 608
 609        return 0;
 610}
 611
 612/*
 613 * Handle IP routing ioctl calls.
 614 * These are used to manipulate the routing tables
 615 */
 616int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
 617{
 618        struct fib_config cfg;
 619        int err;
 620
 621        switch (cmd) {
 622        case SIOCADDRT:         /* Add a route */
 623        case SIOCDELRT:         /* Delete a route */
 624                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 625                        return -EPERM;
 626
 627                rtnl_lock();
 628                err = rtentry_to_fib_config(net, cmd, rt, &cfg);
 629                if (err == 0) {
 630                        struct fib_table *tb;
 631
 632                        if (cmd == SIOCDELRT) {
 633                                tb = fib_get_table(net, cfg.fc_table);
 634                                if (tb)
 635                                        err = fib_table_delete(net, tb, &cfg,
 636                                                               NULL);
 637                                else
 638                                        err = -ESRCH;
 639                        } else {
 640                                tb = fib_new_table(net, cfg.fc_table);
 641                                if (tb)
 642                                        err = fib_table_insert(net, tb,
 643                                                               &cfg, NULL);
 644                                else
 645                                        err = -ENOBUFS;
 646                        }
 647
 648                        /* allocated by rtentry_to_fib_config() */
 649                        kfree(cfg.fc_mx);
 650                }
 651                rtnl_unlock();
 652                return err;
 653        }
 654        return -EINVAL;
 655}
 656
 657const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 658        [RTA_UNSPEC]            = { .strict_start_type = RTA_DPORT + 1 },
 659        [RTA_DST]               = { .type = NLA_U32 },
 660        [RTA_SRC]               = { .type = NLA_U32 },
 661        [RTA_IIF]               = { .type = NLA_U32 },
 662        [RTA_OIF]               = { .type = NLA_U32 },
 663        [RTA_GATEWAY]           = { .type = NLA_U32 },
 664        [RTA_PRIORITY]          = { .type = NLA_U32 },
 665        [RTA_PREFSRC]           = { .type = NLA_U32 },
 666        [RTA_METRICS]           = { .type = NLA_NESTED },
 667        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
 668        [RTA_FLOW]              = { .type = NLA_U32 },
 669        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
 670        [RTA_ENCAP]             = { .type = NLA_NESTED },
 671        [RTA_UID]               = { .type = NLA_U32 },
 672        [RTA_MARK]              = { .type = NLA_U32 },
 673        [RTA_TABLE]             = { .type = NLA_U32 },
 674        [RTA_IP_PROTO]          = { .type = NLA_U8 },
 675        [RTA_SPORT]             = { .type = NLA_U16 },
 676        [RTA_DPORT]             = { .type = NLA_U16 },
 677        [RTA_NH_ID]             = { .type = NLA_U32 },
 678};
 679
 680int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
 681                    struct netlink_ext_ack *extack)
 682{
 683        struct rtvia *via;
 684        int alen;
 685
 686        if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
 687                NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
 688                return -EINVAL;
 689        }
 690
 691        via = nla_data(nla);
 692        alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
 693
 694        switch (via->rtvia_family) {
 695        case AF_INET:
 696                if (alen != sizeof(__be32)) {
 697                        NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
 698                        return -EINVAL;
 699                }
 700                cfg->fc_gw_family = AF_INET;
 701                cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
 702                break;
 703        case AF_INET6:
 704#if IS_ENABLED(CONFIG_IPV6)
 705                if (alen != sizeof(struct in6_addr)) {
 706                        NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
 707                        return -EINVAL;
 708                }
 709                cfg->fc_gw_family = AF_INET6;
 710                cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
 711#else
 712                NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
 713                return -EINVAL;
 714#endif
 715                break;
 716        default:
 717                NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
 718                return -EINVAL;
 719        }
 720
 721        return 0;
 722}
 723
 724static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 725                             struct nlmsghdr *nlh, struct fib_config *cfg,
 726                             struct netlink_ext_ack *extack)
 727{
 728        bool has_gw = false, has_via = false;
 729        struct nlattr *attr;
 730        int err, remaining;
 731        struct rtmsg *rtm;
 732
 733        err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
 734                                        rtm_ipv4_policy, extack);
 735        if (err < 0)
 736                goto errout;
 737
 738        memset(cfg, 0, sizeof(*cfg));
 739
 740        rtm = nlmsg_data(nlh);
 741
 742        if (!inet_validate_dscp(rtm->rtm_tos)) {
 743                NL_SET_ERR_MSG(extack,
 744                               "Invalid dsfield (tos): ECN bits must be 0");
 745                err = -EINVAL;
 746                goto errout;
 747        }
 748        cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
 749
 750        cfg->fc_dst_len = rtm->rtm_dst_len;
 751        cfg->fc_table = rtm->rtm_table;
 752        cfg->fc_protocol = rtm->rtm_protocol;
 753        cfg->fc_scope = rtm->rtm_scope;
 754        cfg->fc_type = rtm->rtm_type;
 755        cfg->fc_flags = rtm->rtm_flags;
 756        cfg->fc_nlflags = nlh->nlmsg_flags;
 757
 758        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 759        cfg->fc_nlinfo.nlh = nlh;
 760        cfg->fc_nlinfo.nl_net = net;
 761
 762        if (cfg->fc_type > RTN_MAX) {
 763                NL_SET_ERR_MSG(extack, "Invalid route type");
 764                err = -EINVAL;
 765                goto errout;
 766        }
 767
 768        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
 769                switch (nla_type(attr)) {
 770                case RTA_DST:
 771                        cfg->fc_dst = nla_get_be32(attr);
 772                        break;
 773                case RTA_OIF:
 774                        cfg->fc_oif = nla_get_u32(attr);
 775                        break;
 776                case RTA_GATEWAY:
 777                        has_gw = true;
 778                        cfg->fc_gw4 = nla_get_be32(attr);
 779                        if (cfg->fc_gw4)
 780                                cfg->fc_gw_family = AF_INET;
 781                        break;
 782                case RTA_VIA:
 783                        has_via = true;
 784                        err = fib_gw_from_via(cfg, attr, extack);
 785                        if (err)
 786                                goto errout;
 787                        break;
 788                case RTA_PRIORITY:
 789                        cfg->fc_priority = nla_get_u32(attr);
 790                        break;
 791                case RTA_PREFSRC:
 792                        cfg->fc_prefsrc = nla_get_be32(attr);
 793                        break;
 794                case RTA_METRICS:
 795                        cfg->fc_mx = nla_data(attr);
 796                        cfg->fc_mx_len = nla_len(attr);
 797                        break;
 798                case RTA_MULTIPATH:
 799                        err = lwtunnel_valid_encap_type_attr(nla_data(attr),
 800                                                             nla_len(attr),
 801                                                             extack);
 802                        if (err < 0)
 803                                goto errout;
 804                        cfg->fc_mp = nla_data(attr);
 805                        cfg->fc_mp_len = nla_len(attr);
 806                        break;
 807                case RTA_FLOW:
 808                        cfg->fc_flow = nla_get_u32(attr);
 809                        break;
 810                case RTA_TABLE:
 811                        cfg->fc_table = nla_get_u32(attr);
 812                        break;
 813                case RTA_ENCAP:
 814                        cfg->fc_encap = attr;
 815                        break;
 816                case RTA_ENCAP_TYPE:
 817                        cfg->fc_encap_type = nla_get_u16(attr);
 818                        err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
 819                                                        extack);
 820                        if (err < 0)
 821                                goto errout;
 822                        break;
 823                case RTA_NH_ID:
 824                        cfg->fc_nh_id = nla_get_u32(attr);
 825                        break;
 826                }
 827        }
 828
 829        if (cfg->fc_nh_id) {
 830                if (cfg->fc_oif || cfg->fc_gw_family ||
 831                    cfg->fc_encap || cfg->fc_mp) {
 832                        NL_SET_ERR_MSG(extack,
 833                                       "Nexthop specification and nexthop id are mutually exclusive");
 834                        return -EINVAL;
 835                }
 836        }
 837
 838        if (has_gw && has_via) {
 839                NL_SET_ERR_MSG(extack,
 840                               "Nexthop configuration can not contain both GATEWAY and VIA");
 841                return -EINVAL;
 842        }
 843
 844        return 0;
 845errout:
 846        return err;
 847}
 848
 849static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 850                             struct netlink_ext_ack *extack)
 851{
 852        struct net *net = sock_net(skb->sk);
 853        struct fib_config cfg;
 854        struct fib_table *tb;
 855        int err;
 856
 857        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 858        if (err < 0)
 859                goto errout;
 860
 861        if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
 862                NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
 863                err = -EINVAL;
 864                goto errout;
 865        }
 866
 867        tb = fib_get_table(net, cfg.fc_table);
 868        if (!tb) {
 869                NL_SET_ERR_MSG(extack, "FIB table does not exist");
 870                err = -ESRCH;
 871                goto errout;
 872        }
 873
 874        err = fib_table_delete(net, tb, &cfg, extack);
 875errout:
 876        return err;
 877}
 878
 879static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 880                             struct netlink_ext_ack *extack)
 881{
 882        struct net *net = sock_net(skb->sk);
 883        struct fib_config cfg;
 884        struct fib_table *tb;
 885        int err;
 886
 887        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 888        if (err < 0)
 889                goto errout;
 890
 891        tb = fib_new_table(net, cfg.fc_table);
 892        if (!tb) {
 893                err = -ENOBUFS;
 894                goto errout;
 895        }
 896
 897        err = fib_table_insert(net, tb, &cfg, extack);
 898        if (!err && cfg.fc_type == RTN_LOCAL)
 899                net->ipv4.fib_has_custom_local_routes = true;
 900errout:
 901        return err;
 902}
 903
 904int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 905                          struct fib_dump_filter *filter,
 906                          struct netlink_callback *cb)
 907{
 908        struct netlink_ext_ack *extack = cb->extack;
 909        struct nlattr *tb[RTA_MAX + 1];
 910        struct rtmsg *rtm;
 911        int err, i;
 912
 913        ASSERT_RTNL();
 914
 915        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 916                NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
 917                return -EINVAL;
 918        }
 919
 920        rtm = nlmsg_data(nlh);
 921        if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
 922            rtm->rtm_scope) {
 923                NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 924                return -EINVAL;
 925        }
 926
 927        if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
 928                NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
 929                return -EINVAL;
 930        }
 931        if (rtm->rtm_flags & RTM_F_CLONED)
 932                filter->dump_routes = false;
 933        else
 934                filter->dump_exceptions = false;
 935
 936        filter->flags    = rtm->rtm_flags;
 937        filter->protocol = rtm->rtm_protocol;
 938        filter->rt_type  = rtm->rtm_type;
 939        filter->table_id = rtm->rtm_table;
 940
 941        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
 942                                            rtm_ipv4_policy, extack);
 943        if (err < 0)
 944                return err;
 945
 946        for (i = 0; i <= RTA_MAX; ++i) {
 947                int ifindex;
 948
 949                if (!tb[i])
 950                        continue;
 951
 952                switch (i) {
 953                case RTA_TABLE:
 954                        filter->table_id = nla_get_u32(tb[i]);
 955                        break;
 956                case RTA_OIF:
 957                        ifindex = nla_get_u32(tb[i]);
 958                        filter->dev = __dev_get_by_index(net, ifindex);
 959                        if (!filter->dev)
 960                                return -ENODEV;
 961                        break;
 962                default:
 963                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
 964                        return -EINVAL;
 965                }
 966        }
 967
 968        if (filter->flags || filter->protocol || filter->rt_type ||
 969            filter->table_id || filter->dev) {
 970                filter->filter_set = 1;
 971                cb->answer_flags = NLM_F_DUMP_FILTERED;
 972        }
 973
 974        return 0;
 975}
 976EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
 977
 978static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 979{
 980        struct fib_dump_filter filter = { .dump_routes = true,
 981                                          .dump_exceptions = true };
 982        const struct nlmsghdr *nlh = cb->nlh;
 983        struct net *net = sock_net(skb->sk);
 984        unsigned int h, s_h;
 985        unsigned int e = 0, s_e;
 986        struct fib_table *tb;
 987        struct hlist_head *head;
 988        int dumped = 0, err;
 989
 990        if (cb->strict_check) {
 991                err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
 992                if (err < 0)
 993                        return err;
 994        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 995                struct rtmsg *rtm = nlmsg_data(nlh);
 996
 997                filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 998        }
 999
1000        /* ipv4 does not use prefix flag */
1001        if (filter.flags & RTM_F_PREFIX)
1002                return skb->len;
1003
1004        if (filter.table_id) {
1005                tb = fib_get_table(net, filter.table_id);
1006                if (!tb) {
1007                        if (rtnl_msg_family(cb->nlh) != PF_INET)
1008                                return skb->len;
1009
1010                        NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
1011                        return -ENOENT;
1012                }
1013
1014                rcu_read_lock();
1015                err = fib_table_dump(tb, skb, cb, &filter);
1016                rcu_read_unlock();
1017                return skb->len ? : err;
1018        }
1019
1020        s_h = cb->args[0];
1021        s_e = cb->args[1];
1022
1023        rcu_read_lock();
1024
1025        for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
1026                e = 0;
1027                head = &net->ipv4.fib_table_hash[h];
1028                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1029                        if (e < s_e)
1030                                goto next;
1031                        if (dumped)
1032                                memset(&cb->args[2], 0, sizeof(cb->args) -
1033                                                 2 * sizeof(cb->args[0]));
1034                        err = fib_table_dump(tb, skb, cb, &filter);
1035                        if (err < 0) {
1036                                if (likely(skb->len))
1037                                        goto out;
1038
1039                                goto out_err;
1040                        }
1041                        dumped = 1;
1042next:
1043                        e++;
1044                }
1045        }
1046out:
1047        err = skb->len;
1048out_err:
1049        rcu_read_unlock();
1050
1051        cb->args[1] = e;
1052        cb->args[0] = h;
1053
1054        return err;
1055}
1056
1057/* Prepare and feed intra-kernel routing request.
1058 * Really, it should be netlink message, but :-( netlink
1059 * can be not configured, so that we feed it directly
1060 * to fib engine. It is legal, because all events occur
1061 * only when netlink is already locked.
1062 */
1063static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
1064                      struct in_ifaddr *ifa, u32 rt_priority)
1065{
1066        struct net *net = dev_net(ifa->ifa_dev->dev);
1067        u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
1068        struct fib_table *tb;
1069        struct fib_config cfg = {
1070                .fc_protocol = RTPROT_KERNEL,
1071                .fc_type = type,
1072                .fc_dst = dst,
1073                .fc_dst_len = dst_len,
1074                .fc_priority = rt_priority,
1075                .fc_prefsrc = ifa->ifa_local,
1076                .fc_oif = ifa->ifa_dev->dev->ifindex,
1077                .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
1078                .fc_nlinfo = {
1079                        .nl_net = net,
1080                },
1081        };
1082
1083        if (!tb_id)
1084                tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
1085
1086        tb = fib_new_table(net, tb_id);
1087        if (!tb)
1088                return;
1089
1090        cfg.fc_table = tb->tb_id;
1091
1092        if (type != RTN_LOCAL)
1093                cfg.fc_scope = RT_SCOPE_LINK;
1094        else
1095                cfg.fc_scope = RT_SCOPE_HOST;
1096
1097        if (cmd == RTM_NEWROUTE)
1098                fib_table_insert(net, tb, &cfg, NULL);
1099        else
1100                fib_table_delete(net, tb, &cfg, NULL);
1101}
1102
1103void fib_add_ifaddr(struct in_ifaddr *ifa)
1104{
1105        struct in_device *in_dev = ifa->ifa_dev;
1106        struct net_device *dev = in_dev->dev;
1107        struct in_ifaddr *prim = ifa;
1108        __be32 mask = ifa->ifa_mask;
1109        __be32 addr = ifa->ifa_local;
1110        __be32 prefix = ifa->ifa_address & mask;
1111
1112        if (ifa->ifa_flags & IFA_F_SECONDARY) {
1113                prim = inet_ifa_byprefix(in_dev, prefix, mask);
1114                if (!prim) {
1115                        pr_warn("%s: bug: prim == NULL\n", __func__);
1116                        return;
1117                }
1118        }
1119
1120        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
1121
1122        if (!(dev->flags & IFF_UP))
1123                return;
1124
1125        /* Add broadcast address, if it is explicitly assigned. */
1126        if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
1127                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1128                          prim, 0);
1129                arp_invalidate(dev, ifa->ifa_broadcast, false);
1130        }
1131
1132        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
1133            (prefix != addr || ifa->ifa_prefixlen < 32)) {
1134                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1135                        fib_magic(RTM_NEWROUTE,
1136                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1137                                  prefix, ifa->ifa_prefixlen, prim,
1138                                  ifa->ifa_rt_priority);
1139
1140                /* Add the network broadcast address, when it makes sense */
1141                if (ifa->ifa_prefixlen < 31) {
1142                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
1143                                  32, prim, 0);
1144                        arp_invalidate(dev, prefix | ~mask, false);
1145                }
1146        }
1147}
1148
1149void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
1150{
1151        __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
1152        struct in_device *in_dev = ifa->ifa_dev;
1153        struct net_device *dev = in_dev->dev;
1154
1155        if (!(dev->flags & IFF_UP) ||
1156            ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
1157            ipv4_is_zeronet(prefix) ||
1158            (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
1159                return;
1160
1161        /* add the new */
1162        fib_magic(RTM_NEWROUTE,
1163                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1164                  prefix, ifa->ifa_prefixlen, ifa, new_metric);
1165
1166        /* delete the old */
1167        fib_magic(RTM_DELROUTE,
1168                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1169                  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
1170}
1171
1172/* Delete primary or secondary address.
1173 * Optionally, on secondary address promotion consider the addresses
1174 * from subnet iprim as deleted, even if they are in device list.
1175 * In this case the secondary ifa can be in device list.
1176 */
1177void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
1178{
1179        struct in_device *in_dev = ifa->ifa_dev;
1180        struct net_device *dev = in_dev->dev;
1181        struct in_ifaddr *ifa1;
1182        struct in_ifaddr *prim = ifa, *prim1 = NULL;
1183        __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
1184        __be32 any = ifa->ifa_address & ifa->ifa_mask;
1185#define LOCAL_OK        1
1186#define BRD_OK          2
1187#define BRD0_OK         4
1188#define BRD1_OK         8
1189        unsigned int ok = 0;
1190        int subnet = 0;         /* Primary network */
1191        int gone = 1;           /* Address is missing */
1192        int same_prefsrc = 0;   /* Another primary with same IP */
1193
1194        if (ifa->ifa_flags & IFA_F_SECONDARY) {
1195                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
1196                if (!prim) {
1197                        /* if the device has been deleted, we don't perform
1198                         * address promotion
1199                         */
1200                        if (!in_dev->dead)
1201                                pr_warn("%s: bug: prim == NULL\n", __func__);
1202                        return;
1203                }
1204                if (iprim && iprim != prim) {
1205                        pr_warn("%s: bug: iprim != prim\n", __func__);
1206                        return;
1207                }
1208        } else if (!ipv4_is_zeronet(any) &&
1209                   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
1210                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1211                        fib_magic(RTM_DELROUTE,
1212                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1213                                  any, ifa->ifa_prefixlen, prim, 0);
1214                subnet = 1;
1215        }
1216
1217        if (in_dev->dead)
1218                goto no_promotions;
1219
1220        /* Deletion is more complicated than add.
1221         * We should take care of not to delete too much :-)
1222         *
1223         * Scan address list to be sure that addresses are really gone.
1224         */
1225        rcu_read_lock();
1226        in_dev_for_each_ifa_rcu(ifa1, in_dev) {
1227                if (ifa1 == ifa) {
1228                        /* promotion, keep the IP */
1229                        gone = 0;
1230                        continue;
1231                }
1232                /* Ignore IFAs from our subnet */
1233                if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
1234                    inet_ifa_match(ifa1->ifa_address, iprim))
1235                        continue;
1236
1237                /* Ignore ifa1 if it uses different primary IP (prefsrc) */
1238                if (ifa1->ifa_flags & IFA_F_SECONDARY) {
1239                        /* Another address from our subnet? */
1240                        if (ifa1->ifa_mask == prim->ifa_mask &&
1241                            inet_ifa_match(ifa1->ifa_address, prim))
1242                                prim1 = prim;
1243                        else {
1244                                /* We reached the secondaries, so
1245                                 * same_prefsrc should be determined.
1246                                 */
1247                                if (!same_prefsrc)
1248                                        continue;
1249                                /* Search new prim1 if ifa1 is not
1250                                 * using the current prim1
1251                                 */
1252                                if (!prim1 ||
1253                                    ifa1->ifa_mask != prim1->ifa_mask ||
1254                                    !inet_ifa_match(ifa1->ifa_address, prim1))
1255                                        prim1 = inet_ifa_byprefix(in_dev,
1256                                                        ifa1->ifa_address,
1257                                                        ifa1->ifa_mask);
1258                                if (!prim1)
1259                                        continue;
1260                                if (prim1->ifa_local != prim->ifa_local)
1261                                        continue;
1262                        }
1263                } else {
1264                        if (prim->ifa_local != ifa1->ifa_local)
1265                                continue;
1266                        prim1 = ifa1;
1267                        if (prim != prim1)
1268                                same_prefsrc = 1;
1269                }
1270                if (ifa->ifa_local == ifa1->ifa_local)
1271                        ok |= LOCAL_OK;
1272                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
1273                        ok |= BRD_OK;
1274                if (brd == ifa1->ifa_broadcast)
1275                        ok |= BRD1_OK;
1276                if (any == ifa1->ifa_broadcast)
1277                        ok |= BRD0_OK;
1278                /* primary has network specific broadcasts */
1279                if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
1280                        __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
1281                        __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
1282
1283                        if (!ipv4_is_zeronet(any1)) {
1284                                if (ifa->ifa_broadcast == brd1 ||
1285                                    ifa->ifa_broadcast == any1)
1286                                        ok |= BRD_OK;
1287                                if (brd == brd1 || brd == any1)
1288                                        ok |= BRD1_OK;
1289                                if (any == brd1 || any == any1)
1290                                        ok |= BRD0_OK;
1291                        }
1292                }
1293        }
1294        rcu_read_unlock();
1295
1296no_promotions:
1297        if (!(ok & BRD_OK))
1298                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1299                          prim, 0);
1300        if (subnet && ifa->ifa_prefixlen < 31) {
1301                if (!(ok & BRD1_OK))
1302                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
1303                                  prim, 0);
1304                if (!(ok & BRD0_OK))
1305                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
1306                                  prim, 0);
1307        }
1308        if (!(ok & LOCAL_OK)) {
1309                unsigned int addr_type;
1310
1311                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
1312
1313                /* Check, that this local address finally disappeared. */
1314                addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
1315                                                     ifa->ifa_local);
1316                if (gone && addr_type != RTN_LOCAL) {
1317                        /* And the last, but not the least thing.
1318                         * We must flush stray FIB entries.
1319                         *
1320                         * First of all, we scan fib_info list searching
1321                         * for stray nexthop entries, then ignite fib_flush.
1322                         */
1323                        if (fib_sync_down_addr(dev, ifa->ifa_local))
1324                                fib_flush(dev_net(dev));
1325                }
1326        }
1327#undef LOCAL_OK
1328#undef BRD_OK
1329#undef BRD0_OK
1330#undef BRD1_OK
1331}
1332
1333static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
1334{
1335
1336        struct fib_result       res;
1337        struct flowi4           fl4 = {
1338                .flowi4_mark = frn->fl_mark,
1339                .daddr = frn->fl_addr,
1340                .flowi4_tos = frn->fl_tos,
1341                .flowi4_scope = frn->fl_scope,
1342        };
1343        struct fib_table *tb;
1344
1345        rcu_read_lock();
1346
1347        tb = fib_get_table(net, frn->tb_id_in);
1348
1349        frn->err = -ENOENT;
1350        if (tb) {
1351                local_bh_disable();
1352
1353                frn->tb_id = tb->tb_id;
1354                frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1355
1356                if (!frn->err) {
1357                        frn->prefixlen = res.prefixlen;
1358                        frn->nh_sel = res.nh_sel;
1359                        frn->type = res.type;
1360                        frn->scope = res.scope;
1361                }
1362                local_bh_enable();
1363        }
1364
1365        rcu_read_unlock();
1366}
1367
1368static void nl_fib_input(struct sk_buff *skb)
1369{
1370        struct net *net;
1371        struct fib_result_nl *frn;
1372        struct nlmsghdr *nlh;
1373        u32 portid;
1374
1375        net = sock_net(skb->sk);
1376        nlh = nlmsg_hdr(skb);
1377        if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1378            skb->len < nlh->nlmsg_len ||
1379            nlmsg_len(nlh) < sizeof(*frn))
1380                return;
1381
1382        skb = netlink_skb_clone(skb, GFP_KERNEL);
1383        if (!skb)
1384                return;
1385        nlh = nlmsg_hdr(skb);
1386
1387        frn = nlmsg_data(nlh);
1388        nl_fib_lookup(net, frn);
1389
1390        portid = NETLINK_CB(skb).portid;      /* netlink portid */
1391        NETLINK_CB(skb).portid = 0;        /* from kernel */
1392        NETLINK_CB(skb).dst_group = 0;  /* unicast */
1393        nlmsg_unicast(net->ipv4.fibnl, skb, portid);
1394}
1395
1396static int __net_init nl_fib_lookup_init(struct net *net)
1397{
1398        struct sock *sk;
1399        struct netlink_kernel_cfg cfg = {
1400                .input  = nl_fib_input,
1401        };
1402
1403        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1404        if (!sk)
1405                return -EAFNOSUPPORT;
1406        net->ipv4.fibnl = sk;
1407        return 0;
1408}
1409
1410static void nl_fib_lookup_exit(struct net *net)
1411{
1412        netlink_kernel_release(net->ipv4.fibnl);
1413        net->ipv4.fibnl = NULL;
1414}
1415
1416static void fib_disable_ip(struct net_device *dev, unsigned long event,
1417                           bool force)
1418{
1419        if (fib_sync_down_dev(dev, event, force))
1420                fib_flush(dev_net(dev));
1421        else
1422                rt_cache_flush(dev_net(dev));
1423        arp_ifdown(dev);
1424}
1425
1426static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1427{
1428        struct in_ifaddr *ifa = ptr;
1429        struct net_device *dev = ifa->ifa_dev->dev;
1430        struct net *net = dev_net(dev);
1431
1432        switch (event) {
1433        case NETDEV_UP:
1434                fib_add_ifaddr(ifa);
1435#ifdef CONFIG_IP_ROUTE_MULTIPATH
1436                fib_sync_up(dev, RTNH_F_DEAD);
1437#endif
1438                atomic_inc(&net->ipv4.dev_addr_genid);
1439                rt_cache_flush(dev_net(dev));
1440                break;
1441        case NETDEV_DOWN:
1442                fib_del_ifaddr(ifa, NULL);
1443                atomic_inc(&net->ipv4.dev_addr_genid);
1444                if (!ifa->ifa_dev->ifa_list) {
1445                        /* Last address was deleted from this interface.
1446                         * Disable IP.
1447                         */
1448                        fib_disable_ip(dev, event, true);
1449                } else {
1450                        rt_cache_flush(dev_net(dev));
1451                }
1452                break;
1453        }
1454        return NOTIFY_DONE;
1455}
1456
1457static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1458{
1459        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1460        struct netdev_notifier_changeupper_info *upper_info = ptr;
1461        struct netdev_notifier_info_ext *info_ext = ptr;
1462        struct in_device *in_dev;
1463        struct net *net = dev_net(dev);
1464        struct in_ifaddr *ifa;
1465        unsigned int flags;
1466
1467        if (event == NETDEV_UNREGISTER) {
1468                fib_disable_ip(dev, event, true);
1469                rt_flush_dev(dev);
1470                return NOTIFY_DONE;
1471        }
1472
1473        in_dev = __in_dev_get_rtnl(dev);
1474        if (!in_dev)
1475                return NOTIFY_DONE;
1476
1477        switch (event) {
1478        case NETDEV_UP:
1479                in_dev_for_each_ifa_rtnl(ifa, in_dev) {
1480                        fib_add_ifaddr(ifa);
1481                }
1482#ifdef CONFIG_IP_ROUTE_MULTIPATH
1483                fib_sync_up(dev, RTNH_F_DEAD);
1484#endif
1485                atomic_inc(&net->ipv4.dev_addr_genid);
1486                rt_cache_flush(net);
1487                break;
1488        case NETDEV_DOWN:
1489                fib_disable_ip(dev, event, false);
1490                break;
1491        case NETDEV_CHANGE:
1492                flags = dev_get_flags(dev);
1493                if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1494                        fib_sync_up(dev, RTNH_F_LINKDOWN);
1495                else
1496                        fib_sync_down_dev(dev, event, false);
1497                rt_cache_flush(net);
1498                break;
1499        case NETDEV_CHANGEMTU:
1500                fib_sync_mtu(dev, info_ext->ext.mtu);
1501                rt_cache_flush(net);
1502                break;
1503        case NETDEV_CHANGEUPPER:
1504                upper_info = ptr;
1505                /* flush all routes if dev is linked to or unlinked from
1506                 * an L3 master device (e.g., VRF)
1507                 */
1508                if (upper_info->upper_dev &&
1509                    netif_is_l3_master(upper_info->upper_dev))
1510                        fib_disable_ip(dev, NETDEV_DOWN, true);
1511                break;
1512        }
1513        return NOTIFY_DONE;
1514}
1515
1516static struct notifier_block fib_inetaddr_notifier = {
1517        .notifier_call = fib_inetaddr_event,
1518};
1519
1520static struct notifier_block fib_netdev_notifier = {
1521        .notifier_call = fib_netdev_event,
1522};
1523
1524static int __net_init ip_fib_net_init(struct net *net)
1525{
1526        int err;
1527        size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1528
1529        err = fib4_notifier_init(net);
1530        if (err)
1531                return err;
1532
1533#ifdef CONFIG_IP_ROUTE_MULTIPATH
1534        /* Default to 3-tuple */
1535        net->ipv4.sysctl_fib_multipath_hash_fields =
1536                FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
1537#endif
1538
1539        /* Avoid false sharing : Use at least a full cache line */
1540        size = max_t(size_t, size, L1_CACHE_BYTES);
1541
1542        net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1543        if (!net->ipv4.fib_table_hash) {
1544                err = -ENOMEM;
1545                goto err_table_hash_alloc;
1546        }
1547
1548        err = fib4_rules_init(net);
1549        if (err < 0)
1550                goto err_rules_init;
1551        return 0;
1552
1553err_rules_init:
1554        kfree(net->ipv4.fib_table_hash);
1555err_table_hash_alloc:
1556        fib4_notifier_exit(net);
1557        return err;
1558}
1559
1560static void ip_fib_net_exit(struct net *net)
1561{
1562        int i;
1563
1564        ASSERT_RTNL();
1565#ifdef CONFIG_IP_MULTIPLE_TABLES
1566        RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1567        RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1568#endif
1569        /* Destroy the tables in reverse order to guarantee that the
1570         * local table, ID 255, is destroyed before the main table, ID
1571         * 254. This is necessary as the local table may contain
1572         * references to data contained in the main table.
1573         */
1574        for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
1575                struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1576                struct hlist_node *tmp;
1577                struct fib_table *tb;
1578
1579                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1580                        hlist_del(&tb->tb_hlist);
1581                        fib_table_flush(net, tb, true);
1582                        fib_free_table(tb);
1583                }
1584        }
1585
1586#ifdef CONFIG_IP_MULTIPLE_TABLES
1587        fib4_rules_exit(net);
1588#endif
1589
1590        kfree(net->ipv4.fib_table_hash);
1591        fib4_notifier_exit(net);
1592}
1593
1594static int __net_init fib_net_init(struct net *net)
1595{
1596        int error;
1597
1598#ifdef CONFIG_IP_ROUTE_CLASSID
1599        atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
1600#endif
1601        error = ip_fib_net_init(net);
1602        if (error < 0)
1603                goto out;
1604        error = nl_fib_lookup_init(net);
1605        if (error < 0)
1606                goto out_nlfl;
1607        error = fib_proc_init(net);
1608        if (error < 0)
1609                goto out_proc;
1610out:
1611        return error;
1612
1613out_proc:
1614        nl_fib_lookup_exit(net);
1615out_nlfl:
1616        rtnl_lock();
1617        ip_fib_net_exit(net);
1618        rtnl_unlock();
1619        goto out;
1620}
1621
1622static void __net_exit fib_net_exit(struct net *net)
1623{
1624        fib_proc_exit(net);
1625        nl_fib_lookup_exit(net);
1626}
1627
1628static void __net_exit fib_net_exit_batch(struct list_head *net_list)
1629{
1630        struct net *net;
1631
1632        rtnl_lock();
1633        list_for_each_entry(net, net_list, exit_list)
1634                ip_fib_net_exit(net);
1635
1636        rtnl_unlock();
1637}
1638
1639static struct pernet_operations fib_net_ops = {
1640        .init = fib_net_init,
1641        .exit = fib_net_exit,
1642        .exit_batch = fib_net_exit_batch,
1643};
1644
1645void __init ip_fib_init(void)
1646{
1647        fib_trie_init();
1648
1649        register_pernet_subsys(&fib_net_ops);
1650
1651        register_netdevice_notifier(&fib_netdev_notifier);
1652        register_inetaddr_notifier(&fib_inetaddr_notifier);
1653
1654        rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
1655        rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
1656        rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
1657}
1658