linux/net/ipv4/nexthop.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* Generic nexthop implementation
   3 *
   4 * Copyright (c) 2017-19 Cumulus Networks
   5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
   6 */
   7
   8#include <linux/nexthop.h>
   9#include <linux/rtnetlink.h>
  10#include <linux/slab.h>
  11#include <net/arp.h>
  12#include <net/ipv6_stubs.h>
  13#include <net/lwtunnel.h>
  14#include <net/ndisc.h>
  15#include <net/nexthop.h>
  16#include <net/route.h>
  17#include <net/sock.h>
  18
  19static void remove_nexthop(struct net *net, struct nexthop *nh,
  20                           struct nl_info *nlinfo);
  21
  22#define NH_DEV_HASHBITS  8
  23#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
  24
  25static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
  26        [NHA_UNSPEC]            = { .strict_start_type = NHA_UNSPEC + 1 },
  27        [NHA_ID]                = { .type = NLA_U32 },
  28        [NHA_GROUP]             = { .type = NLA_BINARY },
  29        [NHA_GROUP_TYPE]        = { .type = NLA_U16 },
  30        [NHA_BLACKHOLE]         = { .type = NLA_FLAG },
  31        [NHA_OIF]               = { .type = NLA_U32 },
  32        [NHA_GATEWAY]           = { .type = NLA_BINARY },
  33        [NHA_ENCAP_TYPE]        = { .type = NLA_U16 },
  34        [NHA_ENCAP]             = { .type = NLA_NESTED },
  35        [NHA_GROUPS]            = { .type = NLA_FLAG },
  36        [NHA_MASTER]            = { .type = NLA_U32 },
  37};
  38
  39static unsigned int nh_dev_hashfn(unsigned int val)
  40{
  41        unsigned int mask = NH_DEV_HASHSIZE - 1;
  42
  43        return (val ^
  44                (val >> NH_DEV_HASHBITS) ^
  45                (val >> (NH_DEV_HASHBITS * 2))) & mask;
  46}
  47
  48static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
  49{
  50        struct net_device *dev = nhi->fib_nhc.nhc_dev;
  51        struct hlist_head *head;
  52        unsigned int hash;
  53
  54        WARN_ON(!dev);
  55
  56        hash = nh_dev_hashfn(dev->ifindex);
  57        head = &net->nexthop.devhash[hash];
  58        hlist_add_head(&nhi->dev_hash, head);
  59}
  60
  61static void nexthop_free_mpath(struct nexthop *nh)
  62{
  63        struct nh_group *nhg;
  64        int i;
  65
  66        nhg = rcu_dereference_raw(nh->nh_grp);
  67        for (i = 0; i < nhg->num_nh; ++i)
  68                WARN_ON(nhg->nh_entries[i].nh);
  69
  70        kfree(nhg);
  71}
  72
  73static void nexthop_free_single(struct nexthop *nh)
  74{
  75        struct nh_info *nhi;
  76
  77        nhi = rcu_dereference_raw(nh->nh_info);
  78        switch (nhi->family) {
  79        case AF_INET:
  80                fib_nh_release(nh->net, &nhi->fib_nh);
  81                break;
  82        case AF_INET6:
  83                ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
  84                break;
  85        }
  86        kfree(nhi);
  87}
  88
  89void nexthop_free_rcu(struct rcu_head *head)
  90{
  91        struct nexthop *nh = container_of(head, struct nexthop, rcu);
  92
  93        if (nh->is_group)
  94                nexthop_free_mpath(nh);
  95        else
  96                nexthop_free_single(nh);
  97
  98        kfree(nh);
  99}
 100EXPORT_SYMBOL_GPL(nexthop_free_rcu);
 101
 102static struct nexthop *nexthop_alloc(void)
 103{
 104        struct nexthop *nh;
 105
 106        nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 107        if (nh) {
 108                INIT_LIST_HEAD(&nh->fi_list);
 109                INIT_LIST_HEAD(&nh->f6i_list);
 110                INIT_LIST_HEAD(&nh->grp_list);
 111        }
 112        return nh;
 113}
 114
 115static struct nh_group *nexthop_grp_alloc(u16 num_nh)
 116{
 117        size_t sz = offsetof(struct nexthop, nh_grp)
 118                    + sizeof(struct nh_group)
 119                    + sizeof(struct nh_grp_entry) * num_nh;
 120        struct nh_group *nhg;
 121
 122        nhg = kzalloc(sz, GFP_KERNEL);
 123        if (nhg)
 124                nhg->num_nh = num_nh;
 125
 126        return nhg;
 127}
 128
 129static void nh_base_seq_inc(struct net *net)
 130{
 131        while (++net->nexthop.seq == 0)
 132                ;
 133}
 134
 135/* no reference taken; rcu lock or rtnl must be held */
 136struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
 137{
 138        struct rb_node **pp, *parent = NULL, *next;
 139
 140        pp = &net->nexthop.rb_root.rb_node;
 141        while (1) {
 142                struct nexthop *nh;
 143
 144                next = rcu_dereference_raw(*pp);
 145                if (!next)
 146                        break;
 147                parent = next;
 148
 149                nh = rb_entry(parent, struct nexthop, rb_node);
 150                if (id < nh->id)
 151                        pp = &next->rb_left;
 152                else if (id > nh->id)
 153                        pp = &next->rb_right;
 154                else
 155                        return nh;
 156        }
 157        return NULL;
 158}
 159EXPORT_SYMBOL_GPL(nexthop_find_by_id);
 160
 161/* used for auto id allocation; called with rtnl held */
 162static u32 nh_find_unused_id(struct net *net)
 163{
 164        u32 id_start = net->nexthop.last_id_allocated;
 165
 166        while (1) {
 167                net->nexthop.last_id_allocated++;
 168                if (net->nexthop.last_id_allocated == id_start)
 169                        break;
 170
 171                if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
 172                        return net->nexthop.last_id_allocated;
 173        }
 174        return 0;
 175}
 176
 177static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
 178{
 179        struct nexthop_grp *p;
 180        size_t len = nhg->num_nh * sizeof(*p);
 181        struct nlattr *nla;
 182        u16 group_type = 0;
 183        int i;
 184
 185        if (nhg->mpath)
 186                group_type = NEXTHOP_GRP_TYPE_MPATH;
 187
 188        if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
 189                goto nla_put_failure;
 190
 191        nla = nla_reserve(skb, NHA_GROUP, len);
 192        if (!nla)
 193                goto nla_put_failure;
 194
 195        p = nla_data(nla);
 196        for (i = 0; i < nhg->num_nh; ++i) {
 197                p->id = nhg->nh_entries[i].nh->id;
 198                p->weight = nhg->nh_entries[i].weight - 1;
 199                p += 1;
 200        }
 201
 202        return 0;
 203
 204nla_put_failure:
 205        return -EMSGSIZE;
 206}
 207
 208static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 209                        int event, u32 portid, u32 seq, unsigned int nlflags)
 210{
 211        struct fib6_nh *fib6_nh;
 212        struct fib_nh *fib_nh;
 213        struct nlmsghdr *nlh;
 214        struct nh_info *nhi;
 215        struct nhmsg *nhm;
 216
 217        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
 218        if (!nlh)
 219                return -EMSGSIZE;
 220
 221        nhm = nlmsg_data(nlh);
 222        nhm->nh_family = AF_UNSPEC;
 223        nhm->nh_flags = nh->nh_flags;
 224        nhm->nh_protocol = nh->protocol;
 225        nhm->nh_scope = 0;
 226        nhm->resvd = 0;
 227
 228        if (nla_put_u32(skb, NHA_ID, nh->id))
 229                goto nla_put_failure;
 230
 231        if (nh->is_group) {
 232                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 233
 234                if (nla_put_nh_group(skb, nhg))
 235                        goto nla_put_failure;
 236                goto out;
 237        }
 238
 239        nhi = rtnl_dereference(nh->nh_info);
 240        nhm->nh_family = nhi->family;
 241        if (nhi->reject_nh) {
 242                if (nla_put_flag(skb, NHA_BLACKHOLE))
 243                        goto nla_put_failure;
 244                goto out;
 245        } else {
 246                const struct net_device *dev;
 247
 248                dev = nhi->fib_nhc.nhc_dev;
 249                if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
 250                        goto nla_put_failure;
 251        }
 252
 253        nhm->nh_scope = nhi->fib_nhc.nhc_scope;
 254        switch (nhi->family) {
 255        case AF_INET:
 256                fib_nh = &nhi->fib_nh;
 257                if (fib_nh->fib_nh_gw_family &&
 258                    nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
 259                        goto nla_put_failure;
 260                break;
 261
 262        case AF_INET6:
 263                fib6_nh = &nhi->fib6_nh;
 264                if (fib6_nh->fib_nh_gw_family &&
 265                    nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
 266                        goto nla_put_failure;
 267                break;
 268        }
 269
 270        if (nhi->fib_nhc.nhc_lwtstate &&
 271            lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
 272                                NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
 273                goto nla_put_failure;
 274
 275out:
 276        nlmsg_end(skb, nlh);
 277        return 0;
 278
 279nla_put_failure:
 280        return -EMSGSIZE;
 281}
 282
 283static size_t nh_nlmsg_size_grp(struct nexthop *nh)
 284{
 285        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 286        size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
 287
 288        return nla_total_size(sz) +
 289               nla_total_size(2);  /* NHA_GROUP_TYPE */
 290}
 291
 292static size_t nh_nlmsg_size_single(struct nexthop *nh)
 293{
 294        struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 295        size_t sz;
 296
 297        /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
 298         * are mutually exclusive
 299         */
 300        sz = nla_total_size(4);  /* NHA_OIF */
 301
 302        switch (nhi->family) {
 303        case AF_INET:
 304                if (nhi->fib_nh.fib_nh_gw_family)
 305                        sz += nla_total_size(4);  /* NHA_GATEWAY */
 306                break;
 307
 308        case AF_INET6:
 309                /* NHA_GATEWAY */
 310                if (nhi->fib6_nh.fib_nh_gw_family)
 311                        sz += nla_total_size(sizeof(const struct in6_addr));
 312                break;
 313        }
 314
 315        if (nhi->fib_nhc.nhc_lwtstate) {
 316                sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
 317                sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
 318        }
 319
 320        return sz;
 321}
 322
 323static size_t nh_nlmsg_size(struct nexthop *nh)
 324{
 325        size_t sz = nla_total_size(4);    /* NHA_ID */
 326
 327        if (nh->is_group)
 328                sz += nh_nlmsg_size_grp(nh);
 329        else
 330                sz += nh_nlmsg_size_single(nh);
 331
 332        return sz;
 333}
 334
 335static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 336{
 337        unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
 338        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 339        struct sk_buff *skb;
 340        int err = -ENOBUFS;
 341
 342        skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
 343        if (!skb)
 344                goto errout;
 345
 346        err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
 347        if (err < 0) {
 348                /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
 349                WARN_ON(err == -EMSGSIZE);
 350                kfree_skb(skb);
 351                goto errout;
 352        }
 353
 354        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
 355                    info->nlh, gfp_any());
 356        return;
 357errout:
 358        if (err < 0)
 359                rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
 360}
 361
 362static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
 363                           struct netlink_ext_ack *extack)
 364{
 365        if (nh->is_group) {
 366                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 367
 368                /* nested multipath (group within a group) is not
 369                 * supported
 370                 */
 371                if (nhg->mpath) {
 372                        NL_SET_ERR_MSG(extack,
 373                                       "Multipath group can not be a nexthop within a group");
 374                        return false;
 375                }
 376        } else {
 377                struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 378
 379                if (nhi->reject_nh && npaths > 1) {
 380                        NL_SET_ERR_MSG(extack,
 381                                       "Blackhole nexthop can not be used in a group with more than 1 path");
 382                        return false;
 383                }
 384        }
 385
 386        return true;
 387}
 388
 389static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 390                               struct netlink_ext_ack *extack)
 391{
 392        unsigned int len = nla_len(tb[NHA_GROUP]);
 393        struct nexthop_grp *nhg;
 394        unsigned int i, j;
 395
 396        if (len & (sizeof(struct nexthop_grp) - 1)) {
 397                NL_SET_ERR_MSG(extack,
 398                               "Invalid length for nexthop group attribute");
 399                return -EINVAL;
 400        }
 401
 402        /* convert len to number of nexthop ids */
 403        len /= sizeof(*nhg);
 404
 405        nhg = nla_data(tb[NHA_GROUP]);
 406        for (i = 0; i < len; ++i) {
 407                if (nhg[i].resvd1 || nhg[i].resvd2) {
 408                        NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
 409                        return -EINVAL;
 410                }
 411                if (nhg[i].weight > 254) {
 412                        NL_SET_ERR_MSG(extack, "Invalid value for weight");
 413                        return -EINVAL;
 414                }
 415                for (j = i + 1; j < len; ++j) {
 416                        if (nhg[i].id == nhg[j].id) {
 417                                NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
 418                                return -EINVAL;
 419                        }
 420                }
 421        }
 422
 423        nhg = nla_data(tb[NHA_GROUP]);
 424        for (i = 0; i < len; ++i) {
 425                struct nexthop *nh;
 426
 427                nh = nexthop_find_by_id(net, nhg[i].id);
 428                if (!nh) {
 429                        NL_SET_ERR_MSG(extack, "Invalid nexthop id");
 430                        return -EINVAL;
 431                }
 432                if (!valid_group_nh(nh, len, extack))
 433                        return -EINVAL;
 434        }
 435        for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
 436                if (!tb[i])
 437                        continue;
 438
 439                NL_SET_ERR_MSG(extack,
 440                               "No other attributes can be set in nexthop groups");
 441                return -EINVAL;
 442        }
 443
 444        return 0;
 445}
 446
 447static bool ipv6_good_nh(const struct fib6_nh *nh)
 448{
 449        int state = NUD_REACHABLE;
 450        struct neighbour *n;
 451
 452        rcu_read_lock_bh();
 453
 454        n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
 455        if (n)
 456                state = n->nud_state;
 457
 458        rcu_read_unlock_bh();
 459
 460        return !!(state & NUD_VALID);
 461}
 462
 463static bool ipv4_good_nh(const struct fib_nh *nh)
 464{
 465        int state = NUD_REACHABLE;
 466        struct neighbour *n;
 467
 468        rcu_read_lock_bh();
 469
 470        n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
 471                                      (__force u32)nh->fib_nh_gw4);
 472        if (n)
 473                state = n->nud_state;
 474
 475        rcu_read_unlock_bh();
 476
 477        return !!(state & NUD_VALID);
 478}
 479
 480struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
 481{
 482        struct nexthop *rc = NULL;
 483        struct nh_group *nhg;
 484        int i;
 485
 486        if (!nh->is_group)
 487                return nh;
 488
 489        nhg = rcu_dereference(nh->nh_grp);
 490        for (i = 0; i < nhg->num_nh; ++i) {
 491                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 492                struct nh_info *nhi;
 493
 494                if (hash > atomic_read(&nhge->upper_bound))
 495                        continue;
 496
 497                /* nexthops always check if it is good and does
 498                 * not rely on a sysctl for this behavior
 499                 */
 500                nhi = rcu_dereference(nhge->nh->nh_info);
 501                switch (nhi->family) {
 502                case AF_INET:
 503                        if (ipv4_good_nh(&nhi->fib_nh))
 504                                return nhge->nh;
 505                        break;
 506                case AF_INET6:
 507                        if (ipv6_good_nh(&nhi->fib6_nh))
 508                                return nhge->nh;
 509                        break;
 510                }
 511
 512                if (!rc)
 513                        rc = nhge->nh;
 514        }
 515
 516        return rc;
 517}
 518EXPORT_SYMBOL_GPL(nexthop_select_path);
 519
 520int nexthop_for_each_fib6_nh(struct nexthop *nh,
 521                             int (*cb)(struct fib6_nh *nh, void *arg),
 522                             void *arg)
 523{
 524        struct nh_info *nhi;
 525        int err;
 526
 527        if (nh->is_group) {
 528                struct nh_group *nhg;
 529                int i;
 530
 531                nhg = rcu_dereference_rtnl(nh->nh_grp);
 532                for (i = 0; i < nhg->num_nh; i++) {
 533                        struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 534
 535                        nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
 536                        err = cb(&nhi->fib6_nh, arg);
 537                        if (err)
 538                                return err;
 539                }
 540        } else {
 541                nhi = rcu_dereference_rtnl(nh->nh_info);
 542                err = cb(&nhi->fib6_nh, arg);
 543                if (err)
 544                        return err;
 545        }
 546
 547        return 0;
 548}
 549EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
 550
 551static int check_src_addr(const struct in6_addr *saddr,
 552                          struct netlink_ext_ack *extack)
 553{
 554        if (!ipv6_addr_any(saddr)) {
 555                NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
 556                return -EINVAL;
 557        }
 558        return 0;
 559}
 560
 561int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
 562                       struct netlink_ext_ack *extack)
 563{
 564        struct nh_info *nhi;
 565
 566        /* fib6_src is unique to a fib6_info and limits the ability to cache
 567         * routes in fib6_nh within a nexthop that is potentially shared
 568         * across multiple fib entries. If the config wants to use source
 569         * routing it can not use nexthop objects. mlxsw also does not allow
 570         * fib6_src on routes.
 571         */
 572        if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
 573                return -EINVAL;
 574
 575        if (nh->is_group) {
 576                struct nh_group *nhg;
 577
 578                nhg = rtnl_dereference(nh->nh_grp);
 579                if (nhg->has_v4)
 580                        goto no_v4_nh;
 581        } else {
 582                nhi = rtnl_dereference(nh->nh_info);
 583                if (nhi->family == AF_INET)
 584                        goto no_v4_nh;
 585        }
 586
 587        return 0;
 588no_v4_nh:
 589        NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
 590        return -EINVAL;
 591}
 592EXPORT_SYMBOL_GPL(fib6_check_nexthop);
 593
 594/* if existing nexthop has ipv6 routes linked to it, need
 595 * to verify this new spec works with ipv6
 596 */
 597static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
 598                              struct netlink_ext_ack *extack)
 599{
 600        struct fib6_info *f6i;
 601
 602        if (list_empty(&old->f6i_list))
 603                return 0;
 604
 605        list_for_each_entry(f6i, &old->f6i_list, nh_list) {
 606                if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
 607                        return -EINVAL;
 608        }
 609
 610        return fib6_check_nexthop(new, NULL, extack);
 611}
 612
 613static int nexthop_check_scope(struct nexthop *nh, u8 scope,
 614                               struct netlink_ext_ack *extack)
 615{
 616        struct nh_info *nhi;
 617
 618        nhi = rtnl_dereference(nh->nh_info);
 619        if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
 620                NL_SET_ERR_MSG(extack,
 621                               "Route with host scope can not have a gateway");
 622                return -EINVAL;
 623        }
 624
 625        if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
 626                NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
 627                return -EINVAL;
 628        }
 629
 630        return 0;
 631}
 632
 633/* Invoked by fib add code to verify nexthop by id is ok with
 634 * config for prefix; parts of fib_check_nh not done when nexthop
 635 * object is used.
 636 */
 637int fib_check_nexthop(struct nexthop *nh, u8 scope,
 638                      struct netlink_ext_ack *extack)
 639{
 640        int err = 0;
 641
 642        if (nh->is_group) {
 643                struct nh_group *nhg;
 644
 645                if (scope == RT_SCOPE_HOST) {
 646                        NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
 647                        err = -EINVAL;
 648                        goto out;
 649                }
 650
 651                nhg = rtnl_dereference(nh->nh_grp);
 652                /* all nexthops in a group have the same scope */
 653                err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack);
 654        } else {
 655                err = nexthop_check_scope(nh, scope, extack);
 656        }
 657out:
 658        return err;
 659}
 660
 661static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
 662                             struct netlink_ext_ack *extack)
 663{
 664        struct fib_info *fi;
 665
 666        list_for_each_entry(fi, &old->fi_list, nh_list) {
 667                int err;
 668
 669                err = fib_check_nexthop(new, fi->fib_scope, extack);
 670                if (err)
 671                        return err;
 672        }
 673        return 0;
 674}
 675
 676static void nh_group_rebalance(struct nh_group *nhg)
 677{
 678        int total = 0;
 679        int w = 0;
 680        int i;
 681
 682        for (i = 0; i < nhg->num_nh; ++i)
 683                total += nhg->nh_entries[i].weight;
 684
 685        for (i = 0; i < nhg->num_nh; ++i) {
 686                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 687                int upper_bound;
 688
 689                w += nhge->weight;
 690                upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
 691                atomic_set(&nhge->upper_bound, upper_bound);
 692        }
 693}
 694
 695static void remove_nh_grp_entry(struct nh_grp_entry *nhge,
 696                                struct nh_group *nhg,
 697                                struct nl_info *nlinfo)
 698{
 699        struct nexthop *nh = nhge->nh;
 700        struct nh_grp_entry *nhges;
 701        bool found = false;
 702        int i;
 703
 704        WARN_ON(!nh);
 705
 706        nhges = nhg->nh_entries;
 707        for (i = 0; i < nhg->num_nh; ++i) {
 708                if (found) {
 709                        nhges[i-1].nh = nhges[i].nh;
 710                        nhges[i-1].weight = nhges[i].weight;
 711                        list_del(&nhges[i].nh_list);
 712                        list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list);
 713                } else if (nhg->nh_entries[i].nh == nh) {
 714                        found = true;
 715                }
 716        }
 717
 718        if (WARN_ON(!found))
 719                return;
 720
 721        nhg->num_nh--;
 722        nhg->nh_entries[nhg->num_nh].nh = NULL;
 723
 724        nh_group_rebalance(nhg);
 725
 726        nexthop_put(nh);
 727
 728        if (nlinfo)
 729                nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo);
 730}
 731
 732static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
 733                                       struct nl_info *nlinfo)
 734{
 735        struct nh_grp_entry *nhge, *tmp;
 736
 737        list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
 738                struct nh_group *nhg;
 739
 740                list_del(&nhge->nh_list);
 741                nhg = rtnl_dereference(nhge->nh_parent->nh_grp);
 742                remove_nh_grp_entry(nhge, nhg, nlinfo);
 743
 744                /* if this group has no more entries then remove it */
 745                if (!nhg->num_nh)
 746                        remove_nexthop(net, nhge->nh_parent, nlinfo);
 747        }
 748}
 749
 750static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
 751{
 752        struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
 753        int i, num_nh = nhg->num_nh;
 754
 755        for (i = 0; i < num_nh; ++i) {
 756                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 757
 758                if (WARN_ON(!nhge->nh))
 759                        continue;
 760
 761                list_del(&nhge->nh_list);
 762                nexthop_put(nhge->nh);
 763                nhge->nh = NULL;
 764                nhg->num_nh--;
 765        }
 766}
 767
 768/* not called for nexthop replace */
 769static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 770{
 771        struct fib6_info *f6i, *tmp;
 772        bool do_flush = false;
 773        struct fib_info *fi;
 774
 775        list_for_each_entry(fi, &nh->fi_list, nh_list) {
 776                fi->fib_flags |= RTNH_F_DEAD;
 777                do_flush = true;
 778        }
 779        if (do_flush)
 780                fib_flush(net);
 781
 782        /* ip6_del_rt removes the entry from this list hence the _safe */
 783        list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
 784                /* __ip6_del_rt does a release, so do a hold here */
 785                fib6_info_hold(f6i);
 786                ipv6_stub->ip6_del_rt(net, f6i);
 787        }
 788}
 789
 790static void __remove_nexthop(struct net *net, struct nexthop *nh,
 791                             struct nl_info *nlinfo)
 792{
 793        __remove_nexthop_fib(net, nh);
 794
 795        if (nh->is_group) {
 796                remove_nexthop_group(nh, nlinfo);
 797        } else {
 798                struct nh_info *nhi;
 799
 800                nhi = rtnl_dereference(nh->nh_info);
 801                if (nhi->fib_nhc.nhc_dev)
 802                        hlist_del(&nhi->dev_hash);
 803
 804                remove_nexthop_from_groups(net, nh, nlinfo);
 805        }
 806}
 807
 808static void remove_nexthop(struct net *net, struct nexthop *nh,
 809                           struct nl_info *nlinfo)
 810{
 811        /* remove from the tree */
 812        rb_erase(&nh->rb_node, &net->nexthop.rb_root);
 813
 814        if (nlinfo)
 815                nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 816
 817        __remove_nexthop(net, nh, nlinfo);
 818        nh_base_seq_inc(net);
 819
 820        nexthop_put(nh);
 821}
 822
 823/* if any FIB entries reference this nexthop, any dst entries
 824 * need to be regenerated
 825 */
 826static void nh_rt_cache_flush(struct net *net, struct nexthop *nh)
 827{
 828        struct fib6_info *f6i;
 829
 830        if (!list_empty(&nh->fi_list))
 831                rt_cache_flush(net);
 832
 833        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
 834                ipv6_stub->fib6_update_sernum(net, f6i);
 835}
 836
 837static int replace_nexthop_grp(struct net *net, struct nexthop *old,
 838                               struct nexthop *new,
 839                               struct netlink_ext_ack *extack)
 840{
 841        struct nh_group *oldg, *newg;
 842        int i;
 843
 844        if (!new->is_group) {
 845                NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
 846                return -EINVAL;
 847        }
 848
 849        oldg = rtnl_dereference(old->nh_grp);
 850        newg = rtnl_dereference(new->nh_grp);
 851
 852        /* update parents - used by nexthop code for cleanup */
 853        for (i = 0; i < newg->num_nh; i++)
 854                newg->nh_entries[i].nh_parent = old;
 855
 856        rcu_assign_pointer(old->nh_grp, newg);
 857
 858        for (i = 0; i < oldg->num_nh; i++)
 859                oldg->nh_entries[i].nh_parent = new;
 860
 861        rcu_assign_pointer(new->nh_grp, oldg);
 862
 863        return 0;
 864}
 865
 866static int replace_nexthop_single(struct net *net, struct nexthop *old,
 867                                  struct nexthop *new,
 868                                  struct netlink_ext_ack *extack)
 869{
 870        struct nh_info *oldi, *newi;
 871
 872        if (new->is_group) {
 873                NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
 874                return -EINVAL;
 875        }
 876
 877        oldi = rtnl_dereference(old->nh_info);
 878        newi = rtnl_dereference(new->nh_info);
 879
 880        newi->nh_parent = old;
 881        oldi->nh_parent = new;
 882
 883        old->protocol = new->protocol;
 884        old->nh_flags = new->nh_flags;
 885
 886        rcu_assign_pointer(old->nh_info, newi);
 887        rcu_assign_pointer(new->nh_info, oldi);
 888
 889        return 0;
 890}
 891
 892static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
 893                                     struct nl_info *info)
 894{
 895        struct fib6_info *f6i;
 896
 897        if (!list_empty(&nh->fi_list)) {
 898                struct fib_info *fi;
 899
 900                /* expectation is a few fib_info per nexthop and then
 901                 * a lot of routes per fib_info. So mark the fib_info
 902                 * and then walk the fib tables once
 903                 */
 904                list_for_each_entry(fi, &nh->fi_list, nh_list)
 905                        fi->nh_updated = true;
 906
 907                fib_info_notify_update(net, info);
 908
 909                list_for_each_entry(fi, &nh->fi_list, nh_list)
 910                        fi->nh_updated = false;
 911        }
 912
 913        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
 914                ipv6_stub->fib6_rt_update(net, f6i, info);
 915}
 916
 917/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
 918 * linked to this nexthop and for all groups that the nexthop
 919 * is a member of
 920 */
 921static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
 922                                   struct nl_info *info)
 923{
 924        struct nh_grp_entry *nhge;
 925
 926        __nexthop_replace_notify(net, nh, info);
 927
 928        list_for_each_entry(nhge, &nh->grp_list, nh_list)
 929                __nexthop_replace_notify(net, nhge->nh_parent, info);
 930}
 931
 932static int replace_nexthop(struct net *net, struct nexthop *old,
 933                           struct nexthop *new, struct netlink_ext_ack *extack)
 934{
 935        bool new_is_reject = false;
 936        struct nh_grp_entry *nhge;
 937        int err;
 938
 939        /* check that existing FIB entries are ok with the
 940         * new nexthop definition
 941         */
 942        err = fib_check_nh_list(old, new, extack);
 943        if (err)
 944                return err;
 945
 946        err = fib6_check_nh_list(old, new, extack);
 947        if (err)
 948                return err;
 949
 950        if (!new->is_group) {
 951                struct nh_info *nhi = rtnl_dereference(new->nh_info);
 952
 953                new_is_reject = nhi->reject_nh;
 954        }
 955
 956        list_for_each_entry(nhge, &old->grp_list, nh_list) {
 957                /* if new nexthop is a blackhole, any groups using this
 958                 * nexthop cannot have more than 1 path
 959                 */
 960                if (new_is_reject &&
 961                    nexthop_num_path(nhge->nh_parent) > 1) {
 962                        NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
 963                        return -EINVAL;
 964                }
 965
 966                err = fib_check_nh_list(nhge->nh_parent, new, extack);
 967                if (err)
 968                        return err;
 969
 970                err = fib6_check_nh_list(nhge->nh_parent, new, extack);
 971                if (err)
 972                        return err;
 973        }
 974
 975        if (old->is_group)
 976                err = replace_nexthop_grp(net, old, new, extack);
 977        else
 978                err = replace_nexthop_single(net, old, new, extack);
 979
 980        if (!err) {
 981                nh_rt_cache_flush(net, old);
 982
 983                __remove_nexthop(net, new, NULL);
 984                nexthop_put(new);
 985        }
 986
 987        return err;
 988}
 989
 990/* called with rtnl_lock held */
 991static int insert_nexthop(struct net *net, struct nexthop *new_nh,
 992                          struct nh_config *cfg, struct netlink_ext_ack *extack)
 993{
 994        struct rb_node **pp, *parent = NULL, *next;
 995        struct rb_root *root = &net->nexthop.rb_root;
 996        bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
 997        bool create = !!(cfg->nlflags & NLM_F_CREATE);
 998        u32 new_id = new_nh->id;
 999        int replace_notify = 0;
1000        int rc = -EEXIST;
1001
1002        pp = &root->rb_node;
1003        while (1) {
1004                struct nexthop *nh;
1005
1006                next = rtnl_dereference(*pp);
1007                if (!next)
1008                        break;
1009
1010                parent = next;
1011
1012                nh = rb_entry(parent, struct nexthop, rb_node);
1013                if (new_id < nh->id) {
1014                        pp = &next->rb_left;
1015                } else if (new_id > nh->id) {
1016                        pp = &next->rb_right;
1017                } else if (replace) {
1018                        rc = replace_nexthop(net, nh, new_nh, extack);
1019                        if (!rc) {
1020                                new_nh = nh; /* send notification with old nh */
1021                                replace_notify = 1;
1022                        }
1023                        goto out;
1024                } else {
1025                        /* id already exists and not a replace */
1026                        goto out;
1027                }
1028        }
1029
1030        if (replace && !create) {
1031                NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
1032                rc = -ENOENT;
1033                goto out;
1034        }
1035
1036        rb_link_node_rcu(&new_nh->rb_node, parent, pp);
1037        rb_insert_color(&new_nh->rb_node, root);
1038        rc = 0;
1039out:
1040        if (!rc) {
1041                nh_base_seq_inc(net);
1042                nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
1043                if (replace_notify)
1044                        nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
1045        }
1046
1047        return rc;
1048}
1049
1050/* rtnl */
1051/* remove all nexthops tied to a device being deleted */
1052static void nexthop_flush_dev(struct net_device *dev)
1053{
1054        unsigned int hash = nh_dev_hashfn(dev->ifindex);
1055        struct net *net = dev_net(dev);
1056        struct hlist_head *head = &net->nexthop.devhash[hash];
1057        struct hlist_node *n;
1058        struct nh_info *nhi;
1059
1060        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
1061                if (nhi->fib_nhc.nhc_dev != dev)
1062                        continue;
1063
1064                remove_nexthop(net, nhi->nh_parent, NULL);
1065        }
1066}
1067
1068/* rtnl; called when net namespace is deleted */
1069static void flush_all_nexthops(struct net *net)
1070{
1071        struct rb_root *root = &net->nexthop.rb_root;
1072        struct rb_node *node;
1073        struct nexthop *nh;
1074
1075        while ((node = rb_first(root))) {
1076                nh = rb_entry(node, struct nexthop, rb_node);
1077                remove_nexthop(net, nh, NULL);
1078                cond_resched();
1079        }
1080}
1081
1082static struct nexthop *nexthop_create_group(struct net *net,
1083                                            struct nh_config *cfg)
1084{
1085        struct nlattr *grps_attr = cfg->nh_grp;
1086        struct nexthop_grp *entry = nla_data(grps_attr);
1087        struct nh_group *nhg;
1088        struct nexthop *nh;
1089        int i;
1090
1091        nh = nexthop_alloc();
1092        if (!nh)
1093                return ERR_PTR(-ENOMEM);
1094
1095        nh->is_group = 1;
1096
1097        nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
1098        if (!nhg) {
1099                kfree(nh);
1100                return ERR_PTR(-ENOMEM);
1101        }
1102
1103        for (i = 0; i < nhg->num_nh; ++i) {
1104                struct nexthop *nhe;
1105                struct nh_info *nhi;
1106
1107                nhe = nexthop_find_by_id(net, entry[i].id);
1108                if (!nexthop_get(nhe))
1109                        goto out_no_nh;
1110
1111                nhi = rtnl_dereference(nhe->nh_info);
1112                if (nhi->family == AF_INET)
1113                        nhg->has_v4 = true;
1114
1115                nhg->nh_entries[i].nh = nhe;
1116                nhg->nh_entries[i].weight = entry[i].weight + 1;
1117                list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
1118                nhg->nh_entries[i].nh_parent = nh;
1119        }
1120
1121        if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
1122                nhg->mpath = 1;
1123                nh_group_rebalance(nhg);
1124        }
1125
1126        rcu_assign_pointer(nh->nh_grp, nhg);
1127
1128        return nh;
1129
1130out_no_nh:
1131        for (; i >= 0; --i)
1132                nexthop_put(nhg->nh_entries[i].nh);
1133
1134        kfree(nhg);
1135        kfree(nh);
1136
1137        return ERR_PTR(-ENOENT);
1138}
1139
1140static int nh_create_ipv4(struct net *net, struct nexthop *nh,
1141                          struct nh_info *nhi, struct nh_config *cfg,
1142                          struct netlink_ext_ack *extack)
1143{
1144        struct fib_nh *fib_nh = &nhi->fib_nh;
1145        struct fib_config fib_cfg = {
1146                .fc_oif   = cfg->nh_ifindex,
1147                .fc_gw4   = cfg->gw.ipv4,
1148                .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
1149                .fc_flags = cfg->nh_flags,
1150                .fc_encap = cfg->nh_encap,
1151                .fc_encap_type = cfg->nh_encap_type,
1152        };
1153        u32 tb_id = l3mdev_fib_table(cfg->dev);
1154        int err = -EINVAL;
1155
1156        err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
1157        if (err) {
1158                fib_nh_release(net, fib_nh);
1159                goto out;
1160        }
1161
1162        /* sets nh_dev if successful */
1163        err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
1164        if (!err) {
1165                nh->nh_flags = fib_nh->fib_nh_flags;
1166                fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
1167                                          fib_nh->fib_nh_scope);
1168        } else {
1169                fib_nh_release(net, fib_nh);
1170        }
1171out:
1172        return err;
1173}
1174
1175static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
1176                          struct nh_info *nhi, struct nh_config *cfg,
1177                          struct netlink_ext_ack *extack)
1178{
1179        struct fib6_nh *fib6_nh = &nhi->fib6_nh;
1180        struct fib6_config fib6_cfg = {
1181                .fc_table = l3mdev_fib_table(cfg->dev),
1182                .fc_ifindex = cfg->nh_ifindex,
1183                .fc_gateway = cfg->gw.ipv6,
1184                .fc_flags = cfg->nh_flags,
1185                .fc_encap = cfg->nh_encap,
1186                .fc_encap_type = cfg->nh_encap_type,
1187        };
1188        int err;
1189
1190        if (!ipv6_addr_any(&cfg->gw.ipv6))
1191                fib6_cfg.fc_flags |= RTF_GATEWAY;
1192
1193        /* sets nh_dev if successful */
1194        err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
1195                                      extack);
1196        if (err)
1197                ipv6_stub->fib6_nh_release(fib6_nh);
1198        else
1199                nh->nh_flags = fib6_nh->fib_nh_flags;
1200
1201        return err;
1202}
1203
1204static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
1205                                      struct netlink_ext_ack *extack)
1206{
1207        struct nh_info *nhi;
1208        struct nexthop *nh;
1209        int err = 0;
1210
1211        nh = nexthop_alloc();
1212        if (!nh)
1213                return ERR_PTR(-ENOMEM);
1214
1215        nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
1216        if (!nhi) {
1217                kfree(nh);
1218                return ERR_PTR(-ENOMEM);
1219        }
1220
1221        nh->nh_flags = cfg->nh_flags;
1222        nh->net = net;
1223
1224        nhi->nh_parent = nh;
1225        nhi->family = cfg->nh_family;
1226        nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
1227
1228        if (cfg->nh_blackhole) {
1229                nhi->reject_nh = 1;
1230                cfg->nh_ifindex = net->loopback_dev->ifindex;
1231        }
1232
1233        switch (cfg->nh_family) {
1234        case AF_INET:
1235                err = nh_create_ipv4(net, nh, nhi, cfg, extack);
1236                break;
1237        case AF_INET6:
1238                err = nh_create_ipv6(net, nh, nhi, cfg, extack);
1239                break;
1240        }
1241
1242        if (err) {
1243                kfree(nhi);
1244                kfree(nh);
1245                return ERR_PTR(err);
1246        }
1247
1248        /* add the entry to the device based hash */
1249        nexthop_devhash_add(net, nhi);
1250
1251        rcu_assign_pointer(nh->nh_info, nhi);
1252
1253        return nh;
1254}
1255
1256/* called with rtnl lock held */
1257static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
1258                                   struct netlink_ext_ack *extack)
1259{
1260        struct nexthop *nh;
1261        int err;
1262
1263        if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
1264                NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
1265                return ERR_PTR(-EINVAL);
1266        }
1267
1268        if (!cfg->nh_id) {
1269                cfg->nh_id = nh_find_unused_id(net);
1270                if (!cfg->nh_id) {
1271                        NL_SET_ERR_MSG(extack, "No unused id");
1272                        return ERR_PTR(-EINVAL);
1273                }
1274        }
1275
1276        if (cfg->nh_grp)
1277                nh = nexthop_create_group(net, cfg);
1278        else
1279                nh = nexthop_create(net, cfg, extack);
1280
1281        if (IS_ERR(nh))
1282                return nh;
1283
1284        refcount_set(&nh->refcnt, 1);
1285        nh->id = cfg->nh_id;
1286        nh->protocol = cfg->nh_protocol;
1287        nh->net = net;
1288
1289        err = insert_nexthop(net, nh, cfg, extack);
1290        if (err) {
1291                __remove_nexthop(net, nh, NULL);
1292                nexthop_put(nh);
1293                nh = ERR_PTR(err);
1294        }
1295
1296        return nh;
1297}
1298
1299static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
1300                            struct nlmsghdr *nlh, struct nh_config *cfg,
1301                            struct netlink_ext_ack *extack)
1302{
1303        struct nhmsg *nhm = nlmsg_data(nlh);
1304        struct nlattr *tb[NHA_MAX + 1];
1305        int err;
1306
1307        err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1308                          extack);
1309        if (err < 0)
1310                return err;
1311
1312        err = -EINVAL;
1313        if (nhm->resvd || nhm->nh_scope) {
1314                NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
1315                goto out;
1316        }
1317        if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
1318                NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
1319                goto out;
1320        }
1321
1322        switch (nhm->nh_family) {
1323        case AF_INET:
1324        case AF_INET6:
1325                break;
1326        case AF_UNSPEC:
1327                if (tb[NHA_GROUP])
1328                        break;
1329                /* fallthrough */
1330        default:
1331                NL_SET_ERR_MSG(extack, "Invalid address family");
1332                goto out;
1333        }
1334
1335        if (tb[NHA_GROUPS] || tb[NHA_MASTER]) {
1336                NL_SET_ERR_MSG(extack, "Invalid attributes in request");
1337                goto out;
1338        }
1339
1340        memset(cfg, 0, sizeof(*cfg));
1341        cfg->nlflags = nlh->nlmsg_flags;
1342        cfg->nlinfo.portid = NETLINK_CB(skb).portid;
1343        cfg->nlinfo.nlh = nlh;
1344        cfg->nlinfo.nl_net = net;
1345
1346        cfg->nh_family = nhm->nh_family;
1347        cfg->nh_protocol = nhm->nh_protocol;
1348        cfg->nh_flags = nhm->nh_flags;
1349
1350        if (tb[NHA_ID])
1351                cfg->nh_id = nla_get_u32(tb[NHA_ID]);
1352
1353        if (tb[NHA_GROUP]) {
1354                if (nhm->nh_family != AF_UNSPEC) {
1355                        NL_SET_ERR_MSG(extack, "Invalid family for group");
1356                        goto out;
1357                }
1358                cfg->nh_grp = tb[NHA_GROUP];
1359
1360                cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
1361                if (tb[NHA_GROUP_TYPE])
1362                        cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
1363
1364                if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
1365                        NL_SET_ERR_MSG(extack, "Invalid group type");
1366                        goto out;
1367                }
1368                err = nh_check_attr_group(net, tb, extack);
1369
1370                /* no other attributes should be set */
1371                goto out;
1372        }
1373
1374        if (tb[NHA_BLACKHOLE]) {
1375                if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
1376                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
1377                        NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
1378                        goto out;
1379                }
1380
1381                cfg->nh_blackhole = 1;
1382                err = 0;
1383                goto out;
1384        }
1385
1386        if (!tb[NHA_OIF]) {
1387                NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
1388                goto out;
1389        }
1390
1391        cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
1392        if (cfg->nh_ifindex)
1393                cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
1394
1395        if (!cfg->dev) {
1396                NL_SET_ERR_MSG(extack, "Invalid device index");
1397                goto out;
1398        } else if (!(cfg->dev->flags & IFF_UP)) {
1399                NL_SET_ERR_MSG(extack, "Nexthop device is not up");
1400                err = -ENETDOWN;
1401                goto out;
1402        } else if (!netif_carrier_ok(cfg->dev)) {
1403                NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
1404                err = -ENETDOWN;
1405                goto out;
1406        }
1407
1408        err = -EINVAL;
1409        if (tb[NHA_GATEWAY]) {
1410                struct nlattr *gwa = tb[NHA_GATEWAY];
1411
1412                switch (cfg->nh_family) {
1413                case AF_INET:
1414                        if (nla_len(gwa) != sizeof(u32)) {
1415                                NL_SET_ERR_MSG(extack, "Invalid gateway");
1416                                goto out;
1417                        }
1418                        cfg->gw.ipv4 = nla_get_be32(gwa);
1419                        break;
1420                case AF_INET6:
1421                        if (nla_len(gwa) != sizeof(struct in6_addr)) {
1422                                NL_SET_ERR_MSG(extack, "Invalid gateway");
1423                                goto out;
1424                        }
1425                        cfg->gw.ipv6 = nla_get_in6_addr(gwa);
1426                        break;
1427                default:
1428                        NL_SET_ERR_MSG(extack,
1429                                       "Unknown address family for gateway");
1430                        goto out;
1431                }
1432        } else {
1433                /* device only nexthop (no gateway) */
1434                if (cfg->nh_flags & RTNH_F_ONLINK) {
1435                        NL_SET_ERR_MSG(extack,
1436                                       "ONLINK flag can not be set for nexthop without a gateway");
1437                        goto out;
1438                }
1439        }
1440
1441        if (tb[NHA_ENCAP]) {
1442                cfg->nh_encap = tb[NHA_ENCAP];
1443
1444                if (!tb[NHA_ENCAP_TYPE]) {
1445                        NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
1446                        goto out;
1447                }
1448
1449                cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
1450                err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
1451                if (err < 0)
1452                        goto out;
1453
1454        } else if (tb[NHA_ENCAP_TYPE]) {
1455                NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
1456                goto out;
1457        }
1458
1459
1460        err = 0;
1461out:
1462        return err;
1463}
1464
1465/* rtnl */
1466static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
1467                           struct netlink_ext_ack *extack)
1468{
1469        struct net *net = sock_net(skb->sk);
1470        struct nh_config cfg;
1471        struct nexthop *nh;
1472        int err;
1473
1474        err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
1475        if (!err) {
1476                nh = nexthop_add(net, &cfg, extack);
1477                if (IS_ERR(nh))
1478                        err = PTR_ERR(nh);
1479        }
1480
1481        return err;
1482}
1483
1484static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id,
1485                                struct netlink_ext_ack *extack)
1486{
1487        struct nhmsg *nhm = nlmsg_data(nlh);
1488        struct nlattr *tb[NHA_MAX + 1];
1489        int err, i;
1490
1491        err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1492                          extack);
1493        if (err < 0)
1494                return err;
1495
1496        err = -EINVAL;
1497        for (i = 0; i < __NHA_MAX; ++i) {
1498                if (!tb[i])
1499                        continue;
1500
1501                switch (i) {
1502                case NHA_ID:
1503                        break;
1504                default:
1505                        NL_SET_ERR_MSG_ATTR(extack, tb[i],
1506                                            "Unexpected attribute in request");
1507                        goto out;
1508                }
1509        }
1510        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
1511                NL_SET_ERR_MSG(extack, "Invalid values in header");
1512                goto out;
1513        }
1514
1515        if (!tb[NHA_ID]) {
1516                NL_SET_ERR_MSG(extack, "Nexthop id is missing");
1517                goto out;
1518        }
1519
1520        *id = nla_get_u32(tb[NHA_ID]);
1521        if (!(*id))
1522                NL_SET_ERR_MSG(extack, "Invalid nexthop id");
1523        else
1524                err = 0;
1525out:
1526        return err;
1527}
1528
1529/* rtnl */
1530static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
1531                           struct netlink_ext_ack *extack)
1532{
1533        struct net *net = sock_net(skb->sk);
1534        struct nl_info nlinfo = {
1535                .nlh = nlh,
1536                .nl_net = net,
1537                .portid = NETLINK_CB(skb).portid,
1538        };
1539        struct nexthop *nh;
1540        int err;
1541        u32 id;
1542
1543        err = nh_valid_get_del_req(nlh, &id, extack);
1544        if (err)
1545                return err;
1546
1547        nh = nexthop_find_by_id(net, id);
1548        if (!nh)
1549                return -ENOENT;
1550
1551        remove_nexthop(net, nh, &nlinfo);
1552
1553        return 0;
1554}
1555
1556/* rtnl */
1557static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
1558                           struct netlink_ext_ack *extack)
1559{
1560        struct net *net = sock_net(in_skb->sk);
1561        struct sk_buff *skb = NULL;
1562        struct nexthop *nh;
1563        int err;
1564        u32 id;
1565
1566        err = nh_valid_get_del_req(nlh, &id, extack);
1567        if (err)
1568                return err;
1569
1570        err = -ENOBUFS;
1571        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1572        if (!skb)
1573                goto out;
1574
1575        err = -ENOENT;
1576        nh = nexthop_find_by_id(net, id);
1577        if (!nh)
1578                goto errout_free;
1579
1580        err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
1581                           nlh->nlmsg_seq, 0);
1582        if (err < 0) {
1583                WARN_ON(err == -EMSGSIZE);
1584                goto errout_free;
1585        }
1586
1587        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1588out:
1589        return err;
1590errout_free:
1591        kfree_skb(skb);
1592        goto out;
1593}
1594
1595static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
1596                             bool group_filter, u8 family)
1597{
1598        const struct net_device *dev;
1599        const struct nh_info *nhi;
1600
1601        if (group_filter && !nh->is_group)
1602                return true;
1603
1604        if (!dev_idx && !master_idx && !family)
1605                return false;
1606
1607        if (nh->is_group)
1608                return true;
1609
1610        nhi = rtnl_dereference(nh->nh_info);
1611        if (family && nhi->family != family)
1612                return true;
1613
1614        dev = nhi->fib_nhc.nhc_dev;
1615        if (dev_idx && (!dev || dev->ifindex != dev_idx))
1616                return true;
1617
1618        if (master_idx) {
1619                struct net_device *master;
1620
1621                if (!dev)
1622                        return true;
1623
1624                master = netdev_master_upper_dev_get((struct net_device *)dev);
1625                if (!master || master->ifindex != master_idx)
1626                        return true;
1627        }
1628
1629        return false;
1630}
1631
1632static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
1633                             int *master_idx, bool *group_filter,
1634                             struct netlink_callback *cb)
1635{
1636        struct netlink_ext_ack *extack = cb->extack;
1637        struct nlattr *tb[NHA_MAX + 1];
1638        struct nhmsg *nhm;
1639        int err, i;
1640        u32 idx;
1641
1642        err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1643                          NULL);
1644        if (err < 0)
1645                return err;
1646
1647        for (i = 0; i <= NHA_MAX; ++i) {
1648                if (!tb[i])
1649                        continue;
1650
1651                switch (i) {
1652                case NHA_OIF:
1653                        idx = nla_get_u32(tb[i]);
1654                        if (idx > INT_MAX) {
1655                                NL_SET_ERR_MSG(extack, "Invalid device index");
1656                                return -EINVAL;
1657                        }
1658                        *dev_idx = idx;
1659                        break;
1660                case NHA_MASTER:
1661                        idx = nla_get_u32(tb[i]);
1662                        if (idx > INT_MAX) {
1663                                NL_SET_ERR_MSG(extack, "Invalid master device index");
1664                                return -EINVAL;
1665                        }
1666                        *master_idx = idx;
1667                        break;
1668                case NHA_GROUPS:
1669                        *group_filter = true;
1670                        break;
1671                default:
1672                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
1673                        return -EINVAL;
1674                }
1675        }
1676
1677        nhm = nlmsg_data(nlh);
1678        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
1679                NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
1680                return -EINVAL;
1681        }
1682
1683        return 0;
1684}
1685
1686/* rtnl */
1687static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
1688{
1689        struct nhmsg *nhm = nlmsg_data(cb->nlh);
1690        int dev_filter_idx = 0, master_idx = 0;
1691        struct net *net = sock_net(skb->sk);
1692        struct rb_root *root = &net->nexthop.rb_root;
1693        bool group_filter = false;
1694        struct rb_node *node;
1695        int idx = 0, s_idx;
1696        int err;
1697
1698        err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
1699                                &group_filter, cb);
1700        if (err < 0)
1701                return err;
1702
1703        s_idx = cb->args[0];
1704        for (node = rb_first(root); node; node = rb_next(node)) {
1705                struct nexthop *nh;
1706
1707                if (idx < s_idx)
1708                        goto cont;
1709
1710                nh = rb_entry(node, struct nexthop, rb_node);
1711                if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
1712                                     group_filter, nhm->nh_family))
1713                        goto cont;
1714
1715                err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
1716                                   NETLINK_CB(cb->skb).portid,
1717                                   cb->nlh->nlmsg_seq, NLM_F_MULTI);
1718                if (err < 0) {
1719                        if (likely(skb->len))
1720                                goto out;
1721
1722                        goto out_err;
1723                }
1724cont:
1725                idx++;
1726        }
1727
1728out:
1729        err = skb->len;
1730out_err:
1731        cb->args[0] = idx;
1732        cb->seq = net->nexthop.seq;
1733        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1734
1735        return err;
1736}
1737
1738static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
1739{
1740        unsigned int hash = nh_dev_hashfn(dev->ifindex);
1741        struct net *net = dev_net(dev);
1742        struct hlist_head *head = &net->nexthop.devhash[hash];
1743        struct hlist_node *n;
1744        struct nh_info *nhi;
1745
1746        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
1747                if (nhi->fib_nhc.nhc_dev == dev) {
1748                        if (nhi->family == AF_INET)
1749                                fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
1750                                                   orig_mtu);
1751                }
1752        }
1753}
1754
1755/* rtnl */
1756static int nh_netdev_event(struct notifier_block *this,
1757                           unsigned long event, void *ptr)
1758{
1759        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1760        struct netdev_notifier_info_ext *info_ext;
1761
1762        switch (event) {
1763        case NETDEV_DOWN:
1764        case NETDEV_UNREGISTER:
1765                nexthop_flush_dev(dev);
1766                break;
1767        case NETDEV_CHANGE:
1768                if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
1769                        nexthop_flush_dev(dev);
1770                break;
1771        case NETDEV_CHANGEMTU:
1772                info_ext = ptr;
1773                nexthop_sync_mtu(dev, info_ext->ext.mtu);
1774                rt_cache_flush(dev_net(dev));
1775                break;
1776        }
1777        return NOTIFY_DONE;
1778}
1779
1780static struct notifier_block nh_netdev_notifier = {
1781        .notifier_call = nh_netdev_event,
1782};
1783
1784static void __net_exit nexthop_net_exit(struct net *net)
1785{
1786        rtnl_lock();
1787        flush_all_nexthops(net);
1788        rtnl_unlock();
1789        kfree(net->nexthop.devhash);
1790}
1791
1792static int __net_init nexthop_net_init(struct net *net)
1793{
1794        size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
1795
1796        net->nexthop.rb_root = RB_ROOT;
1797        net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
1798        if (!net->nexthop.devhash)
1799                return -ENOMEM;
1800
1801        return 0;
1802}
1803
1804static struct pernet_operations nexthop_net_ops = {
1805        .init = nexthop_net_init,
1806        .exit = nexthop_net_exit,
1807};
1808
1809static int __init nexthop_init(void)
1810{
1811        register_pernet_subsys(&nexthop_net_ops);
1812
1813        register_netdevice_notifier(&nh_netdev_notifier);
1814
1815        rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
1816        rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
1817        rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
1818                      rtm_dump_nexthop, 0);
1819
1820        rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
1821        rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
1822
1823        rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
1824        rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
1825
1826        return 0;
1827}
1828subsys_initcall(nexthop_init);
1829