linux/net/ipv4/nexthop.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* Generic nexthop implementation
   3 *
   4 * Copyright (c) 2017-19 Cumulus Networks
   5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
   6 */
   7
   8#include <linux/nexthop.h>
   9#include <linux/rtnetlink.h>
  10#include <linux/slab.h>
  11#include <net/arp.h>
  12#include <net/ipv6_stubs.h>
  13#include <net/lwtunnel.h>
  14#include <net/ndisc.h>
  15#include <net/nexthop.h>
  16#include <net/route.h>
  17#include <net/sock.h>
  18
  19static void remove_nexthop(struct net *net, struct nexthop *nh,
  20                           struct nl_info *nlinfo);
  21
  22#define NH_DEV_HASHBITS  8
  23#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
  24
  25static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
  26        [NHA_ID]                = { .type = NLA_U32 },
  27        [NHA_GROUP]             = { .type = NLA_BINARY },
  28        [NHA_GROUP_TYPE]        = { .type = NLA_U16 },
  29        [NHA_BLACKHOLE]         = { .type = NLA_FLAG },
  30        [NHA_OIF]               = { .type = NLA_U32 },
  31        [NHA_GATEWAY]           = { .type = NLA_BINARY },
  32        [NHA_ENCAP_TYPE]        = { .type = NLA_U16 },
  33        [NHA_ENCAP]             = { .type = NLA_NESTED },
  34        [NHA_GROUPS]            = { .type = NLA_FLAG },
  35        [NHA_MASTER]            = { .type = NLA_U32 },
  36        [NHA_FDB]               = { .type = NLA_FLAG },
  37};
  38
  39static int call_nexthop_notifiers(struct net *net,
  40                                  enum nexthop_event_type event_type,
  41                                  struct nexthop *nh)
  42{
  43        int err;
  44
  45        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
  46                                           event_type, nh);
  47        return notifier_to_errno(err);
  48}
  49
  50static unsigned int nh_dev_hashfn(unsigned int val)
  51{
  52        unsigned int mask = NH_DEV_HASHSIZE - 1;
  53
  54        return (val ^
  55                (val >> NH_DEV_HASHBITS) ^
  56                (val >> (NH_DEV_HASHBITS * 2))) & mask;
  57}
  58
  59static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
  60{
  61        struct net_device *dev = nhi->fib_nhc.nhc_dev;
  62        struct hlist_head *head;
  63        unsigned int hash;
  64
  65        WARN_ON(!dev);
  66
  67        hash = nh_dev_hashfn(dev->ifindex);
  68        head = &net->nexthop.devhash[hash];
  69        hlist_add_head(&nhi->dev_hash, head);
  70}
  71
  72static void nexthop_free_mpath(struct nexthop *nh)
  73{
  74        struct nh_group *nhg;
  75        int i;
  76
  77        nhg = rcu_dereference_raw(nh->nh_grp);
  78        for (i = 0; i < nhg->num_nh; ++i) {
  79                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
  80
  81                WARN_ON(!list_empty(&nhge->nh_list));
  82                nexthop_put(nhge->nh);
  83        }
  84
  85        WARN_ON(nhg->spare == nhg);
  86
  87        kfree(nhg->spare);
  88        kfree(nhg);
  89}
  90
  91static void nexthop_free_single(struct nexthop *nh)
  92{
  93        struct nh_info *nhi;
  94
  95        nhi = rcu_dereference_raw(nh->nh_info);
  96        switch (nhi->family) {
  97        case AF_INET:
  98                fib_nh_release(nh->net, &nhi->fib_nh);
  99                break;
 100        case AF_INET6:
 101                ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
 102                break;
 103        }
 104        kfree(nhi);
 105}
 106
 107void nexthop_free_rcu(struct rcu_head *head)
 108{
 109        struct nexthop *nh = container_of(head, struct nexthop, rcu);
 110
 111        if (nh->is_group)
 112                nexthop_free_mpath(nh);
 113        else
 114                nexthop_free_single(nh);
 115
 116        kfree(nh);
 117}
 118EXPORT_SYMBOL_GPL(nexthop_free_rcu);
 119
 120static struct nexthop *nexthop_alloc(void)
 121{
 122        struct nexthop *nh;
 123
 124        nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 125        if (nh) {
 126                INIT_LIST_HEAD(&nh->fi_list);
 127                INIT_LIST_HEAD(&nh->f6i_list);
 128                INIT_LIST_HEAD(&nh->grp_list);
 129                INIT_LIST_HEAD(&nh->fdb_list);
 130        }
 131        return nh;
 132}
 133
 134static struct nh_group *nexthop_grp_alloc(u16 num_nh)
 135{
 136        struct nh_group *nhg;
 137
 138        nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
 139        if (nhg)
 140                nhg->num_nh = num_nh;
 141
 142        return nhg;
 143}
 144
 145static void nh_base_seq_inc(struct net *net)
 146{
 147        while (++net->nexthop.seq == 0)
 148                ;
 149}
 150
 151/* no reference taken; rcu lock or rtnl must be held */
 152struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
 153{
 154        struct rb_node **pp, *parent = NULL, *next;
 155
 156        pp = &net->nexthop.rb_root.rb_node;
 157        while (1) {
 158                struct nexthop *nh;
 159
 160                next = rcu_dereference_raw(*pp);
 161                if (!next)
 162                        break;
 163                parent = next;
 164
 165                nh = rb_entry(parent, struct nexthop, rb_node);
 166                if (id < nh->id)
 167                        pp = &next->rb_left;
 168                else if (id > nh->id)
 169                        pp = &next->rb_right;
 170                else
 171                        return nh;
 172        }
 173        return NULL;
 174}
 175EXPORT_SYMBOL_GPL(nexthop_find_by_id);
 176
 177/* used for auto id allocation; called with rtnl held */
 178static u32 nh_find_unused_id(struct net *net)
 179{
 180        u32 id_start = net->nexthop.last_id_allocated;
 181
 182        while (1) {
 183                net->nexthop.last_id_allocated++;
 184                if (net->nexthop.last_id_allocated == id_start)
 185                        break;
 186
 187                if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
 188                        return net->nexthop.last_id_allocated;
 189        }
 190        return 0;
 191}
 192
 193static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
 194{
 195        struct nexthop_grp *p;
 196        size_t len = nhg->num_nh * sizeof(*p);
 197        struct nlattr *nla;
 198        u16 group_type = 0;
 199        int i;
 200
 201        if (nhg->mpath)
 202                group_type = NEXTHOP_GRP_TYPE_MPATH;
 203
 204        if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
 205                goto nla_put_failure;
 206
 207        nla = nla_reserve(skb, NHA_GROUP, len);
 208        if (!nla)
 209                goto nla_put_failure;
 210
 211        p = nla_data(nla);
 212        for (i = 0; i < nhg->num_nh; ++i) {
 213                p->id = nhg->nh_entries[i].nh->id;
 214                p->weight = nhg->nh_entries[i].weight - 1;
 215                p += 1;
 216        }
 217
 218        return 0;
 219
 220nla_put_failure:
 221        return -EMSGSIZE;
 222}
 223
 224static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 225                        int event, u32 portid, u32 seq, unsigned int nlflags)
 226{
 227        struct fib6_nh *fib6_nh;
 228        struct fib_nh *fib_nh;
 229        struct nlmsghdr *nlh;
 230        struct nh_info *nhi;
 231        struct nhmsg *nhm;
 232
 233        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
 234        if (!nlh)
 235                return -EMSGSIZE;
 236
 237        nhm = nlmsg_data(nlh);
 238        nhm->nh_family = AF_UNSPEC;
 239        nhm->nh_flags = nh->nh_flags;
 240        nhm->nh_protocol = nh->protocol;
 241        nhm->nh_scope = 0;
 242        nhm->resvd = 0;
 243
 244        if (nla_put_u32(skb, NHA_ID, nh->id))
 245                goto nla_put_failure;
 246
 247        if (nh->is_group) {
 248                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 249
 250                if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
 251                        goto nla_put_failure;
 252                if (nla_put_nh_group(skb, nhg))
 253                        goto nla_put_failure;
 254                goto out;
 255        }
 256
 257        nhi = rtnl_dereference(nh->nh_info);
 258        nhm->nh_family = nhi->family;
 259        if (nhi->reject_nh) {
 260                if (nla_put_flag(skb, NHA_BLACKHOLE))
 261                        goto nla_put_failure;
 262                goto out;
 263        } else if (nhi->fdb_nh) {
 264                if (nla_put_flag(skb, NHA_FDB))
 265                        goto nla_put_failure;
 266        } else {
 267                const struct net_device *dev;
 268
 269                dev = nhi->fib_nhc.nhc_dev;
 270                if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
 271                        goto nla_put_failure;
 272        }
 273
 274        nhm->nh_scope = nhi->fib_nhc.nhc_scope;
 275        switch (nhi->family) {
 276        case AF_INET:
 277                fib_nh = &nhi->fib_nh;
 278                if (fib_nh->fib_nh_gw_family &&
 279                    nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
 280                        goto nla_put_failure;
 281                break;
 282
 283        case AF_INET6:
 284                fib6_nh = &nhi->fib6_nh;
 285                if (fib6_nh->fib_nh_gw_family &&
 286                    nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
 287                        goto nla_put_failure;
 288                break;
 289        }
 290
 291        if (nhi->fib_nhc.nhc_lwtstate &&
 292            lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
 293                                NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
 294                goto nla_put_failure;
 295
 296out:
 297        nlmsg_end(skb, nlh);
 298        return 0;
 299
 300nla_put_failure:
 301        nlmsg_cancel(skb, nlh);
 302        return -EMSGSIZE;
 303}
 304
 305static size_t nh_nlmsg_size_grp(struct nexthop *nh)
 306{
 307        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 308        size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
 309
 310        return nla_total_size(sz) +
 311               nla_total_size(2);  /* NHA_GROUP_TYPE */
 312}
 313
 314static size_t nh_nlmsg_size_single(struct nexthop *nh)
 315{
 316        struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 317        size_t sz;
 318
 319        /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
 320         * are mutually exclusive
 321         */
 322        sz = nla_total_size(4);  /* NHA_OIF */
 323
 324        switch (nhi->family) {
 325        case AF_INET:
 326                if (nhi->fib_nh.fib_nh_gw_family)
 327                        sz += nla_total_size(4);  /* NHA_GATEWAY */
 328                break;
 329
 330        case AF_INET6:
 331                /* NHA_GATEWAY */
 332                if (nhi->fib6_nh.fib_nh_gw_family)
 333                        sz += nla_total_size(sizeof(const struct in6_addr));
 334                break;
 335        }
 336
 337        if (nhi->fib_nhc.nhc_lwtstate) {
 338                sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
 339                sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
 340        }
 341
 342        return sz;
 343}
 344
 345static size_t nh_nlmsg_size(struct nexthop *nh)
 346{
 347        size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
 348
 349        sz += nla_total_size(4); /* NHA_ID */
 350
 351        if (nh->is_group)
 352                sz += nh_nlmsg_size_grp(nh);
 353        else
 354                sz += nh_nlmsg_size_single(nh);
 355
 356        return sz;
 357}
 358
 359static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 360{
 361        unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
 362        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 363        struct sk_buff *skb;
 364        int err = -ENOBUFS;
 365
 366        skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
 367        if (!skb)
 368                goto errout;
 369
 370        err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
 371        if (err < 0) {
 372                /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
 373                WARN_ON(err == -EMSGSIZE);
 374                kfree_skb(skb);
 375                goto errout;
 376        }
 377
 378        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
 379                    info->nlh, gfp_any());
 380        return;
 381errout:
 382        if (err < 0)
 383                rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
 384}
 385
 386static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
 387                           bool *is_fdb, struct netlink_ext_ack *extack)
 388{
 389        if (nh->is_group) {
 390                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 391
 392                /* nested multipath (group within a group) is not
 393                 * supported
 394                 */
 395                if (nhg->mpath) {
 396                        NL_SET_ERR_MSG(extack,
 397                                       "Multipath group can not be a nexthop within a group");
 398                        return false;
 399                }
 400                *is_fdb = nhg->fdb_nh;
 401        } else {
 402                struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 403
 404                if (nhi->reject_nh && npaths > 1) {
 405                        NL_SET_ERR_MSG(extack,
 406                                       "Blackhole nexthop can not be used in a group with more than 1 path");
 407                        return false;
 408                }
 409                *is_fdb = nhi->fdb_nh;
 410        }
 411
 412        return true;
 413}
 414
 415static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
 416                                   struct netlink_ext_ack *extack)
 417{
 418        struct nh_info *nhi;
 419
 420        nhi = rtnl_dereference(nh->nh_info);
 421
 422        if (!nhi->fdb_nh) {
 423                NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
 424                return -EINVAL;
 425        }
 426
 427        if (*nh_family == AF_UNSPEC) {
 428                *nh_family = nhi->family;
 429        } else if (*nh_family != nhi->family) {
 430                NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
 431                return -EINVAL;
 432        }
 433
 434        return 0;
 435}
 436
 437static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 438                               struct netlink_ext_ack *extack)
 439{
 440        unsigned int len = nla_len(tb[NHA_GROUP]);
 441        u8 nh_family = AF_UNSPEC;
 442        struct nexthop_grp *nhg;
 443        unsigned int i, j;
 444        u8 nhg_fdb = 0;
 445
 446        if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
 447                NL_SET_ERR_MSG(extack,
 448                               "Invalid length for nexthop group attribute");
 449                return -EINVAL;
 450        }
 451
 452        /* convert len to number of nexthop ids */
 453        len /= sizeof(*nhg);
 454
 455        nhg = nla_data(tb[NHA_GROUP]);
 456        for (i = 0; i < len; ++i) {
 457                if (nhg[i].resvd1 || nhg[i].resvd2) {
 458                        NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
 459                        return -EINVAL;
 460                }
 461                if (nhg[i].weight > 254) {
 462                        NL_SET_ERR_MSG(extack, "Invalid value for weight");
 463                        return -EINVAL;
 464                }
 465                for (j = i + 1; j < len; ++j) {
 466                        if (nhg[i].id == nhg[j].id) {
 467                                NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
 468                                return -EINVAL;
 469                        }
 470                }
 471        }
 472
 473        if (tb[NHA_FDB])
 474                nhg_fdb = 1;
 475        nhg = nla_data(tb[NHA_GROUP]);
 476        for (i = 0; i < len; ++i) {
 477                struct nexthop *nh;
 478                bool is_fdb_nh;
 479
 480                nh = nexthop_find_by_id(net, nhg[i].id);
 481                if (!nh) {
 482                        NL_SET_ERR_MSG(extack, "Invalid nexthop id");
 483                        return -EINVAL;
 484                }
 485                if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
 486                        return -EINVAL;
 487
 488                if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
 489                        return -EINVAL;
 490
 491                if (!nhg_fdb && is_fdb_nh) {
 492                        NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
 493                        return -EINVAL;
 494                }
 495        }
 496        for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) {
 497                if (!tb[i])
 498                        continue;
 499                if (tb[NHA_FDB])
 500                        continue;
 501                NL_SET_ERR_MSG(extack,
 502                               "No other attributes can be set in nexthop groups");
 503                return -EINVAL;
 504        }
 505
 506        return 0;
 507}
 508
 509static bool ipv6_good_nh(const struct fib6_nh *nh)
 510{
 511        int state = NUD_REACHABLE;
 512        struct neighbour *n;
 513
 514        rcu_read_lock_bh();
 515
 516        n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
 517        if (n)
 518                state = n->nud_state;
 519
 520        rcu_read_unlock_bh();
 521
 522        return !!(state & NUD_VALID);
 523}
 524
 525static bool ipv4_good_nh(const struct fib_nh *nh)
 526{
 527        int state = NUD_REACHABLE;
 528        struct neighbour *n;
 529
 530        rcu_read_lock_bh();
 531
 532        n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
 533                                      (__force u32)nh->fib_nh_gw4);
 534        if (n)
 535                state = n->nud_state;
 536
 537        rcu_read_unlock_bh();
 538
 539        return !!(state & NUD_VALID);
 540}
 541
 542struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
 543{
 544        struct nexthop *rc = NULL;
 545        struct nh_group *nhg;
 546        int i;
 547
 548        if (!nh->is_group)
 549                return nh;
 550
 551        nhg = rcu_dereference(nh->nh_grp);
 552        for (i = 0; i < nhg->num_nh; ++i) {
 553                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 554                struct nh_info *nhi;
 555
 556                if (hash > atomic_read(&nhge->upper_bound))
 557                        continue;
 558
 559                nhi = rcu_dereference(nhge->nh->nh_info);
 560                if (nhi->fdb_nh)
 561                        return nhge->nh;
 562
 563                /* nexthops always check if it is good and does
 564                 * not rely on a sysctl for this behavior
 565                 */
 566                switch (nhi->family) {
 567                case AF_INET:
 568                        if (ipv4_good_nh(&nhi->fib_nh))
 569                                return nhge->nh;
 570                        break;
 571                case AF_INET6:
 572                        if (ipv6_good_nh(&nhi->fib6_nh))
 573                                return nhge->nh;
 574                        break;
 575                }
 576
 577                if (!rc)
 578                        rc = nhge->nh;
 579        }
 580
 581        return rc;
 582}
 583EXPORT_SYMBOL_GPL(nexthop_select_path);
 584
 585int nexthop_for_each_fib6_nh(struct nexthop *nh,
 586                             int (*cb)(struct fib6_nh *nh, void *arg),
 587                             void *arg)
 588{
 589        struct nh_info *nhi;
 590        int err;
 591
 592        if (nh->is_group) {
 593                struct nh_group *nhg;
 594                int i;
 595
 596                nhg = rcu_dereference_rtnl(nh->nh_grp);
 597                for (i = 0; i < nhg->num_nh; i++) {
 598                        struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 599
 600                        nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
 601                        err = cb(&nhi->fib6_nh, arg);
 602                        if (err)
 603                                return err;
 604                }
 605        } else {
 606                nhi = rcu_dereference_rtnl(nh->nh_info);
 607                err = cb(&nhi->fib6_nh, arg);
 608                if (err)
 609                        return err;
 610        }
 611
 612        return 0;
 613}
 614EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
 615
 616static int check_src_addr(const struct in6_addr *saddr,
 617                          struct netlink_ext_ack *extack)
 618{
 619        if (!ipv6_addr_any(saddr)) {
 620                NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
 621                return -EINVAL;
 622        }
 623        return 0;
 624}
 625
 626int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
 627                       struct netlink_ext_ack *extack)
 628{
 629        struct nh_info *nhi;
 630        bool is_fdb_nh;
 631
 632        /* fib6_src is unique to a fib6_info and limits the ability to cache
 633         * routes in fib6_nh within a nexthop that is potentially shared
 634         * across multiple fib entries. If the config wants to use source
 635         * routing it can not use nexthop objects. mlxsw also does not allow
 636         * fib6_src on routes.
 637         */
 638        if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
 639                return -EINVAL;
 640
 641        if (nh->is_group) {
 642                struct nh_group *nhg;
 643
 644                nhg = rtnl_dereference(nh->nh_grp);
 645                if (nhg->has_v4)
 646                        goto no_v4_nh;
 647                is_fdb_nh = nhg->fdb_nh;
 648        } else {
 649                nhi = rtnl_dereference(nh->nh_info);
 650                if (nhi->family == AF_INET)
 651                        goto no_v4_nh;
 652                is_fdb_nh = nhi->fdb_nh;
 653        }
 654
 655        if (is_fdb_nh) {
 656                NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
 657                return -EINVAL;
 658        }
 659
 660        return 0;
 661no_v4_nh:
 662        NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
 663        return -EINVAL;
 664}
 665EXPORT_SYMBOL_GPL(fib6_check_nexthop);
 666
 667/* if existing nexthop has ipv6 routes linked to it, need
 668 * to verify this new spec works with ipv6
 669 */
 670static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
 671                              struct netlink_ext_ack *extack)
 672{
 673        struct fib6_info *f6i;
 674
 675        if (list_empty(&old->f6i_list))
 676                return 0;
 677
 678        list_for_each_entry(f6i, &old->f6i_list, nh_list) {
 679                if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
 680                        return -EINVAL;
 681        }
 682
 683        return fib6_check_nexthop(new, NULL, extack);
 684}
 685
 686static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
 687                               struct netlink_ext_ack *extack)
 688{
 689        if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
 690                NL_SET_ERR_MSG(extack,
 691                               "Route with host scope can not have a gateway");
 692                return -EINVAL;
 693        }
 694
 695        if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
 696                NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
 697                return -EINVAL;
 698        }
 699
 700        return 0;
 701}
 702
 703/* Invoked by fib add code to verify nexthop by id is ok with
 704 * config for prefix; parts of fib_check_nh not done when nexthop
 705 * object is used.
 706 */
 707int fib_check_nexthop(struct nexthop *nh, u8 scope,
 708                      struct netlink_ext_ack *extack)
 709{
 710        struct nh_info *nhi;
 711        int err = 0;
 712
 713        if (nh->is_group) {
 714                struct nh_group *nhg;
 715
 716                nhg = rtnl_dereference(nh->nh_grp);
 717                if (nhg->fdb_nh) {
 718                        NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
 719                        err = -EINVAL;
 720                        goto out;
 721                }
 722
 723                if (scope == RT_SCOPE_HOST) {
 724                        NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
 725                        err = -EINVAL;
 726                        goto out;
 727                }
 728
 729                /* all nexthops in a group have the same scope */
 730                nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
 731                err = nexthop_check_scope(nhi, scope, extack);
 732        } else {
 733                nhi = rtnl_dereference(nh->nh_info);
 734                if (nhi->fdb_nh) {
 735                        NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
 736                        err = -EINVAL;
 737                        goto out;
 738                }
 739                err = nexthop_check_scope(nhi, scope, extack);
 740        }
 741
 742out:
 743        return err;
 744}
 745
 746static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
 747                             struct netlink_ext_ack *extack)
 748{
 749        struct fib_info *fi;
 750
 751        list_for_each_entry(fi, &old->fi_list, nh_list) {
 752                int err;
 753
 754                err = fib_check_nexthop(new, fi->fib_scope, extack);
 755                if (err)
 756                        return err;
 757        }
 758        return 0;
 759}
 760
 761static void nh_group_rebalance(struct nh_group *nhg)
 762{
 763        int total = 0;
 764        int w = 0;
 765        int i;
 766
 767        for (i = 0; i < nhg->num_nh; ++i)
 768                total += nhg->nh_entries[i].weight;
 769
 770        for (i = 0; i < nhg->num_nh; ++i) {
 771                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 772                int upper_bound;
 773
 774                w += nhge->weight;
 775                upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
 776                atomic_set(&nhge->upper_bound, upper_bound);
 777        }
 778}
 779
 780static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
 781                                struct nl_info *nlinfo)
 782{
 783        struct nh_grp_entry *nhges, *new_nhges;
 784        struct nexthop *nhp = nhge->nh_parent;
 785        struct nexthop *nh = nhge->nh;
 786        struct nh_group *nhg, *newg;
 787        int i, j;
 788
 789        WARN_ON(!nh);
 790
 791        nhg = rtnl_dereference(nhp->nh_grp);
 792        newg = nhg->spare;
 793
 794        /* last entry, keep it visible and remove the parent */
 795        if (nhg->num_nh == 1) {
 796                remove_nexthop(net, nhp, nlinfo);
 797                return;
 798        }
 799
 800        newg->has_v4 = false;
 801        newg->mpath = nhg->mpath;
 802        newg->fdb_nh = nhg->fdb_nh;
 803        newg->num_nh = nhg->num_nh;
 804
 805        /* copy old entries to new except the one getting removed */
 806        nhges = nhg->nh_entries;
 807        new_nhges = newg->nh_entries;
 808        for (i = 0, j = 0; i < nhg->num_nh; ++i) {
 809                struct nh_info *nhi;
 810
 811                /* current nexthop getting removed */
 812                if (nhg->nh_entries[i].nh == nh) {
 813                        newg->num_nh--;
 814                        continue;
 815                }
 816
 817                nhi = rtnl_dereference(nhges[i].nh->nh_info);
 818                if (nhi->family == AF_INET)
 819                        newg->has_v4 = true;
 820
 821                list_del(&nhges[i].nh_list);
 822                new_nhges[j].nh_parent = nhges[i].nh_parent;
 823                new_nhges[j].nh = nhges[i].nh;
 824                new_nhges[j].weight = nhges[i].weight;
 825                list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
 826                j++;
 827        }
 828
 829        nh_group_rebalance(newg);
 830        rcu_assign_pointer(nhp->nh_grp, newg);
 831
 832        list_del(&nhge->nh_list);
 833        nexthop_put(nhge->nh);
 834
 835        if (nlinfo)
 836                nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
 837}
 838
 839static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
 840                                       struct nl_info *nlinfo)
 841{
 842        struct nh_grp_entry *nhge, *tmp;
 843
 844        list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
 845                remove_nh_grp_entry(net, nhge, nlinfo);
 846
 847        /* make sure all see the newly published array before releasing rtnl */
 848        synchronize_net();
 849}
 850
 851static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
 852{
 853        struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
 854        int i, num_nh = nhg->num_nh;
 855
 856        for (i = 0; i < num_nh; ++i) {
 857                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 858
 859                if (WARN_ON(!nhge->nh))
 860                        continue;
 861
 862                list_del_init(&nhge->nh_list);
 863        }
 864}
 865
 866/* not called for nexthop replace */
 867static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 868{
 869        struct fib6_info *f6i, *tmp;
 870        bool do_flush = false;
 871        struct fib_info *fi;
 872
 873        list_for_each_entry(fi, &nh->fi_list, nh_list) {
 874                fi->fib_flags |= RTNH_F_DEAD;
 875                do_flush = true;
 876        }
 877        if (do_flush)
 878                fib_flush(net);
 879
 880        /* ip6_del_rt removes the entry from this list hence the _safe */
 881        list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
 882                /* __ip6_del_rt does a release, so do a hold here */
 883                fib6_info_hold(f6i);
 884                ipv6_stub->ip6_del_rt(net, f6i,
 885                                      !net->ipv4.sysctl_nexthop_compat_mode);
 886        }
 887}
 888
 889static void __remove_nexthop(struct net *net, struct nexthop *nh,
 890                             struct nl_info *nlinfo)
 891{
 892        __remove_nexthop_fib(net, nh);
 893
 894        if (nh->is_group) {
 895                remove_nexthop_group(nh, nlinfo);
 896        } else {
 897                struct nh_info *nhi;
 898
 899                nhi = rtnl_dereference(nh->nh_info);
 900                if (nhi->fib_nhc.nhc_dev)
 901                        hlist_del(&nhi->dev_hash);
 902
 903                remove_nexthop_from_groups(net, nh, nlinfo);
 904        }
 905}
 906
 907static void remove_nexthop(struct net *net, struct nexthop *nh,
 908                           struct nl_info *nlinfo)
 909{
 910        call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh);
 911
 912        /* remove from the tree */
 913        rb_erase(&nh->rb_node, &net->nexthop.rb_root);
 914
 915        if (nlinfo)
 916                nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 917
 918        __remove_nexthop(net, nh, nlinfo);
 919        nh_base_seq_inc(net);
 920
 921        nexthop_put(nh);
 922}
 923
 924/* if any FIB entries reference this nexthop, any dst entries
 925 * need to be regenerated
 926 */
 927static void nh_rt_cache_flush(struct net *net, struct nexthop *nh)
 928{
 929        struct fib6_info *f6i;
 930
 931        if (!list_empty(&nh->fi_list))
 932                rt_cache_flush(net);
 933
 934        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
 935                ipv6_stub->fib6_update_sernum(net, f6i);
 936}
 937
 938static int replace_nexthop_grp(struct net *net, struct nexthop *old,
 939                               struct nexthop *new,
 940                               struct netlink_ext_ack *extack)
 941{
 942        struct nh_group *oldg, *newg;
 943        int i;
 944
 945        if (!new->is_group) {
 946                NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
 947                return -EINVAL;
 948        }
 949
 950        oldg = rtnl_dereference(old->nh_grp);
 951        newg = rtnl_dereference(new->nh_grp);
 952
 953        /* update parents - used by nexthop code for cleanup */
 954        for (i = 0; i < newg->num_nh; i++)
 955                newg->nh_entries[i].nh_parent = old;
 956
 957        rcu_assign_pointer(old->nh_grp, newg);
 958
 959        for (i = 0; i < oldg->num_nh; i++)
 960                oldg->nh_entries[i].nh_parent = new;
 961
 962        rcu_assign_pointer(new->nh_grp, oldg);
 963
 964        return 0;
 965}
 966
 967static void nh_group_v4_update(struct nh_group *nhg)
 968{
 969        struct nh_grp_entry *nhges;
 970        bool has_v4 = false;
 971        int i;
 972
 973        nhges = nhg->nh_entries;
 974        for (i = 0; i < nhg->num_nh; i++) {
 975                struct nh_info *nhi;
 976
 977                nhi = rtnl_dereference(nhges[i].nh->nh_info);
 978                if (nhi->family == AF_INET)
 979                        has_v4 = true;
 980        }
 981        nhg->has_v4 = has_v4;
 982}
 983
 984static int replace_nexthop_single(struct net *net, struct nexthop *old,
 985                                  struct nexthop *new,
 986                                  struct netlink_ext_ack *extack)
 987{
 988        struct nh_info *oldi, *newi;
 989
 990        if (new->is_group) {
 991                NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
 992                return -EINVAL;
 993        }
 994
 995        oldi = rtnl_dereference(old->nh_info);
 996        newi = rtnl_dereference(new->nh_info);
 997
 998        newi->nh_parent = old;
 999        oldi->nh_parent = new;
1000
1001        old->protocol = new->protocol;
1002        old->nh_flags = new->nh_flags;
1003
1004        rcu_assign_pointer(old->nh_info, newi);
1005        rcu_assign_pointer(new->nh_info, oldi);
1006
1007        /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
1008         * update IPv4 indication in all the groups using the nexthop.
1009         */
1010        if (oldi->family == AF_INET && newi->family == AF_INET6) {
1011                struct nh_grp_entry *nhge;
1012
1013                list_for_each_entry(nhge, &old->grp_list, nh_list) {
1014                        struct nexthop *nhp = nhge->nh_parent;
1015                        struct nh_group *nhg;
1016
1017                        nhg = rtnl_dereference(nhp->nh_grp);
1018                        nh_group_v4_update(nhg);
1019                }
1020        }
1021
1022        return 0;
1023}
1024
1025static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
1026                                     struct nl_info *info)
1027{
1028        struct fib6_info *f6i;
1029
1030        if (!list_empty(&nh->fi_list)) {
1031                struct fib_info *fi;
1032
1033                /* expectation is a few fib_info per nexthop and then
1034                 * a lot of routes per fib_info. So mark the fib_info
1035                 * and then walk the fib tables once
1036                 */
1037                list_for_each_entry(fi, &nh->fi_list, nh_list)
1038                        fi->nh_updated = true;
1039
1040                fib_info_notify_update(net, info);
1041
1042                list_for_each_entry(fi, &nh->fi_list, nh_list)
1043                        fi->nh_updated = false;
1044        }
1045
1046        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
1047                ipv6_stub->fib6_rt_update(net, f6i, info);
1048}
1049
1050/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
1051 * linked to this nexthop and for all groups that the nexthop
1052 * is a member of
1053 */
1054static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
1055                                   struct nl_info *info)
1056{
1057        struct nh_grp_entry *nhge;
1058
1059        __nexthop_replace_notify(net, nh, info);
1060
1061        list_for_each_entry(nhge, &nh->grp_list, nh_list)
1062                __nexthop_replace_notify(net, nhge->nh_parent, info);
1063}
1064
1065static int replace_nexthop(struct net *net, struct nexthop *old,
1066                           struct nexthop *new, struct netlink_ext_ack *extack)
1067{
1068        bool new_is_reject = false;
1069        struct nh_grp_entry *nhge;
1070        int err;
1071
1072        /* check that existing FIB entries are ok with the
1073         * new nexthop definition
1074         */
1075        err = fib_check_nh_list(old, new, extack);
1076        if (err)
1077                return err;
1078
1079        err = fib6_check_nh_list(old, new, extack);
1080        if (err)
1081                return err;
1082
1083        if (!new->is_group) {
1084                struct nh_info *nhi = rtnl_dereference(new->nh_info);
1085
1086                new_is_reject = nhi->reject_nh;
1087        }
1088
1089        list_for_each_entry(nhge, &old->grp_list, nh_list) {
1090                /* if new nexthop is a blackhole, any groups using this
1091                 * nexthop cannot have more than 1 path
1092                 */
1093                if (new_is_reject &&
1094                    nexthop_num_path(nhge->nh_parent) > 1) {
1095                        NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
1096                        return -EINVAL;
1097                }
1098
1099                err = fib_check_nh_list(nhge->nh_parent, new, extack);
1100                if (err)
1101                        return err;
1102
1103                err = fib6_check_nh_list(nhge->nh_parent, new, extack);
1104                if (err)
1105                        return err;
1106        }
1107
1108        if (old->is_group)
1109                err = replace_nexthop_grp(net, old, new, extack);
1110        else
1111                err = replace_nexthop_single(net, old, new, extack);
1112
1113        if (!err) {
1114                nh_rt_cache_flush(net, old);
1115
1116                __remove_nexthop(net, new, NULL);
1117                nexthop_put(new);
1118        }
1119
1120        return err;
1121}
1122
1123/* called with rtnl_lock held */
1124static int insert_nexthop(struct net *net, struct nexthop *new_nh,
1125                          struct nh_config *cfg, struct netlink_ext_ack *extack)
1126{
1127        struct rb_node **pp, *parent = NULL, *next;
1128        struct rb_root *root = &net->nexthop.rb_root;
1129        bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
1130        bool create = !!(cfg->nlflags & NLM_F_CREATE);
1131        u32 new_id = new_nh->id;
1132        int replace_notify = 0;
1133        int rc = -EEXIST;
1134
1135        pp = &root->rb_node;
1136        while (1) {
1137                struct nexthop *nh;
1138
1139                next = *pp;
1140                if (!next)
1141                        break;
1142
1143                parent = next;
1144
1145                nh = rb_entry(parent, struct nexthop, rb_node);
1146                if (new_id < nh->id) {
1147                        pp = &next->rb_left;
1148                } else if (new_id > nh->id) {
1149                        pp = &next->rb_right;
1150                } else if (replace) {
1151                        rc = replace_nexthop(net, nh, new_nh, extack);
1152                        if (!rc) {
1153                                new_nh = nh; /* send notification with old nh */
1154                                replace_notify = 1;
1155                        }
1156                        goto out;
1157                } else {
1158                        /* id already exists and not a replace */
1159                        goto out;
1160                }
1161        }
1162
1163        if (replace && !create) {
1164                NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
1165                rc = -ENOENT;
1166                goto out;
1167        }
1168
1169        rb_link_node_rcu(&new_nh->rb_node, parent, pp);
1170        rb_insert_color(&new_nh->rb_node, root);
1171        rc = 0;
1172out:
1173        if (!rc) {
1174                nh_base_seq_inc(net);
1175                nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
1176                if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
1177                        nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
1178        }
1179
1180        return rc;
1181}
1182
1183/* rtnl */
1184/* remove all nexthops tied to a device being deleted */
1185static void nexthop_flush_dev(struct net_device *dev)
1186{
1187        unsigned int hash = nh_dev_hashfn(dev->ifindex);
1188        struct net *net = dev_net(dev);
1189        struct hlist_head *head = &net->nexthop.devhash[hash];
1190        struct hlist_node *n;
1191        struct nh_info *nhi;
1192
1193        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
1194                if (nhi->fib_nhc.nhc_dev != dev)
1195                        continue;
1196
1197                remove_nexthop(net, nhi->nh_parent, NULL);
1198        }
1199}
1200
1201/* rtnl; called when net namespace is deleted */
1202static void flush_all_nexthops(struct net *net)
1203{
1204        struct rb_root *root = &net->nexthop.rb_root;
1205        struct rb_node *node;
1206        struct nexthop *nh;
1207
1208        while ((node = rb_first(root))) {
1209                nh = rb_entry(node, struct nexthop, rb_node);
1210                remove_nexthop(net, nh, NULL);
1211                cond_resched();
1212        }
1213}
1214
1215static struct nexthop *nexthop_create_group(struct net *net,
1216                                            struct nh_config *cfg)
1217{
1218        struct nlattr *grps_attr = cfg->nh_grp;
1219        struct nexthop_grp *entry = nla_data(grps_attr);
1220        u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
1221        struct nh_group *nhg;
1222        struct nexthop *nh;
1223        int i;
1224
1225        if (WARN_ON(!num_nh))
1226                return ERR_PTR(-EINVAL);
1227
1228        nh = nexthop_alloc();
1229        if (!nh)
1230                return ERR_PTR(-ENOMEM);
1231
1232        nh->is_group = 1;
1233
1234        nhg = nexthop_grp_alloc(num_nh);
1235        if (!nhg) {
1236                kfree(nh);
1237                return ERR_PTR(-ENOMEM);
1238        }
1239
1240        /* spare group used for removals */
1241        nhg->spare = nexthop_grp_alloc(num_nh);
1242        if (!nhg->spare) {
1243                kfree(nhg);
1244                kfree(nh);
1245                return ERR_PTR(-ENOMEM);
1246        }
1247        nhg->spare->spare = nhg;
1248
1249        for (i = 0; i < nhg->num_nh; ++i) {
1250                struct nexthop *nhe;
1251                struct nh_info *nhi;
1252
1253                nhe = nexthop_find_by_id(net, entry[i].id);
1254                if (!nexthop_get(nhe))
1255                        goto out_no_nh;
1256
1257                nhi = rtnl_dereference(nhe->nh_info);
1258                if (nhi->family == AF_INET)
1259                        nhg->has_v4 = true;
1260
1261                nhg->nh_entries[i].nh = nhe;
1262                nhg->nh_entries[i].weight = entry[i].weight + 1;
1263                list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
1264                nhg->nh_entries[i].nh_parent = nh;
1265        }
1266
1267        if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
1268                nhg->mpath = 1;
1269                nh_group_rebalance(nhg);
1270        }
1271
1272        if (cfg->nh_fdb)
1273                nhg->fdb_nh = 1;
1274
1275        rcu_assign_pointer(nh->nh_grp, nhg);
1276
1277        return nh;
1278
1279out_no_nh:
1280        for (; i >= 0; --i)
1281                nexthop_put(nhg->nh_entries[i].nh);
1282
1283        kfree(nhg->spare);
1284        kfree(nhg);
1285        kfree(nh);
1286
1287        return ERR_PTR(-ENOENT);
1288}
1289
1290static int nh_create_ipv4(struct net *net, struct nexthop *nh,
1291                          struct nh_info *nhi, struct nh_config *cfg,
1292                          struct netlink_ext_ack *extack)
1293{
1294        struct fib_nh *fib_nh = &nhi->fib_nh;
1295        struct fib_config fib_cfg = {
1296                .fc_oif   = cfg->nh_ifindex,
1297                .fc_gw4   = cfg->gw.ipv4,
1298                .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
1299                .fc_flags = cfg->nh_flags,
1300                .fc_encap = cfg->nh_encap,
1301                .fc_encap_type = cfg->nh_encap_type,
1302        };
1303        u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
1304        int err;
1305
1306        err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
1307        if (err) {
1308                fib_nh_release(net, fib_nh);
1309                goto out;
1310        }
1311
1312        if (nhi->fdb_nh)
1313                goto out;
1314
1315        /* sets nh_dev if successful */
1316        err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
1317        if (!err) {
1318                nh->nh_flags = fib_nh->fib_nh_flags;
1319                fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
1320                                          fib_nh->fib_nh_scope);
1321        } else {
1322                fib_nh_release(net, fib_nh);
1323        }
1324out:
1325        return err;
1326}
1327
1328static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
1329                          struct nh_info *nhi, struct nh_config *cfg,
1330                          struct netlink_ext_ack *extack)
1331{
1332        struct fib6_nh *fib6_nh = &nhi->fib6_nh;
1333        struct fib6_config fib6_cfg = {
1334                .fc_table = l3mdev_fib_table(cfg->dev),
1335                .fc_ifindex = cfg->nh_ifindex,
1336                .fc_gateway = cfg->gw.ipv6,
1337                .fc_flags = cfg->nh_flags,
1338                .fc_encap = cfg->nh_encap,
1339                .fc_encap_type = cfg->nh_encap_type,
1340                .fc_is_fdb = cfg->nh_fdb,
1341        };
1342        int err;
1343
1344        if (!ipv6_addr_any(&cfg->gw.ipv6))
1345                fib6_cfg.fc_flags |= RTF_GATEWAY;
1346
1347        /* sets nh_dev if successful */
1348        err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
1349                                      extack);
1350        if (err)
1351                ipv6_stub->fib6_nh_release(fib6_nh);
1352        else
1353                nh->nh_flags = fib6_nh->fib_nh_flags;
1354
1355        return err;
1356}
1357
1358static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
1359                                      struct netlink_ext_ack *extack)
1360{
1361        struct nh_info *nhi;
1362        struct nexthop *nh;
1363        int err = 0;
1364
1365        nh = nexthop_alloc();
1366        if (!nh)
1367                return ERR_PTR(-ENOMEM);
1368
1369        nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
1370        if (!nhi) {
1371                kfree(nh);
1372                return ERR_PTR(-ENOMEM);
1373        }
1374
1375        nh->nh_flags = cfg->nh_flags;
1376        nh->net = net;
1377
1378        nhi->nh_parent = nh;
1379        nhi->family = cfg->nh_family;
1380        nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
1381
1382        if (cfg->nh_fdb)
1383                nhi->fdb_nh = 1;
1384
1385        if (cfg->nh_blackhole) {
1386                nhi->reject_nh = 1;
1387                cfg->nh_ifindex = net->loopback_dev->ifindex;
1388        }
1389
1390        switch (cfg->nh_family) {
1391        case AF_INET:
1392                err = nh_create_ipv4(net, nh, nhi, cfg, extack);
1393                break;
1394        case AF_INET6:
1395                err = nh_create_ipv6(net, nh, nhi, cfg, extack);
1396                break;
1397        }
1398
1399        if (err) {
1400                kfree(nhi);
1401                kfree(nh);
1402                return ERR_PTR(err);
1403        }
1404
1405        /* add the entry to the device based hash */
1406        if (!nhi->fdb_nh)
1407                nexthop_devhash_add(net, nhi);
1408
1409        rcu_assign_pointer(nh->nh_info, nhi);
1410
1411        return nh;
1412}
1413
1414/* called with rtnl lock held */
1415static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
1416                                   struct netlink_ext_ack *extack)
1417{
1418        struct nexthop *nh;
1419        int err;
1420
1421        if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
1422                NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
1423                return ERR_PTR(-EINVAL);
1424        }
1425
1426        if (!cfg->nh_id) {
1427                cfg->nh_id = nh_find_unused_id(net);
1428                if (!cfg->nh_id) {
1429                        NL_SET_ERR_MSG(extack, "No unused id");
1430                        return ERR_PTR(-EINVAL);
1431                }
1432        }
1433
1434        if (cfg->nh_grp)
1435                nh = nexthop_create_group(net, cfg);
1436        else
1437                nh = nexthop_create(net, cfg, extack);
1438
1439        if (IS_ERR(nh))
1440                return nh;
1441
1442        refcount_set(&nh->refcnt, 1);
1443        nh->id = cfg->nh_id;
1444        nh->protocol = cfg->nh_protocol;
1445        nh->net = net;
1446
1447        err = insert_nexthop(net, nh, cfg, extack);
1448        if (err) {
1449                __remove_nexthop(net, nh, NULL);
1450                nexthop_put(nh);
1451                nh = ERR_PTR(err);
1452        }
1453
1454        return nh;
1455}
1456
1457static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
1458                            struct nlmsghdr *nlh, struct nh_config *cfg,
1459                            struct netlink_ext_ack *extack)
1460{
1461        struct nhmsg *nhm = nlmsg_data(nlh);
1462        struct nlattr *tb[NHA_MAX + 1];
1463        int err;
1464
1465        err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1466                          extack);
1467        if (err < 0)
1468                return err;
1469
1470        err = -EINVAL;
1471        if (nhm->resvd || nhm->nh_scope) {
1472                NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
1473                goto out;
1474        }
1475        if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
1476                NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
1477                goto out;
1478        }
1479
1480        switch (nhm->nh_family) {
1481        case AF_INET:
1482        case AF_INET6:
1483                break;
1484        case AF_UNSPEC:
1485                if (tb[NHA_GROUP])
1486                        break;
1487                fallthrough;
1488        default:
1489                NL_SET_ERR_MSG(extack, "Invalid address family");
1490                goto out;
1491        }
1492
1493        if (tb[NHA_GROUPS] || tb[NHA_MASTER]) {
1494                NL_SET_ERR_MSG(extack, "Invalid attributes in request");
1495                goto out;
1496        }
1497
1498        memset(cfg, 0, sizeof(*cfg));
1499        cfg->nlflags = nlh->nlmsg_flags;
1500        cfg->nlinfo.portid = NETLINK_CB(skb).portid;
1501        cfg->nlinfo.nlh = nlh;
1502        cfg->nlinfo.nl_net = net;
1503
1504        cfg->nh_family = nhm->nh_family;
1505        cfg->nh_protocol = nhm->nh_protocol;
1506        cfg->nh_flags = nhm->nh_flags;
1507
1508        if (tb[NHA_ID])
1509                cfg->nh_id = nla_get_u32(tb[NHA_ID]);
1510
1511        if (tb[NHA_FDB]) {
1512                if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
1513                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
1514                        NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
1515                        goto out;
1516                }
1517                if (nhm->nh_flags) {
1518                        NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
1519                        goto out;
1520                }
1521                cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
1522        }
1523
1524        if (tb[NHA_GROUP]) {
1525                if (nhm->nh_family != AF_UNSPEC) {
1526                        NL_SET_ERR_MSG(extack, "Invalid family for group");
1527                        goto out;
1528                }
1529                cfg->nh_grp = tb[NHA_GROUP];
1530
1531                cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
1532                if (tb[NHA_GROUP_TYPE])
1533                        cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
1534
1535                if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
1536                        NL_SET_ERR_MSG(extack, "Invalid group type");
1537                        goto out;
1538                }
1539                err = nh_check_attr_group(net, tb, extack);
1540
1541                /* no other attributes should be set */
1542                goto out;
1543        }
1544
1545        if (tb[NHA_BLACKHOLE]) {
1546                if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
1547                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
1548                        NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
1549                        goto out;
1550                }
1551
1552                cfg->nh_blackhole = 1;
1553                err = 0;
1554                goto out;
1555        }
1556
1557        if (!cfg->nh_fdb && !tb[NHA_OIF]) {
1558                NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
1559                goto out;
1560        }
1561
1562        if (!cfg->nh_fdb && tb[NHA_OIF]) {
1563                cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
1564                if (cfg->nh_ifindex)
1565                        cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
1566
1567                if (!cfg->dev) {
1568                        NL_SET_ERR_MSG(extack, "Invalid device index");
1569                        goto out;
1570                } else if (!(cfg->dev->flags & IFF_UP)) {
1571                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
1572                        err = -ENETDOWN;
1573                        goto out;
1574                } else if (!netif_carrier_ok(cfg->dev)) {
1575                        NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
1576                        err = -ENETDOWN;
1577                        goto out;
1578                }
1579        }
1580
1581        err = -EINVAL;
1582        if (tb[NHA_GATEWAY]) {
1583                struct nlattr *gwa = tb[NHA_GATEWAY];
1584
1585                switch (cfg->nh_family) {
1586                case AF_INET:
1587                        if (nla_len(gwa) != sizeof(u32)) {
1588                                NL_SET_ERR_MSG(extack, "Invalid gateway");
1589                                goto out;
1590                        }
1591                        cfg->gw.ipv4 = nla_get_be32(gwa);
1592                        break;
1593                case AF_INET6:
1594                        if (nla_len(gwa) != sizeof(struct in6_addr)) {
1595                                NL_SET_ERR_MSG(extack, "Invalid gateway");
1596                                goto out;
1597                        }
1598                        cfg->gw.ipv6 = nla_get_in6_addr(gwa);
1599                        break;
1600                default:
1601                        NL_SET_ERR_MSG(extack,
1602                                       "Unknown address family for gateway");
1603                        goto out;
1604                }
1605        } else {
1606                /* device only nexthop (no gateway) */
1607                if (cfg->nh_flags & RTNH_F_ONLINK) {
1608                        NL_SET_ERR_MSG(extack,
1609                                       "ONLINK flag can not be set for nexthop without a gateway");
1610                        goto out;
1611                }
1612        }
1613
1614        if (tb[NHA_ENCAP]) {
1615                cfg->nh_encap = tb[NHA_ENCAP];
1616
1617                if (!tb[NHA_ENCAP_TYPE]) {
1618                        NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
1619                        goto out;
1620                }
1621
1622                cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
1623                err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
1624                if (err < 0)
1625                        goto out;
1626
1627        } else if (tb[NHA_ENCAP_TYPE]) {
1628                NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
1629                goto out;
1630        }
1631
1632
1633        err = 0;
1634out:
1635        return err;
1636}
1637
1638/* rtnl */
1639static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
1640                           struct netlink_ext_ack *extack)
1641{
1642        struct net *net = sock_net(skb->sk);
1643        struct nh_config cfg;
1644        struct nexthop *nh;
1645        int err;
1646
1647        err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
1648        if (!err) {
1649                nh = nexthop_add(net, &cfg, extack);
1650                if (IS_ERR(nh))
1651                        err = PTR_ERR(nh);
1652        }
1653
1654        return err;
1655}
1656
1657static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id,
1658                                struct netlink_ext_ack *extack)
1659{
1660        struct nhmsg *nhm = nlmsg_data(nlh);
1661        struct nlattr *tb[NHA_MAX + 1];
1662        int err, i;
1663
1664        err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1665                          extack);
1666        if (err < 0)
1667                return err;
1668
1669        err = -EINVAL;
1670        for (i = 0; i < __NHA_MAX; ++i) {
1671                if (!tb[i])
1672                        continue;
1673
1674                switch (i) {
1675                case NHA_ID:
1676                        break;
1677                default:
1678                        NL_SET_ERR_MSG_ATTR(extack, tb[i],
1679                                            "Unexpected attribute in request");
1680                        goto out;
1681                }
1682        }
1683        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
1684                NL_SET_ERR_MSG(extack, "Invalid values in header");
1685                goto out;
1686        }
1687
1688        if (!tb[NHA_ID]) {
1689                NL_SET_ERR_MSG(extack, "Nexthop id is missing");
1690                goto out;
1691        }
1692
1693        *id = nla_get_u32(tb[NHA_ID]);
1694        if (!(*id))
1695                NL_SET_ERR_MSG(extack, "Invalid nexthop id");
1696        else
1697                err = 0;
1698out:
1699        return err;
1700}
1701
1702/* rtnl */
1703static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
1704                           struct netlink_ext_ack *extack)
1705{
1706        struct net *net = sock_net(skb->sk);
1707        struct nl_info nlinfo = {
1708                .nlh = nlh,
1709                .nl_net = net,
1710                .portid = NETLINK_CB(skb).portid,
1711        };
1712        struct nexthop *nh;
1713        int err;
1714        u32 id;
1715
1716        err = nh_valid_get_del_req(nlh, &id, extack);
1717        if (err)
1718                return err;
1719
1720        nh = nexthop_find_by_id(net, id);
1721        if (!nh)
1722                return -ENOENT;
1723
1724        remove_nexthop(net, nh, &nlinfo);
1725
1726        return 0;
1727}
1728
1729/* rtnl */
1730static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
1731                           struct netlink_ext_ack *extack)
1732{
1733        struct net *net = sock_net(in_skb->sk);
1734        struct sk_buff *skb = NULL;
1735        struct nexthop *nh;
1736        int err;
1737        u32 id;
1738
1739        err = nh_valid_get_del_req(nlh, &id, extack);
1740        if (err)
1741                return err;
1742
1743        err = -ENOBUFS;
1744        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1745        if (!skb)
1746                goto out;
1747
1748        err = -ENOENT;
1749        nh = nexthop_find_by_id(net, id);
1750        if (!nh)
1751                goto errout_free;
1752
1753        err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
1754                           nlh->nlmsg_seq, 0);
1755        if (err < 0) {
1756                WARN_ON(err == -EMSGSIZE);
1757                goto errout_free;
1758        }
1759
1760        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1761out:
1762        return err;
1763errout_free:
1764        kfree_skb(skb);
1765        goto out;
1766}
1767
1768static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
1769                             bool group_filter, u8 family)
1770{
1771        const struct net_device *dev;
1772        const struct nh_info *nhi;
1773
1774        if (group_filter && !nh->is_group)
1775                return true;
1776
1777        if (!dev_idx && !master_idx && !family)
1778                return false;
1779
1780        if (nh->is_group)
1781                return true;
1782
1783        nhi = rtnl_dereference(nh->nh_info);
1784        if (family && nhi->family != family)
1785                return true;
1786
1787        dev = nhi->fib_nhc.nhc_dev;
1788        if (dev_idx && (!dev || dev->ifindex != dev_idx))
1789                return true;
1790
1791        if (master_idx) {
1792                struct net_device *master;
1793
1794                if (!dev)
1795                        return true;
1796
1797                master = netdev_master_upper_dev_get((struct net_device *)dev);
1798                if (!master || master->ifindex != master_idx)
1799                        return true;
1800        }
1801
1802        return false;
1803}
1804
1805static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
1806                             int *master_idx, bool *group_filter,
1807                             bool *fdb_filter, struct netlink_callback *cb)
1808{
1809        struct netlink_ext_ack *extack = cb->extack;
1810        struct nlattr *tb[NHA_MAX + 1];
1811        struct nhmsg *nhm;
1812        int err, i;
1813        u32 idx;
1814
1815        err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1816                          NULL);
1817        if (err < 0)
1818                return err;
1819
1820        for (i = 0; i <= NHA_MAX; ++i) {
1821                if (!tb[i])
1822                        continue;
1823
1824                switch (i) {
1825                case NHA_OIF:
1826                        idx = nla_get_u32(tb[i]);
1827                        if (idx > INT_MAX) {
1828                                NL_SET_ERR_MSG(extack, "Invalid device index");
1829                                return -EINVAL;
1830                        }
1831                        *dev_idx = idx;
1832                        break;
1833                case NHA_MASTER:
1834                        idx = nla_get_u32(tb[i]);
1835                        if (idx > INT_MAX) {
1836                                NL_SET_ERR_MSG(extack, "Invalid master device index");
1837                                return -EINVAL;
1838                        }
1839                        *master_idx = idx;
1840                        break;
1841                case NHA_GROUPS:
1842                        *group_filter = true;
1843                        break;
1844                case NHA_FDB:
1845                        *fdb_filter = true;
1846                        break;
1847                default:
1848                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
1849                        return -EINVAL;
1850                }
1851        }
1852
1853        nhm = nlmsg_data(nlh);
1854        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
1855                NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
1856                return -EINVAL;
1857        }
1858
1859        return 0;
1860}
1861
1862/* rtnl */
1863static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
1864{
1865        bool group_filter = false, fdb_filter = false;
1866        struct nhmsg *nhm = nlmsg_data(cb->nlh);
1867        int dev_filter_idx = 0, master_idx = 0;
1868        struct net *net = sock_net(skb->sk);
1869        struct rb_root *root = &net->nexthop.rb_root;
1870        struct rb_node *node;
1871        int idx = 0, s_idx;
1872        int err;
1873
1874        err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
1875                                &group_filter, &fdb_filter, cb);
1876        if (err < 0)
1877                return err;
1878
1879        s_idx = cb->args[0];
1880        for (node = rb_first(root); node; node = rb_next(node)) {
1881                struct nexthop *nh;
1882
1883                if (idx < s_idx)
1884                        goto cont;
1885
1886                nh = rb_entry(node, struct nexthop, rb_node);
1887                if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
1888                                     group_filter, nhm->nh_family))
1889                        goto cont;
1890
1891                err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
1892                                   NETLINK_CB(cb->skb).portid,
1893                                   cb->nlh->nlmsg_seq, NLM_F_MULTI);
1894                if (err < 0) {
1895                        if (likely(skb->len))
1896                                goto out;
1897
1898                        goto out_err;
1899                }
1900cont:
1901                idx++;
1902        }
1903
1904out:
1905        err = skb->len;
1906out_err:
1907        cb->args[0] = idx;
1908        cb->seq = net->nexthop.seq;
1909        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1910
1911        return err;
1912}
1913
1914static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
1915{
1916        unsigned int hash = nh_dev_hashfn(dev->ifindex);
1917        struct net *net = dev_net(dev);
1918        struct hlist_head *head = &net->nexthop.devhash[hash];
1919        struct hlist_node *n;
1920        struct nh_info *nhi;
1921
1922        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
1923                if (nhi->fib_nhc.nhc_dev == dev) {
1924                        if (nhi->family == AF_INET)
1925                                fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
1926                                                   orig_mtu);
1927                }
1928        }
1929}
1930
1931/* rtnl */
1932static int nh_netdev_event(struct notifier_block *this,
1933                           unsigned long event, void *ptr)
1934{
1935        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1936        struct netdev_notifier_info_ext *info_ext;
1937
1938        switch (event) {
1939        case NETDEV_DOWN:
1940        case NETDEV_UNREGISTER:
1941                nexthop_flush_dev(dev);
1942                break;
1943        case NETDEV_CHANGE:
1944                if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
1945                        nexthop_flush_dev(dev);
1946                break;
1947        case NETDEV_CHANGEMTU:
1948                info_ext = ptr;
1949                nexthop_sync_mtu(dev, info_ext->ext.mtu);
1950                rt_cache_flush(dev_net(dev));
1951                break;
1952        }
1953        return NOTIFY_DONE;
1954}
1955
1956static struct notifier_block nh_netdev_notifier = {
1957        .notifier_call = nh_netdev_event,
1958};
1959
1960int register_nexthop_notifier(struct net *net, struct notifier_block *nb)
1961{
1962        return blocking_notifier_chain_register(&net->nexthop.notifier_chain,
1963                                                nb);
1964}
1965EXPORT_SYMBOL(register_nexthop_notifier);
1966
1967int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
1968{
1969        return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
1970                                                  nb);
1971}
1972EXPORT_SYMBOL(unregister_nexthop_notifier);
1973
1974static void __net_exit nexthop_net_exit(struct net *net)
1975{
1976        rtnl_lock();
1977        flush_all_nexthops(net);
1978        rtnl_unlock();
1979        kfree(net->nexthop.devhash);
1980}
1981
1982static int __net_init nexthop_net_init(struct net *net)
1983{
1984        size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
1985
1986        net->nexthop.rb_root = RB_ROOT;
1987        net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
1988        if (!net->nexthop.devhash)
1989                return -ENOMEM;
1990        BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
1991
1992        return 0;
1993}
1994
1995static struct pernet_operations nexthop_net_ops = {
1996        .init = nexthop_net_init,
1997        .exit = nexthop_net_exit,
1998};
1999
2000static int __init nexthop_init(void)
2001{
2002        register_pernet_subsys(&nexthop_net_ops);
2003
2004        register_netdevice_notifier(&nh_netdev_notifier);
2005
2006        rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
2007        rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
2008        rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
2009                      rtm_dump_nexthop, 0);
2010
2011        rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
2012        rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
2013
2014        rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
2015        rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
2016
2017        return 0;
2018}
2019subsys_initcall(nexthop_init);
2020