linux/net/ipv4/nexthop.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* Generic nexthop implementation
   3 *
   4 * Copyright (c) 2017-19 Cumulus Networks
   5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
   6 */
   7
   8#include <linux/nexthop.h>
   9#include <linux/rtnetlink.h>
  10#include <linux/slab.h>
  11#include <net/arp.h>
  12#include <net/ipv6_stubs.h>
  13#include <net/lwtunnel.h>
  14#include <net/ndisc.h>
  15#include <net/nexthop.h>
  16#include <net/route.h>
  17#include <net/sock.h>
  18
  19#define NH_RES_DEFAULT_IDLE_TIMER       (120 * HZ)
  20#define NH_RES_DEFAULT_UNBALANCED_TIMER 0       /* No forced rebalancing. */
  21
  22static void remove_nexthop(struct net *net, struct nexthop *nh,
  23                           struct nl_info *nlinfo);
  24
  25#define NH_DEV_HASHBITS  8
  26#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
  27
  28static const struct nla_policy rtm_nh_policy_new[] = {
  29        [NHA_ID]                = { .type = NLA_U32 },
  30        [NHA_GROUP]             = { .type = NLA_BINARY },
  31        [NHA_GROUP_TYPE]        = { .type = NLA_U16 },
  32        [NHA_BLACKHOLE]         = { .type = NLA_FLAG },
  33        [NHA_OIF]               = { .type = NLA_U32 },
  34        [NHA_GATEWAY]           = { .type = NLA_BINARY },
  35        [NHA_ENCAP_TYPE]        = { .type = NLA_U16 },
  36        [NHA_ENCAP]             = { .type = NLA_NESTED },
  37        [NHA_FDB]               = { .type = NLA_FLAG },
  38        [NHA_RES_GROUP]         = { .type = NLA_NESTED },
  39};
  40
  41static const struct nla_policy rtm_nh_policy_get[] = {
  42        [NHA_ID]                = { .type = NLA_U32 },
  43};
  44
  45static const struct nla_policy rtm_nh_policy_dump[] = {
  46        [NHA_OIF]               = { .type = NLA_U32 },
  47        [NHA_GROUPS]            = { .type = NLA_FLAG },
  48        [NHA_MASTER]            = { .type = NLA_U32 },
  49        [NHA_FDB]               = { .type = NLA_FLAG },
  50};
  51
  52static const struct nla_policy rtm_nh_res_policy_new[] = {
  53        [NHA_RES_GROUP_BUCKETS]                 = { .type = NLA_U16 },
  54        [NHA_RES_GROUP_IDLE_TIMER]              = { .type = NLA_U32 },
  55        [NHA_RES_GROUP_UNBALANCED_TIMER]        = { .type = NLA_U32 },
  56};
  57
  58static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
  59        [NHA_ID]                = { .type = NLA_U32 },
  60        [NHA_OIF]               = { .type = NLA_U32 },
  61        [NHA_MASTER]            = { .type = NLA_U32 },
  62        [NHA_RES_BUCKET]        = { .type = NLA_NESTED },
  63};
  64
  65static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
  66        [NHA_RES_BUCKET_NH_ID]  = { .type = NLA_U32 },
  67};
  68
  69static const struct nla_policy rtm_nh_policy_get_bucket[] = {
  70        [NHA_ID]                = { .type = NLA_U32 },
  71        [NHA_RES_BUCKET]        = { .type = NLA_NESTED },
  72};
  73
  74static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
  75        [NHA_RES_BUCKET_INDEX]  = { .type = NLA_U16 },
  76};
  77
  78static bool nexthop_notifiers_is_empty(struct net *net)
  79{
  80        return !net->nexthop.notifier_chain.head;
  81}
  82
  83static void
  84__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
  85                               const struct nh_info *nhi)
  86{
  87        nh_info->dev = nhi->fib_nhc.nhc_dev;
  88        nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
  89        if (nh_info->gw_family == AF_INET)
  90                nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
  91        else if (nh_info->gw_family == AF_INET6)
  92                nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
  93
  94        nh_info->is_reject = nhi->reject_nh;
  95        nh_info->is_fdb = nhi->fdb_nh;
  96        nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
  97}
  98
  99static int nh_notifier_single_info_init(struct nh_notifier_info *info,
 100                                        const struct nexthop *nh)
 101{
 102        struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 103
 104        info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
 105        info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
 106        if (!info->nh)
 107                return -ENOMEM;
 108
 109        __nh_notifier_single_info_init(info->nh, nhi);
 110
 111        return 0;
 112}
 113
 114static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
 115{
 116        kfree(info->nh);
 117}
 118
 119static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
 120                                       struct nh_group *nhg)
 121{
 122        u16 num_nh = nhg->num_nh;
 123        int i;
 124
 125        info->type = NH_NOTIFIER_INFO_TYPE_GRP;
 126        info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
 127                               GFP_KERNEL);
 128        if (!info->nh_grp)
 129                return -ENOMEM;
 130
 131        info->nh_grp->num_nh = num_nh;
 132        info->nh_grp->is_fdb = nhg->fdb_nh;
 133
 134        for (i = 0; i < num_nh; i++) {
 135                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 136                struct nh_info *nhi;
 137
 138                nhi = rtnl_dereference(nhge->nh->nh_info);
 139                info->nh_grp->nh_entries[i].id = nhge->nh->id;
 140                info->nh_grp->nh_entries[i].weight = nhge->weight;
 141                __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
 142                                               nhi);
 143        }
 144
 145        return 0;
 146}
 147
 148static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
 149                                           struct nh_group *nhg)
 150{
 151        struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
 152        u16 num_nh_buckets = res_table->num_nh_buckets;
 153        unsigned long size;
 154        u16 i;
 155
 156        info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
 157        size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
 158        info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
 159                                       __GFP_NOWARN);
 160        if (!info->nh_res_table)
 161                return -ENOMEM;
 162
 163        info->nh_res_table->num_nh_buckets = num_nh_buckets;
 164
 165        for (i = 0; i < num_nh_buckets; i++) {
 166                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
 167                struct nh_grp_entry *nhge;
 168                struct nh_info *nhi;
 169
 170                nhge = rtnl_dereference(bucket->nh_entry);
 171                nhi = rtnl_dereference(nhge->nh->nh_info);
 172                __nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
 173                                               nhi);
 174        }
 175
 176        return 0;
 177}
 178
 179static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
 180                                     const struct nexthop *nh)
 181{
 182        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 183
 184        if (nhg->hash_threshold)
 185                return nh_notifier_mpath_info_init(info, nhg);
 186        else if (nhg->resilient)
 187                return nh_notifier_res_table_info_init(info, nhg);
 188        return -EINVAL;
 189}
 190
 191static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
 192                                      const struct nexthop *nh)
 193{
 194        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 195
 196        if (nhg->hash_threshold)
 197                kfree(info->nh_grp);
 198        else if (nhg->resilient)
 199                vfree(info->nh_res_table);
 200}
 201
 202static int nh_notifier_info_init(struct nh_notifier_info *info,
 203                                 const struct nexthop *nh)
 204{
 205        info->id = nh->id;
 206
 207        if (nh->is_group)
 208                return nh_notifier_grp_info_init(info, nh);
 209        else
 210                return nh_notifier_single_info_init(info, nh);
 211}
 212
 213static void nh_notifier_info_fini(struct nh_notifier_info *info,
 214                                  const struct nexthop *nh)
 215{
 216        if (nh->is_group)
 217                nh_notifier_grp_info_fini(info, nh);
 218        else
 219                nh_notifier_single_info_fini(info);
 220}
 221
 222static int call_nexthop_notifiers(struct net *net,
 223                                  enum nexthop_event_type event_type,
 224                                  struct nexthop *nh,
 225                                  struct netlink_ext_ack *extack)
 226{
 227        struct nh_notifier_info info = {
 228                .net = net,
 229                .extack = extack,
 230        };
 231        int err;
 232
 233        ASSERT_RTNL();
 234
 235        if (nexthop_notifiers_is_empty(net))
 236                return 0;
 237
 238        err = nh_notifier_info_init(&info, nh);
 239        if (err) {
 240                NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
 241                return err;
 242        }
 243
 244        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
 245                                           event_type, &info);
 246        nh_notifier_info_fini(&info, nh);
 247
 248        return notifier_to_errno(err);
 249}
 250
 251static int
 252nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
 253                                      bool force, unsigned int *p_idle_timer_ms)
 254{
 255        struct nh_res_table *res_table;
 256        struct nh_group *nhg;
 257        struct nexthop *nh;
 258        int err = 0;
 259
 260        /* When 'force' is false, nexthop bucket replacement is performed
 261         * because the bucket was deemed to be idle. In this case, capable
 262         * listeners can choose to perform an atomic replacement: The bucket is
 263         * only replaced if it is inactive. However, if the idle timer interval
 264         * is smaller than the interval in which a listener is querying
 265         * buckets' activity from the device, then atomic replacement should
 266         * not be tried. Pass the idle timer value to listeners, so that they
 267         * could determine which type of replacement to perform.
 268         */
 269        if (force) {
 270                *p_idle_timer_ms = 0;
 271                return 0;
 272        }
 273
 274        rcu_read_lock();
 275
 276        nh = nexthop_find_by_id(info->net, info->id);
 277        if (!nh) {
 278                err = -EINVAL;
 279                goto out;
 280        }
 281
 282        nhg = rcu_dereference(nh->nh_grp);
 283        res_table = rcu_dereference(nhg->res_table);
 284        *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);
 285
 286out:
 287        rcu_read_unlock();
 288
 289        return err;
 290}
 291
 292static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
 293                                            u16 bucket_index, bool force,
 294                                            struct nh_info *oldi,
 295                                            struct nh_info *newi)
 296{
 297        unsigned int idle_timer_ms;
 298        int err;
 299
 300        err = nh_notifier_res_bucket_idle_timer_get(info, force,
 301                                                    &idle_timer_ms);
 302        if (err)
 303                return err;
 304
 305        info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
 306        info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
 307                                      GFP_KERNEL);
 308        if (!info->nh_res_bucket)
 309                return -ENOMEM;
 310
 311        info->nh_res_bucket->bucket_index = bucket_index;
 312        info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
 313        info->nh_res_bucket->force = force;
 314        __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
 315        __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
 316        return 0;
 317}
 318
 319static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
 320{
 321        kfree(info->nh_res_bucket);
 322}
 323
 324static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
 325                                               u16 bucket_index, bool force,
 326                                               struct nh_info *oldi,
 327                                               struct nh_info *newi,
 328                                               struct netlink_ext_ack *extack)
 329{
 330        struct nh_notifier_info info = {
 331                .net = net,
 332                .extack = extack,
 333                .id = nhg_id,
 334        };
 335        int err;
 336
 337        if (nexthop_notifiers_is_empty(net))
 338                return 0;
 339
 340        err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
 341                                               oldi, newi);
 342        if (err)
 343                return err;
 344
 345        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
 346                                           NEXTHOP_EVENT_BUCKET_REPLACE, &info);
 347        nh_notifier_res_bucket_info_fini(&info);
 348
 349        return notifier_to_errno(err);
 350}
 351
 352/* There are three users of RES_TABLE, and NHs etc. referenced from there:
 353 *
 354 * 1) a collection of callbacks for NH maintenance. This operates under
 355 *    RTNL,
 356 * 2) the delayed work that gradually balances the resilient table,
 357 * 3) and nexthop_select_path(), operating under RCU.
 358 *
 359 * Both the delayed work and the RTNL block are writers, and need to
 360 * maintain mutual exclusion. Since there are only two and well-known
 361 * writers for each table, the RTNL code can make sure it has exclusive
 362 * access thus:
 363 *
 364 * - Have the DW operate without locking;
 365 * - synchronously cancel the DW;
 366 * - do the writing;
 367 * - if the write was not actually a delete, call upkeep, which schedules
 368 *   DW again if necessary.
 369 *
 370 * The functions that are always called from the RTNL context use
 371 * rtnl_dereference(). The functions that can also be called from the DW do
 372 * a raw dereference and rely on the above mutual exclusion scheme.
 373 */
 374#define nh_res_dereference(p) (rcu_dereference_raw(p))
 375
 376static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
 377                                             u16 bucket_index, bool force,
 378                                             struct nexthop *old_nh,
 379                                             struct nexthop *new_nh,
 380                                             struct netlink_ext_ack *extack)
 381{
 382        struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
 383        struct nh_info *newi = nh_res_dereference(new_nh->nh_info);
 384
 385        return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
 386                                                   force, oldi, newi, extack);
 387}
 388
 389static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
 390                                            struct netlink_ext_ack *extack)
 391{
 392        struct nh_notifier_info info = {
 393                .net = net,
 394                .extack = extack,
 395        };
 396        struct nh_group *nhg;
 397        int err;
 398
 399        ASSERT_RTNL();
 400
 401        if (nexthop_notifiers_is_empty(net))
 402                return 0;
 403
 404        /* At this point, the nexthop buckets are still not populated. Only
 405         * emit a notification with the logical nexthops, so that a listener
 406         * could potentially veto it in case of unsupported configuration.
 407         */
 408        nhg = rtnl_dereference(nh->nh_grp);
 409        err = nh_notifier_mpath_info_init(&info, nhg);
 410        if (err) {
 411                NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
 412                return err;
 413        }
 414
 415        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
 416                                           NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
 417                                           &info);
 418        kfree(info.nh_grp);
 419
 420        return notifier_to_errno(err);
 421}
 422
 423static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
 424                                 enum nexthop_event_type event_type,
 425                                 struct nexthop *nh,
 426                                 struct netlink_ext_ack *extack)
 427{
 428        struct nh_notifier_info info = {
 429                .net = net,
 430                .extack = extack,
 431        };
 432        int err;
 433
 434        err = nh_notifier_info_init(&info, nh);
 435        if (err)
 436                return err;
 437
 438        err = nb->notifier_call(nb, event_type, &info);
 439        nh_notifier_info_fini(&info, nh);
 440
 441        return notifier_to_errno(err);
 442}
 443
 444static unsigned int nh_dev_hashfn(unsigned int val)
 445{
 446        unsigned int mask = NH_DEV_HASHSIZE - 1;
 447
 448        return (val ^
 449                (val >> NH_DEV_HASHBITS) ^
 450                (val >> (NH_DEV_HASHBITS * 2))) & mask;
 451}
 452
 453static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
 454{
 455        struct net_device *dev = nhi->fib_nhc.nhc_dev;
 456        struct hlist_head *head;
 457        unsigned int hash;
 458
 459        WARN_ON(!dev);
 460
 461        hash = nh_dev_hashfn(dev->ifindex);
 462        head = &net->nexthop.devhash[hash];
 463        hlist_add_head(&nhi->dev_hash, head);
 464}
 465
 466static void nexthop_free_group(struct nexthop *nh)
 467{
 468        struct nh_group *nhg;
 469        int i;
 470
 471        nhg = rcu_dereference_raw(nh->nh_grp);
 472        for (i = 0; i < nhg->num_nh; ++i) {
 473                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 474
 475                WARN_ON(!list_empty(&nhge->nh_list));
 476                nexthop_put(nhge->nh);
 477        }
 478
 479        WARN_ON(nhg->spare == nhg);
 480
 481        if (nhg->resilient)
 482                vfree(rcu_dereference_raw(nhg->res_table));
 483
 484        kfree(nhg->spare);
 485        kfree(nhg);
 486}
 487
 488static void nexthop_free_single(struct nexthop *nh)
 489{
 490        struct nh_info *nhi;
 491
 492        nhi = rcu_dereference_raw(nh->nh_info);
 493        switch (nhi->family) {
 494        case AF_INET:
 495                fib_nh_release(nh->net, &nhi->fib_nh);
 496                break;
 497        case AF_INET6:
 498                ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
 499                break;
 500        }
 501        kfree(nhi);
 502}
 503
 504void nexthop_free_rcu(struct rcu_head *head)
 505{
 506        struct nexthop *nh = container_of(head, struct nexthop, rcu);
 507
 508        if (nh->is_group)
 509                nexthop_free_group(nh);
 510        else
 511                nexthop_free_single(nh);
 512
 513        kfree(nh);
 514}
 515EXPORT_SYMBOL_GPL(nexthop_free_rcu);
 516
 517static struct nexthop *nexthop_alloc(void)
 518{
 519        struct nexthop *nh;
 520
 521        nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 522        if (nh) {
 523                INIT_LIST_HEAD(&nh->fi_list);
 524                INIT_LIST_HEAD(&nh->f6i_list);
 525                INIT_LIST_HEAD(&nh->grp_list);
 526                INIT_LIST_HEAD(&nh->fdb_list);
 527        }
 528        return nh;
 529}
 530
 531static struct nh_group *nexthop_grp_alloc(u16 num_nh)
 532{
 533        struct nh_group *nhg;
 534
 535        nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
 536        if (nhg)
 537                nhg->num_nh = num_nh;
 538
 539        return nhg;
 540}
 541
 542static void nh_res_table_upkeep_dw(struct work_struct *work);
 543
 544static struct nh_res_table *
 545nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
 546{
 547        const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
 548        struct nh_res_table *res_table;
 549        unsigned long size;
 550
 551        size = struct_size(res_table, nh_buckets, num_nh_buckets);
 552        res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
 553        if (!res_table)
 554                return NULL;
 555
 556        res_table->net = net;
 557        res_table->nhg_id = nhg_id;
 558        INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
 559        INIT_LIST_HEAD(&res_table->uw_nh_entries);
 560        res_table->idle_timer = cfg->nh_grp_res_idle_timer;
 561        res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
 562        res_table->num_nh_buckets = num_nh_buckets;
 563        return res_table;
 564}
 565
 566static void nh_base_seq_inc(struct net *net)
 567{
 568        while (++net->nexthop.seq == 0)
 569                ;
 570}
 571
 572/* no reference taken; rcu lock or rtnl must be held */
 573struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
 574{
 575        struct rb_node **pp, *parent = NULL, *next;
 576
 577        pp = &net->nexthop.rb_root.rb_node;
 578        while (1) {
 579                struct nexthop *nh;
 580
 581                next = rcu_dereference_raw(*pp);
 582                if (!next)
 583                        break;
 584                parent = next;
 585
 586                nh = rb_entry(parent, struct nexthop, rb_node);
 587                if (id < nh->id)
 588                        pp = &next->rb_left;
 589                else if (id > nh->id)
 590                        pp = &next->rb_right;
 591                else
 592                        return nh;
 593        }
 594        return NULL;
 595}
 596EXPORT_SYMBOL_GPL(nexthop_find_by_id);
 597
 598/* used for auto id allocation; called with rtnl held */
 599static u32 nh_find_unused_id(struct net *net)
 600{
 601        u32 id_start = net->nexthop.last_id_allocated;
 602
 603        while (1) {
 604                net->nexthop.last_id_allocated++;
 605                if (net->nexthop.last_id_allocated == id_start)
 606                        break;
 607
 608                if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
 609                        return net->nexthop.last_id_allocated;
 610        }
 611        return 0;
 612}
 613
 614static void nh_res_time_set_deadline(unsigned long next_time,
 615                                     unsigned long *deadline)
 616{
 617        if (time_before(next_time, *deadline))
 618                *deadline = next_time;
 619}
 620
 621static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
 622{
 623        if (list_empty(&res_table->uw_nh_entries))
 624                return 0;
 625        return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
 626}
 627
 628static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
 629{
 630        struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
 631        struct nlattr *nest;
 632
 633        nest = nla_nest_start(skb, NHA_RES_GROUP);
 634        if (!nest)
 635                return -EMSGSIZE;
 636
 637        if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
 638                        res_table->num_nh_buckets) ||
 639            nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
 640                        jiffies_to_clock_t(res_table->idle_timer)) ||
 641            nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
 642                        jiffies_to_clock_t(res_table->unbalanced_timer)) ||
 643            nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
 644                              nh_res_table_unbalanced_time(res_table),
 645                              NHA_RES_GROUP_PAD))
 646                goto nla_put_failure;
 647
 648        nla_nest_end(skb, nest);
 649        return 0;
 650
 651nla_put_failure:
 652        nla_nest_cancel(skb, nest);
 653        return -EMSGSIZE;
 654}
 655
 656static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
 657{
 658        struct nexthop_grp *p;
 659        size_t len = nhg->num_nh * sizeof(*p);
 660        struct nlattr *nla;
 661        u16 group_type = 0;
 662        int i;
 663
 664        if (nhg->hash_threshold)
 665                group_type = NEXTHOP_GRP_TYPE_MPATH;
 666        else if (nhg->resilient)
 667                group_type = NEXTHOP_GRP_TYPE_RES;
 668
 669        if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
 670                goto nla_put_failure;
 671
 672        nla = nla_reserve(skb, NHA_GROUP, len);
 673        if (!nla)
 674                goto nla_put_failure;
 675
 676        p = nla_data(nla);
 677        for (i = 0; i < nhg->num_nh; ++i) {
 678                p->id = nhg->nh_entries[i].nh->id;
 679                p->weight = nhg->nh_entries[i].weight - 1;
 680                p += 1;
 681        }
 682
 683        if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
 684                goto nla_put_failure;
 685
 686        return 0;
 687
 688nla_put_failure:
 689        return -EMSGSIZE;
 690}
 691
 692static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 693                        int event, u32 portid, u32 seq, unsigned int nlflags)
 694{
 695        struct fib6_nh *fib6_nh;
 696        struct fib_nh *fib_nh;
 697        struct nlmsghdr *nlh;
 698        struct nh_info *nhi;
 699        struct nhmsg *nhm;
 700
 701        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
 702        if (!nlh)
 703                return -EMSGSIZE;
 704
 705        nhm = nlmsg_data(nlh);
 706        nhm->nh_family = AF_UNSPEC;
 707        nhm->nh_flags = nh->nh_flags;
 708        nhm->nh_protocol = nh->protocol;
 709        nhm->nh_scope = 0;
 710        nhm->resvd = 0;
 711
 712        if (nla_put_u32(skb, NHA_ID, nh->id))
 713                goto nla_put_failure;
 714
 715        if (nh->is_group) {
 716                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 717
 718                if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
 719                        goto nla_put_failure;
 720                if (nla_put_nh_group(skb, nhg))
 721                        goto nla_put_failure;
 722                goto out;
 723        }
 724
 725        nhi = rtnl_dereference(nh->nh_info);
 726        nhm->nh_family = nhi->family;
 727        if (nhi->reject_nh) {
 728                if (nla_put_flag(skb, NHA_BLACKHOLE))
 729                        goto nla_put_failure;
 730                goto out;
 731        } else if (nhi->fdb_nh) {
 732                if (nla_put_flag(skb, NHA_FDB))
 733                        goto nla_put_failure;
 734        } else {
 735                const struct net_device *dev;
 736
 737                dev = nhi->fib_nhc.nhc_dev;
 738                if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
 739                        goto nla_put_failure;
 740        }
 741
 742        nhm->nh_scope = nhi->fib_nhc.nhc_scope;
 743        switch (nhi->family) {
 744        case AF_INET:
 745                fib_nh = &nhi->fib_nh;
 746                if (fib_nh->fib_nh_gw_family &&
 747                    nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
 748                        goto nla_put_failure;
 749                break;
 750
 751        case AF_INET6:
 752                fib6_nh = &nhi->fib6_nh;
 753                if (fib6_nh->fib_nh_gw_family &&
 754                    nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
 755                        goto nla_put_failure;
 756                break;
 757        }
 758
 759        if (nhi->fib_nhc.nhc_lwtstate &&
 760            lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
 761                                NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
 762                goto nla_put_failure;
 763
 764out:
 765        nlmsg_end(skb, nlh);
 766        return 0;
 767
 768nla_put_failure:
 769        nlmsg_cancel(skb, nlh);
 770        return -EMSGSIZE;
 771}
 772
 773static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
 774{
 775        return nla_total_size(0) +      /* NHA_RES_GROUP */
 776                nla_total_size(2) +     /* NHA_RES_GROUP_BUCKETS */
 777                nla_total_size(4) +     /* NHA_RES_GROUP_IDLE_TIMER */
 778                nla_total_size(4) +     /* NHA_RES_GROUP_UNBALANCED_TIMER */
 779                nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
 780}
 781
 782static size_t nh_nlmsg_size_grp(struct nexthop *nh)
 783{
 784        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 785        size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
 786        size_t tot = nla_total_size(sz) +
 787                nla_total_size(2); /* NHA_GROUP_TYPE */
 788
 789        if (nhg->resilient)
 790                tot += nh_nlmsg_size_grp_res(nhg);
 791
 792        return tot;
 793}
 794
 795static size_t nh_nlmsg_size_single(struct nexthop *nh)
 796{
 797        struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 798        size_t sz;
 799
 800        /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
 801         * are mutually exclusive
 802         */
 803        sz = nla_total_size(4);  /* NHA_OIF */
 804
 805        switch (nhi->family) {
 806        case AF_INET:
 807                if (nhi->fib_nh.fib_nh_gw_family)
 808                        sz += nla_total_size(4);  /* NHA_GATEWAY */
 809                break;
 810
 811        case AF_INET6:
 812                /* NHA_GATEWAY */
 813                if (nhi->fib6_nh.fib_nh_gw_family)
 814                        sz += nla_total_size(sizeof(const struct in6_addr));
 815                break;
 816        }
 817
 818        if (nhi->fib_nhc.nhc_lwtstate) {
 819                sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
 820                sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
 821        }
 822
 823        return sz;
 824}
 825
 826static size_t nh_nlmsg_size(struct nexthop *nh)
 827{
 828        size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
 829
 830        sz += nla_total_size(4); /* NHA_ID */
 831
 832        if (nh->is_group)
 833                sz += nh_nlmsg_size_grp(nh);
 834        else
 835                sz += nh_nlmsg_size_single(nh);
 836
 837        return sz;
 838}
 839
 840static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 841{
 842        unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
 843        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 844        struct sk_buff *skb;
 845        int err = -ENOBUFS;
 846
 847        skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
 848        if (!skb)
 849                goto errout;
 850
 851        err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
 852        if (err < 0) {
 853                /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
 854                WARN_ON(err == -EMSGSIZE);
 855                kfree_skb(skb);
 856                goto errout;
 857        }
 858
 859        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
 860                    info->nlh, gfp_any());
 861        return;
 862errout:
 863        if (err < 0)
 864                rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
 865}
 866
 867static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
 868{
 869        return (unsigned long)atomic_long_read(&bucket->used_time);
 870}
 871
 872static unsigned long
 873nh_res_bucket_idle_point(const struct nh_res_table *res_table,
 874                         const struct nh_res_bucket *bucket,
 875                         unsigned long now)
 876{
 877        unsigned long time = nh_res_bucket_used_time(bucket);
 878
 879        /* Bucket was not used since it was migrated. The idle time is now. */
 880        if (time == bucket->migrated_time)
 881                return now;
 882
 883        return time + res_table->idle_timer;
 884}
 885
 886static unsigned long
 887nh_res_table_unb_point(const struct nh_res_table *res_table)
 888{
 889        return res_table->unbalanced_since + res_table->unbalanced_timer;
 890}
 891
 892static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
 893                                   struct nh_res_bucket *bucket)
 894{
 895        unsigned long now = jiffies;
 896
 897        atomic_long_set(&bucket->used_time, (long)now);
 898        bucket->migrated_time = now;
 899}
 900
 901static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
 902{
 903        atomic_long_set(&bucket->used_time, (long)jiffies);
 904}
 905
 906static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
 907{
 908        unsigned long used_time = nh_res_bucket_used_time(bucket);
 909
 910        return jiffies_delta_to_clock_t(jiffies - used_time);
 911}
 912
 913static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
 914                              struct nh_res_bucket *bucket, u16 bucket_index,
 915                              int event, u32 portid, u32 seq,
 916                              unsigned int nlflags,
 917                              struct netlink_ext_ack *extack)
 918{
 919        struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
 920        struct nlmsghdr *nlh;
 921        struct nlattr *nest;
 922        struct nhmsg *nhm;
 923
 924        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
 925        if (!nlh)
 926                return -EMSGSIZE;
 927
 928        nhm = nlmsg_data(nlh);
 929        nhm->nh_family = AF_UNSPEC;
 930        nhm->nh_flags = bucket->nh_flags;
 931        nhm->nh_protocol = nh->protocol;
 932        nhm->nh_scope = 0;
 933        nhm->resvd = 0;
 934
 935        if (nla_put_u32(skb, NHA_ID, nh->id))
 936                goto nla_put_failure;
 937
 938        nest = nla_nest_start(skb, NHA_RES_BUCKET);
 939        if (!nest)
 940                goto nla_put_failure;
 941
 942        if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
 943            nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
 944            nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
 945                              nh_res_bucket_idle_time(bucket),
 946                              NHA_RES_BUCKET_PAD))
 947                goto nla_put_failure_nest;
 948
 949        nla_nest_end(skb, nest);
 950        nlmsg_end(skb, nlh);
 951        return 0;
 952
 953nla_put_failure_nest:
 954        nla_nest_cancel(skb, nest);
 955nla_put_failure:
 956        nlmsg_cancel(skb, nlh);
 957        return -EMSGSIZE;
 958}
 959
 960static void nexthop_bucket_notify(struct nh_res_table *res_table,
 961                                  u16 bucket_index)
 962{
 963        struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
 964        struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
 965        struct nexthop *nh = nhge->nh_parent;
 966        struct sk_buff *skb;
 967        int err = -ENOBUFS;
 968
 969        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 970        if (!skb)
 971                goto errout;
 972
 973        err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
 974                                 RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
 975                                 NULL);
 976        if (err < 0) {
 977                kfree_skb(skb);
 978                goto errout;
 979        }
 980
 981        rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
 982        return;
 983errout:
 984        if (err < 0)
 985                rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
 986}
 987
 988static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
 989                           bool *is_fdb, struct netlink_ext_ack *extack)
 990{
 991        if (nh->is_group) {
 992                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 993
 994                /* Nesting groups within groups is not supported. */
 995                if (nhg->hash_threshold) {
 996                        NL_SET_ERR_MSG(extack,
 997                                       "Hash-threshold group can not be a nexthop within a group");
 998                        return false;
 999                }
1000                if (nhg->resilient) {
1001                        NL_SET_ERR_MSG(extack,
1002                                       "Resilient group can not be a nexthop within a group");
1003                        return false;
1004                }
1005                *is_fdb = nhg->fdb_nh;
1006        } else {
1007                struct nh_info *nhi = rtnl_dereference(nh->nh_info);
1008
1009                if (nhi->reject_nh && npaths > 1) {
1010                        NL_SET_ERR_MSG(extack,
1011                                       "Blackhole nexthop can not be used in a group with more than 1 path");
1012                        return false;
1013                }
1014                *is_fdb = nhi->fdb_nh;
1015        }
1016
1017        return true;
1018}
1019
1020static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
1021                                   struct netlink_ext_ack *extack)
1022{
1023        struct nh_info *nhi;
1024
1025        nhi = rtnl_dereference(nh->nh_info);
1026
1027        if (!nhi->fdb_nh) {
1028                NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
1029                return -EINVAL;
1030        }
1031
1032        if (*nh_family == AF_UNSPEC) {
1033                *nh_family = nhi->family;
1034        } else if (*nh_family != nhi->family) {
1035                NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
1036                return -EINVAL;
1037        }
1038
1039        return 0;
1040}
1041
1042static int nh_check_attr_group(struct net *net,
1043                               struct nlattr *tb[], size_t tb_size,
1044                               u16 nh_grp_type, struct netlink_ext_ack *extack)
1045{
1046        unsigned int len = nla_len(tb[NHA_GROUP]);
1047        u8 nh_family = AF_UNSPEC;
1048        struct nexthop_grp *nhg;
1049        unsigned int i, j;
1050        u8 nhg_fdb = 0;
1051
1052        if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
1053                NL_SET_ERR_MSG(extack,
1054                               "Invalid length for nexthop group attribute");
1055                return -EINVAL;
1056        }
1057
1058        /* convert len to number of nexthop ids */
1059        len /= sizeof(*nhg);
1060
1061        nhg = nla_data(tb[NHA_GROUP]);
1062        for (i = 0; i < len; ++i) {
1063                if (nhg[i].resvd1 || nhg[i].resvd2) {
1064                        NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
1065                        return -EINVAL;
1066                }
1067                if (nhg[i].weight > 254) {
1068                        NL_SET_ERR_MSG(extack, "Invalid value for weight");
1069                        return -EINVAL;
1070                }
1071                for (j = i + 1; j < len; ++j) {
1072                        if (nhg[i].id == nhg[j].id) {
1073                                NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
1074                                return -EINVAL;
1075                        }
1076                }
1077        }
1078
1079        if (tb[NHA_FDB])
1080                nhg_fdb = 1;
1081        nhg = nla_data(tb[NHA_GROUP]);
1082        for (i = 0; i < len; ++i) {
1083                struct nexthop *nh;
1084                bool is_fdb_nh;
1085
1086                nh = nexthop_find_by_id(net, nhg[i].id);
1087                if (!nh) {
1088                        NL_SET_ERR_MSG(extack, "Invalid nexthop id");
1089                        return -EINVAL;
1090                }
1091                if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
1092                        return -EINVAL;
1093
1094                if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
1095                        return -EINVAL;
1096
1097                if (!nhg_fdb && is_fdb_nh) {
1098                        NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
1099                        return -EINVAL;
1100                }
1101        }
1102        for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
1103                if (!tb[i])
1104                        continue;
1105                switch (i) {
1106                case NHA_FDB:
1107                        continue;
1108                case NHA_RES_GROUP:
1109                        if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
1110                                continue;
1111                        break;
1112                }
1113                NL_SET_ERR_MSG(extack,
1114                               "No other attributes can be set in nexthop groups");
1115                return -EINVAL;
1116        }
1117
1118        return 0;
1119}
1120
1121static bool ipv6_good_nh(const struct fib6_nh *nh)
1122{
1123        int state = NUD_REACHABLE;
1124        struct neighbour *n;
1125
1126        rcu_read_lock_bh();
1127
1128        n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
1129        if (n)
1130                state = n->nud_state;
1131
1132        rcu_read_unlock_bh();
1133
1134        return !!(state & NUD_VALID);
1135}
1136
1137static bool ipv4_good_nh(const struct fib_nh *nh)
1138{
1139        int state = NUD_REACHABLE;
1140        struct neighbour *n;
1141
1142        rcu_read_lock_bh();
1143
1144        n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
1145                                      (__force u32)nh->fib_nh_gw4);
1146        if (n)
1147                state = n->nud_state;
1148
1149        rcu_read_unlock_bh();
1150
1151        return !!(state & NUD_VALID);
1152}
1153
1154static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
1155{
1156        struct nexthop *rc = NULL;
1157        int i;
1158
1159        for (i = 0; i < nhg->num_nh; ++i) {
1160                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1161                struct nh_info *nhi;
1162
1163                if (hash > atomic_read(&nhge->hthr.upper_bound))
1164                        continue;
1165
1166                nhi = rcu_dereference(nhge->nh->nh_info);
1167                if (nhi->fdb_nh)
1168                        return nhge->nh;
1169
1170                /* nexthops always check if it is good and does
1171                 * not rely on a sysctl for this behavior
1172                 */
1173                switch (nhi->family) {
1174                case AF_INET:
1175                        if (ipv4_good_nh(&nhi->fib_nh))
1176                                return nhge->nh;
1177                        break;
1178                case AF_INET6:
1179                        if (ipv6_good_nh(&nhi->fib6_nh))
1180                                return nhge->nh;
1181                        break;
1182                }
1183
1184                if (!rc)
1185                        rc = nhge->nh;
1186        }
1187
1188        return rc;
1189}
1190
1191static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
1192{
1193        struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
1194        u16 bucket_index = hash % res_table->num_nh_buckets;
1195        struct nh_res_bucket *bucket;
1196        struct nh_grp_entry *nhge;
1197
1198        /* nexthop_select_path() is expected to return a non-NULL value, so
1199         * skip protocol validation and just hand out whatever there is.
1200         */
1201        bucket = &res_table->nh_buckets[bucket_index];
1202        nh_res_bucket_set_busy(bucket);
1203        nhge = rcu_dereference(bucket->nh_entry);
1204        return nhge->nh;
1205}
1206
1207struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
1208{
1209        struct nh_group *nhg;
1210
1211        if (!nh->is_group)
1212                return nh;
1213
1214        nhg = rcu_dereference(nh->nh_grp);
1215        if (nhg->hash_threshold)
1216                return nexthop_select_path_hthr(nhg, hash);
1217        else if (nhg->resilient)
1218                return nexthop_select_path_res(nhg, hash);
1219
1220        /* Unreachable. */
1221        return NULL;
1222}
1223EXPORT_SYMBOL_GPL(nexthop_select_path);
1224
1225int nexthop_for_each_fib6_nh(struct nexthop *nh,
1226                             int (*cb)(struct fib6_nh *nh, void *arg),
1227                             void *arg)
1228{
1229        struct nh_info *nhi;
1230        int err;
1231
1232        if (nh->is_group) {
1233                struct nh_group *nhg;
1234                int i;
1235
1236                nhg = rcu_dereference_rtnl(nh->nh_grp);
1237                for (i = 0; i < nhg->num_nh; i++) {
1238                        struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1239
1240                        nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
1241                        err = cb(&nhi->fib6_nh, arg);
1242                        if (err)
1243                                return err;
1244                }
1245        } else {
1246                nhi = rcu_dereference_rtnl(nh->nh_info);
1247                err = cb(&nhi->fib6_nh, arg);
1248                if (err)
1249                        return err;
1250        }
1251
1252        return 0;
1253}
1254EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
1255
1256static int check_src_addr(const struct in6_addr *saddr,
1257                          struct netlink_ext_ack *extack)
1258{
1259        if (!ipv6_addr_any(saddr)) {
1260                NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
1261                return -EINVAL;
1262        }
1263        return 0;
1264}
1265
1266int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
1267                       struct netlink_ext_ack *extack)
1268{
1269        struct nh_info *nhi;
1270        bool is_fdb_nh;
1271
1272        /* fib6_src is unique to a fib6_info and limits the ability to cache
1273         * routes in fib6_nh within a nexthop that is potentially shared
1274         * across multiple fib entries. If the config wants to use source
1275         * routing it can not use nexthop objects. mlxsw also does not allow
1276         * fib6_src on routes.
1277         */
1278        if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
1279                return -EINVAL;
1280
1281        if (nh->is_group) {
1282                struct nh_group *nhg;
1283
1284                nhg = rtnl_dereference(nh->nh_grp);
1285                if (nhg->has_v4)
1286                        goto no_v4_nh;
1287                is_fdb_nh = nhg->fdb_nh;
1288        } else {
1289                nhi = rtnl_dereference(nh->nh_info);
1290                if (nhi->family == AF_INET)
1291                        goto no_v4_nh;
1292                is_fdb_nh = nhi->fdb_nh;
1293        }
1294
1295        if (is_fdb_nh) {
1296                NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1297                return -EINVAL;
1298        }
1299
1300        return 0;
1301no_v4_nh:
1302        NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
1303        return -EINVAL;
1304}
1305EXPORT_SYMBOL_GPL(fib6_check_nexthop);
1306
1307/* if existing nexthop has ipv6 routes linked to it, need
1308 * to verify this new spec works with ipv6
1309 */
1310static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
1311                              struct netlink_ext_ack *extack)
1312{
1313        struct fib6_info *f6i;
1314
1315        if (list_empty(&old->f6i_list))
1316                return 0;
1317
1318        list_for_each_entry(f6i, &old->f6i_list, nh_list) {
1319                if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
1320                        return -EINVAL;
1321        }
1322
1323        return fib6_check_nexthop(new, NULL, extack);
1324}
1325
1326static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
1327                               struct netlink_ext_ack *extack)
1328{
1329        if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
1330                NL_SET_ERR_MSG(extack,
1331                               "Route with host scope can not have a gateway");
1332                return -EINVAL;
1333        }
1334
1335        if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
1336                NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
1337                return -EINVAL;
1338        }
1339
1340        return 0;
1341}
1342
1343/* Invoked by fib add code to verify nexthop by id is ok with
1344 * config for prefix; parts of fib_check_nh not done when nexthop
1345 * object is used.
1346 */
1347int fib_check_nexthop(struct nexthop *nh, u8 scope,
1348                      struct netlink_ext_ack *extack)
1349{
1350        struct nh_info *nhi;
1351        int err = 0;
1352
1353        if (nh->is_group) {
1354                struct nh_group *nhg;
1355
1356                nhg = rtnl_dereference(nh->nh_grp);
1357                if (nhg->fdb_nh) {
1358                        NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1359                        err = -EINVAL;
1360                        goto out;
1361                }
1362
1363                if (scope == RT_SCOPE_HOST) {
1364                        NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
1365                        err = -EINVAL;
1366                        goto out;
1367                }
1368
1369                /* all nexthops in a group have the same scope */
1370                nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
1371                err = nexthop_check_scope(nhi, scope, extack);
1372        } else {
1373                nhi = rtnl_dereference(nh->nh_info);
1374                if (nhi->fdb_nh) {
1375                        NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1376                        err = -EINVAL;
1377                        goto out;
1378                }
1379                err = nexthop_check_scope(nhi, scope, extack);
1380        }
1381
1382out:
1383        return err;
1384}
1385
1386static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
1387                             struct netlink_ext_ack *extack)
1388{
1389        struct fib_info *fi;
1390
1391        list_for_each_entry(fi, &old->fi_list, nh_list) {
1392                int err;
1393
1394                err = fib_check_nexthop(new, fi->fib_scope, extack);
1395                if (err)
1396                        return err;
1397        }
1398        return 0;
1399}
1400
1401static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
1402{
1403        return nhge->res.count_buckets == nhge->res.wants_buckets;
1404}
1405
1406static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
1407{
1408        return nhge->res.count_buckets > nhge->res.wants_buckets;
1409}
1410
1411static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
1412{
1413        return nhge->res.count_buckets < nhge->res.wants_buckets;
1414}
1415
1416static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
1417{
1418        return list_empty(&res_table->uw_nh_entries);
1419}
1420
1421static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
1422{
1423        struct nh_grp_entry *nhge;
1424
1425        if (bucket->occupied) {
1426                nhge = nh_res_dereference(bucket->nh_entry);
1427                nhge->res.count_buckets--;
1428                bucket->occupied = false;
1429        }
1430}
1431
1432static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
1433                                 struct nh_grp_entry *nhge)
1434{
1435        nh_res_bucket_unset_nh(bucket);
1436
1437        bucket->occupied = true;
1438        rcu_assign_pointer(bucket->nh_entry, nhge);
1439        nhge->res.count_buckets++;
1440}
1441
1442static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
1443                                         struct nh_res_bucket *bucket,
1444                                         unsigned long *deadline, bool *force)
1445{
1446        unsigned long now = jiffies;
1447        struct nh_grp_entry *nhge;
1448        unsigned long idle_point;
1449
1450        if (!bucket->occupied) {
1451                /* The bucket is not occupied, its NHGE pointer is either
1452                 * NULL or obsolete. We _have to_ migrate: set force.
1453                 */
1454                *force = true;
1455                return true;
1456        }
1457
1458        nhge = nh_res_dereference(bucket->nh_entry);
1459
1460        /* If the bucket is populated by an underweight or balanced
1461         * nexthop, do not migrate.
1462         */
1463        if (!nh_res_nhge_is_ow(nhge))
1464                return false;
1465
1466        /* At this point we know that the bucket is populated with an
1467         * overweight nexthop. It needs to be migrated to a new nexthop if
1468         * the idle timer of unbalanced timer expired.
1469         */
1470
1471        idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
1472        if (time_after_eq(now, idle_point)) {
1473                /* The bucket is idle. We _can_ migrate: unset force. */
1474                *force = false;
1475                return true;
1476        }
1477
1478        /* Unbalanced timer of 0 means "never force". */
1479        if (res_table->unbalanced_timer) {
1480                unsigned long unb_point;
1481
1482                unb_point = nh_res_table_unb_point(res_table);
1483                if (time_after(now, unb_point)) {
1484                        /* The bucket is not idle, but the unbalanced timer
1485                         * expired. We _can_ migrate, but set force anyway,
1486                         * so that drivers know to ignore activity reports
1487                         * from the HW.
1488                         */
1489                        *force = true;
1490                        return true;
1491                }
1492
1493                nh_res_time_set_deadline(unb_point, deadline);
1494        }
1495
1496        nh_res_time_set_deadline(idle_point, deadline);
1497        return false;
1498}
1499
1500static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
1501                                  u16 bucket_index, bool notify,
1502                                  bool notify_nl, bool force)
1503{
1504        struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
1505        struct nh_grp_entry *new_nhge;
1506        struct netlink_ext_ack extack;
1507        int err;
1508
1509        new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
1510                                            struct nh_grp_entry,
1511                                            res.uw_nh_entry);
1512        if (WARN_ON_ONCE(!new_nhge))
1513                /* If this function is called, "bucket" is either not
1514                 * occupied, or it belongs to a next hop that is
1515                 * overweight. In either case, there ought to be a
1516                 * corresponding underweight next hop.
1517                 */
1518                return false;
1519
1520        if (notify) {
1521                struct nh_grp_entry *old_nhge;
1522
1523                old_nhge = nh_res_dereference(bucket->nh_entry);
1524                err = call_nexthop_res_bucket_notifiers(res_table->net,
1525                                                        res_table->nhg_id,
1526                                                        bucket_index, force,
1527                                                        old_nhge->nh,
1528                                                        new_nhge->nh, &extack);
1529                if (err) {
1530                        pr_err_ratelimited("%s\n", extack._msg);
1531                        if (!force)
1532                                return false;
1533                        /* It is not possible to veto a forced replacement, so
1534                         * just clear the hardware flags from the nexthop
1535                         * bucket to indicate to user space that this bucket is
1536                         * not correctly populated in hardware.
1537                         */
1538                        bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
1539                }
1540        }
1541
1542        nh_res_bucket_set_nh(bucket, new_nhge);
1543        nh_res_bucket_set_idle(res_table, bucket);
1544
1545        if (notify_nl)
1546                nexthop_bucket_notify(res_table, bucket_index);
1547
1548        if (nh_res_nhge_is_balanced(new_nhge))
1549                list_del(&new_nhge->res.uw_nh_entry);
1550        return true;
1551}
1552
1553#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
1554
1555static void nh_res_table_upkeep(struct nh_res_table *res_table,
1556                                bool notify, bool notify_nl)
1557{
1558        unsigned long now = jiffies;
1559        unsigned long deadline;
1560        u16 i;
1561
1562        /* Deadline is the next time that upkeep should be run. It is the
1563         * earliest time at which one of the buckets might be migrated.
1564         * Start at the most pessimistic estimate: either unbalanced_timer
1565         * from now, or if there is none, idle_timer from now. For each
1566         * encountered time point, call nh_res_time_set_deadline() to
1567         * refine the estimate.
1568         */
1569        if (res_table->unbalanced_timer)
1570                deadline = now + res_table->unbalanced_timer;
1571        else
1572                deadline = now + res_table->idle_timer;
1573
1574        for (i = 0; i < res_table->num_nh_buckets; i++) {
1575                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
1576                bool force;
1577
1578                if (nh_res_bucket_should_migrate(res_table, bucket,
1579                                                 &deadline, &force)) {
1580                        if (!nh_res_bucket_migrate(res_table, i, notify,
1581                                                   notify_nl, force)) {
1582                                unsigned long idle_point;
1583
1584                                /* A driver can override the migration
1585                                 * decision if the HW reports that the
1586                                 * bucket is actually not idle. Therefore
1587                                 * remark the bucket as busy again and
1588                                 * update the deadline.
1589                                 */
1590                                nh_res_bucket_set_busy(bucket);
1591                                idle_point = nh_res_bucket_idle_point(res_table,
1592                                                                      bucket,
1593                                                                      now);
1594                                nh_res_time_set_deadline(idle_point, &deadline);
1595                        }
1596                }
1597        }
1598
1599        /* If the group is still unbalanced, schedule the next upkeep to
1600         * either the deadline computed above, or the minimum deadline,
1601         * whichever comes later.
1602         */
1603        if (!nh_res_table_is_balanced(res_table)) {
1604                unsigned long now = jiffies;
1605                unsigned long min_deadline;
1606
1607                min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
1608                if (time_before(deadline, min_deadline))
1609                        deadline = min_deadline;
1610
1611                queue_delayed_work(system_power_efficient_wq,
1612                                   &res_table->upkeep_dw, deadline - now);
1613        }
1614}
1615
1616static void nh_res_table_upkeep_dw(struct work_struct *work)
1617{
1618        struct delayed_work *dw = to_delayed_work(work);
1619        struct nh_res_table *res_table;
1620
1621        res_table = container_of(dw, struct nh_res_table, upkeep_dw);
1622        nh_res_table_upkeep(res_table, true, true);
1623}
1624
1625static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
1626{
1627        cancel_delayed_work_sync(&res_table->upkeep_dw);
1628}
1629
1630static void nh_res_group_rebalance(struct nh_group *nhg,
1631                                   struct nh_res_table *res_table)
1632{
1633        int prev_upper_bound = 0;
1634        int total = 0;
1635        int w = 0;
1636        int i;
1637
1638        INIT_LIST_HEAD(&res_table->uw_nh_entries);
1639
1640        for (i = 0; i < nhg->num_nh; ++i)
1641                total += nhg->nh_entries[i].weight;
1642
1643        for (i = 0; i < nhg->num_nh; ++i) {
1644                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1645                int upper_bound;
1646
1647                w += nhge->weight;
1648                upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
1649                                                total);
1650                nhge->res.wants_buckets = upper_bound - prev_upper_bound;
1651                prev_upper_bound = upper_bound;
1652
1653                if (nh_res_nhge_is_uw(nhge)) {
1654                        if (list_empty(&res_table->uw_nh_entries))
1655                                res_table->unbalanced_since = jiffies;
1656                        list_add(&nhge->res.uw_nh_entry,
1657                                 &res_table->uw_nh_entries);
1658                }
1659        }
1660}
1661
1662/* Migrate buckets in res_table so that they reference NHGE's from NHG with
1663 * the right NH ID. Set those buckets that do not have a corresponding NHGE
1664 * entry in NHG as not occupied.
1665 */
1666static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
1667                                         struct nh_group *nhg)
1668{
1669        u16 i;
1670
1671        for (i = 0; i < res_table->num_nh_buckets; i++) {
1672                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
1673                u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
1674                bool found = false;
1675                int j;
1676
1677                for (j = 0; j < nhg->num_nh; j++) {
1678                        struct nh_grp_entry *nhge = &nhg->nh_entries[j];
1679
1680                        if (nhge->nh->id == id) {
1681                                nh_res_bucket_set_nh(bucket, nhge);
1682                                found = true;
1683                                break;
1684                        }
1685                }
1686
1687                if (!found)
1688                        nh_res_bucket_unset_nh(bucket);
1689        }
1690}
1691
1692static void replace_nexthop_grp_res(struct nh_group *oldg,
1693                                    struct nh_group *newg)
1694{
1695        /* For NH group replacement, the new NHG might only have a stub
1696         * hash table with 0 buckets, because the number of buckets was not
1697         * specified. For NH removal, oldg and newg both reference the same
1698         * res_table. So in any case, in the following, we want to work
1699         * with oldg->res_table.
1700         */
1701        struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
1702        unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
1703        bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
1704
1705        nh_res_table_cancel_upkeep(old_res_table);
1706        nh_res_table_migrate_buckets(old_res_table, newg);
1707        nh_res_group_rebalance(newg, old_res_table);
1708        if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
1709                old_res_table->unbalanced_since = prev_unbalanced_since;
1710        nh_res_table_upkeep(old_res_table, true, false);
1711}
1712
1713static void nh_hthr_group_rebalance(struct nh_group *nhg)
1714{
1715        int total = 0;
1716        int w = 0;
1717        int i;
1718
1719        for (i = 0; i < nhg->num_nh; ++i)
1720                total += nhg->nh_entries[i].weight;
1721
1722        for (i = 0; i < nhg->num_nh; ++i) {
1723                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1724                int upper_bound;
1725
1726                w += nhge->weight;
1727                upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
1728                atomic_set(&nhge->hthr.upper_bound, upper_bound);
1729        }
1730}
1731
1732static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
1733                                struct nl_info *nlinfo)
1734{
1735        struct nh_grp_entry *nhges, *new_nhges;
1736        struct nexthop *nhp = nhge->nh_parent;
1737        struct netlink_ext_ack extack;
1738        struct nexthop *nh = nhge->nh;
1739        struct nh_group *nhg, *newg;
1740        int i, j, err;
1741
1742        WARN_ON(!nh);
1743
1744        nhg = rtnl_dereference(nhp->nh_grp);
1745        newg = nhg->spare;
1746
1747        /* last entry, keep it visible and remove the parent */
1748        if (nhg->num_nh == 1) {
1749                remove_nexthop(net, nhp, nlinfo);
1750                return;
1751        }
1752
1753        newg->has_v4 = false;
1754        newg->is_multipath = nhg->is_multipath;
1755        newg->hash_threshold = nhg->hash_threshold;
1756        newg->resilient = nhg->resilient;
1757        newg->fdb_nh = nhg->fdb_nh;
1758        newg->num_nh = nhg->num_nh;
1759
1760        /* copy old entries to new except the one getting removed */
1761        nhges = nhg->nh_entries;
1762        new_nhges = newg->nh_entries;
1763        for (i = 0, j = 0; i < nhg->num_nh; ++i) {
1764                struct nh_info *nhi;
1765
1766                /* current nexthop getting removed */
1767                if (nhg->nh_entries[i].nh == nh) {
1768                        newg->num_nh--;
1769                        continue;
1770                }
1771
1772                nhi = rtnl_dereference(nhges[i].nh->nh_info);
1773                if (nhi->family == AF_INET)
1774                        newg->has_v4 = true;
1775
1776                list_del(&nhges[i].nh_list);
1777                new_nhges[j].nh_parent = nhges[i].nh_parent;
1778                new_nhges[j].nh = nhges[i].nh;
1779                new_nhges[j].weight = nhges[i].weight;
1780                list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
1781                j++;
1782        }
1783
1784        if (newg->hash_threshold)
1785                nh_hthr_group_rebalance(newg);
1786        else if (newg->resilient)
1787                replace_nexthop_grp_res(nhg, newg);
1788
1789        rcu_assign_pointer(nhp->nh_grp, newg);
1790
1791        list_del(&nhge->nh_list);
1792        nexthop_put(nhge->nh);
1793
1794        /* Removal of a NH from a resilient group is notified through
1795         * bucket notifications.
1796         */
1797        if (newg->hash_threshold) {
1798                err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
1799                                             &extack);
1800                if (err)
1801                        pr_err("%s\n", extack._msg);
1802        }
1803
1804        if (nlinfo)
1805                nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
1806}
1807
1808static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
1809                                       struct nl_info *nlinfo)
1810{
1811        struct nh_grp_entry *nhge, *tmp;
1812
1813        list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
1814                remove_nh_grp_entry(net, nhge, nlinfo);
1815
1816        /* make sure all see the newly published array before releasing rtnl */
1817        synchronize_net();
1818}
1819
1820static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
1821{
1822        struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
1823        struct nh_res_table *res_table;
1824        int i, num_nh = nhg->num_nh;
1825
1826        for (i = 0; i < num_nh; ++i) {
1827                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1828
1829                if (WARN_ON(!nhge->nh))
1830                        continue;
1831
1832                list_del_init(&nhge->nh_list);
1833        }
1834
1835        if (nhg->resilient) {
1836                res_table = rtnl_dereference(nhg->res_table);
1837                nh_res_table_cancel_upkeep(res_table);
1838        }
1839}
1840
1841/* not called for nexthop replace */
1842static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
1843{
1844        struct fib6_info *f6i, *tmp;
1845        bool do_flush = false;
1846        struct fib_info *fi;
1847
1848        list_for_each_entry(fi, &nh->fi_list, nh_list) {
1849                fi->fib_flags |= RTNH_F_DEAD;
1850                do_flush = true;
1851        }
1852        if (do_flush)
1853                fib_flush(net);
1854
1855        /* ip6_del_rt removes the entry from this list hence the _safe */
1856        list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
1857                /* __ip6_del_rt does a release, so do a hold here */
1858                fib6_info_hold(f6i);
1859                ipv6_stub->ip6_del_rt(net, f6i,
1860                                      !net->ipv4.sysctl_nexthop_compat_mode);
1861        }
1862}
1863
1864static void __remove_nexthop(struct net *net, struct nexthop *nh,
1865                             struct nl_info *nlinfo)
1866{
1867        __remove_nexthop_fib(net, nh);
1868
1869        if (nh->is_group) {
1870                remove_nexthop_group(nh, nlinfo);
1871        } else {
1872                struct nh_info *nhi;
1873
1874                nhi = rtnl_dereference(nh->nh_info);
1875                if (nhi->fib_nhc.nhc_dev)
1876                        hlist_del(&nhi->dev_hash);
1877
1878                remove_nexthop_from_groups(net, nh, nlinfo);
1879        }
1880}
1881
1882static void remove_nexthop(struct net *net, struct nexthop *nh,
1883                           struct nl_info *nlinfo)
1884{
1885        call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
1886
1887        /* remove from the tree */
1888        rb_erase(&nh->rb_node, &net->nexthop.rb_root);
1889
1890        if (nlinfo)
1891                nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
1892
1893        __remove_nexthop(net, nh, nlinfo);
1894        nh_base_seq_inc(net);
1895
1896        nexthop_put(nh);
1897}
1898
1899/* if any FIB entries reference this nexthop, any dst entries
1900 * need to be regenerated
1901 */
1902static void nh_rt_cache_flush(struct net *net, struct nexthop *nh)
1903{
1904        struct fib6_info *f6i;
1905
1906        if (!list_empty(&nh->fi_list))
1907                rt_cache_flush(net);
1908
1909        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
1910                ipv6_stub->fib6_update_sernum(net, f6i);
1911}
1912
1913static int replace_nexthop_grp(struct net *net, struct nexthop *old,
1914                               struct nexthop *new, const struct nh_config *cfg,
1915                               struct netlink_ext_ack *extack)
1916{
1917        struct nh_res_table *tmp_table = NULL;
1918        struct nh_res_table *new_res_table;
1919        struct nh_res_table *old_res_table;
1920        struct nh_group *oldg, *newg;
1921        int i, err;
1922
1923        if (!new->is_group) {
1924                NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
1925                return -EINVAL;
1926        }
1927
1928        oldg = rtnl_dereference(old->nh_grp);
1929        newg = rtnl_dereference(new->nh_grp);
1930
1931        if (newg->hash_threshold != oldg->hash_threshold) {
1932                NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
1933                return -EINVAL;
1934        }
1935
1936        if (newg->hash_threshold) {
1937                err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
1938                                             extack);
1939                if (err)
1940                        return err;
1941        } else if (newg->resilient) {
1942                new_res_table = rtnl_dereference(newg->res_table);
1943                old_res_table = rtnl_dereference(oldg->res_table);
1944
1945                /* Accept if num_nh_buckets was not given, but if it was
1946                 * given, demand that the value be correct.
1947                 */
1948                if (cfg->nh_grp_res_has_num_buckets &&
1949                    cfg->nh_grp_res_num_buckets !=
1950                    old_res_table->num_nh_buckets) {
1951                        NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
1952                        return -EINVAL;
1953                }
1954
1955                /* Emit a pre-replace notification so that listeners could veto
1956                 * a potentially unsupported configuration. Otherwise,
1957                 * individual bucket replacement notifications would need to be
1958                 * vetoed, which is something that should only happen if the
1959                 * bucket is currently active.
1960                 */
1961                err = call_nexthop_res_table_notifiers(net, new, extack);
1962                if (err)
1963                        return err;
1964
1965                if (cfg->nh_grp_res_has_idle_timer)
1966                        old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
1967                if (cfg->nh_grp_res_has_unbalanced_timer)
1968                        old_res_table->unbalanced_timer =
1969                                cfg->nh_grp_res_unbalanced_timer;
1970
1971                replace_nexthop_grp_res(oldg, newg);
1972
1973                tmp_table = new_res_table;
1974                rcu_assign_pointer(newg->res_table, old_res_table);
1975                rcu_assign_pointer(newg->spare->res_table, old_res_table);
1976        }
1977
1978        /* update parents - used by nexthop code for cleanup */
1979        for (i = 0; i < newg->num_nh; i++)
1980                newg->nh_entries[i].nh_parent = old;
1981
1982        rcu_assign_pointer(old->nh_grp, newg);
1983
1984        if (newg->resilient) {
1985                /* Make sure concurrent readers are not using 'oldg' anymore. */
1986                synchronize_net();
1987                rcu_assign_pointer(oldg->res_table, tmp_table);
1988                rcu_assign_pointer(oldg->spare->res_table, tmp_table);
1989        }
1990
1991        for (i = 0; i < oldg->num_nh; i++)
1992                oldg->nh_entries[i].nh_parent = new;
1993
1994        rcu_assign_pointer(new->nh_grp, oldg);
1995
1996        return 0;
1997}
1998
1999static void nh_group_v4_update(struct nh_group *nhg)
2000{
2001        struct nh_grp_entry *nhges;
2002        bool has_v4 = false;
2003        int i;
2004
2005        nhges = nhg->nh_entries;
2006        for (i = 0; i < nhg->num_nh; i++) {
2007                struct nh_info *nhi;
2008
2009                nhi = rtnl_dereference(nhges[i].nh->nh_info);
2010                if (nhi->family == AF_INET)
2011                        has_v4 = true;
2012        }
2013        nhg->has_v4 = has_v4;
2014}
2015
2016static int replace_nexthop_single_notify_res(struct net *net,
2017                                             struct nh_res_table *res_table,
2018                                             struct nexthop *old,
2019                                             struct nh_info *oldi,
2020                                             struct nh_info *newi,
2021                                             struct netlink_ext_ack *extack)
2022{
2023        u32 nhg_id = res_table->nhg_id;
2024        int err;
2025        u16 i;
2026
2027        for (i = 0; i < res_table->num_nh_buckets; i++) {
2028                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
2029                struct nh_grp_entry *nhge;
2030
2031                nhge = rtnl_dereference(bucket->nh_entry);
2032                if (nhge->nh == old) {
2033                        err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
2034                                                                  i, true,
2035                                                                  oldi, newi,
2036                                                                  extack);
2037                        if (err)
2038                                goto err_notify;
2039                }
2040        }
2041
2042        return 0;
2043
2044err_notify:
2045        while (i-- > 0) {
2046                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
2047                struct nh_grp_entry *nhge;
2048
2049                nhge = rtnl_dereference(bucket->nh_entry);
2050                if (nhge->nh == old)
2051                        __call_nexthop_res_bucket_notifiers(net, nhg_id, i,
2052                                                            true, newi, oldi,
2053                                                            extack);
2054        }
2055        return err;
2056}
2057
2058static int replace_nexthop_single_notify(struct net *net,
2059                                         struct nexthop *group_nh,
2060                                         struct nexthop *old,
2061                                         struct nh_info *oldi,
2062                                         struct nh_info *newi,
2063                                         struct netlink_ext_ack *extack)
2064{
2065        struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
2066        struct nh_res_table *res_table;
2067
2068        if (nhg->hash_threshold) {
2069                return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
2070                                              group_nh, extack);
2071        } else if (nhg->resilient) {
2072                res_table = rtnl_dereference(nhg->res_table);
2073                return replace_nexthop_single_notify_res(net, res_table,
2074                                                         old, oldi, newi,
2075                                                         extack);
2076        }
2077
2078        return -EINVAL;
2079}
2080
2081static int replace_nexthop_single(struct net *net, struct nexthop *old,
2082                                  struct nexthop *new,
2083                                  struct netlink_ext_ack *extack)
2084{
2085        u8 old_protocol, old_nh_flags;
2086        struct nh_info *oldi, *newi;
2087        struct nh_grp_entry *nhge;
2088        int err;
2089
2090        if (new->is_group) {
2091                NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
2092                return -EINVAL;
2093        }
2094
2095        err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
2096        if (err)
2097                return err;
2098
2099        /* Hardware flags were set on 'old' as 'new' is not in the red-black
2100         * tree. Therefore, inherit the flags from 'old' to 'new'.
2101         */
2102        new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
2103
2104        oldi = rtnl_dereference(old->nh_info);
2105        newi = rtnl_dereference(new->nh_info);
2106
2107        newi->nh_parent = old;
2108        oldi->nh_parent = new;
2109
2110        old_protocol = old->protocol;
2111        old_nh_flags = old->nh_flags;
2112
2113        old->protocol = new->protocol;
2114        old->nh_flags = new->nh_flags;
2115
2116        rcu_assign_pointer(old->nh_info, newi);
2117        rcu_assign_pointer(new->nh_info, oldi);
2118
2119        /* Send a replace notification for all the groups using the nexthop. */
2120        list_for_each_entry(nhge, &old->grp_list, nh_list) {
2121                struct nexthop *nhp = nhge->nh_parent;
2122
2123                err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
2124                                                    extack);
2125                if (err)
2126                        goto err_notify;
2127        }
2128
2129        /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
2130         * update IPv4 indication in all the groups using the nexthop.
2131         */
2132        if (oldi->family == AF_INET && newi->family == AF_INET6) {
2133                list_for_each_entry(nhge, &old->grp_list, nh_list) {
2134                        struct nexthop *nhp = nhge->nh_parent;
2135                        struct nh_group *nhg;
2136
2137                        nhg = rtnl_dereference(nhp->nh_grp);
2138                        nh_group_v4_update(nhg);
2139                }
2140        }
2141
2142        return 0;
2143
2144err_notify:
2145        rcu_assign_pointer(new->nh_info, newi);
2146        rcu_assign_pointer(old->nh_info, oldi);
2147        old->nh_flags = old_nh_flags;
2148        old->protocol = old_protocol;
2149        oldi->nh_parent = old;
2150        newi->nh_parent = new;
2151        list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
2152                struct nexthop *nhp = nhge->nh_parent;
2153
2154                replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
2155        }
2156        call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
2157        return err;
2158}
2159
2160static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
2161                                     struct nl_info *info)
2162{
2163        struct fib6_info *f6i;
2164
2165        if (!list_empty(&nh->fi_list)) {
2166                struct fib_info *fi;
2167
2168                /* expectation is a few fib_info per nexthop and then
2169                 * a lot of routes per fib_info. So mark the fib_info
2170                 * and then walk the fib tables once
2171                 */
2172                list_for_each_entry(fi, &nh->fi_list, nh_list)
2173                        fi->nh_updated = true;
2174
2175                fib_info_notify_update(net, info);
2176
2177                list_for_each_entry(fi, &nh->fi_list, nh_list)
2178                        fi->nh_updated = false;
2179        }
2180
2181        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
2182                ipv6_stub->fib6_rt_update(net, f6i, info);
2183}
2184
2185/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
2186 * linked to this nexthop and for all groups that the nexthop
2187 * is a member of
2188 */
2189static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
2190                                   struct nl_info *info)
2191{
2192        struct nh_grp_entry *nhge;
2193
2194        __nexthop_replace_notify(net, nh, info);
2195
2196        list_for_each_entry(nhge, &nh->grp_list, nh_list)
2197                __nexthop_replace_notify(net, nhge->nh_parent, info);
2198}
2199
2200static int replace_nexthop(struct net *net, struct nexthop *old,
2201                           struct nexthop *new, const struct nh_config *cfg,
2202                           struct netlink_ext_ack *extack)
2203{
2204        bool new_is_reject = false;
2205        struct nh_grp_entry *nhge;
2206        int err;
2207
2208        /* check that existing FIB entries are ok with the
2209         * new nexthop definition
2210         */
2211        err = fib_check_nh_list(old, new, extack);
2212        if (err)
2213                return err;
2214
2215        err = fib6_check_nh_list(old, new, extack);
2216        if (err)
2217                return err;
2218
2219        if (!new->is_group) {
2220                struct nh_info *nhi = rtnl_dereference(new->nh_info);
2221
2222                new_is_reject = nhi->reject_nh;
2223        }
2224
2225        list_for_each_entry(nhge, &old->grp_list, nh_list) {
2226                /* if new nexthop is a blackhole, any groups using this
2227                 * nexthop cannot have more than 1 path
2228                 */
2229                if (new_is_reject &&
2230                    nexthop_num_path(nhge->nh_parent) > 1) {
2231                        NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
2232                        return -EINVAL;
2233                }
2234
2235                err = fib_check_nh_list(nhge->nh_parent, new, extack);
2236                if (err)
2237                        return err;
2238
2239                err = fib6_check_nh_list(nhge->nh_parent, new, extack);
2240                if (err)
2241                        return err;
2242        }
2243
2244        if (old->is_group)
2245                err = replace_nexthop_grp(net, old, new, cfg, extack);
2246        else
2247                err = replace_nexthop_single(net, old, new, extack);
2248
2249        if (!err) {
2250                nh_rt_cache_flush(net, old);
2251
2252                __remove_nexthop(net, new, NULL);
2253                nexthop_put(new);
2254        }
2255
2256        return err;
2257}
2258
2259/* called with rtnl_lock held */
2260static int insert_nexthop(struct net *net, struct nexthop *new_nh,
2261                          struct nh_config *cfg, struct netlink_ext_ack *extack)
2262{
2263        struct rb_node **pp, *parent = NULL, *next;
2264        struct rb_root *root = &net->nexthop.rb_root;
2265        bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
2266        bool create = !!(cfg->nlflags & NLM_F_CREATE);
2267        u32 new_id = new_nh->id;
2268        int replace_notify = 0;
2269        int rc = -EEXIST;
2270
2271        pp = &root->rb_node;
2272        while (1) {
2273                struct nexthop *nh;
2274
2275                next = *pp;
2276                if (!next)
2277                        break;
2278
2279                parent = next;
2280
2281                nh = rb_entry(parent, struct nexthop, rb_node);
2282                if (new_id < nh->id) {
2283                        pp = &next->rb_left;
2284                } else if (new_id > nh->id) {
2285                        pp = &next->rb_right;
2286                } else if (replace) {
2287                        rc = replace_nexthop(net, nh, new_nh, cfg, extack);
2288                        if (!rc) {
2289                                new_nh = nh; /* send notification with old nh */
2290                                replace_notify = 1;
2291                        }
2292                        goto out;
2293                } else {
2294                        /* id already exists and not a replace */
2295                        goto out;
2296                }
2297        }
2298
2299        if (replace && !create) {
2300                NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
2301                rc = -ENOENT;
2302                goto out;
2303        }
2304
2305        if (new_nh->is_group) {
2306                struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
2307                struct nh_res_table *res_table;
2308
2309                if (nhg->resilient) {
2310                        res_table = rtnl_dereference(nhg->res_table);
2311
2312                        /* Not passing the number of buckets is OK when
2313                         * replacing, but not when creating a new group.
2314                         */
2315                        if (!cfg->nh_grp_res_has_num_buckets) {
2316                                NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
2317                                rc = -EINVAL;
2318                                goto out;
2319                        }
2320
2321                        nh_res_group_rebalance(nhg, res_table);
2322
2323                        /* Do not send bucket notifications, we do full
2324                         * notification below.
2325                         */
2326                        nh_res_table_upkeep(res_table, false, false);
2327                }
2328        }
2329
2330        rb_link_node_rcu(&new_nh->rb_node, parent, pp);
2331        rb_insert_color(&new_nh->rb_node, root);
2332
2333        /* The initial insertion is a full notification for hash-threshold as
2334         * well as resilient groups.
2335         */
2336        rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
2337        if (rc)
2338                rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
2339
2340out:
2341        if (!rc) {
2342                nh_base_seq_inc(net);
2343                nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
2344                if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
2345                        nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
2346        }
2347
2348        return rc;
2349}
2350
2351/* rtnl */
2352/* remove all nexthops tied to a device being deleted */
2353static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
2354{
2355        unsigned int hash = nh_dev_hashfn(dev->ifindex);
2356        struct net *net = dev_net(dev);
2357        struct hlist_head *head = &net->nexthop.devhash[hash];
2358        struct hlist_node *n;
2359        struct nh_info *nhi;
2360
2361        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
2362                if (nhi->fib_nhc.nhc_dev != dev)
2363                        continue;
2364
2365                if (nhi->reject_nh &&
2366                    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
2367                        continue;
2368
2369                remove_nexthop(net, nhi->nh_parent, NULL);
2370        }
2371}
2372
2373/* rtnl; called when net namespace is deleted */
2374static void flush_all_nexthops(struct net *net)
2375{
2376        struct rb_root *root = &net->nexthop.rb_root;
2377        struct rb_node *node;
2378        struct nexthop *nh;
2379
2380        while ((node = rb_first(root))) {
2381                nh = rb_entry(node, struct nexthop, rb_node);
2382                remove_nexthop(net, nh, NULL);
2383                cond_resched();
2384        }
2385}
2386
2387static struct nexthop *nexthop_create_group(struct net *net,
2388                                            struct nh_config *cfg)
2389{
2390        struct nlattr *grps_attr = cfg->nh_grp;
2391        struct nexthop_grp *entry = nla_data(grps_attr);
2392        u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
2393        struct nh_group *nhg;
2394        struct nexthop *nh;
2395        int err;
2396        int i;
2397
2398        if (WARN_ON(!num_nh))
2399                return ERR_PTR(-EINVAL);
2400
2401        nh = nexthop_alloc();
2402        if (!nh)
2403                return ERR_PTR(-ENOMEM);
2404
2405        nh->is_group = 1;
2406
2407        nhg = nexthop_grp_alloc(num_nh);
2408        if (!nhg) {
2409                kfree(nh);
2410                return ERR_PTR(-ENOMEM);
2411        }
2412
2413        /* spare group used for removals */
2414        nhg->spare = nexthop_grp_alloc(num_nh);
2415        if (!nhg->spare) {
2416                kfree(nhg);
2417                kfree(nh);
2418                return ERR_PTR(-ENOMEM);
2419        }
2420        nhg->spare->spare = nhg;
2421
2422        for (i = 0; i < nhg->num_nh; ++i) {
2423                struct nexthop *nhe;
2424                struct nh_info *nhi;
2425
2426                nhe = nexthop_find_by_id(net, entry[i].id);
2427                if (!nexthop_get(nhe)) {
2428                        err = -ENOENT;
2429                        goto out_no_nh;
2430                }
2431
2432                nhi = rtnl_dereference(nhe->nh_info);
2433                if (nhi->family == AF_INET)
2434                        nhg->has_v4 = true;
2435
2436                nhg->nh_entries[i].nh = nhe;
2437                nhg->nh_entries[i].weight = entry[i].weight + 1;
2438                list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
2439                nhg->nh_entries[i].nh_parent = nh;
2440        }
2441
2442        if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
2443                nhg->hash_threshold = 1;
2444                nhg->is_multipath = true;
2445        } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
2446                struct nh_res_table *res_table;
2447
2448                res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
2449                if (!res_table) {
2450                        err = -ENOMEM;
2451                        goto out_no_nh;
2452                }
2453
2454                rcu_assign_pointer(nhg->spare->res_table, res_table);
2455                rcu_assign_pointer(nhg->res_table, res_table);
2456                nhg->resilient = true;
2457                nhg->is_multipath = true;
2458        }
2459
2460        WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1);
2461
2462        if (nhg->hash_threshold)
2463                nh_hthr_group_rebalance(nhg);
2464
2465        if (cfg->nh_fdb)
2466                nhg->fdb_nh = 1;
2467
2468        rcu_assign_pointer(nh->nh_grp, nhg);
2469
2470        return nh;
2471
2472out_no_nh:
2473        for (i--; i >= 0; --i) {
2474                list_del(&nhg->nh_entries[i].nh_list);
2475                nexthop_put(nhg->nh_entries[i].nh);
2476        }
2477
2478        kfree(nhg->spare);
2479        kfree(nhg);
2480        kfree(nh);
2481
2482        return ERR_PTR(err);
2483}
2484
2485static int nh_create_ipv4(struct net *net, struct nexthop *nh,
2486                          struct nh_info *nhi, struct nh_config *cfg,
2487                          struct netlink_ext_ack *extack)
2488{
2489        struct fib_nh *fib_nh = &nhi->fib_nh;
2490        struct fib_config fib_cfg = {
2491                .fc_oif   = cfg->nh_ifindex,
2492                .fc_gw4   = cfg->gw.ipv4,
2493                .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
2494                .fc_flags = cfg->nh_flags,
2495                .fc_nlinfo = cfg->nlinfo,
2496                .fc_encap = cfg->nh_encap,
2497                .fc_encap_type = cfg->nh_encap_type,
2498        };
2499        u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
2500        int err;
2501
2502        err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
2503        if (err) {
2504                fib_nh_release(net, fib_nh);
2505                goto out;
2506        }
2507
2508        if (nhi->fdb_nh)
2509                goto out;
2510
2511        /* sets nh_dev if successful */
2512        err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
2513        if (!err) {
2514                nh->nh_flags = fib_nh->fib_nh_flags;
2515                fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
2516                                          fib_nh->fib_nh_scope);
2517        } else {
2518                fib_nh_release(net, fib_nh);
2519        }
2520out:
2521        return err;
2522}
2523
2524static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
2525                          struct nh_info *nhi, struct nh_config *cfg,
2526                          struct netlink_ext_ack *extack)
2527{
2528        struct fib6_nh *fib6_nh = &nhi->fib6_nh;
2529        struct fib6_config fib6_cfg = {
2530                .fc_table = l3mdev_fib_table(cfg->dev),
2531                .fc_ifindex = cfg->nh_ifindex,
2532                .fc_gateway = cfg->gw.ipv6,
2533                .fc_flags = cfg->nh_flags,
2534                .fc_nlinfo = cfg->nlinfo,
2535                .fc_encap = cfg->nh_encap,
2536                .fc_encap_type = cfg->nh_encap_type,
2537                .fc_is_fdb = cfg->nh_fdb,
2538        };
2539        int err;
2540
2541        if (!ipv6_addr_any(&cfg->gw.ipv6))
2542                fib6_cfg.fc_flags |= RTF_GATEWAY;
2543
2544        /* sets nh_dev if successful */
2545        err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
2546                                      extack);
2547        if (err)
2548                ipv6_stub->fib6_nh_release(fib6_nh);
2549        else
2550                nh->nh_flags = fib6_nh->fib_nh_flags;
2551
2552        return err;
2553}
2554
2555static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
2556                                      struct netlink_ext_ack *extack)
2557{
2558        struct nh_info *nhi;
2559        struct nexthop *nh;
2560        int err = 0;
2561
2562        nh = nexthop_alloc();
2563        if (!nh)
2564                return ERR_PTR(-ENOMEM);
2565
2566        nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
2567        if (!nhi) {
2568                kfree(nh);
2569                return ERR_PTR(-ENOMEM);
2570        }
2571
2572        nh->nh_flags = cfg->nh_flags;
2573        nh->net = net;
2574
2575        nhi->nh_parent = nh;
2576        nhi->family = cfg->nh_family;
2577        nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
2578
2579        if (cfg->nh_fdb)
2580                nhi->fdb_nh = 1;
2581
2582        if (cfg->nh_blackhole) {
2583                nhi->reject_nh = 1;
2584                cfg->nh_ifindex = net->loopback_dev->ifindex;
2585        }
2586
2587        switch (cfg->nh_family) {
2588        case AF_INET:
2589                err = nh_create_ipv4(net, nh, nhi, cfg, extack);
2590                break;
2591        case AF_INET6:
2592                err = nh_create_ipv6(net, nh, nhi, cfg, extack);
2593                break;
2594        }
2595
2596        if (err) {
2597                kfree(nhi);
2598                kfree(nh);
2599                return ERR_PTR(err);
2600        }
2601
2602        /* add the entry to the device based hash */
2603        if (!nhi->fdb_nh)
2604                nexthop_devhash_add(net, nhi);
2605
2606        rcu_assign_pointer(nh->nh_info, nhi);
2607
2608        return nh;
2609}
2610
2611/* called with rtnl lock held */
2612static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
2613                                   struct netlink_ext_ack *extack)
2614{
2615        struct nexthop *nh;
2616        int err;
2617
2618        if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
2619                NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
2620                return ERR_PTR(-EINVAL);
2621        }
2622
2623        if (!cfg->nh_id) {
2624                cfg->nh_id = nh_find_unused_id(net);
2625                if (!cfg->nh_id) {
2626                        NL_SET_ERR_MSG(extack, "No unused id");
2627                        return ERR_PTR(-EINVAL);
2628                }
2629        }
2630
2631        if (cfg->nh_grp)
2632                nh = nexthop_create_group(net, cfg);
2633        else
2634                nh = nexthop_create(net, cfg, extack);
2635
2636        if (IS_ERR(nh))
2637                return nh;
2638
2639        refcount_set(&nh->refcnt, 1);
2640        nh->id = cfg->nh_id;
2641        nh->protocol = cfg->nh_protocol;
2642        nh->net = net;
2643
2644        err = insert_nexthop(net, nh, cfg, extack);
2645        if (err) {
2646                __remove_nexthop(net, nh, NULL);
2647                nexthop_put(nh);
2648                nh = ERR_PTR(err);
2649        }
2650
2651        return nh;
2652}
2653
2654static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
2655                            unsigned long *timer_p, bool *has_p,
2656                            struct netlink_ext_ack *extack)
2657{
2658        unsigned long timer;
2659        u32 value;
2660
2661        if (!attr) {
2662                *timer_p = fallback;
2663                *has_p = false;
2664                return 0;
2665        }
2666
2667        value = nla_get_u32(attr);
2668        timer = clock_t_to_jiffies(value);
2669        if (timer == ~0UL) {
2670                NL_SET_ERR_MSG(extack, "Timer value too large");
2671                return -EINVAL;
2672        }
2673
2674        *timer_p = timer;
2675        *has_p = true;
2676        return 0;
2677}
2678
2679static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
2680                                    struct netlink_ext_ack *extack)
2681{
2682        struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
2683        int err;
2684
2685        if (res) {
2686                err = nla_parse_nested(tb,
2687                                       ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
2688                                       res, rtm_nh_res_policy_new, extack);
2689                if (err < 0)
2690                        return err;
2691        }
2692
2693        if (tb[NHA_RES_GROUP_BUCKETS]) {
2694                cfg->nh_grp_res_num_buckets =
2695                        nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
2696                cfg->nh_grp_res_has_num_buckets = true;
2697                if (!cfg->nh_grp_res_num_buckets) {
2698                        NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
2699                        return -EINVAL;
2700                }
2701        }
2702
2703        err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
2704                               NH_RES_DEFAULT_IDLE_TIMER,
2705                               &cfg->nh_grp_res_idle_timer,
2706                               &cfg->nh_grp_res_has_idle_timer,
2707                               extack);
2708        if (err)
2709                return err;
2710
2711        return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
2712                                NH_RES_DEFAULT_UNBALANCED_TIMER,
2713                                &cfg->nh_grp_res_unbalanced_timer,
2714                                &cfg->nh_grp_res_has_unbalanced_timer,
2715                                extack);
2716}
2717
2718static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
2719                            struct nlmsghdr *nlh, struct nh_config *cfg,
2720                            struct netlink_ext_ack *extack)
2721{
2722        struct nhmsg *nhm = nlmsg_data(nlh);
2723        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
2724        int err;
2725
2726        err = nlmsg_parse(nlh, sizeof(*nhm), tb,
2727                          ARRAY_SIZE(rtm_nh_policy_new) - 1,
2728                          rtm_nh_policy_new, extack);
2729        if (err < 0)
2730                return err;
2731
2732        err = -EINVAL;
2733        if (nhm->resvd || nhm->nh_scope) {
2734                NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
2735                goto out;
2736        }
2737        if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
2738                NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
2739                goto out;
2740        }
2741
2742        switch (nhm->nh_family) {
2743        case AF_INET:
2744        case AF_INET6:
2745                break;
2746        case AF_UNSPEC:
2747                if (tb[NHA_GROUP])
2748                        break;
2749                fallthrough;
2750        default:
2751                NL_SET_ERR_MSG(extack, "Invalid address family");
2752                goto out;
2753        }
2754
2755        memset(cfg, 0, sizeof(*cfg));
2756        cfg->nlflags = nlh->nlmsg_flags;
2757        cfg->nlinfo.portid = NETLINK_CB(skb).portid;
2758        cfg->nlinfo.nlh = nlh;
2759        cfg->nlinfo.nl_net = net;
2760
2761        cfg->nh_family = nhm->nh_family;
2762        cfg->nh_protocol = nhm->nh_protocol;
2763        cfg->nh_flags = nhm->nh_flags;
2764
2765        if (tb[NHA_ID])
2766                cfg->nh_id = nla_get_u32(tb[NHA_ID]);
2767
2768        if (tb[NHA_FDB]) {
2769                if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
2770                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
2771                        NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
2772                        goto out;
2773                }
2774                if (nhm->nh_flags) {
2775                        NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
2776                        goto out;
2777                }
2778                cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
2779        }
2780
2781        if (tb[NHA_GROUP]) {
2782                if (nhm->nh_family != AF_UNSPEC) {
2783                        NL_SET_ERR_MSG(extack, "Invalid family for group");
2784                        goto out;
2785                }
2786                cfg->nh_grp = tb[NHA_GROUP];
2787
2788                cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
2789                if (tb[NHA_GROUP_TYPE])
2790                        cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
2791
2792                if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
2793                        NL_SET_ERR_MSG(extack, "Invalid group type");
2794                        goto out;
2795                }
2796                err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb),
2797                                          cfg->nh_grp_type, extack);
2798                if (err)
2799                        goto out;
2800
2801                if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
2802                        err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
2803                                                       cfg, extack);
2804
2805                /* no other attributes should be set */
2806                goto out;
2807        }
2808
2809        if (tb[NHA_BLACKHOLE]) {
2810                if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
2811                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
2812                        NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
2813                        goto out;
2814                }
2815
2816                cfg->nh_blackhole = 1;
2817                err = 0;
2818                goto out;
2819        }
2820
2821        if (!cfg->nh_fdb && !tb[NHA_OIF]) {
2822                NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
2823                goto out;
2824        }
2825
2826        if (!cfg->nh_fdb && tb[NHA_OIF]) {
2827                cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
2828                if (cfg->nh_ifindex)
2829                        cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
2830
2831                if (!cfg->dev) {
2832                        NL_SET_ERR_MSG(extack, "Invalid device index");
2833                        goto out;
2834                } else if (!(cfg->dev->flags & IFF_UP)) {
2835                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2836                        err = -ENETDOWN;
2837                        goto out;
2838                } else if (!netif_carrier_ok(cfg->dev)) {
2839                        NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
2840                        err = -ENETDOWN;
2841                        goto out;
2842                }
2843        }
2844
2845        err = -EINVAL;
2846        if (tb[NHA_GATEWAY]) {
2847                struct nlattr *gwa = tb[NHA_GATEWAY];
2848
2849                switch (cfg->nh_family) {
2850                case AF_INET:
2851                        if (nla_len(gwa) != sizeof(u32)) {
2852                                NL_SET_ERR_MSG(extack, "Invalid gateway");
2853                                goto out;
2854                        }
2855                        cfg->gw.ipv4 = nla_get_be32(gwa);
2856                        break;
2857                case AF_INET6:
2858                        if (nla_len(gwa) != sizeof(struct in6_addr)) {
2859                                NL_SET_ERR_MSG(extack, "Invalid gateway");
2860                                goto out;
2861                        }
2862                        cfg->gw.ipv6 = nla_get_in6_addr(gwa);
2863                        break;
2864                default:
2865                        NL_SET_ERR_MSG(extack,
2866                                       "Unknown address family for gateway");
2867                        goto out;
2868                }
2869        } else {
2870                /* device only nexthop (no gateway) */
2871                if (cfg->nh_flags & RTNH_F_ONLINK) {
2872                        NL_SET_ERR_MSG(extack,
2873                                       "ONLINK flag can not be set for nexthop without a gateway");
2874                        goto out;
2875                }
2876        }
2877
2878        if (tb[NHA_ENCAP]) {
2879                cfg->nh_encap = tb[NHA_ENCAP];
2880
2881                if (!tb[NHA_ENCAP_TYPE]) {
2882                        NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
2883                        goto out;
2884                }
2885
2886                cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
2887                err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
2888                if (err < 0)
2889                        goto out;
2890
2891        } else if (tb[NHA_ENCAP_TYPE]) {
2892                NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
2893                goto out;
2894        }
2895
2896
2897        err = 0;
2898out:
2899        return err;
2900}
2901
2902/* rtnl */
2903static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
2904                           struct netlink_ext_ack *extack)
2905{
2906        struct net *net = sock_net(skb->sk);
2907        struct nh_config cfg;
2908        struct nexthop *nh;
2909        int err;
2910
2911        err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
2912        if (!err) {
2913                nh = nexthop_add(net, &cfg, extack);
2914                if (IS_ERR(nh))
2915                        err = PTR_ERR(nh);
2916        }
2917
2918        return err;
2919}
2920
2921static int __nh_valid_get_del_req(const struct nlmsghdr *nlh,
2922                                  struct nlattr **tb, u32 *id,
2923                                  struct netlink_ext_ack *extack)
2924{
2925        struct nhmsg *nhm = nlmsg_data(nlh);
2926
2927        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
2928                NL_SET_ERR_MSG(extack, "Invalid values in header");
2929                return -EINVAL;
2930        }
2931
2932        if (!tb[NHA_ID]) {
2933                NL_SET_ERR_MSG(extack, "Nexthop id is missing");
2934                return -EINVAL;
2935        }
2936
2937        *id = nla_get_u32(tb[NHA_ID]);
2938        if (!(*id)) {
2939                NL_SET_ERR_MSG(extack, "Invalid nexthop id");
2940                return -EINVAL;
2941        }
2942
2943        return 0;
2944}
2945
2946static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id,
2947                                struct netlink_ext_ack *extack)
2948{
2949        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
2950        int err;
2951
2952        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
2953                          ARRAY_SIZE(rtm_nh_policy_get) - 1,
2954                          rtm_nh_policy_get, extack);
2955        if (err < 0)
2956                return err;
2957
2958        return __nh_valid_get_del_req(nlh, tb, id, extack);
2959}
2960
2961/* rtnl */
2962static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
2963                           struct netlink_ext_ack *extack)
2964{
2965        struct net *net = sock_net(skb->sk);
2966        struct nl_info nlinfo = {
2967                .nlh = nlh,
2968                .nl_net = net,
2969                .portid = NETLINK_CB(skb).portid,
2970        };
2971        struct nexthop *nh;
2972        int err;
2973        u32 id;
2974
2975        err = nh_valid_get_del_req(nlh, &id, extack);
2976        if (err)
2977                return err;
2978
2979        nh = nexthop_find_by_id(net, id);
2980        if (!nh)
2981                return -ENOENT;
2982
2983        remove_nexthop(net, nh, &nlinfo);
2984
2985        return 0;
2986}
2987
2988/* rtnl */
2989static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2990                           struct netlink_ext_ack *extack)
2991{
2992        struct net *net = sock_net(in_skb->sk);
2993        struct sk_buff *skb = NULL;
2994        struct nexthop *nh;
2995        int err;
2996        u32 id;
2997
2998        err = nh_valid_get_del_req(nlh, &id, extack);
2999        if (err)
3000                return err;
3001
3002        err = -ENOBUFS;
3003        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3004        if (!skb)
3005                goto out;
3006
3007        err = -ENOENT;
3008        nh = nexthop_find_by_id(net, id);
3009        if (!nh)
3010                goto errout_free;
3011
3012        err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
3013                           nlh->nlmsg_seq, 0);
3014        if (err < 0) {
3015                WARN_ON(err == -EMSGSIZE);
3016                goto errout_free;
3017        }
3018
3019        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3020out:
3021        return err;
3022errout_free:
3023        kfree_skb(skb);
3024        goto out;
3025}
3026
3027struct nh_dump_filter {
3028        u32 nh_id;
3029        int dev_idx;
3030        int master_idx;
3031        bool group_filter;
3032        bool fdb_filter;
3033        u32 res_bucket_nh_id;
3034};
3035
3036static bool nh_dump_filtered(struct nexthop *nh,
3037                             struct nh_dump_filter *filter, u8 family)
3038{
3039        const struct net_device *dev;
3040        const struct nh_info *nhi;
3041
3042        if (filter->group_filter && !nh->is_group)
3043                return true;
3044
3045        if (!filter->dev_idx && !filter->master_idx && !family)
3046                return false;
3047
3048        if (nh->is_group)
3049                return true;
3050
3051        nhi = rtnl_dereference(nh->nh_info);
3052        if (family && nhi->family != family)
3053                return true;
3054
3055        dev = nhi->fib_nhc.nhc_dev;
3056        if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
3057                return true;
3058
3059        if (filter->master_idx) {
3060                struct net_device *master;
3061
3062                if (!dev)
3063                        return true;
3064
3065                master = netdev_master_upper_dev_get((struct net_device *)dev);
3066                if (!master || master->ifindex != filter->master_idx)
3067                        return true;
3068        }
3069
3070        return false;
3071}
3072
3073static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
3074                               struct nh_dump_filter *filter,
3075                               struct netlink_ext_ack *extack)
3076{
3077        struct nhmsg *nhm;
3078        u32 idx;
3079
3080        if (tb[NHA_OIF]) {
3081                idx = nla_get_u32(tb[NHA_OIF]);
3082                if (idx > INT_MAX) {
3083                        NL_SET_ERR_MSG(extack, "Invalid device index");
3084                        return -EINVAL;
3085                }
3086                filter->dev_idx = idx;
3087        }
3088        if (tb[NHA_MASTER]) {
3089                idx = nla_get_u32(tb[NHA_MASTER]);
3090                if (idx > INT_MAX) {
3091                        NL_SET_ERR_MSG(extack, "Invalid master device index");
3092                        return -EINVAL;
3093                }
3094                filter->master_idx = idx;
3095        }
3096        filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
3097        filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);
3098
3099        nhm = nlmsg_data(nlh);
3100        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
3101                NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
3102                return -EINVAL;
3103        }
3104
3105        return 0;
3106}
3107
3108static int nh_valid_dump_req(const struct nlmsghdr *nlh,
3109                             struct nh_dump_filter *filter,
3110                             struct netlink_callback *cb)
3111{
3112        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
3113        int err;
3114
3115        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3116                          ARRAY_SIZE(rtm_nh_policy_dump) - 1,
3117                          rtm_nh_policy_dump, cb->extack);
3118        if (err < 0)
3119                return err;
3120
3121        return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
3122}
3123
3124struct rtm_dump_nh_ctx {
3125        u32 idx;
3126};
3127
3128static struct rtm_dump_nh_ctx *
3129rtm_dump_nh_ctx(struct netlink_callback *cb)
3130{
3131        struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;
3132
3133        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
3134        return ctx;
3135}
3136
3137static int rtm_dump_walk_nexthops(struct sk_buff *skb,
3138                                  struct netlink_callback *cb,
3139                                  struct rb_root *root,
3140                                  struct rtm_dump_nh_ctx *ctx,
3141                                  int (*nh_cb)(struct sk_buff *skb,
3142                                               struct netlink_callback *cb,
3143                                               struct nexthop *nh, void *data),
3144                                  void *data)
3145{
3146        struct rb_node *node;
3147        int s_idx;
3148        int err;
3149
3150        s_idx = ctx->idx;
3151        for (node = rb_first(root); node; node = rb_next(node)) {
3152                struct nexthop *nh;
3153
3154                nh = rb_entry(node, struct nexthop, rb_node);
3155                if (nh->id < s_idx)
3156                        continue;
3157
3158                ctx->idx = nh->id;
3159                err = nh_cb(skb, cb, nh, data);
3160                if (err)
3161                        return err;
3162        }
3163
3164        ctx->idx++;
3165        return 0;
3166}
3167
3168static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
3169                               struct nexthop *nh, void *data)
3170{
3171        struct nhmsg *nhm = nlmsg_data(cb->nlh);
3172        struct nh_dump_filter *filter = data;
3173
3174        if (nh_dump_filtered(nh, filter, nhm->nh_family))
3175                return 0;
3176
3177        return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
3178                            NETLINK_CB(cb->skb).portid,
3179                            cb->nlh->nlmsg_seq, NLM_F_MULTI);
3180}
3181
3182/* rtnl */
3183static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
3184{
3185        struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
3186        struct net *net = sock_net(skb->sk);
3187        struct rb_root *root = &net->nexthop.rb_root;
3188        struct nh_dump_filter filter = {};
3189        int err;
3190
3191        err = nh_valid_dump_req(cb->nlh, &filter, cb);
3192        if (err < 0)
3193                return err;
3194
3195        err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
3196                                     &rtm_dump_nexthop_cb, &filter);
3197        if (err < 0) {
3198                if (likely(skb->len))
3199                        goto out;
3200                goto out_err;
3201        }
3202
3203out:
3204        err = skb->len;
3205out_err:
3206        cb->seq = net->nexthop.seq;
3207        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
3208        return err;
3209}
3210
3211static struct nexthop *
3212nexthop_find_group_resilient(struct net *net, u32 id,
3213                             struct netlink_ext_ack *extack)
3214{
3215        struct nh_group *nhg;
3216        struct nexthop *nh;
3217
3218        nh = nexthop_find_by_id(net, id);
3219        if (!nh)
3220                return ERR_PTR(-ENOENT);
3221
3222        if (!nh->is_group) {
3223                NL_SET_ERR_MSG(extack, "Not a nexthop group");
3224                return ERR_PTR(-EINVAL);
3225        }
3226
3227        nhg = rtnl_dereference(nh->nh_grp);
3228        if (!nhg->resilient) {
3229                NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
3230                return ERR_PTR(-EINVAL);
3231        }
3232
3233        return nh;
3234}
3235
3236static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
3237                              struct netlink_ext_ack *extack)
3238{
3239        u32 idx;
3240
3241        if (attr) {
3242                idx = nla_get_u32(attr);
3243                if (!idx) {
3244                        NL_SET_ERR_MSG(extack, "Invalid nexthop id");
3245                        return -EINVAL;
3246                }
3247                *nh_id_p = idx;
3248        } else {
3249                *nh_id_p = 0;
3250        }
3251
3252        return 0;
3253}
3254
3255static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
3256                                    struct nh_dump_filter *filter,
3257                                    struct netlink_callback *cb)
3258{
3259        struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
3260        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
3261        int err;
3262
3263        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3264                          ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
3265                          rtm_nh_policy_dump_bucket, NULL);
3266        if (err < 0)
3267                return err;
3268
3269        err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
3270        if (err)
3271                return err;
3272
3273        if (tb[NHA_RES_BUCKET]) {
3274                size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;
3275
3276                err = nla_parse_nested(res_tb, max,
3277                                       tb[NHA_RES_BUCKET],
3278                                       rtm_nh_res_bucket_policy_dump,
3279                                       cb->extack);
3280                if (err < 0)
3281                        return err;
3282
3283                err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
3284                                         &filter->res_bucket_nh_id,
3285                                         cb->extack);
3286                if (err)
3287                        return err;
3288        }
3289
3290        return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
3291}
3292
3293struct rtm_dump_res_bucket_ctx {
3294        struct rtm_dump_nh_ctx nh;
3295        u16 bucket_index;
3296        u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */
3297};
3298
3299static struct rtm_dump_res_bucket_ctx *
3300rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
3301{
3302        struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;
3303
3304        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
3305        return ctx;
3306}
3307
3308struct rtm_dump_nexthop_bucket_data {
3309        struct rtm_dump_res_bucket_ctx *ctx;
3310        struct nh_dump_filter filter;
3311};
3312
3313static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
3314                                      struct netlink_callback *cb,
3315                                      struct nexthop *nh,
3316                                      struct rtm_dump_nexthop_bucket_data *dd)
3317{
3318        u32 portid = NETLINK_CB(cb->skb).portid;
3319        struct nhmsg *nhm = nlmsg_data(cb->nlh);
3320        struct nh_res_table *res_table;
3321        struct nh_group *nhg;
3322        u16 bucket_index;
3323        int err;
3324
3325        if (dd->ctx->nh.idx < dd->ctx->done_nh_idx)
3326                return 0;
3327
3328        nhg = rtnl_dereference(nh->nh_grp);
3329        res_table = rtnl_dereference(nhg->res_table);
3330        for (bucket_index = dd->ctx->bucket_index;
3331             bucket_index < res_table->num_nh_buckets;
3332             bucket_index++) {
3333                struct nh_res_bucket *bucket;
3334                struct nh_grp_entry *nhge;
3335
3336                bucket = &res_table->nh_buckets[bucket_index];
3337                nhge = rtnl_dereference(bucket->nh_entry);
3338                if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
3339                        continue;
3340
3341                if (dd->filter.res_bucket_nh_id &&
3342                    dd->filter.res_bucket_nh_id != nhge->nh->id)
3343                        continue;
3344
3345                err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
3346                                         RTM_NEWNEXTHOPBUCKET, portid,
3347                                         cb->nlh->nlmsg_seq, NLM_F_MULTI,
3348                                         cb->extack);
3349                if (err < 0) {
3350                        if (likely(skb->len))
3351                                goto out;
3352                        goto out_err;
3353                }
3354        }
3355
3356        dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1;
3357        bucket_index = 0;
3358
3359out:
3360        err = skb->len;
3361out_err:
3362        dd->ctx->bucket_index = bucket_index;
3363        return err;
3364}
3365
3366static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
3367                                      struct netlink_callback *cb,
3368                                      struct nexthop *nh, void *data)
3369{
3370        struct rtm_dump_nexthop_bucket_data *dd = data;
3371        struct nh_group *nhg;
3372
3373        if (!nh->is_group)
3374                return 0;
3375
3376        nhg = rtnl_dereference(nh->nh_grp);
3377        if (!nhg->resilient)
3378                return 0;
3379
3380        return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
3381}
3382
3383/* rtnl */
3384static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
3385                                   struct netlink_callback *cb)
3386{
3387        struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
3388        struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
3389        struct net *net = sock_net(skb->sk);
3390        struct nexthop *nh;
3391        int err;
3392
3393        err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
3394        if (err)
3395                return err;
3396
3397        if (dd.filter.nh_id) {
3398                nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
3399                                                  cb->extack);
3400                if (IS_ERR(nh))
3401                        return PTR_ERR(nh);
3402                err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
3403        } else {
3404                struct rb_root *root = &net->nexthop.rb_root;
3405
3406                err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
3407                                             &rtm_dump_nexthop_bucket_cb, &dd);
3408        }
3409
3410        if (err < 0) {
3411                if (likely(skb->len))
3412                        goto out;
3413                goto out_err;
3414        }
3415
3416out:
3417        err = skb->len;
3418out_err:
3419        cb->seq = net->nexthop.seq;
3420        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
3421        return err;
3422}
3423
3424static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
3425                                              u16 *bucket_index,
3426                                              struct netlink_ext_ack *extack)
3427{
3428        struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
3429        int err;
3430
3431        err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
3432                               res, rtm_nh_res_bucket_policy_get, extack);
3433        if (err < 0)
3434                return err;
3435
3436        if (!tb[NHA_RES_BUCKET_INDEX]) {
3437                NL_SET_ERR_MSG(extack, "Bucket index is missing");
3438                return -EINVAL;
3439        }
3440
3441        *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
3442        return 0;
3443}
3444
3445static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
3446                                   u32 *id, u16 *bucket_index,
3447                                   struct netlink_ext_ack *extack)
3448{
3449        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
3450        int err;
3451
3452        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3453                          ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
3454                          rtm_nh_policy_get_bucket, extack);
3455        if (err < 0)
3456                return err;
3457
3458        err = __nh_valid_get_del_req(nlh, tb, id, extack);
3459        if (err)
3460                return err;
3461
3462        if (!tb[NHA_RES_BUCKET]) {
3463                NL_SET_ERR_MSG(extack, "Bucket information is missing");
3464                return -EINVAL;
3465        }
3466
3467        err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
3468                                                 bucket_index, extack);
3469        if (err)
3470                return err;
3471
3472        return 0;
3473}
3474
3475/* rtnl */
3476static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3477                                  struct netlink_ext_ack *extack)
3478{
3479        struct net *net = sock_net(in_skb->sk);
3480        struct nh_res_table *res_table;
3481        struct sk_buff *skb = NULL;
3482        struct nh_group *nhg;
3483        struct nexthop *nh;
3484        u16 bucket_index;
3485        int err;
3486        u32 id;
3487
3488        err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
3489        if (err)
3490                return err;
3491
3492        nh = nexthop_find_group_resilient(net, id, extack);
3493        if (IS_ERR(nh))
3494                return PTR_ERR(nh);
3495
3496        nhg = rtnl_dereference(nh->nh_grp);
3497        res_table = rtnl_dereference(nhg->res_table);
3498        if (bucket_index >= res_table->num_nh_buckets) {
3499                NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
3500                return -ENOENT;
3501        }
3502
3503        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3504        if (!skb)
3505                return -ENOBUFS;
3506
3507        err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
3508                                 bucket_index, RTM_NEWNEXTHOPBUCKET,
3509                                 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
3510                                 0, extack);
3511        if (err < 0) {
3512                WARN_ON(err == -EMSGSIZE);
3513                goto errout_free;
3514        }
3515
3516        return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3517
3518errout_free:
3519        kfree_skb(skb);
3520        return err;
3521}
3522
3523static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
3524{
3525        unsigned int hash = nh_dev_hashfn(dev->ifindex);
3526        struct net *net = dev_net(dev);
3527        struct hlist_head *head = &net->nexthop.devhash[hash];
3528        struct hlist_node *n;
3529        struct nh_info *nhi;
3530
3531        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
3532                if (nhi->fib_nhc.nhc_dev == dev) {
3533                        if (nhi->family == AF_INET)
3534                                fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
3535                                                   orig_mtu);
3536                }
3537        }
3538}
3539
3540/* rtnl */
3541static int nh_netdev_event(struct notifier_block *this,
3542                           unsigned long event, void *ptr)
3543{
3544        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3545        struct netdev_notifier_info_ext *info_ext;
3546
3547        switch (event) {
3548        case NETDEV_DOWN:
3549        case NETDEV_UNREGISTER:
3550                nexthop_flush_dev(dev, event);
3551                break;
3552        case NETDEV_CHANGE:
3553                if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
3554                        nexthop_flush_dev(dev, event);
3555                break;
3556        case NETDEV_CHANGEMTU:
3557                info_ext = ptr;
3558                nexthop_sync_mtu(dev, info_ext->ext.mtu);
3559                rt_cache_flush(dev_net(dev));
3560                break;
3561        }
3562        return NOTIFY_DONE;
3563}
3564
3565static struct notifier_block nh_netdev_notifier = {
3566        .notifier_call = nh_netdev_event,
3567};
3568
3569static int nexthops_dump(struct net *net, struct notifier_block *nb,
3570                         enum nexthop_event_type event_type,
3571                         struct netlink_ext_ack *extack)
3572{
3573        struct rb_root *root = &net->nexthop.rb_root;
3574        struct rb_node *node;
3575        int err = 0;
3576
3577        for (node = rb_first(root); node; node = rb_next(node)) {
3578                struct nexthop *nh;
3579
3580                nh = rb_entry(node, struct nexthop, rb_node);
3581                err = call_nexthop_notifier(nb, net, event_type, nh, extack);
3582                if (err)
3583                        break;
3584        }
3585
3586        return err;
3587}
3588
3589int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
3590                              struct netlink_ext_ack *extack)
3591{
3592        int err;
3593
3594        rtnl_lock();
3595        err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack);
3596        if (err)
3597                goto unlock;
3598        err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
3599                                               nb);
3600unlock:
3601        rtnl_unlock();
3602        return err;
3603}
3604EXPORT_SYMBOL(register_nexthop_notifier);
3605
3606int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
3607{
3608        int err;
3609
3610        rtnl_lock();
3611        err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
3612                                                 nb);
3613        if (err)
3614                goto unlock;
3615        nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
3616unlock:
3617        rtnl_unlock();
3618        return err;
3619}
3620EXPORT_SYMBOL(unregister_nexthop_notifier);
3621
3622void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
3623{
3624        struct nexthop *nexthop;
3625
3626        rcu_read_lock();
3627
3628        nexthop = nexthop_find_by_id(net, id);
3629        if (!nexthop)
3630                goto out;
3631
3632        nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
3633        if (offload)
3634                nexthop->nh_flags |= RTNH_F_OFFLOAD;
3635        if (trap)
3636                nexthop->nh_flags |= RTNH_F_TRAP;
3637
3638out:
3639        rcu_read_unlock();
3640}
3641EXPORT_SYMBOL(nexthop_set_hw_flags);
3642
3643void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
3644                                 bool offload, bool trap)
3645{
3646        struct nh_res_table *res_table;
3647        struct nh_res_bucket *bucket;
3648        struct nexthop *nexthop;
3649        struct nh_group *nhg;
3650
3651        rcu_read_lock();
3652
3653        nexthop = nexthop_find_by_id(net, id);
3654        if (!nexthop || !nexthop->is_group)
3655                goto out;
3656
3657        nhg = rcu_dereference(nexthop->nh_grp);
3658        if (!nhg->resilient)
3659                goto out;
3660
3661        if (bucket_index >= nhg->res_table->num_nh_buckets)
3662                goto out;
3663
3664        res_table = rcu_dereference(nhg->res_table);
3665        bucket = &res_table->nh_buckets[bucket_index];
3666        bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
3667        if (offload)
3668                bucket->nh_flags |= RTNH_F_OFFLOAD;
3669        if (trap)
3670                bucket->nh_flags |= RTNH_F_TRAP;
3671
3672out:
3673        rcu_read_unlock();
3674}
3675EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);
3676
3677void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
3678                                     unsigned long *activity)
3679{
3680        struct nh_res_table *res_table;
3681        struct nexthop *nexthop;
3682        struct nh_group *nhg;
3683        u16 i;
3684
3685        rcu_read_lock();
3686
3687        nexthop = nexthop_find_by_id(net, id);
3688        if (!nexthop || !nexthop->is_group)
3689                goto out;
3690
3691        nhg = rcu_dereference(nexthop->nh_grp);
3692        if (!nhg->resilient)
3693                goto out;
3694
3695        /* Instead of silently ignoring some buckets, demand that the sizes
3696         * be the same.
3697         */
3698        res_table = rcu_dereference(nhg->res_table);
3699        if (num_buckets != res_table->num_nh_buckets)
3700                goto out;
3701
3702        for (i = 0; i < num_buckets; i++) {
3703                if (test_bit(i, activity))
3704                        nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
3705        }
3706
3707out:
3708        rcu_read_unlock();
3709}
3710EXPORT_SYMBOL(nexthop_res_grp_activity_update);
3711
3712static void __net_exit nexthop_net_exit(struct net *net)
3713{
3714        rtnl_lock();
3715        flush_all_nexthops(net);
3716        rtnl_unlock();
3717        kfree(net->nexthop.devhash);
3718}
3719
3720static int __net_init nexthop_net_init(struct net *net)
3721{
3722        size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
3723
3724        net->nexthop.rb_root = RB_ROOT;
3725        net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
3726        if (!net->nexthop.devhash)
3727                return -ENOMEM;
3728        BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
3729
3730        return 0;
3731}
3732
3733static struct pernet_operations nexthop_net_ops = {
3734        .init = nexthop_net_init,
3735        .exit = nexthop_net_exit,
3736};
3737
3738static int __init nexthop_init(void)
3739{
3740        register_pernet_subsys(&nexthop_net_ops);
3741
3742        register_netdevice_notifier(&nh_netdev_notifier);
3743
3744        rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3745        rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
3746        rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
3747                      rtm_dump_nexthop, 0);
3748
3749        rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3750        rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
3751
3752        rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3753        rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
3754
3755        rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket,
3756                      rtm_dump_nexthop_bucket, 0);
3757
3758        return 0;
3759}
3760subsys_initcall(nexthop_init);
3761