linux/net/ipv4/nexthop.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* Generic nexthop implementation
   3 *
   4 * Copyright (c) 2017-19 Cumulus Networks
   5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
   6 */
   7
   8#include <linux/nexthop.h>
   9#include <linux/rtnetlink.h>
  10#include <linux/slab.h>
  11#include <net/arp.h>
  12#include <net/ipv6_stubs.h>
  13#include <net/lwtunnel.h>
  14#include <net/ndisc.h>
  15#include <net/nexthop.h>
  16#include <net/route.h>
  17#include <net/sock.h>
  18
  19#define NH_RES_DEFAULT_IDLE_TIMER       (120 * HZ)
  20#define NH_RES_DEFAULT_UNBALANCED_TIMER 0       /* No forced rebalancing. */
  21
  22static void remove_nexthop(struct net *net, struct nexthop *nh,
  23                           struct nl_info *nlinfo);
  24
  25#define NH_DEV_HASHBITS  8
  26#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
  27
  28static const struct nla_policy rtm_nh_policy_new[] = {
  29        [NHA_ID]                = { .type = NLA_U32 },
  30        [NHA_GROUP]             = { .type = NLA_BINARY },
  31        [NHA_GROUP_TYPE]        = { .type = NLA_U16 },
  32        [NHA_BLACKHOLE]         = { .type = NLA_FLAG },
  33        [NHA_OIF]               = { .type = NLA_U32 },
  34        [NHA_GATEWAY]           = { .type = NLA_BINARY },
  35        [NHA_ENCAP_TYPE]        = { .type = NLA_U16 },
  36        [NHA_ENCAP]             = { .type = NLA_NESTED },
  37        [NHA_FDB]               = { .type = NLA_FLAG },
  38        [NHA_RES_GROUP]         = { .type = NLA_NESTED },
  39};
  40
  41static const struct nla_policy rtm_nh_policy_get[] = {
  42        [NHA_ID]                = { .type = NLA_U32 },
  43};
  44
  45static const struct nla_policy rtm_nh_policy_dump[] = {
  46        [NHA_OIF]               = { .type = NLA_U32 },
  47        [NHA_GROUPS]            = { .type = NLA_FLAG },
  48        [NHA_MASTER]            = { .type = NLA_U32 },
  49        [NHA_FDB]               = { .type = NLA_FLAG },
  50};
  51
  52static const struct nla_policy rtm_nh_res_policy_new[] = {
  53        [NHA_RES_GROUP_BUCKETS]                 = { .type = NLA_U16 },
  54        [NHA_RES_GROUP_IDLE_TIMER]              = { .type = NLA_U32 },
  55        [NHA_RES_GROUP_UNBALANCED_TIMER]        = { .type = NLA_U32 },
  56};
  57
  58static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
  59        [NHA_ID]                = { .type = NLA_U32 },
  60        [NHA_OIF]               = { .type = NLA_U32 },
  61        [NHA_MASTER]            = { .type = NLA_U32 },
  62        [NHA_RES_BUCKET]        = { .type = NLA_NESTED },
  63};
  64
  65static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
  66        [NHA_RES_BUCKET_NH_ID]  = { .type = NLA_U32 },
  67};
  68
  69static const struct nla_policy rtm_nh_policy_get_bucket[] = {
  70        [NHA_ID]                = { .type = NLA_U32 },
  71        [NHA_RES_BUCKET]        = { .type = NLA_NESTED },
  72};
  73
  74static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
  75        [NHA_RES_BUCKET_INDEX]  = { .type = NLA_U16 },
  76};
  77
  78static bool nexthop_notifiers_is_empty(struct net *net)
  79{
  80        return !net->nexthop.notifier_chain.head;
  81}
  82
  83static void
  84__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
  85                               const struct nh_info *nhi)
  86{
  87        nh_info->dev = nhi->fib_nhc.nhc_dev;
  88        nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
  89        if (nh_info->gw_family == AF_INET)
  90                nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
  91        else if (nh_info->gw_family == AF_INET6)
  92                nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
  93
  94        nh_info->is_reject = nhi->reject_nh;
  95        nh_info->is_fdb = nhi->fdb_nh;
  96        nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
  97}
  98
  99static int nh_notifier_single_info_init(struct nh_notifier_info *info,
 100                                        const struct nexthop *nh)
 101{
 102        struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 103
 104        info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
 105        info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
 106        if (!info->nh)
 107                return -ENOMEM;
 108
 109        __nh_notifier_single_info_init(info->nh, nhi);
 110
 111        return 0;
 112}
 113
 114static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
 115{
 116        kfree(info->nh);
 117}
 118
 119static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
 120                                       struct nh_group *nhg)
 121{
 122        u16 num_nh = nhg->num_nh;
 123        int i;
 124
 125        info->type = NH_NOTIFIER_INFO_TYPE_GRP;
 126        info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
 127                               GFP_KERNEL);
 128        if (!info->nh_grp)
 129                return -ENOMEM;
 130
 131        info->nh_grp->num_nh = num_nh;
 132        info->nh_grp->is_fdb = nhg->fdb_nh;
 133
 134        for (i = 0; i < num_nh; i++) {
 135                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 136                struct nh_info *nhi;
 137
 138                nhi = rtnl_dereference(nhge->nh->nh_info);
 139                info->nh_grp->nh_entries[i].id = nhge->nh->id;
 140                info->nh_grp->nh_entries[i].weight = nhge->weight;
 141                __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
 142                                               nhi);
 143        }
 144
 145        return 0;
 146}
 147
 148static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
 149                                           struct nh_group *nhg)
 150{
 151        struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
 152        u16 num_nh_buckets = res_table->num_nh_buckets;
 153        unsigned long size;
 154        u16 i;
 155
 156        info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
 157        size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
 158        info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
 159                                       __GFP_NOWARN);
 160        if (!info->nh_res_table)
 161                return -ENOMEM;
 162
 163        info->nh_res_table->num_nh_buckets = num_nh_buckets;
 164
 165        for (i = 0; i < num_nh_buckets; i++) {
 166                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
 167                struct nh_grp_entry *nhge;
 168                struct nh_info *nhi;
 169
 170                nhge = rtnl_dereference(bucket->nh_entry);
 171                nhi = rtnl_dereference(nhge->nh->nh_info);
 172                __nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
 173                                               nhi);
 174        }
 175
 176        return 0;
 177}
 178
 179static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
 180                                     const struct nexthop *nh)
 181{
 182        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 183
 184        if (nhg->hash_threshold)
 185                return nh_notifier_mpath_info_init(info, nhg);
 186        else if (nhg->resilient)
 187                return nh_notifier_res_table_info_init(info, nhg);
 188        return -EINVAL;
 189}
 190
 191static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
 192                                      const struct nexthop *nh)
 193{
 194        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 195
 196        if (nhg->hash_threshold)
 197                kfree(info->nh_grp);
 198        else if (nhg->resilient)
 199                vfree(info->nh_res_table);
 200}
 201
 202static int nh_notifier_info_init(struct nh_notifier_info *info,
 203                                 const struct nexthop *nh)
 204{
 205        info->id = nh->id;
 206
 207        if (nh->is_group)
 208                return nh_notifier_grp_info_init(info, nh);
 209        else
 210                return nh_notifier_single_info_init(info, nh);
 211}
 212
 213static void nh_notifier_info_fini(struct nh_notifier_info *info,
 214                                  const struct nexthop *nh)
 215{
 216        if (nh->is_group)
 217                nh_notifier_grp_info_fini(info, nh);
 218        else
 219                nh_notifier_single_info_fini(info);
 220}
 221
 222static int call_nexthop_notifiers(struct net *net,
 223                                  enum nexthop_event_type event_type,
 224                                  struct nexthop *nh,
 225                                  struct netlink_ext_ack *extack)
 226{
 227        struct nh_notifier_info info = {
 228                .net = net,
 229                .extack = extack,
 230        };
 231        int err;
 232
 233        ASSERT_RTNL();
 234
 235        if (nexthop_notifiers_is_empty(net))
 236                return 0;
 237
 238        err = nh_notifier_info_init(&info, nh);
 239        if (err) {
 240                NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
 241                return err;
 242        }
 243
 244        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
 245                                           event_type, &info);
 246        nh_notifier_info_fini(&info, nh);
 247
 248        return notifier_to_errno(err);
 249}
 250
 251static int
 252nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
 253                                      bool force, unsigned int *p_idle_timer_ms)
 254{
 255        struct nh_res_table *res_table;
 256        struct nh_group *nhg;
 257        struct nexthop *nh;
 258        int err = 0;
 259
 260        /* When 'force' is false, nexthop bucket replacement is performed
 261         * because the bucket was deemed to be idle. In this case, capable
 262         * listeners can choose to perform an atomic replacement: The bucket is
 263         * only replaced if it is inactive. However, if the idle timer interval
 264         * is smaller than the interval in which a listener is querying
 265         * buckets' activity from the device, then atomic replacement should
 266         * not be tried. Pass the idle timer value to listeners, so that they
 267         * could determine which type of replacement to perform.
 268         */
 269        if (force) {
 270                *p_idle_timer_ms = 0;
 271                return 0;
 272        }
 273
 274        rcu_read_lock();
 275
 276        nh = nexthop_find_by_id(info->net, info->id);
 277        if (!nh) {
 278                err = -EINVAL;
 279                goto out;
 280        }
 281
 282        nhg = rcu_dereference(nh->nh_grp);
 283        res_table = rcu_dereference(nhg->res_table);
 284        *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);
 285
 286out:
 287        rcu_read_unlock();
 288
 289        return err;
 290}
 291
 292static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
 293                                            u16 bucket_index, bool force,
 294                                            struct nh_info *oldi,
 295                                            struct nh_info *newi)
 296{
 297        unsigned int idle_timer_ms;
 298        int err;
 299
 300        err = nh_notifier_res_bucket_idle_timer_get(info, force,
 301                                                    &idle_timer_ms);
 302        if (err)
 303                return err;
 304
 305        info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
 306        info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
 307                                      GFP_KERNEL);
 308        if (!info->nh_res_bucket)
 309                return -ENOMEM;
 310
 311        info->nh_res_bucket->bucket_index = bucket_index;
 312        info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
 313        info->nh_res_bucket->force = force;
 314        __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
 315        __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
 316        return 0;
 317}
 318
 319static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
 320{
 321        kfree(info->nh_res_bucket);
 322}
 323
 324static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
 325                                               u16 bucket_index, bool force,
 326                                               struct nh_info *oldi,
 327                                               struct nh_info *newi,
 328                                               struct netlink_ext_ack *extack)
 329{
 330        struct nh_notifier_info info = {
 331                .net = net,
 332                .extack = extack,
 333                .id = nhg_id,
 334        };
 335        int err;
 336
 337        if (nexthop_notifiers_is_empty(net))
 338                return 0;
 339
 340        err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
 341                                               oldi, newi);
 342        if (err)
 343                return err;
 344
 345        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
 346                                           NEXTHOP_EVENT_BUCKET_REPLACE, &info);
 347        nh_notifier_res_bucket_info_fini(&info);
 348
 349        return notifier_to_errno(err);
 350}
 351
 352/* There are three users of RES_TABLE, and NHs etc. referenced from there:
 353 *
 354 * 1) a collection of callbacks for NH maintenance. This operates under
 355 *    RTNL,
 356 * 2) the delayed work that gradually balances the resilient table,
 357 * 3) and nexthop_select_path(), operating under RCU.
 358 *
 359 * Both the delayed work and the RTNL block are writers, and need to
 360 * maintain mutual exclusion. Since there are only two and well-known
 361 * writers for each table, the RTNL code can make sure it has exclusive
 362 * access thus:
 363 *
 364 * - Have the DW operate without locking;
 365 * - synchronously cancel the DW;
 366 * - do the writing;
 367 * - if the write was not actually a delete, call upkeep, which schedules
 368 *   DW again if necessary.
 369 *
 370 * The functions that are always called from the RTNL context use
 371 * rtnl_dereference(). The functions that can also be called from the DW do
 372 * a raw dereference and rely on the above mutual exclusion scheme.
 373 */
 374#define nh_res_dereference(p) (rcu_dereference_raw(p))
 375
 376static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
 377                                             u16 bucket_index, bool force,
 378                                             struct nexthop *old_nh,
 379                                             struct nexthop *new_nh,
 380                                             struct netlink_ext_ack *extack)
 381{
 382        struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
 383        struct nh_info *newi = nh_res_dereference(new_nh->nh_info);
 384
 385        return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
 386                                                   force, oldi, newi, extack);
 387}
 388
 389static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
 390                                            struct netlink_ext_ack *extack)
 391{
 392        struct nh_notifier_info info = {
 393                .net = net,
 394                .extack = extack,
 395        };
 396        struct nh_group *nhg;
 397        int err;
 398
 399        ASSERT_RTNL();
 400
 401        if (nexthop_notifiers_is_empty(net))
 402                return 0;
 403
 404        /* At this point, the nexthop buckets are still not populated. Only
 405         * emit a notification with the logical nexthops, so that a listener
 406         * could potentially veto it in case of unsupported configuration.
 407         */
 408        nhg = rtnl_dereference(nh->nh_grp);
 409        err = nh_notifier_mpath_info_init(&info, nhg);
 410        if (err) {
 411                NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
 412                return err;
 413        }
 414
 415        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
 416                                           NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
 417                                           &info);
 418        kfree(info.nh_grp);
 419
 420        return notifier_to_errno(err);
 421}
 422
 423static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
 424                                 enum nexthop_event_type event_type,
 425                                 struct nexthop *nh,
 426                                 struct netlink_ext_ack *extack)
 427{
 428        struct nh_notifier_info info = {
 429                .net = net,
 430                .extack = extack,
 431        };
 432        int err;
 433
 434        err = nh_notifier_info_init(&info, nh);
 435        if (err)
 436                return err;
 437
 438        err = nb->notifier_call(nb, event_type, &info);
 439        nh_notifier_info_fini(&info, nh);
 440
 441        return notifier_to_errno(err);
 442}
 443
 444static unsigned int nh_dev_hashfn(unsigned int val)
 445{
 446        unsigned int mask = NH_DEV_HASHSIZE - 1;
 447
 448        return (val ^
 449                (val >> NH_DEV_HASHBITS) ^
 450                (val >> (NH_DEV_HASHBITS * 2))) & mask;
 451}
 452
 453static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
 454{
 455        struct net_device *dev = nhi->fib_nhc.nhc_dev;
 456        struct hlist_head *head;
 457        unsigned int hash;
 458
 459        WARN_ON(!dev);
 460
 461        hash = nh_dev_hashfn(dev->ifindex);
 462        head = &net->nexthop.devhash[hash];
 463        hlist_add_head(&nhi->dev_hash, head);
 464}
 465
 466static void nexthop_free_group(struct nexthop *nh)
 467{
 468        struct nh_group *nhg;
 469        int i;
 470
 471        nhg = rcu_dereference_raw(nh->nh_grp);
 472        for (i = 0; i < nhg->num_nh; ++i) {
 473                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 474
 475                WARN_ON(!list_empty(&nhge->nh_list));
 476                nexthop_put(nhge->nh);
 477        }
 478
 479        WARN_ON(nhg->spare == nhg);
 480
 481        if (nhg->resilient)
 482                vfree(rcu_dereference_raw(nhg->res_table));
 483
 484        kfree(nhg->spare);
 485        kfree(nhg);
 486}
 487
 488static void nexthop_free_single(struct nexthop *nh)
 489{
 490        struct nh_info *nhi;
 491
 492        nhi = rcu_dereference_raw(nh->nh_info);
 493        switch (nhi->family) {
 494        case AF_INET:
 495                fib_nh_release(nh->net, &nhi->fib_nh);
 496                break;
 497        case AF_INET6:
 498                ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
 499                break;
 500        }
 501        kfree(nhi);
 502}
 503
 504void nexthop_free_rcu(struct rcu_head *head)
 505{
 506        struct nexthop *nh = container_of(head, struct nexthop, rcu);
 507
 508        if (nh->is_group)
 509                nexthop_free_group(nh);
 510        else
 511                nexthop_free_single(nh);
 512
 513        kfree(nh);
 514}
 515EXPORT_SYMBOL_GPL(nexthop_free_rcu);
 516
 517static struct nexthop *nexthop_alloc(void)
 518{
 519        struct nexthop *nh;
 520
 521        nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 522        if (nh) {
 523                INIT_LIST_HEAD(&nh->fi_list);
 524                INIT_LIST_HEAD(&nh->f6i_list);
 525                INIT_LIST_HEAD(&nh->grp_list);
 526                INIT_LIST_HEAD(&nh->fdb_list);
 527        }
 528        return nh;
 529}
 530
 531static struct nh_group *nexthop_grp_alloc(u16 num_nh)
 532{
 533        struct nh_group *nhg;
 534
 535        nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
 536        if (nhg)
 537                nhg->num_nh = num_nh;
 538
 539        return nhg;
 540}
 541
 542static void nh_res_table_upkeep_dw(struct work_struct *work);
 543
 544static struct nh_res_table *
 545nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
 546{
 547        const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
 548        struct nh_res_table *res_table;
 549        unsigned long size;
 550
 551        size = struct_size(res_table, nh_buckets, num_nh_buckets);
 552        res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
 553        if (!res_table)
 554                return NULL;
 555
 556        res_table->net = net;
 557        res_table->nhg_id = nhg_id;
 558        INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
 559        INIT_LIST_HEAD(&res_table->uw_nh_entries);
 560        res_table->idle_timer = cfg->nh_grp_res_idle_timer;
 561        res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
 562        res_table->num_nh_buckets = num_nh_buckets;
 563        return res_table;
 564}
 565
 566static void nh_base_seq_inc(struct net *net)
 567{
 568        while (++net->nexthop.seq == 0)
 569                ;
 570}
 571
 572/* no reference taken; rcu lock or rtnl must be held */
 573struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
 574{
 575        struct rb_node **pp, *parent = NULL, *next;
 576
 577        pp = &net->nexthop.rb_root.rb_node;
 578        while (1) {
 579                struct nexthop *nh;
 580
 581                next = rcu_dereference_raw(*pp);
 582                if (!next)
 583                        break;
 584                parent = next;
 585
 586                nh = rb_entry(parent, struct nexthop, rb_node);
 587                if (id < nh->id)
 588                        pp = &next->rb_left;
 589                else if (id > nh->id)
 590                        pp = &next->rb_right;
 591                else
 592                        return nh;
 593        }
 594        return NULL;
 595}
 596EXPORT_SYMBOL_GPL(nexthop_find_by_id);
 597
 598/* used for auto id allocation; called with rtnl held */
 599static u32 nh_find_unused_id(struct net *net)
 600{
 601        u32 id_start = net->nexthop.last_id_allocated;
 602
 603        while (1) {
 604                net->nexthop.last_id_allocated++;
 605                if (net->nexthop.last_id_allocated == id_start)
 606                        break;
 607
 608                if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
 609                        return net->nexthop.last_id_allocated;
 610        }
 611        return 0;
 612}
 613
 614static void nh_res_time_set_deadline(unsigned long next_time,
 615                                     unsigned long *deadline)
 616{
 617        if (time_before(next_time, *deadline))
 618                *deadline = next_time;
 619}
 620
 621static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
 622{
 623        if (list_empty(&res_table->uw_nh_entries))
 624                return 0;
 625        return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
 626}
 627
 628static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
 629{
 630        struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
 631        struct nlattr *nest;
 632
 633        nest = nla_nest_start(skb, NHA_RES_GROUP);
 634        if (!nest)
 635                return -EMSGSIZE;
 636
 637        if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
 638                        res_table->num_nh_buckets) ||
 639            nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
 640                        jiffies_to_clock_t(res_table->idle_timer)) ||
 641            nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
 642                        jiffies_to_clock_t(res_table->unbalanced_timer)) ||
 643            nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
 644                              nh_res_table_unbalanced_time(res_table),
 645                              NHA_RES_GROUP_PAD))
 646                goto nla_put_failure;
 647
 648        nla_nest_end(skb, nest);
 649        return 0;
 650
 651nla_put_failure:
 652        nla_nest_cancel(skb, nest);
 653        return -EMSGSIZE;
 654}
 655
 656static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
 657{
 658        struct nexthop_grp *p;
 659        size_t len = nhg->num_nh * sizeof(*p);
 660        struct nlattr *nla;
 661        u16 group_type = 0;
 662        int i;
 663
 664        if (nhg->hash_threshold)
 665                group_type = NEXTHOP_GRP_TYPE_MPATH;
 666        else if (nhg->resilient)
 667                group_type = NEXTHOP_GRP_TYPE_RES;
 668
 669        if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
 670                goto nla_put_failure;
 671
 672        nla = nla_reserve(skb, NHA_GROUP, len);
 673        if (!nla)
 674                goto nla_put_failure;
 675
 676        p = nla_data(nla);
 677        for (i = 0; i < nhg->num_nh; ++i) {
 678                p->id = nhg->nh_entries[i].nh->id;
 679                p->weight = nhg->nh_entries[i].weight - 1;
 680                p += 1;
 681        }
 682
 683        if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
 684                goto nla_put_failure;
 685
 686        return 0;
 687
 688nla_put_failure:
 689        return -EMSGSIZE;
 690}
 691
 692static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 693                        int event, u32 portid, u32 seq, unsigned int nlflags)
 694{
 695        struct fib6_nh *fib6_nh;
 696        struct fib_nh *fib_nh;
 697        struct nlmsghdr *nlh;
 698        struct nh_info *nhi;
 699        struct nhmsg *nhm;
 700
 701        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
 702        if (!nlh)
 703                return -EMSGSIZE;
 704
 705        nhm = nlmsg_data(nlh);
 706        nhm->nh_family = AF_UNSPEC;
 707        nhm->nh_flags = nh->nh_flags;
 708        nhm->nh_protocol = nh->protocol;
 709        nhm->nh_scope = 0;
 710        nhm->resvd = 0;
 711
 712        if (nla_put_u32(skb, NHA_ID, nh->id))
 713                goto nla_put_failure;
 714
 715        if (nh->is_group) {
 716                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 717
 718                if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
 719                        goto nla_put_failure;
 720                if (nla_put_nh_group(skb, nhg))
 721                        goto nla_put_failure;
 722                goto out;
 723        }
 724
 725        nhi = rtnl_dereference(nh->nh_info);
 726        nhm->nh_family = nhi->family;
 727        if (nhi->reject_nh) {
 728                if (nla_put_flag(skb, NHA_BLACKHOLE))
 729                        goto nla_put_failure;
 730                goto out;
 731        } else if (nhi->fdb_nh) {
 732                if (nla_put_flag(skb, NHA_FDB))
 733                        goto nla_put_failure;
 734        } else {
 735                const struct net_device *dev;
 736
 737                dev = nhi->fib_nhc.nhc_dev;
 738                if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
 739                        goto nla_put_failure;
 740        }
 741
 742        nhm->nh_scope = nhi->fib_nhc.nhc_scope;
 743        switch (nhi->family) {
 744        case AF_INET:
 745                fib_nh = &nhi->fib_nh;
 746                if (fib_nh->fib_nh_gw_family &&
 747                    nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
 748                        goto nla_put_failure;
 749                break;
 750
 751        case AF_INET6:
 752                fib6_nh = &nhi->fib6_nh;
 753                if (fib6_nh->fib_nh_gw_family &&
 754                    nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
 755                        goto nla_put_failure;
 756                break;
 757        }
 758
 759        if (nhi->fib_nhc.nhc_lwtstate &&
 760            lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
 761                                NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
 762                goto nla_put_failure;
 763
 764out:
 765        nlmsg_end(skb, nlh);
 766        return 0;
 767
 768nla_put_failure:
 769        nlmsg_cancel(skb, nlh);
 770        return -EMSGSIZE;
 771}
 772
 773static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
 774{
 775        return nla_total_size(0) +      /* NHA_RES_GROUP */
 776                nla_total_size(2) +     /* NHA_RES_GROUP_BUCKETS */
 777                nla_total_size(4) +     /* NHA_RES_GROUP_IDLE_TIMER */
 778                nla_total_size(4) +     /* NHA_RES_GROUP_UNBALANCED_TIMER */
 779                nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
 780}
 781
 782static size_t nh_nlmsg_size_grp(struct nexthop *nh)
 783{
 784        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 785        size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
 786        size_t tot = nla_total_size(sz) +
 787                nla_total_size(2); /* NHA_GROUP_TYPE */
 788
 789        if (nhg->resilient)
 790                tot += nh_nlmsg_size_grp_res(nhg);
 791
 792        return tot;
 793}
 794
 795static size_t nh_nlmsg_size_single(struct nexthop *nh)
 796{
 797        struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 798        size_t sz;
 799
 800        /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
 801         * are mutually exclusive
 802         */
 803        sz = nla_total_size(4);  /* NHA_OIF */
 804
 805        switch (nhi->family) {
 806        case AF_INET:
 807                if (nhi->fib_nh.fib_nh_gw_family)
 808                        sz += nla_total_size(4);  /* NHA_GATEWAY */
 809                break;
 810
 811        case AF_INET6:
 812                /* NHA_GATEWAY */
 813                if (nhi->fib6_nh.fib_nh_gw_family)
 814                        sz += nla_total_size(sizeof(const struct in6_addr));
 815                break;
 816        }
 817
 818        if (nhi->fib_nhc.nhc_lwtstate) {
 819                sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
 820                sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
 821        }
 822
 823        return sz;
 824}
 825
 826static size_t nh_nlmsg_size(struct nexthop *nh)
 827{
 828        size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
 829
 830        sz += nla_total_size(4); /* NHA_ID */
 831
 832        if (nh->is_group)
 833                sz += nh_nlmsg_size_grp(nh);
 834        else
 835                sz += nh_nlmsg_size_single(nh);
 836
 837        return sz;
 838}
 839
 840static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 841{
 842        unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
 843        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 844        struct sk_buff *skb;
 845        int err = -ENOBUFS;
 846
 847        skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
 848        if (!skb)
 849                goto errout;
 850
 851        err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
 852        if (err < 0) {
 853                /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
 854                WARN_ON(err == -EMSGSIZE);
 855                kfree_skb(skb);
 856                goto errout;
 857        }
 858
 859        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
 860                    info->nlh, gfp_any());
 861        return;
 862errout:
 863        if (err < 0)
 864                rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
 865}
 866
 867static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
 868{
 869        return (unsigned long)atomic_long_read(&bucket->used_time);
 870}
 871
 872static unsigned long
 873nh_res_bucket_idle_point(const struct nh_res_table *res_table,
 874                         const struct nh_res_bucket *bucket,
 875                         unsigned long now)
 876{
 877        unsigned long time = nh_res_bucket_used_time(bucket);
 878
 879        /* Bucket was not used since it was migrated. The idle time is now. */
 880        if (time == bucket->migrated_time)
 881                return now;
 882
 883        return time + res_table->idle_timer;
 884}
 885
 886static unsigned long
 887nh_res_table_unb_point(const struct nh_res_table *res_table)
 888{
 889        return res_table->unbalanced_since + res_table->unbalanced_timer;
 890}
 891
 892static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
 893                                   struct nh_res_bucket *bucket)
 894{
 895        unsigned long now = jiffies;
 896
 897        atomic_long_set(&bucket->used_time, (long)now);
 898        bucket->migrated_time = now;
 899}
 900
 901static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
 902{
 903        atomic_long_set(&bucket->used_time, (long)jiffies);
 904}
 905
 906static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
 907{
 908        unsigned long used_time = nh_res_bucket_used_time(bucket);
 909
 910        return jiffies_delta_to_clock_t(jiffies - used_time);
 911}
 912
 913static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
 914                              struct nh_res_bucket *bucket, u16 bucket_index,
 915                              int event, u32 portid, u32 seq,
 916                              unsigned int nlflags,
 917                              struct netlink_ext_ack *extack)
 918{
 919        struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
 920        struct nlmsghdr *nlh;
 921        struct nlattr *nest;
 922        struct nhmsg *nhm;
 923
 924        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
 925        if (!nlh)
 926                return -EMSGSIZE;
 927
 928        nhm = nlmsg_data(nlh);
 929        nhm->nh_family = AF_UNSPEC;
 930        nhm->nh_flags = bucket->nh_flags;
 931        nhm->nh_protocol = nh->protocol;
 932        nhm->nh_scope = 0;
 933        nhm->resvd = 0;
 934
 935        if (nla_put_u32(skb, NHA_ID, nh->id))
 936                goto nla_put_failure;
 937
 938        nest = nla_nest_start(skb, NHA_RES_BUCKET);
 939        if (!nest)
 940                goto nla_put_failure;
 941
 942        if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
 943            nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
 944            nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
 945                              nh_res_bucket_idle_time(bucket),
 946                              NHA_RES_BUCKET_PAD))
 947                goto nla_put_failure_nest;
 948
 949        nla_nest_end(skb, nest);
 950        nlmsg_end(skb, nlh);
 951        return 0;
 952
 953nla_put_failure_nest:
 954        nla_nest_cancel(skb, nest);
 955nla_put_failure:
 956        nlmsg_cancel(skb, nlh);
 957        return -EMSGSIZE;
 958}
 959
 960static void nexthop_bucket_notify(struct nh_res_table *res_table,
 961                                  u16 bucket_index)
 962{
 963        struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
 964        struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
 965        struct nexthop *nh = nhge->nh_parent;
 966        struct sk_buff *skb;
 967        int err = -ENOBUFS;
 968
 969        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 970        if (!skb)
 971                goto errout;
 972
 973        err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
 974                                 RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
 975                                 NULL);
 976        if (err < 0) {
 977                kfree_skb(skb);
 978                goto errout;
 979        }
 980
 981        rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
 982        return;
 983errout:
 984        if (err < 0)
 985                rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
 986}
 987
 988static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
 989                           bool *is_fdb, struct netlink_ext_ack *extack)
 990{
 991        if (nh->is_group) {
 992                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 993
 994                /* Nesting groups within groups is not supported. */
 995                if (nhg->hash_threshold) {
 996                        NL_SET_ERR_MSG(extack,
 997                                       "Hash-threshold group can not be a nexthop within a group");
 998                        return false;
 999                }
1000                if (nhg->resilient) {
1001                        NL_SET_ERR_MSG(extack,
1002                                       "Resilient group can not be a nexthop within a group");
1003                        return false;
1004                }
1005                *is_fdb = nhg->fdb_nh;
1006        } else {
1007                struct nh_info *nhi = rtnl_dereference(nh->nh_info);
1008
1009                if (nhi->reject_nh && npaths > 1) {
1010                        NL_SET_ERR_MSG(extack,
1011                                       "Blackhole nexthop can not be used in a group with more than 1 path");
1012                        return false;
1013                }
1014                *is_fdb = nhi->fdb_nh;
1015        }
1016
1017        return true;
1018}
1019
1020static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
1021                                   struct netlink_ext_ack *extack)
1022{
1023        struct nh_info *nhi;
1024
1025        nhi = rtnl_dereference(nh->nh_info);
1026
1027        if (!nhi->fdb_nh) {
1028                NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
1029                return -EINVAL;
1030        }
1031
1032        if (*nh_family == AF_UNSPEC) {
1033                *nh_family = nhi->family;
1034        } else if (*nh_family != nhi->family) {
1035                NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
1036                return -EINVAL;
1037        }
1038
1039        return 0;
1040}
1041
1042static int nh_check_attr_group(struct net *net,
1043                               struct nlattr *tb[], size_t tb_size,
1044                               u16 nh_grp_type, struct netlink_ext_ack *extack)
1045{
1046        unsigned int len = nla_len(tb[NHA_GROUP]);
1047        u8 nh_family = AF_UNSPEC;
1048        struct nexthop_grp *nhg;
1049        unsigned int i, j;
1050        u8 nhg_fdb = 0;
1051
1052        if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
1053                NL_SET_ERR_MSG(extack,
1054                               "Invalid length for nexthop group attribute");
1055                return -EINVAL;
1056        }
1057
1058        /* convert len to number of nexthop ids */
1059        len /= sizeof(*nhg);
1060
1061        nhg = nla_data(tb[NHA_GROUP]);
1062        for (i = 0; i < len; ++i) {
1063                if (nhg[i].resvd1 || nhg[i].resvd2) {
1064                        NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
1065                        return -EINVAL;
1066                }
1067                if (nhg[i].weight > 254) {
1068                        NL_SET_ERR_MSG(extack, "Invalid value for weight");
1069                        return -EINVAL;
1070                }
1071                for (j = i + 1; j < len; ++j) {
1072                        if (nhg[i].id == nhg[j].id) {
1073                                NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
1074                                return -EINVAL;
1075                        }
1076                }
1077        }
1078
1079        if (tb[NHA_FDB])
1080                nhg_fdb = 1;
1081        nhg = nla_data(tb[NHA_GROUP]);
1082        for (i = 0; i < len; ++i) {
1083                struct nexthop *nh;
1084                bool is_fdb_nh;
1085
1086                nh = nexthop_find_by_id(net, nhg[i].id);
1087                if (!nh) {
1088                        NL_SET_ERR_MSG(extack, "Invalid nexthop id");
1089                        return -EINVAL;
1090                }
1091                if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
1092                        return -EINVAL;
1093
1094                if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
1095                        return -EINVAL;
1096
1097                if (!nhg_fdb && is_fdb_nh) {
1098                        NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
1099                        return -EINVAL;
1100                }
1101        }
1102        for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
1103                if (!tb[i])
1104                        continue;
1105                switch (i) {
1106                case NHA_FDB:
1107                        continue;
1108                case NHA_RES_GROUP:
1109                        if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
1110                                continue;
1111                        break;
1112                }
1113                NL_SET_ERR_MSG(extack,
1114                               "No other attributes can be set in nexthop groups");
1115                return -EINVAL;
1116        }
1117
1118        return 0;
1119}
1120
1121static bool ipv6_good_nh(const struct fib6_nh *nh)
1122{
1123        int state = NUD_REACHABLE;
1124        struct neighbour *n;
1125
1126        rcu_read_lock_bh();
1127
1128        n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
1129        if (n)
1130                state = n->nud_state;
1131
1132        rcu_read_unlock_bh();
1133
1134        return !!(state & NUD_VALID);
1135}
1136
1137static bool ipv4_good_nh(const struct fib_nh *nh)
1138{
1139        int state = NUD_REACHABLE;
1140        struct neighbour *n;
1141
1142        rcu_read_lock_bh();
1143
1144        n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
1145                                      (__force u32)nh->fib_nh_gw4);
1146        if (n)
1147                state = n->nud_state;
1148
1149        rcu_read_unlock_bh();
1150
1151        return !!(state & NUD_VALID);
1152}
1153
1154static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
1155{
1156        struct nexthop *rc = NULL;
1157        int i;
1158
1159        for (i = 0; i < nhg->num_nh; ++i) {
1160                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1161                struct nh_info *nhi;
1162
1163                if (hash > atomic_read(&nhge->hthr.upper_bound))
1164                        continue;
1165
1166                nhi = rcu_dereference(nhge->nh->nh_info);
1167                if (nhi->fdb_nh)
1168                        return nhge->nh;
1169
1170                /* nexthops always check if it is good and does
1171                 * not rely on a sysctl for this behavior
1172                 */
1173                switch (nhi->family) {
1174                case AF_INET:
1175                        if (ipv4_good_nh(&nhi->fib_nh))
1176                                return nhge->nh;
1177                        break;
1178                case AF_INET6:
1179                        if (ipv6_good_nh(&nhi->fib6_nh))
1180                                return nhge->nh;
1181                        break;
1182                }
1183
1184                if (!rc)
1185                        rc = nhge->nh;
1186        }
1187
1188        return rc;
1189}
1190
1191static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
1192{
1193        struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
1194        u16 bucket_index = hash % res_table->num_nh_buckets;
1195        struct nh_res_bucket *bucket;
1196        struct nh_grp_entry *nhge;
1197
1198        /* nexthop_select_path() is expected to return a non-NULL value, so
1199         * skip protocol validation and just hand out whatever there is.
1200         */
1201        bucket = &res_table->nh_buckets[bucket_index];
1202        nh_res_bucket_set_busy(bucket);
1203        nhge = rcu_dereference(bucket->nh_entry);
1204        return nhge->nh;
1205}
1206
1207struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
1208{
1209        struct nh_group *nhg;
1210
1211        if (!nh->is_group)
1212                return nh;
1213
1214        nhg = rcu_dereference(nh->nh_grp);
1215        if (nhg->hash_threshold)
1216                return nexthop_select_path_hthr(nhg, hash);
1217        else if (nhg->resilient)
1218                return nexthop_select_path_res(nhg, hash);
1219
1220        /* Unreachable. */
1221        return NULL;
1222}
1223EXPORT_SYMBOL_GPL(nexthop_select_path);
1224
1225int nexthop_for_each_fib6_nh(struct nexthop *nh,
1226                             int (*cb)(struct fib6_nh *nh, void *arg),
1227                             void *arg)
1228{
1229        struct nh_info *nhi;
1230        int err;
1231
1232        if (nh->is_group) {
1233                struct nh_group *nhg;
1234                int i;
1235
1236                nhg = rcu_dereference_rtnl(nh->nh_grp);
1237                for (i = 0; i < nhg->num_nh; i++) {
1238                        struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1239
1240                        nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
1241                        err = cb(&nhi->fib6_nh, arg);
1242                        if (err)
1243                                return err;
1244                }
1245        } else {
1246                nhi = rcu_dereference_rtnl(nh->nh_info);
1247                err = cb(&nhi->fib6_nh, arg);
1248                if (err)
1249                        return err;
1250        }
1251
1252        return 0;
1253}
1254EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
1255
1256static int check_src_addr(const struct in6_addr *saddr,
1257                          struct netlink_ext_ack *extack)
1258{
1259        if (!ipv6_addr_any(saddr)) {
1260                NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
1261                return -EINVAL;
1262        }
1263        return 0;
1264}
1265
1266int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
1267                       struct netlink_ext_ack *extack)
1268{
1269        struct nh_info *nhi;
1270        bool is_fdb_nh;
1271
1272        /* fib6_src is unique to a fib6_info and limits the ability to cache
1273         * routes in fib6_nh within a nexthop that is potentially shared
1274         * across multiple fib entries. If the config wants to use source
1275         * routing it can not use nexthop objects. mlxsw also does not allow
1276         * fib6_src on routes.
1277         */
1278        if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
1279                return -EINVAL;
1280
1281        if (nh->is_group) {
1282                struct nh_group *nhg;
1283
1284                nhg = rtnl_dereference(nh->nh_grp);
1285                if (nhg->has_v4)
1286                        goto no_v4_nh;
1287                is_fdb_nh = nhg->fdb_nh;
1288        } else {
1289                nhi = rtnl_dereference(nh->nh_info);
1290                if (nhi->family == AF_INET)
1291                        goto no_v4_nh;
1292                is_fdb_nh = nhi->fdb_nh;
1293        }
1294
1295        if (is_fdb_nh) {
1296                NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1297                return -EINVAL;
1298        }
1299
1300        return 0;
1301no_v4_nh:
1302        NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
1303        return -EINVAL;
1304}
1305EXPORT_SYMBOL_GPL(fib6_check_nexthop);
1306
1307/* if existing nexthop has ipv6 routes linked to it, need
1308 * to verify this new spec works with ipv6
1309 */
1310static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
1311                              struct netlink_ext_ack *extack)
1312{
1313        struct fib6_info *f6i;
1314
1315        if (list_empty(&old->f6i_list))
1316                return 0;
1317
1318        list_for_each_entry(f6i, &old->f6i_list, nh_list) {
1319                if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
1320                        return -EINVAL;
1321        }
1322
1323        return fib6_check_nexthop(new, NULL, extack);
1324}
1325
1326static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
1327                               struct netlink_ext_ack *extack)
1328{
1329        if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
1330                NL_SET_ERR_MSG(extack,
1331                               "Route with host scope can not have a gateway");
1332                return -EINVAL;
1333        }
1334
1335        if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
1336                NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
1337                return -EINVAL;
1338        }
1339
1340        return 0;
1341}
1342
1343/* Invoked by fib add code to verify nexthop by id is ok with
1344 * config for prefix; parts of fib_check_nh not done when nexthop
1345 * object is used.
1346 */
1347int fib_check_nexthop(struct nexthop *nh, u8 scope,
1348                      struct netlink_ext_ack *extack)
1349{
1350        struct nh_info *nhi;
1351        int err = 0;
1352
1353        if (nh->is_group) {
1354                struct nh_group *nhg;
1355
1356                nhg = rtnl_dereference(nh->nh_grp);
1357                if (nhg->fdb_nh) {
1358                        NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1359                        err = -EINVAL;
1360                        goto out;
1361                }
1362
1363                if (scope == RT_SCOPE_HOST) {
1364                        NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
1365                        err = -EINVAL;
1366                        goto out;
1367                }
1368
1369                /* all nexthops in a group have the same scope */
1370                nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
1371                err = nexthop_check_scope(nhi, scope, extack);
1372        } else {
1373                nhi = rtnl_dereference(nh->nh_info);
1374                if (nhi->fdb_nh) {
1375                        NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
1376                        err = -EINVAL;
1377                        goto out;
1378                }
1379                err = nexthop_check_scope(nhi, scope, extack);
1380        }
1381
1382out:
1383        return err;
1384}
1385
1386static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
1387                             struct netlink_ext_ack *extack)
1388{
1389        struct fib_info *fi;
1390
1391        list_for_each_entry(fi, &old->fi_list, nh_list) {
1392                int err;
1393
1394                err = fib_check_nexthop(new, fi->fib_scope, extack);
1395                if (err)
1396                        return err;
1397        }
1398        return 0;
1399}
1400
1401static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
1402{
1403        return nhge->res.count_buckets == nhge->res.wants_buckets;
1404}
1405
1406static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
1407{
1408        return nhge->res.count_buckets > nhge->res.wants_buckets;
1409}
1410
1411static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
1412{
1413        return nhge->res.count_buckets < nhge->res.wants_buckets;
1414}
1415
1416static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
1417{
1418        return list_empty(&res_table->uw_nh_entries);
1419}
1420
1421static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
1422{
1423        struct nh_grp_entry *nhge;
1424
1425        if (bucket->occupied) {
1426                nhge = nh_res_dereference(bucket->nh_entry);
1427                nhge->res.count_buckets--;
1428                bucket->occupied = false;
1429        }
1430}
1431
1432static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
1433                                 struct nh_grp_entry *nhge)
1434{
1435        nh_res_bucket_unset_nh(bucket);
1436
1437        bucket->occupied = true;
1438        rcu_assign_pointer(bucket->nh_entry, nhge);
1439        nhge->res.count_buckets++;
1440}
1441
1442static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
1443                                         struct nh_res_bucket *bucket,
1444                                         unsigned long *deadline, bool *force)
1445{
1446        unsigned long now = jiffies;
1447        struct nh_grp_entry *nhge;
1448        unsigned long idle_point;
1449
1450        if (!bucket->occupied) {
1451                /* The bucket is not occupied, its NHGE pointer is either
1452                 * NULL or obsolete. We _have to_ migrate: set force.
1453                 */
1454                *force = true;
1455                return true;
1456        }
1457
1458        nhge = nh_res_dereference(bucket->nh_entry);
1459
1460        /* If the bucket is populated by an underweight or balanced
1461         * nexthop, do not migrate.
1462         */
1463        if (!nh_res_nhge_is_ow(nhge))
1464                return false;
1465
1466        /* At this point we know that the bucket is populated with an
1467         * overweight nexthop. It needs to be migrated to a new nexthop if
1468         * the idle timer of unbalanced timer expired.
1469         */
1470
1471        idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
1472        if (time_after_eq(now, idle_point)) {
1473                /* The bucket is idle. We _can_ migrate: unset force. */
1474                *force = false;
1475                return true;
1476        }
1477
1478        /* Unbalanced timer of 0 means "never force". */
1479        if (res_table->unbalanced_timer) {
1480                unsigned long unb_point;
1481
1482                unb_point = nh_res_table_unb_point(res_table);
1483                if (time_after(now, unb_point)) {
1484                        /* The bucket is not idle, but the unbalanced timer
1485                         * expired. We _can_ migrate, but set force anyway,
1486                         * so that drivers know to ignore activity reports
1487                         * from the HW.
1488                         */
1489                        *force = true;
1490                        return true;
1491                }
1492
1493                nh_res_time_set_deadline(unb_point, deadline);
1494        }
1495
1496        nh_res_time_set_deadline(idle_point, deadline);
1497        return false;
1498}
1499
1500static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
1501                                  u16 bucket_index, bool notify,
1502                                  bool notify_nl, bool force)
1503{
1504        struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
1505        struct nh_grp_entry *new_nhge;
1506        struct netlink_ext_ack extack;
1507        int err;
1508
1509        new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
1510                                            struct nh_grp_entry,
1511                                            res.uw_nh_entry);
1512        if (WARN_ON_ONCE(!new_nhge))
1513                /* If this function is called, "bucket" is either not
1514                 * occupied, or it belongs to a next hop that is
1515                 * overweight. In either case, there ought to be a
1516                 * corresponding underweight next hop.
1517                 */
1518                return false;
1519
1520        if (notify) {
1521                struct nh_grp_entry *old_nhge;
1522
1523                old_nhge = nh_res_dereference(bucket->nh_entry);
1524                err = call_nexthop_res_bucket_notifiers(res_table->net,
1525                                                        res_table->nhg_id,
1526                                                        bucket_index, force,
1527                                                        old_nhge->nh,
1528                                                        new_nhge->nh, &extack);
1529                if (err) {
1530                        pr_err_ratelimited("%s\n", extack._msg);
1531                        if (!force)
1532                                return false;
1533                        /* It is not possible to veto a forced replacement, so
1534                         * just clear the hardware flags from the nexthop
1535                         * bucket to indicate to user space that this bucket is
1536                         * not correctly populated in hardware.
1537                         */
1538                        bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
1539                }
1540        }
1541
1542        nh_res_bucket_set_nh(bucket, new_nhge);
1543        nh_res_bucket_set_idle(res_table, bucket);
1544
1545        if (notify_nl)
1546                nexthop_bucket_notify(res_table, bucket_index);
1547
1548        if (nh_res_nhge_is_balanced(new_nhge))
1549                list_del(&new_nhge->res.uw_nh_entry);
1550        return true;
1551}
1552
1553#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
1554
1555static void nh_res_table_upkeep(struct nh_res_table *res_table,
1556                                bool notify, bool notify_nl)
1557{
1558        unsigned long now = jiffies;
1559        unsigned long deadline;
1560        u16 i;
1561
1562        /* Deadline is the next time that upkeep should be run. It is the
1563         * earliest time at which one of the buckets might be migrated.
1564         * Start at the most pessimistic estimate: either unbalanced_timer
1565         * from now, or if there is none, idle_timer from now. For each
1566         * encountered time point, call nh_res_time_set_deadline() to
1567         * refine the estimate.
1568         */
1569        if (res_table->unbalanced_timer)
1570                deadline = now + res_table->unbalanced_timer;
1571        else
1572                deadline = now + res_table->idle_timer;
1573
1574        for (i = 0; i < res_table->num_nh_buckets; i++) {
1575                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
1576                bool force;
1577
1578                if (nh_res_bucket_should_migrate(res_table, bucket,
1579                                                 &deadline, &force)) {
1580                        if (!nh_res_bucket_migrate(res_table, i, notify,
1581                                                   notify_nl, force)) {
1582                                unsigned long idle_point;
1583
1584                                /* A driver can override the migration
1585                                 * decision if the HW reports that the
1586                                 * bucket is actually not idle. Therefore
1587                                 * remark the bucket as busy again and
1588                                 * update the deadline.
1589                                 */
1590                                nh_res_bucket_set_busy(bucket);
1591                                idle_point = nh_res_bucket_idle_point(res_table,
1592                                                                      bucket,
1593                                                                      now);
1594                                nh_res_time_set_deadline(idle_point, &deadline);
1595                        }
1596                }
1597        }
1598
1599        /* If the group is still unbalanced, schedule the next upkeep to
1600         * either the deadline computed above, or the minimum deadline,
1601         * whichever comes later.
1602         */
1603        if (!nh_res_table_is_balanced(res_table)) {
1604                unsigned long now = jiffies;
1605                unsigned long min_deadline;
1606
1607                min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
1608                if (time_before(deadline, min_deadline))
1609                        deadline = min_deadline;
1610
1611                queue_delayed_work(system_power_efficient_wq,
1612                                   &res_table->upkeep_dw, deadline - now);
1613        }
1614}
1615
1616static void nh_res_table_upkeep_dw(struct work_struct *work)
1617{
1618        struct delayed_work *dw = to_delayed_work(work);
1619        struct nh_res_table *res_table;
1620
1621        res_table = container_of(dw, struct nh_res_table, upkeep_dw);
1622        nh_res_table_upkeep(res_table, true, true);
1623}
1624
1625static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
1626{
1627        cancel_delayed_work_sync(&res_table->upkeep_dw);
1628}
1629
1630static void nh_res_group_rebalance(struct nh_group *nhg,
1631                                   struct nh_res_table *res_table)
1632{
1633        int prev_upper_bound = 0;
1634        int total = 0;
1635        int w = 0;
1636        int i;
1637
1638        INIT_LIST_HEAD(&res_table->uw_nh_entries);
1639
1640        for (i = 0; i < nhg->num_nh; ++i)
1641                total += nhg->nh_entries[i].weight;
1642
1643        for (i = 0; i < nhg->num_nh; ++i) {
1644                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1645                int upper_bound;
1646
1647                w += nhge->weight;
1648                upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
1649                                                total);
1650                nhge->res.wants_buckets = upper_bound - prev_upper_bound;
1651                prev_upper_bound = upper_bound;
1652
1653                if (nh_res_nhge_is_uw(nhge)) {
1654                        if (list_empty(&res_table->uw_nh_entries))
1655                                res_table->unbalanced_since = jiffies;
1656                        list_add(&nhge->res.uw_nh_entry,
1657                                 &res_table->uw_nh_entries);
1658                }
1659        }
1660}
1661
1662/* Migrate buckets in res_table so that they reference NHGE's from NHG with
1663 * the right NH ID. Set those buckets that do not have a corresponding NHGE
1664 * entry in NHG as not occupied.
1665 */
1666static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
1667                                         struct nh_group *nhg)
1668{
1669        u16 i;
1670
1671        for (i = 0; i < res_table->num_nh_buckets; i++) {
1672                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
1673                u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
1674                bool found = false;
1675                int j;
1676
1677                for (j = 0; j < nhg->num_nh; j++) {
1678                        struct nh_grp_entry *nhge = &nhg->nh_entries[j];
1679
1680                        if (nhge->nh->id == id) {
1681                                nh_res_bucket_set_nh(bucket, nhge);
1682                                found = true;
1683                                break;
1684                        }
1685                }
1686
1687                if (!found)
1688                        nh_res_bucket_unset_nh(bucket);
1689        }
1690}
1691
1692static void replace_nexthop_grp_res(struct nh_group *oldg,
1693                                    struct nh_group *newg)
1694{
1695        /* For NH group replacement, the new NHG might only have a stub
1696         * hash table with 0 buckets, because the number of buckets was not
1697         * specified. For NH removal, oldg and newg both reference the same
1698         * res_table. So in any case, in the following, we want to work
1699         * with oldg->res_table.
1700         */
1701        struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
1702        unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
1703        bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
1704
1705        nh_res_table_cancel_upkeep(old_res_table);
1706        nh_res_table_migrate_buckets(old_res_table, newg);
1707        nh_res_group_rebalance(newg, old_res_table);
1708        if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
1709                old_res_table->unbalanced_since = prev_unbalanced_since;
1710        nh_res_table_upkeep(old_res_table, true, false);
1711}
1712
1713static void nh_hthr_group_rebalance(struct nh_group *nhg)
1714{
1715        int total = 0;
1716        int w = 0;
1717        int i;
1718
1719        for (i = 0; i < nhg->num_nh; ++i)
1720                total += nhg->nh_entries[i].weight;
1721
1722        for (i = 0; i < nhg->num_nh; ++i) {
1723                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1724                int upper_bound;
1725
1726                w += nhge->weight;
1727                upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
1728                atomic_set(&nhge->hthr.upper_bound, upper_bound);
1729        }
1730}
1731
1732static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
1733                                struct nl_info *nlinfo)
1734{
1735        struct nh_grp_entry *nhges, *new_nhges;
1736        struct nexthop *nhp = nhge->nh_parent;
1737        struct netlink_ext_ack extack;
1738        struct nexthop *nh = nhge->nh;
1739        struct nh_group *nhg, *newg;
1740        int i, j, err;
1741
1742        WARN_ON(!nh);
1743
1744        nhg = rtnl_dereference(nhp->nh_grp);
1745        newg = nhg->spare;
1746
1747        /* last entry, keep it visible and remove the parent */
1748        if (nhg->num_nh == 1) {
1749                remove_nexthop(net, nhp, nlinfo);
1750                return;
1751        }
1752
1753        newg->has_v4 = false;
1754        newg->is_multipath = nhg->is_multipath;
1755        newg->hash_threshold = nhg->hash_threshold;
1756        newg->resilient = nhg->resilient;
1757        newg->fdb_nh = nhg->fdb_nh;
1758        newg->num_nh = nhg->num_nh;
1759
1760        /* copy old entries to new except the one getting removed */
1761        nhges = nhg->nh_entries;
1762        new_nhges = newg->nh_entries;
1763        for (i = 0, j = 0; i < nhg->num_nh; ++i) {
1764                struct nh_info *nhi;
1765
1766                /* current nexthop getting removed */
1767                if (nhg->nh_entries[i].nh == nh) {
1768                        newg->num_nh--;
1769                        continue;
1770                }
1771
1772                nhi = rtnl_dereference(nhges[i].nh->nh_info);
1773                if (nhi->family == AF_INET)
1774                        newg->has_v4 = true;
1775
1776                list_del(&nhges[i].nh_list);
1777                new_nhges[j].nh_parent = nhges[i].nh_parent;
1778                new_nhges[j].nh = nhges[i].nh;
1779                new_nhges[j].weight = nhges[i].weight;
1780                list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
1781                j++;
1782        }
1783
1784        if (newg->hash_threshold)
1785                nh_hthr_group_rebalance(newg);
1786        else if (newg->resilient)
1787                replace_nexthop_grp_res(nhg, newg);
1788
1789        rcu_assign_pointer(nhp->nh_grp, newg);
1790
1791        list_del(&nhge->nh_list);
1792        nexthop_put(nhge->nh);
1793
1794        /* Removal of a NH from a resilient group is notified through
1795         * bucket notifications.
1796         */
1797        if (newg->hash_threshold) {
1798                err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
1799                                             &extack);
1800                if (err)
1801                        pr_err("%s\n", extack._msg);
1802        }
1803
1804        if (nlinfo)
1805                nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
1806}
1807
1808static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
1809                                       struct nl_info *nlinfo)
1810{
1811        struct nh_grp_entry *nhge, *tmp;
1812
1813        list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
1814                remove_nh_grp_entry(net, nhge, nlinfo);
1815
1816        /* make sure all see the newly published array before releasing rtnl */
1817        synchronize_net();
1818}
1819
1820static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
1821{
1822        struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
1823        struct nh_res_table *res_table;
1824        int i, num_nh = nhg->num_nh;
1825
1826        for (i = 0; i < num_nh; ++i) {
1827                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
1828
1829                if (WARN_ON(!nhge->nh))
1830                        continue;
1831
1832                list_del_init(&nhge->nh_list);
1833        }
1834
1835        if (nhg->resilient) {
1836                res_table = rtnl_dereference(nhg->res_table);
1837                nh_res_table_cancel_upkeep(res_table);
1838        }
1839}
1840
1841/* not called for nexthop replace */
1842static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
1843{
1844        struct fib6_info *f6i, *tmp;
1845        bool do_flush = false;
1846        struct fib_info *fi;
1847
1848        list_for_each_entry(fi, &nh->fi_list, nh_list) {
1849                fi->fib_flags |= RTNH_F_DEAD;
1850                do_flush = true;
1851        }
1852        if (do_flush)
1853                fib_flush(net);
1854
1855        /* ip6_del_rt removes the entry from this list hence the _safe */
1856        list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
1857                /* __ip6_del_rt does a release, so do a hold here */
1858                fib6_info_hold(f6i);
1859                ipv6_stub->ip6_del_rt(net, f6i,
1860                                      !net->ipv4.sysctl_nexthop_compat_mode);
1861        }
1862}
1863
1864static void __remove_nexthop(struct net *net, struct nexthop *nh,
1865                             struct nl_info *nlinfo)
1866{
1867        __remove_nexthop_fib(net, nh);
1868
1869        if (nh->is_group) {
1870                remove_nexthop_group(nh, nlinfo);
1871        } else {
1872                struct nh_info *nhi;
1873
1874                nhi = rtnl_dereference(nh->nh_info);
1875                if (nhi->fib_nhc.nhc_dev)
1876                        hlist_del(&nhi->dev_hash);
1877
1878                remove_nexthop_from_groups(net, nh, nlinfo);
1879        }
1880}
1881
1882static void remove_nexthop(struct net *net, struct nexthop *nh,
1883                           struct nl_info *nlinfo)
1884{
1885        call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
1886
1887        /* remove from the tree */
1888        rb_erase(&nh->rb_node, &net->nexthop.rb_root);
1889
1890        if (nlinfo)
1891                nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
1892
1893        __remove_nexthop(net, nh, nlinfo);
1894        nh_base_seq_inc(net);
1895
1896        nexthop_put(nh);
1897}
1898
1899/* if any FIB entries reference this nexthop, any dst entries
1900 * need to be regenerated
1901 */
1902static void nh_rt_cache_flush(struct net *net, struct nexthop *nh)
1903{
1904        struct fib6_info *f6i;
1905
1906        if (!list_empty(&nh->fi_list))
1907                rt_cache_flush(net);
1908
1909        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
1910                ipv6_stub->fib6_update_sernum(net, f6i);
1911}
1912
1913static int replace_nexthop_grp(struct net *net, struct nexthop *old,
1914                               struct nexthop *new, const struct nh_config *cfg,
1915                               struct netlink_ext_ack *extack)
1916{
1917        struct nh_res_table *tmp_table = NULL;
1918        struct nh_res_table *new_res_table;
1919        struct nh_res_table *old_res_table;
1920        struct nh_group *oldg, *newg;
1921        int i, err;
1922
1923        if (!new->is_group) {
1924                NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
1925                return -EINVAL;
1926        }
1927
1928        oldg = rtnl_dereference(old->nh_grp);
1929        newg = rtnl_dereference(new->nh_grp);
1930
1931        if (newg->hash_threshold != oldg->hash_threshold) {
1932                NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
1933                return -EINVAL;
1934        }
1935
1936        if (newg->hash_threshold) {
1937                err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
1938                                             extack);
1939                if (err)
1940                        return err;
1941        } else if (newg->resilient) {
1942                new_res_table = rtnl_dereference(newg->res_table);
1943                old_res_table = rtnl_dereference(oldg->res_table);
1944
1945                /* Accept if num_nh_buckets was not given, but if it was
1946                 * given, demand that the value be correct.
1947                 */
1948                if (cfg->nh_grp_res_has_num_buckets &&
1949                    cfg->nh_grp_res_num_buckets !=
1950                    old_res_table->num_nh_buckets) {
1951                        NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
1952                        return -EINVAL;
1953                }
1954
1955                /* Emit a pre-replace notification so that listeners could veto
1956                 * a potentially unsupported configuration. Otherwise,
1957                 * individual bucket replacement notifications would need to be
1958                 * vetoed, which is something that should only happen if the
1959                 * bucket is currently active.
1960                 */
1961                err = call_nexthop_res_table_notifiers(net, new, extack);
1962                if (err)
1963                        return err;
1964
1965                if (cfg->nh_grp_res_has_idle_timer)
1966                        old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
1967                if (cfg->nh_grp_res_has_unbalanced_timer)
1968                        old_res_table->unbalanced_timer =
1969                                cfg->nh_grp_res_unbalanced_timer;
1970
1971                replace_nexthop_grp_res(oldg, newg);
1972
1973                tmp_table = new_res_table;
1974                rcu_assign_pointer(newg->res_table, old_res_table);
1975                rcu_assign_pointer(newg->spare->res_table, old_res_table);
1976        }
1977
1978        /* update parents - used by nexthop code for cleanup */
1979        for (i = 0; i < newg->num_nh; i++)
1980                newg->nh_entries[i].nh_parent = old;
1981
1982        rcu_assign_pointer(old->nh_grp, newg);
1983
1984        if (newg->resilient) {
1985                rcu_assign_pointer(oldg->res_table, tmp_table);
1986                rcu_assign_pointer(oldg->spare->res_table, tmp_table);
1987        }
1988
1989        for (i = 0; i < oldg->num_nh; i++)
1990                oldg->nh_entries[i].nh_parent = new;
1991
1992        rcu_assign_pointer(new->nh_grp, oldg);
1993
1994        return 0;
1995}
1996
1997static void nh_group_v4_update(struct nh_group *nhg)
1998{
1999        struct nh_grp_entry *nhges;
2000        bool has_v4 = false;
2001        int i;
2002
2003        nhges = nhg->nh_entries;
2004        for (i = 0; i < nhg->num_nh; i++) {
2005                struct nh_info *nhi;
2006
2007                nhi = rtnl_dereference(nhges[i].nh->nh_info);
2008                if (nhi->family == AF_INET)
2009                        has_v4 = true;
2010        }
2011        nhg->has_v4 = has_v4;
2012}
2013
2014static int replace_nexthop_single_notify_res(struct net *net,
2015                                             struct nh_res_table *res_table,
2016                                             struct nexthop *old,
2017                                             struct nh_info *oldi,
2018                                             struct nh_info *newi,
2019                                             struct netlink_ext_ack *extack)
2020{
2021        u32 nhg_id = res_table->nhg_id;
2022        int err;
2023        u16 i;
2024
2025        for (i = 0; i < res_table->num_nh_buckets; i++) {
2026                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
2027                struct nh_grp_entry *nhge;
2028
2029                nhge = rtnl_dereference(bucket->nh_entry);
2030                if (nhge->nh == old) {
2031                        err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
2032                                                                  i, true,
2033                                                                  oldi, newi,
2034                                                                  extack);
2035                        if (err)
2036                                goto err_notify;
2037                }
2038        }
2039
2040        return 0;
2041
2042err_notify:
2043        while (i-- > 0) {
2044                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
2045                struct nh_grp_entry *nhge;
2046
2047                nhge = rtnl_dereference(bucket->nh_entry);
2048                if (nhge->nh == old)
2049                        __call_nexthop_res_bucket_notifiers(net, nhg_id, i,
2050                                                            true, newi, oldi,
2051                                                            extack);
2052        }
2053        return err;
2054}
2055
2056static int replace_nexthop_single_notify(struct net *net,
2057                                         struct nexthop *group_nh,
2058                                         struct nexthop *old,
2059                                         struct nh_info *oldi,
2060                                         struct nh_info *newi,
2061                                         struct netlink_ext_ack *extack)
2062{
2063        struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
2064        struct nh_res_table *res_table;
2065
2066        if (nhg->hash_threshold) {
2067                return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
2068                                              group_nh, extack);
2069        } else if (nhg->resilient) {
2070                res_table = rtnl_dereference(nhg->res_table);
2071                return replace_nexthop_single_notify_res(net, res_table,
2072                                                         old, oldi, newi,
2073                                                         extack);
2074        }
2075
2076        return -EINVAL;
2077}
2078
2079static int replace_nexthop_single(struct net *net, struct nexthop *old,
2080                                  struct nexthop *new,
2081                                  struct netlink_ext_ack *extack)
2082{
2083        u8 old_protocol, old_nh_flags;
2084        struct nh_info *oldi, *newi;
2085        struct nh_grp_entry *nhge;
2086        int err;
2087
2088        if (new->is_group) {
2089                NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
2090                return -EINVAL;
2091        }
2092
2093        err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
2094        if (err)
2095                return err;
2096
2097        /* Hardware flags were set on 'old' as 'new' is not in the red-black
2098         * tree. Therefore, inherit the flags from 'old' to 'new'.
2099         */
2100        new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
2101
2102        oldi = rtnl_dereference(old->nh_info);
2103        newi = rtnl_dereference(new->nh_info);
2104
2105        newi->nh_parent = old;
2106        oldi->nh_parent = new;
2107
2108        old_protocol = old->protocol;
2109        old_nh_flags = old->nh_flags;
2110
2111        old->protocol = new->protocol;
2112        old->nh_flags = new->nh_flags;
2113
2114        rcu_assign_pointer(old->nh_info, newi);
2115        rcu_assign_pointer(new->nh_info, oldi);
2116
2117        /* Send a replace notification for all the groups using the nexthop. */
2118        list_for_each_entry(nhge, &old->grp_list, nh_list) {
2119                struct nexthop *nhp = nhge->nh_parent;
2120
2121                err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
2122                                                    extack);
2123                if (err)
2124                        goto err_notify;
2125        }
2126
2127        /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
2128         * update IPv4 indication in all the groups using the nexthop.
2129         */
2130        if (oldi->family == AF_INET && newi->family == AF_INET6) {
2131                list_for_each_entry(nhge, &old->grp_list, nh_list) {
2132                        struct nexthop *nhp = nhge->nh_parent;
2133                        struct nh_group *nhg;
2134
2135                        nhg = rtnl_dereference(nhp->nh_grp);
2136                        nh_group_v4_update(nhg);
2137                }
2138        }
2139
2140        return 0;
2141
2142err_notify:
2143        rcu_assign_pointer(new->nh_info, newi);
2144        rcu_assign_pointer(old->nh_info, oldi);
2145        old->nh_flags = old_nh_flags;
2146        old->protocol = old_protocol;
2147        oldi->nh_parent = old;
2148        newi->nh_parent = new;
2149        list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
2150                struct nexthop *nhp = nhge->nh_parent;
2151
2152                replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
2153        }
2154        call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
2155        return err;
2156}
2157
2158static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
2159                                     struct nl_info *info)
2160{
2161        struct fib6_info *f6i;
2162
2163        if (!list_empty(&nh->fi_list)) {
2164                struct fib_info *fi;
2165
2166                /* expectation is a few fib_info per nexthop and then
2167                 * a lot of routes per fib_info. So mark the fib_info
2168                 * and then walk the fib tables once
2169                 */
2170                list_for_each_entry(fi, &nh->fi_list, nh_list)
2171                        fi->nh_updated = true;
2172
2173                fib_info_notify_update(net, info);
2174
2175                list_for_each_entry(fi, &nh->fi_list, nh_list)
2176                        fi->nh_updated = false;
2177        }
2178
2179        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
2180                ipv6_stub->fib6_rt_update(net, f6i, info);
2181}
2182
2183/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
2184 * linked to this nexthop and for all groups that the nexthop
2185 * is a member of
2186 */
2187static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
2188                                   struct nl_info *info)
2189{
2190        struct nh_grp_entry *nhge;
2191
2192        __nexthop_replace_notify(net, nh, info);
2193
2194        list_for_each_entry(nhge, &nh->grp_list, nh_list)
2195                __nexthop_replace_notify(net, nhge->nh_parent, info);
2196}
2197
2198static int replace_nexthop(struct net *net, struct nexthop *old,
2199                           struct nexthop *new, const struct nh_config *cfg,
2200                           struct netlink_ext_ack *extack)
2201{
2202        bool new_is_reject = false;
2203        struct nh_grp_entry *nhge;
2204        int err;
2205
2206        /* check that existing FIB entries are ok with the
2207         * new nexthop definition
2208         */
2209        err = fib_check_nh_list(old, new, extack);
2210        if (err)
2211                return err;
2212
2213        err = fib6_check_nh_list(old, new, extack);
2214        if (err)
2215                return err;
2216
2217        if (!new->is_group) {
2218                struct nh_info *nhi = rtnl_dereference(new->nh_info);
2219
2220                new_is_reject = nhi->reject_nh;
2221        }
2222
2223        list_for_each_entry(nhge, &old->grp_list, nh_list) {
2224                /* if new nexthop is a blackhole, any groups using this
2225                 * nexthop cannot have more than 1 path
2226                 */
2227                if (new_is_reject &&
2228                    nexthop_num_path(nhge->nh_parent) > 1) {
2229                        NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
2230                        return -EINVAL;
2231                }
2232
2233                err = fib_check_nh_list(nhge->nh_parent, new, extack);
2234                if (err)
2235                        return err;
2236
2237                err = fib6_check_nh_list(nhge->nh_parent, new, extack);
2238                if (err)
2239                        return err;
2240        }
2241
2242        if (old->is_group)
2243                err = replace_nexthop_grp(net, old, new, cfg, extack);
2244        else
2245                err = replace_nexthop_single(net, old, new, extack);
2246
2247        if (!err) {
2248                nh_rt_cache_flush(net, old);
2249
2250                __remove_nexthop(net, new, NULL);
2251                nexthop_put(new);
2252        }
2253
2254        return err;
2255}
2256
2257/* called with rtnl_lock held */
2258static int insert_nexthop(struct net *net, struct nexthop *new_nh,
2259                          struct nh_config *cfg, struct netlink_ext_ack *extack)
2260{
2261        struct rb_node **pp, *parent = NULL, *next;
2262        struct rb_root *root = &net->nexthop.rb_root;
2263        bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
2264        bool create = !!(cfg->nlflags & NLM_F_CREATE);
2265        u32 new_id = new_nh->id;
2266        int replace_notify = 0;
2267        int rc = -EEXIST;
2268
2269        pp = &root->rb_node;
2270        while (1) {
2271                struct nexthop *nh;
2272
2273                next = *pp;
2274                if (!next)
2275                        break;
2276
2277                parent = next;
2278
2279                nh = rb_entry(parent, struct nexthop, rb_node);
2280                if (new_id < nh->id) {
2281                        pp = &next->rb_left;
2282                } else if (new_id > nh->id) {
2283                        pp = &next->rb_right;
2284                } else if (replace) {
2285                        rc = replace_nexthop(net, nh, new_nh, cfg, extack);
2286                        if (!rc) {
2287                                new_nh = nh; /* send notification with old nh */
2288                                replace_notify = 1;
2289                        }
2290                        goto out;
2291                } else {
2292                        /* id already exists and not a replace */
2293                        goto out;
2294                }
2295        }
2296
2297        if (replace && !create) {
2298                NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
2299                rc = -ENOENT;
2300                goto out;
2301        }
2302
2303        if (new_nh->is_group) {
2304                struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
2305                struct nh_res_table *res_table;
2306
2307                if (nhg->resilient) {
2308                        res_table = rtnl_dereference(nhg->res_table);
2309
2310                        /* Not passing the number of buckets is OK when
2311                         * replacing, but not when creating a new group.
2312                         */
2313                        if (!cfg->nh_grp_res_has_num_buckets) {
2314                                NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
2315                                rc = -EINVAL;
2316                                goto out;
2317                        }
2318
2319                        nh_res_group_rebalance(nhg, res_table);
2320
2321                        /* Do not send bucket notifications, we do full
2322                         * notification below.
2323                         */
2324                        nh_res_table_upkeep(res_table, false, false);
2325                }
2326        }
2327
2328        rb_link_node_rcu(&new_nh->rb_node, parent, pp);
2329        rb_insert_color(&new_nh->rb_node, root);
2330
2331        /* The initial insertion is a full notification for hash-threshold as
2332         * well as resilient groups.
2333         */
2334        rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
2335        if (rc)
2336                rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
2337
2338out:
2339        if (!rc) {
2340                nh_base_seq_inc(net);
2341                nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
2342                if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
2343                        nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
2344        }
2345
2346        return rc;
2347}
2348
2349/* rtnl */
2350/* remove all nexthops tied to a device being deleted */
2351static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
2352{
2353        unsigned int hash = nh_dev_hashfn(dev->ifindex);
2354        struct net *net = dev_net(dev);
2355        struct hlist_head *head = &net->nexthop.devhash[hash];
2356        struct hlist_node *n;
2357        struct nh_info *nhi;
2358
2359        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
2360                if (nhi->fib_nhc.nhc_dev != dev)
2361                        continue;
2362
2363                if (nhi->reject_nh &&
2364                    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
2365                        continue;
2366
2367                remove_nexthop(net, nhi->nh_parent, NULL);
2368        }
2369}
2370
2371/* rtnl; called when net namespace is deleted */
2372static void flush_all_nexthops(struct net *net)
2373{
2374        struct rb_root *root = &net->nexthop.rb_root;
2375        struct rb_node *node;
2376        struct nexthop *nh;
2377
2378        while ((node = rb_first(root))) {
2379                nh = rb_entry(node, struct nexthop, rb_node);
2380                remove_nexthop(net, nh, NULL);
2381                cond_resched();
2382        }
2383}
2384
2385static struct nexthop *nexthop_create_group(struct net *net,
2386                                            struct nh_config *cfg)
2387{
2388        struct nlattr *grps_attr = cfg->nh_grp;
2389        struct nexthop_grp *entry = nla_data(grps_attr);
2390        u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
2391        struct nh_group *nhg;
2392        struct nexthop *nh;
2393        int err;
2394        int i;
2395
2396        if (WARN_ON(!num_nh))
2397                return ERR_PTR(-EINVAL);
2398
2399        nh = nexthop_alloc();
2400        if (!nh)
2401                return ERR_PTR(-ENOMEM);
2402
2403        nh->is_group = 1;
2404
2405        nhg = nexthop_grp_alloc(num_nh);
2406        if (!nhg) {
2407                kfree(nh);
2408                return ERR_PTR(-ENOMEM);
2409        }
2410
2411        /* spare group used for removals */
2412        nhg->spare = nexthop_grp_alloc(num_nh);
2413        if (!nhg->spare) {
2414                kfree(nhg);
2415                kfree(nh);
2416                return ERR_PTR(-ENOMEM);
2417        }
2418        nhg->spare->spare = nhg;
2419
2420        for (i = 0; i < nhg->num_nh; ++i) {
2421                struct nexthop *nhe;
2422                struct nh_info *nhi;
2423
2424                nhe = nexthop_find_by_id(net, entry[i].id);
2425                if (!nexthop_get(nhe)) {
2426                        err = -ENOENT;
2427                        goto out_no_nh;
2428                }
2429
2430                nhi = rtnl_dereference(nhe->nh_info);
2431                if (nhi->family == AF_INET)
2432                        nhg->has_v4 = true;
2433
2434                nhg->nh_entries[i].nh = nhe;
2435                nhg->nh_entries[i].weight = entry[i].weight + 1;
2436                list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
2437                nhg->nh_entries[i].nh_parent = nh;
2438        }
2439
2440        if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
2441                nhg->hash_threshold = 1;
2442                nhg->is_multipath = true;
2443        } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
2444                struct nh_res_table *res_table;
2445
2446                res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
2447                if (!res_table) {
2448                        err = -ENOMEM;
2449                        goto out_no_nh;
2450                }
2451
2452                rcu_assign_pointer(nhg->spare->res_table, res_table);
2453                rcu_assign_pointer(nhg->res_table, res_table);
2454                nhg->resilient = true;
2455                nhg->is_multipath = true;
2456        }
2457
2458        WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1);
2459
2460        if (nhg->hash_threshold)
2461                nh_hthr_group_rebalance(nhg);
2462
2463        if (cfg->nh_fdb)
2464                nhg->fdb_nh = 1;
2465
2466        rcu_assign_pointer(nh->nh_grp, nhg);
2467
2468        return nh;
2469
2470out_no_nh:
2471        for (i--; i >= 0; --i) {
2472                list_del(&nhg->nh_entries[i].nh_list);
2473                nexthop_put(nhg->nh_entries[i].nh);
2474        }
2475
2476        kfree(nhg->spare);
2477        kfree(nhg);
2478        kfree(nh);
2479
2480        return ERR_PTR(err);
2481}
2482
2483static int nh_create_ipv4(struct net *net, struct nexthop *nh,
2484                          struct nh_info *nhi, struct nh_config *cfg,
2485                          struct netlink_ext_ack *extack)
2486{
2487        struct fib_nh *fib_nh = &nhi->fib_nh;
2488        struct fib_config fib_cfg = {
2489                .fc_oif   = cfg->nh_ifindex,
2490                .fc_gw4   = cfg->gw.ipv4,
2491                .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
2492                .fc_flags = cfg->nh_flags,
2493                .fc_encap = cfg->nh_encap,
2494                .fc_encap_type = cfg->nh_encap_type,
2495        };
2496        u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
2497        int err;
2498
2499        err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
2500        if (err) {
2501                fib_nh_release(net, fib_nh);
2502                goto out;
2503        }
2504
2505        if (nhi->fdb_nh)
2506                goto out;
2507
2508        /* sets nh_dev if successful */
2509        err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
2510        if (!err) {
2511                nh->nh_flags = fib_nh->fib_nh_flags;
2512                fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
2513                                          fib_nh->fib_nh_scope);
2514        } else {
2515                fib_nh_release(net, fib_nh);
2516        }
2517out:
2518        return err;
2519}
2520
2521static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
2522                          struct nh_info *nhi, struct nh_config *cfg,
2523                          struct netlink_ext_ack *extack)
2524{
2525        struct fib6_nh *fib6_nh = &nhi->fib6_nh;
2526        struct fib6_config fib6_cfg = {
2527                .fc_table = l3mdev_fib_table(cfg->dev),
2528                .fc_ifindex = cfg->nh_ifindex,
2529                .fc_gateway = cfg->gw.ipv6,
2530                .fc_flags = cfg->nh_flags,
2531                .fc_encap = cfg->nh_encap,
2532                .fc_encap_type = cfg->nh_encap_type,
2533                .fc_is_fdb = cfg->nh_fdb,
2534        };
2535        int err;
2536
2537        if (!ipv6_addr_any(&cfg->gw.ipv6))
2538                fib6_cfg.fc_flags |= RTF_GATEWAY;
2539
2540        /* sets nh_dev if successful */
2541        err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
2542                                      extack);
2543        if (err)
2544                ipv6_stub->fib6_nh_release(fib6_nh);
2545        else
2546                nh->nh_flags = fib6_nh->fib_nh_flags;
2547
2548        return err;
2549}
2550
2551static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
2552                                      struct netlink_ext_ack *extack)
2553{
2554        struct nh_info *nhi;
2555        struct nexthop *nh;
2556        int err = 0;
2557
2558        nh = nexthop_alloc();
2559        if (!nh)
2560                return ERR_PTR(-ENOMEM);
2561
2562        nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
2563        if (!nhi) {
2564                kfree(nh);
2565                return ERR_PTR(-ENOMEM);
2566        }
2567
2568        nh->nh_flags = cfg->nh_flags;
2569        nh->net = net;
2570
2571        nhi->nh_parent = nh;
2572        nhi->family = cfg->nh_family;
2573        nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
2574
2575        if (cfg->nh_fdb)
2576                nhi->fdb_nh = 1;
2577
2578        if (cfg->nh_blackhole) {
2579                nhi->reject_nh = 1;
2580                cfg->nh_ifindex = net->loopback_dev->ifindex;
2581        }
2582
2583        switch (cfg->nh_family) {
2584        case AF_INET:
2585                err = nh_create_ipv4(net, nh, nhi, cfg, extack);
2586                break;
2587        case AF_INET6:
2588                err = nh_create_ipv6(net, nh, nhi, cfg, extack);
2589                break;
2590        }
2591
2592        if (err) {
2593                kfree(nhi);
2594                kfree(nh);
2595                return ERR_PTR(err);
2596        }
2597
2598        /* add the entry to the device based hash */
2599        if (!nhi->fdb_nh)
2600                nexthop_devhash_add(net, nhi);
2601
2602        rcu_assign_pointer(nh->nh_info, nhi);
2603
2604        return nh;
2605}
2606
2607/* called with rtnl lock held */
2608static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
2609                                   struct netlink_ext_ack *extack)
2610{
2611        struct nexthop *nh;
2612        int err;
2613
2614        if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
2615                NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
2616                return ERR_PTR(-EINVAL);
2617        }
2618
2619        if (!cfg->nh_id) {
2620                cfg->nh_id = nh_find_unused_id(net);
2621                if (!cfg->nh_id) {
2622                        NL_SET_ERR_MSG(extack, "No unused id");
2623                        return ERR_PTR(-EINVAL);
2624                }
2625        }
2626
2627        if (cfg->nh_grp)
2628                nh = nexthop_create_group(net, cfg);
2629        else
2630                nh = nexthop_create(net, cfg, extack);
2631
2632        if (IS_ERR(nh))
2633                return nh;
2634
2635        refcount_set(&nh->refcnt, 1);
2636        nh->id = cfg->nh_id;
2637        nh->protocol = cfg->nh_protocol;
2638        nh->net = net;
2639
2640        err = insert_nexthop(net, nh, cfg, extack);
2641        if (err) {
2642                __remove_nexthop(net, nh, NULL);
2643                nexthop_put(nh);
2644                nh = ERR_PTR(err);
2645        }
2646
2647        return nh;
2648}
2649
2650static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
2651                            unsigned long *timer_p, bool *has_p,
2652                            struct netlink_ext_ack *extack)
2653{
2654        unsigned long timer;
2655        u32 value;
2656
2657        if (!attr) {
2658                *timer_p = fallback;
2659                *has_p = false;
2660                return 0;
2661        }
2662
2663        value = nla_get_u32(attr);
2664        timer = clock_t_to_jiffies(value);
2665        if (timer == ~0UL) {
2666                NL_SET_ERR_MSG(extack, "Timer value too large");
2667                return -EINVAL;
2668        }
2669
2670        *timer_p = timer;
2671        *has_p = true;
2672        return 0;
2673}
2674
2675static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
2676                                    struct netlink_ext_ack *extack)
2677{
2678        struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
2679        int err;
2680
2681        if (res) {
2682                err = nla_parse_nested(tb,
2683                                       ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
2684                                       res, rtm_nh_res_policy_new, extack);
2685                if (err < 0)
2686                        return err;
2687        }
2688
2689        if (tb[NHA_RES_GROUP_BUCKETS]) {
2690                cfg->nh_grp_res_num_buckets =
2691                        nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
2692                cfg->nh_grp_res_has_num_buckets = true;
2693                if (!cfg->nh_grp_res_num_buckets) {
2694                        NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
2695                        return -EINVAL;
2696                }
2697        }
2698
2699        err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
2700                               NH_RES_DEFAULT_IDLE_TIMER,
2701                               &cfg->nh_grp_res_idle_timer,
2702                               &cfg->nh_grp_res_has_idle_timer,
2703                               extack);
2704        if (err)
2705                return err;
2706
2707        return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
2708                                NH_RES_DEFAULT_UNBALANCED_TIMER,
2709                                &cfg->nh_grp_res_unbalanced_timer,
2710                                &cfg->nh_grp_res_has_unbalanced_timer,
2711                                extack);
2712}
2713
2714static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
2715                            struct nlmsghdr *nlh, struct nh_config *cfg,
2716                            struct netlink_ext_ack *extack)
2717{
2718        struct nhmsg *nhm = nlmsg_data(nlh);
2719        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
2720        int err;
2721
2722        err = nlmsg_parse(nlh, sizeof(*nhm), tb,
2723                          ARRAY_SIZE(rtm_nh_policy_new) - 1,
2724                          rtm_nh_policy_new, extack);
2725        if (err < 0)
2726                return err;
2727
2728        err = -EINVAL;
2729        if (nhm->resvd || nhm->nh_scope) {
2730                NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
2731                goto out;
2732        }
2733        if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
2734                NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
2735                goto out;
2736        }
2737
2738        switch (nhm->nh_family) {
2739        case AF_INET:
2740        case AF_INET6:
2741                break;
2742        case AF_UNSPEC:
2743                if (tb[NHA_GROUP])
2744                        break;
2745                fallthrough;
2746        default:
2747                NL_SET_ERR_MSG(extack, "Invalid address family");
2748                goto out;
2749        }
2750
2751        memset(cfg, 0, sizeof(*cfg));
2752        cfg->nlflags = nlh->nlmsg_flags;
2753        cfg->nlinfo.portid = NETLINK_CB(skb).portid;
2754        cfg->nlinfo.nlh = nlh;
2755        cfg->nlinfo.nl_net = net;
2756
2757        cfg->nh_family = nhm->nh_family;
2758        cfg->nh_protocol = nhm->nh_protocol;
2759        cfg->nh_flags = nhm->nh_flags;
2760
2761        if (tb[NHA_ID])
2762                cfg->nh_id = nla_get_u32(tb[NHA_ID]);
2763
2764        if (tb[NHA_FDB]) {
2765                if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
2766                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
2767                        NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
2768                        goto out;
2769                }
2770                if (nhm->nh_flags) {
2771                        NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
2772                        goto out;
2773                }
2774                cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
2775        }
2776
2777        if (tb[NHA_GROUP]) {
2778                if (nhm->nh_family != AF_UNSPEC) {
2779                        NL_SET_ERR_MSG(extack, "Invalid family for group");
2780                        goto out;
2781                }
2782                cfg->nh_grp = tb[NHA_GROUP];
2783
2784                cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
2785                if (tb[NHA_GROUP_TYPE])
2786                        cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
2787
2788                if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
2789                        NL_SET_ERR_MSG(extack, "Invalid group type");
2790                        goto out;
2791                }
2792                err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb),
2793                                          cfg->nh_grp_type, extack);
2794                if (err)
2795                        goto out;
2796
2797                if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
2798                        err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
2799                                                       cfg, extack);
2800
2801                /* no other attributes should be set */
2802                goto out;
2803        }
2804
2805        if (tb[NHA_BLACKHOLE]) {
2806                if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
2807                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
2808                        NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
2809                        goto out;
2810                }
2811
2812                cfg->nh_blackhole = 1;
2813                err = 0;
2814                goto out;
2815        }
2816
2817        if (!cfg->nh_fdb && !tb[NHA_OIF]) {
2818                NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
2819                goto out;
2820        }
2821
2822        if (!cfg->nh_fdb && tb[NHA_OIF]) {
2823                cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
2824                if (cfg->nh_ifindex)
2825                        cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
2826
2827                if (!cfg->dev) {
2828                        NL_SET_ERR_MSG(extack, "Invalid device index");
2829                        goto out;
2830                } else if (!(cfg->dev->flags & IFF_UP)) {
2831                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2832                        err = -ENETDOWN;
2833                        goto out;
2834                } else if (!netif_carrier_ok(cfg->dev)) {
2835                        NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
2836                        err = -ENETDOWN;
2837                        goto out;
2838                }
2839        }
2840
2841        err = -EINVAL;
2842        if (tb[NHA_GATEWAY]) {
2843                struct nlattr *gwa = tb[NHA_GATEWAY];
2844
2845                switch (cfg->nh_family) {
2846                case AF_INET:
2847                        if (nla_len(gwa) != sizeof(u32)) {
2848                                NL_SET_ERR_MSG(extack, "Invalid gateway");
2849                                goto out;
2850                        }
2851                        cfg->gw.ipv4 = nla_get_be32(gwa);
2852                        break;
2853                case AF_INET6:
2854                        if (nla_len(gwa) != sizeof(struct in6_addr)) {
2855                                NL_SET_ERR_MSG(extack, "Invalid gateway");
2856                                goto out;
2857                        }
2858                        cfg->gw.ipv6 = nla_get_in6_addr(gwa);
2859                        break;
2860                default:
2861                        NL_SET_ERR_MSG(extack,
2862                                       "Unknown address family for gateway");
2863                        goto out;
2864                }
2865        } else {
2866                /* device only nexthop (no gateway) */
2867                if (cfg->nh_flags & RTNH_F_ONLINK) {
2868                        NL_SET_ERR_MSG(extack,
2869                                       "ONLINK flag can not be set for nexthop without a gateway");
2870                        goto out;
2871                }
2872        }
2873
2874        if (tb[NHA_ENCAP]) {
2875                cfg->nh_encap = tb[NHA_ENCAP];
2876
2877                if (!tb[NHA_ENCAP_TYPE]) {
2878                        NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
2879                        goto out;
2880                }
2881
2882                cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
2883                err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
2884                if (err < 0)
2885                        goto out;
2886
2887        } else if (tb[NHA_ENCAP_TYPE]) {
2888                NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
2889                goto out;
2890        }
2891
2892
2893        err = 0;
2894out:
2895        return err;
2896}
2897
2898/* rtnl */
2899static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
2900                           struct netlink_ext_ack *extack)
2901{
2902        struct net *net = sock_net(skb->sk);
2903        struct nh_config cfg;
2904        struct nexthop *nh;
2905        int err;
2906
2907        err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
2908        if (!err) {
2909                nh = nexthop_add(net, &cfg, extack);
2910                if (IS_ERR(nh))
2911                        err = PTR_ERR(nh);
2912        }
2913
2914        return err;
2915}
2916
2917static int __nh_valid_get_del_req(const struct nlmsghdr *nlh,
2918                                  struct nlattr **tb, u32 *id,
2919                                  struct netlink_ext_ack *extack)
2920{
2921        struct nhmsg *nhm = nlmsg_data(nlh);
2922
2923        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
2924                NL_SET_ERR_MSG(extack, "Invalid values in header");
2925                return -EINVAL;
2926        }
2927
2928        if (!tb[NHA_ID]) {
2929                NL_SET_ERR_MSG(extack, "Nexthop id is missing");
2930                return -EINVAL;
2931        }
2932
2933        *id = nla_get_u32(tb[NHA_ID]);
2934        if (!(*id)) {
2935                NL_SET_ERR_MSG(extack, "Invalid nexthop id");
2936                return -EINVAL;
2937        }
2938
2939        return 0;
2940}
2941
2942static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id,
2943                                struct netlink_ext_ack *extack)
2944{
2945        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
2946        int err;
2947
2948        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
2949                          ARRAY_SIZE(rtm_nh_policy_get) - 1,
2950                          rtm_nh_policy_get, extack);
2951        if (err < 0)
2952                return err;
2953
2954        return __nh_valid_get_del_req(nlh, tb, id, extack);
2955}
2956
2957/* rtnl */
2958static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
2959                           struct netlink_ext_ack *extack)
2960{
2961        struct net *net = sock_net(skb->sk);
2962        struct nl_info nlinfo = {
2963                .nlh = nlh,
2964                .nl_net = net,
2965                .portid = NETLINK_CB(skb).portid,
2966        };
2967        struct nexthop *nh;
2968        int err;
2969        u32 id;
2970
2971        err = nh_valid_get_del_req(nlh, &id, extack);
2972        if (err)
2973                return err;
2974
2975        nh = nexthop_find_by_id(net, id);
2976        if (!nh)
2977                return -ENOENT;
2978
2979        remove_nexthop(net, nh, &nlinfo);
2980
2981        return 0;
2982}
2983
2984/* rtnl */
2985static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2986                           struct netlink_ext_ack *extack)
2987{
2988        struct net *net = sock_net(in_skb->sk);
2989        struct sk_buff *skb = NULL;
2990        struct nexthop *nh;
2991        int err;
2992        u32 id;
2993
2994        err = nh_valid_get_del_req(nlh, &id, extack);
2995        if (err)
2996                return err;
2997
2998        err = -ENOBUFS;
2999        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3000        if (!skb)
3001                goto out;
3002
3003        err = -ENOENT;
3004        nh = nexthop_find_by_id(net, id);
3005        if (!nh)
3006                goto errout_free;
3007
3008        err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
3009                           nlh->nlmsg_seq, 0);
3010        if (err < 0) {
3011                WARN_ON(err == -EMSGSIZE);
3012                goto errout_free;
3013        }
3014
3015        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3016out:
3017        return err;
3018errout_free:
3019        kfree_skb(skb);
3020        goto out;
3021}
3022
3023struct nh_dump_filter {
3024        u32 nh_id;
3025        int dev_idx;
3026        int master_idx;
3027        bool group_filter;
3028        bool fdb_filter;
3029        u32 res_bucket_nh_id;
3030};
3031
3032static bool nh_dump_filtered(struct nexthop *nh,
3033                             struct nh_dump_filter *filter, u8 family)
3034{
3035        const struct net_device *dev;
3036        const struct nh_info *nhi;
3037
3038        if (filter->group_filter && !nh->is_group)
3039                return true;
3040
3041        if (!filter->dev_idx && !filter->master_idx && !family)
3042                return false;
3043
3044        if (nh->is_group)
3045                return true;
3046
3047        nhi = rtnl_dereference(nh->nh_info);
3048        if (family && nhi->family != family)
3049                return true;
3050
3051        dev = nhi->fib_nhc.nhc_dev;
3052        if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
3053                return true;
3054
3055        if (filter->master_idx) {
3056                struct net_device *master;
3057
3058                if (!dev)
3059                        return true;
3060
3061                master = netdev_master_upper_dev_get((struct net_device *)dev);
3062                if (!master || master->ifindex != filter->master_idx)
3063                        return true;
3064        }
3065
3066        return false;
3067}
3068
3069static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
3070                               struct nh_dump_filter *filter,
3071                               struct netlink_ext_ack *extack)
3072{
3073        struct nhmsg *nhm;
3074        u32 idx;
3075
3076        if (tb[NHA_OIF]) {
3077                idx = nla_get_u32(tb[NHA_OIF]);
3078                if (idx > INT_MAX) {
3079                        NL_SET_ERR_MSG(extack, "Invalid device index");
3080                        return -EINVAL;
3081                }
3082                filter->dev_idx = idx;
3083        }
3084        if (tb[NHA_MASTER]) {
3085                idx = nla_get_u32(tb[NHA_MASTER]);
3086                if (idx > INT_MAX) {
3087                        NL_SET_ERR_MSG(extack, "Invalid master device index");
3088                        return -EINVAL;
3089                }
3090                filter->master_idx = idx;
3091        }
3092        filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
3093        filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);
3094
3095        nhm = nlmsg_data(nlh);
3096        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
3097                NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
3098                return -EINVAL;
3099        }
3100
3101        return 0;
3102}
3103
3104static int nh_valid_dump_req(const struct nlmsghdr *nlh,
3105                             struct nh_dump_filter *filter,
3106                             struct netlink_callback *cb)
3107{
3108        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
3109        int err;
3110
3111        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3112                          ARRAY_SIZE(rtm_nh_policy_dump) - 1,
3113                          rtm_nh_policy_dump, cb->extack);
3114        if (err < 0)
3115                return err;
3116
3117        return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
3118}
3119
3120struct rtm_dump_nh_ctx {
3121        u32 idx;
3122};
3123
3124static struct rtm_dump_nh_ctx *
3125rtm_dump_nh_ctx(struct netlink_callback *cb)
3126{
3127        struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;
3128
3129        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
3130        return ctx;
3131}
3132
3133static int rtm_dump_walk_nexthops(struct sk_buff *skb,
3134                                  struct netlink_callback *cb,
3135                                  struct rb_root *root,
3136                                  struct rtm_dump_nh_ctx *ctx,
3137                                  int (*nh_cb)(struct sk_buff *skb,
3138                                               struct netlink_callback *cb,
3139                                               struct nexthop *nh, void *data),
3140                                  void *data)
3141{
3142        struct rb_node *node;
3143        int s_idx;
3144        int err;
3145
3146        s_idx = ctx->idx;
3147        for (node = rb_first(root); node; node = rb_next(node)) {
3148                struct nexthop *nh;
3149
3150                nh = rb_entry(node, struct nexthop, rb_node);
3151                if (nh->id < s_idx)
3152                        continue;
3153
3154                ctx->idx = nh->id;
3155                err = nh_cb(skb, cb, nh, data);
3156                if (err)
3157                        return err;
3158        }
3159
3160        ctx->idx++;
3161        return 0;
3162}
3163
3164static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
3165                               struct nexthop *nh, void *data)
3166{
3167        struct nhmsg *nhm = nlmsg_data(cb->nlh);
3168        struct nh_dump_filter *filter = data;
3169
3170        if (nh_dump_filtered(nh, filter, nhm->nh_family))
3171                return 0;
3172
3173        return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
3174                            NETLINK_CB(cb->skb).portid,
3175                            cb->nlh->nlmsg_seq, NLM_F_MULTI);
3176}
3177
3178/* rtnl */
3179static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
3180{
3181        struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
3182        struct net *net = sock_net(skb->sk);
3183        struct rb_root *root = &net->nexthop.rb_root;
3184        struct nh_dump_filter filter = {};
3185        int err;
3186
3187        err = nh_valid_dump_req(cb->nlh, &filter, cb);
3188        if (err < 0)
3189                return err;
3190
3191        err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
3192                                     &rtm_dump_nexthop_cb, &filter);
3193        if (err < 0) {
3194                if (likely(skb->len))
3195                        goto out;
3196                goto out_err;
3197        }
3198
3199out:
3200        err = skb->len;
3201out_err:
3202        cb->seq = net->nexthop.seq;
3203        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
3204        return err;
3205}
3206
3207static struct nexthop *
3208nexthop_find_group_resilient(struct net *net, u32 id,
3209                             struct netlink_ext_ack *extack)
3210{
3211        struct nh_group *nhg;
3212        struct nexthop *nh;
3213
3214        nh = nexthop_find_by_id(net, id);
3215        if (!nh)
3216                return ERR_PTR(-ENOENT);
3217
3218        if (!nh->is_group) {
3219                NL_SET_ERR_MSG(extack, "Not a nexthop group");
3220                return ERR_PTR(-EINVAL);
3221        }
3222
3223        nhg = rtnl_dereference(nh->nh_grp);
3224        if (!nhg->resilient) {
3225                NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
3226                return ERR_PTR(-EINVAL);
3227        }
3228
3229        return nh;
3230}
3231
3232static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
3233                              struct netlink_ext_ack *extack)
3234{
3235        u32 idx;
3236
3237        if (attr) {
3238                idx = nla_get_u32(attr);
3239                if (!idx) {
3240                        NL_SET_ERR_MSG(extack, "Invalid nexthop id");
3241                        return -EINVAL;
3242                }
3243                *nh_id_p = idx;
3244        } else {
3245                *nh_id_p = 0;
3246        }
3247
3248        return 0;
3249}
3250
3251static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
3252                                    struct nh_dump_filter *filter,
3253                                    struct netlink_callback *cb)
3254{
3255        struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
3256        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
3257        int err;
3258
3259        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3260                          ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
3261                          rtm_nh_policy_dump_bucket, NULL);
3262        if (err < 0)
3263                return err;
3264
3265        err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
3266        if (err)
3267                return err;
3268
3269        if (tb[NHA_RES_BUCKET]) {
3270                size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;
3271
3272                err = nla_parse_nested(res_tb, max,
3273                                       tb[NHA_RES_BUCKET],
3274                                       rtm_nh_res_bucket_policy_dump,
3275                                       cb->extack);
3276                if (err < 0)
3277                        return err;
3278
3279                err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
3280                                         &filter->res_bucket_nh_id,
3281                                         cb->extack);
3282                if (err)
3283                        return err;
3284        }
3285
3286        return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
3287}
3288
3289struct rtm_dump_res_bucket_ctx {
3290        struct rtm_dump_nh_ctx nh;
3291        u16 bucket_index;
3292        u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */
3293};
3294
3295static struct rtm_dump_res_bucket_ctx *
3296rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
3297{
3298        struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;
3299
3300        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
3301        return ctx;
3302}
3303
3304struct rtm_dump_nexthop_bucket_data {
3305        struct rtm_dump_res_bucket_ctx *ctx;
3306        struct nh_dump_filter filter;
3307};
3308
3309static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
3310                                      struct netlink_callback *cb,
3311                                      struct nexthop *nh,
3312                                      struct rtm_dump_nexthop_bucket_data *dd)
3313{
3314        u32 portid = NETLINK_CB(cb->skb).portid;
3315        struct nhmsg *nhm = nlmsg_data(cb->nlh);
3316        struct nh_res_table *res_table;
3317        struct nh_group *nhg;
3318        u16 bucket_index;
3319        int err;
3320
3321        if (dd->ctx->nh.idx < dd->ctx->done_nh_idx)
3322                return 0;
3323
3324        nhg = rtnl_dereference(nh->nh_grp);
3325        res_table = rtnl_dereference(nhg->res_table);
3326        for (bucket_index = dd->ctx->bucket_index;
3327             bucket_index < res_table->num_nh_buckets;
3328             bucket_index++) {
3329                struct nh_res_bucket *bucket;
3330                struct nh_grp_entry *nhge;
3331
3332                bucket = &res_table->nh_buckets[bucket_index];
3333                nhge = rtnl_dereference(bucket->nh_entry);
3334                if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
3335                        continue;
3336
3337                if (dd->filter.res_bucket_nh_id &&
3338                    dd->filter.res_bucket_nh_id != nhge->nh->id)
3339                        continue;
3340
3341                err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
3342                                         RTM_NEWNEXTHOPBUCKET, portid,
3343                                         cb->nlh->nlmsg_seq, NLM_F_MULTI,
3344                                         cb->extack);
3345                if (err < 0) {
3346                        if (likely(skb->len))
3347                                goto out;
3348                        goto out_err;
3349                }
3350        }
3351
3352        dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1;
3353        bucket_index = 0;
3354
3355out:
3356        err = skb->len;
3357out_err:
3358        dd->ctx->bucket_index = bucket_index;
3359        return err;
3360}
3361
3362static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
3363                                      struct netlink_callback *cb,
3364                                      struct nexthop *nh, void *data)
3365{
3366        struct rtm_dump_nexthop_bucket_data *dd = data;
3367        struct nh_group *nhg;
3368
3369        if (!nh->is_group)
3370                return 0;
3371
3372        nhg = rtnl_dereference(nh->nh_grp);
3373        if (!nhg->resilient)
3374                return 0;
3375
3376        return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
3377}
3378
3379/* rtnl */
3380static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
3381                                   struct netlink_callback *cb)
3382{
3383        struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
3384        struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
3385        struct net *net = sock_net(skb->sk);
3386        struct nexthop *nh;
3387        int err;
3388
3389        err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
3390        if (err)
3391                return err;
3392
3393        if (dd.filter.nh_id) {
3394                nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
3395                                                  cb->extack);
3396                if (IS_ERR(nh))
3397                        return PTR_ERR(nh);
3398                err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
3399        } else {
3400                struct rb_root *root = &net->nexthop.rb_root;
3401
3402                err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
3403                                             &rtm_dump_nexthop_bucket_cb, &dd);
3404        }
3405
3406        if (err < 0) {
3407                if (likely(skb->len))
3408                        goto out;
3409                goto out_err;
3410        }
3411
3412out:
3413        err = skb->len;
3414out_err:
3415        cb->seq = net->nexthop.seq;
3416        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
3417        return err;
3418}
3419
3420static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
3421                                              u16 *bucket_index,
3422                                              struct netlink_ext_ack *extack)
3423{
3424        struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
3425        int err;
3426
3427        err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
3428                               res, rtm_nh_res_bucket_policy_get, extack);
3429        if (err < 0)
3430                return err;
3431
3432        if (!tb[NHA_RES_BUCKET_INDEX]) {
3433                NL_SET_ERR_MSG(extack, "Bucket index is missing");
3434                return -EINVAL;
3435        }
3436
3437        *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
3438        return 0;
3439}
3440
3441static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
3442                                   u32 *id, u16 *bucket_index,
3443                                   struct netlink_ext_ack *extack)
3444{
3445        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
3446        int err;
3447
3448        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
3449                          ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
3450                          rtm_nh_policy_get_bucket, extack);
3451        if (err < 0)
3452                return err;
3453
3454        err = __nh_valid_get_del_req(nlh, tb, id, extack);
3455        if (err)
3456                return err;
3457
3458        if (!tb[NHA_RES_BUCKET]) {
3459                NL_SET_ERR_MSG(extack, "Bucket information is missing");
3460                return -EINVAL;
3461        }
3462
3463        err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
3464                                                 bucket_index, extack);
3465        if (err)
3466                return err;
3467
3468        return 0;
3469}
3470
3471/* rtnl */
3472static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3473                                  struct netlink_ext_ack *extack)
3474{
3475        struct net *net = sock_net(in_skb->sk);
3476        struct nh_res_table *res_table;
3477        struct sk_buff *skb = NULL;
3478        struct nh_group *nhg;
3479        struct nexthop *nh;
3480        u16 bucket_index;
3481        int err;
3482        u32 id;
3483
3484        err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
3485        if (err)
3486                return err;
3487
3488        nh = nexthop_find_group_resilient(net, id, extack);
3489        if (IS_ERR(nh))
3490                return PTR_ERR(nh);
3491
3492        nhg = rtnl_dereference(nh->nh_grp);
3493        res_table = rtnl_dereference(nhg->res_table);
3494        if (bucket_index >= res_table->num_nh_buckets) {
3495                NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
3496                return -ENOENT;
3497        }
3498
3499        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3500        if (!skb)
3501                return -ENOBUFS;
3502
3503        err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
3504                                 bucket_index, RTM_NEWNEXTHOPBUCKET,
3505                                 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
3506                                 0, extack);
3507        if (err < 0) {
3508                WARN_ON(err == -EMSGSIZE);
3509                goto errout_free;
3510        }
3511
3512        return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3513
3514errout_free:
3515        kfree_skb(skb);
3516        return err;
3517}
3518
3519static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
3520{
3521        unsigned int hash = nh_dev_hashfn(dev->ifindex);
3522        struct net *net = dev_net(dev);
3523        struct hlist_head *head = &net->nexthop.devhash[hash];
3524        struct hlist_node *n;
3525        struct nh_info *nhi;
3526
3527        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
3528                if (nhi->fib_nhc.nhc_dev == dev) {
3529                        if (nhi->family == AF_INET)
3530                                fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
3531                                                   orig_mtu);
3532                }
3533        }
3534}
3535
3536/* rtnl */
3537static int nh_netdev_event(struct notifier_block *this,
3538                           unsigned long event, void *ptr)
3539{
3540        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3541        struct netdev_notifier_info_ext *info_ext;
3542
3543        switch (event) {
3544        case NETDEV_DOWN:
3545        case NETDEV_UNREGISTER:
3546                nexthop_flush_dev(dev, event);
3547                break;
3548        case NETDEV_CHANGE:
3549                if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
3550                        nexthop_flush_dev(dev, event);
3551                break;
3552        case NETDEV_CHANGEMTU:
3553                info_ext = ptr;
3554                nexthop_sync_mtu(dev, info_ext->ext.mtu);
3555                rt_cache_flush(dev_net(dev));
3556                break;
3557        }
3558        return NOTIFY_DONE;
3559}
3560
3561static struct notifier_block nh_netdev_notifier = {
3562        .notifier_call = nh_netdev_event,
3563};
3564
3565static int nexthops_dump(struct net *net, struct notifier_block *nb,
3566                         struct netlink_ext_ack *extack)
3567{
3568        struct rb_root *root = &net->nexthop.rb_root;
3569        struct rb_node *node;
3570        int err = 0;
3571
3572        for (node = rb_first(root); node; node = rb_next(node)) {
3573                struct nexthop *nh;
3574
3575                nh = rb_entry(node, struct nexthop, rb_node);
3576                err = call_nexthop_notifier(nb, net, NEXTHOP_EVENT_REPLACE, nh,
3577                                            extack);
3578                if (err)
3579                        break;
3580        }
3581
3582        return err;
3583}
3584
3585int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
3586                              struct netlink_ext_ack *extack)
3587{
3588        int err;
3589
3590        rtnl_lock();
3591        err = nexthops_dump(net, nb, extack);
3592        if (err)
3593                goto unlock;
3594        err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
3595                                               nb);
3596unlock:
3597        rtnl_unlock();
3598        return err;
3599}
3600EXPORT_SYMBOL(register_nexthop_notifier);
3601
3602int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
3603{
3604        return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
3605                                                  nb);
3606}
3607EXPORT_SYMBOL(unregister_nexthop_notifier);
3608
3609void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
3610{
3611        struct nexthop *nexthop;
3612
3613        rcu_read_lock();
3614
3615        nexthop = nexthop_find_by_id(net, id);
3616        if (!nexthop)
3617                goto out;
3618
3619        nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
3620        if (offload)
3621                nexthop->nh_flags |= RTNH_F_OFFLOAD;
3622        if (trap)
3623                nexthop->nh_flags |= RTNH_F_TRAP;
3624
3625out:
3626        rcu_read_unlock();
3627}
3628EXPORT_SYMBOL(nexthop_set_hw_flags);
3629
3630void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
3631                                 bool offload, bool trap)
3632{
3633        struct nh_res_table *res_table;
3634        struct nh_res_bucket *bucket;
3635        struct nexthop *nexthop;
3636        struct nh_group *nhg;
3637
3638        rcu_read_lock();
3639
3640        nexthop = nexthop_find_by_id(net, id);
3641        if (!nexthop || !nexthop->is_group)
3642                goto out;
3643
3644        nhg = rcu_dereference(nexthop->nh_grp);
3645        if (!nhg->resilient)
3646                goto out;
3647
3648        if (bucket_index >= nhg->res_table->num_nh_buckets)
3649                goto out;
3650
3651        res_table = rcu_dereference(nhg->res_table);
3652        bucket = &res_table->nh_buckets[bucket_index];
3653        bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
3654        if (offload)
3655                bucket->nh_flags |= RTNH_F_OFFLOAD;
3656        if (trap)
3657                bucket->nh_flags |= RTNH_F_TRAP;
3658
3659out:
3660        rcu_read_unlock();
3661}
3662EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);
3663
3664void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
3665                                     unsigned long *activity)
3666{
3667        struct nh_res_table *res_table;
3668        struct nexthop *nexthop;
3669        struct nh_group *nhg;
3670        u16 i;
3671
3672        rcu_read_lock();
3673
3674        nexthop = nexthop_find_by_id(net, id);
3675        if (!nexthop || !nexthop->is_group)
3676                goto out;
3677
3678        nhg = rcu_dereference(nexthop->nh_grp);
3679        if (!nhg->resilient)
3680                goto out;
3681
3682        /* Instead of silently ignoring some buckets, demand that the sizes
3683         * be the same.
3684         */
3685        res_table = rcu_dereference(nhg->res_table);
3686        if (num_buckets != res_table->num_nh_buckets)
3687                goto out;
3688
3689        for (i = 0; i < num_buckets; i++) {
3690                if (test_bit(i, activity))
3691                        nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
3692        }
3693
3694out:
3695        rcu_read_unlock();
3696}
3697EXPORT_SYMBOL(nexthop_res_grp_activity_update);
3698
3699static void __net_exit nexthop_net_exit(struct net *net)
3700{
3701        rtnl_lock();
3702        flush_all_nexthops(net);
3703        rtnl_unlock();
3704        kfree(net->nexthop.devhash);
3705}
3706
3707static int __net_init nexthop_net_init(struct net *net)
3708{
3709        size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
3710
3711        net->nexthop.rb_root = RB_ROOT;
3712        net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
3713        if (!net->nexthop.devhash)
3714                return -ENOMEM;
3715        BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
3716
3717        return 0;
3718}
3719
3720static struct pernet_operations nexthop_net_ops = {
3721        .init = nexthop_net_init,
3722        .exit = nexthop_net_exit,
3723};
3724
3725static int __init nexthop_init(void)
3726{
3727        register_pernet_subsys(&nexthop_net_ops);
3728
3729        register_netdevice_notifier(&nh_netdev_notifier);
3730
3731        rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3732        rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
3733        rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
3734                      rtm_dump_nexthop, 0);
3735
3736        rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3737        rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
3738
3739        rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
3740        rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
3741
3742        rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket,
3743                      rtm_dump_nexthop_bucket, 0);
3744
3745        return 0;
3746}
3747subsys_initcall(nexthop_init);
3748