linux/net/sched/sch_api.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_api.c  Packet scheduler API.
   4 *
   5 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 *
   7 * Fixes:
   8 *
   9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  12 */
  13
  14#include <linux/module.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <linux/init.h>
  21#include <linux/proc_fs.h>
  22#include <linux/seq_file.h>
  23#include <linux/kmod.h>
  24#include <linux/list.h>
  25#include <linux/hrtimer.h>
  26#include <linux/slab.h>
  27#include <linux/hashtable.h>
  28
  29#include <net/net_namespace.h>
  30#include <net/sock.h>
  31#include <net/netlink.h>
  32#include <net/pkt_sched.h>
  33#include <net/pkt_cls.h>
  34
  35#include <trace/events/qdisc.h>
  36
  37/*
  38
  39   Short review.
  40   -------------
  41
  42   This file consists of two interrelated parts:
  43
  44   1. queueing disciplines manager frontend.
  45   2. traffic classes manager frontend.
  46
  47   Generally, queueing discipline ("qdisc") is a black box,
  48   which is able to enqueue packets and to dequeue them (when
  49   device is ready to send something) in order and at times
  50   determined by algorithm hidden in it.
  51
  52   qdisc's are divided to two categories:
  53   - "queues", which have no internal structure visible from outside.
  54   - "schedulers", which split all the packets to "traffic classes",
  55     using "packet classifiers" (look at cls_api.c)
  56
  57   In turn, classes may have child qdiscs (as rule, queues)
  58   attached to them etc. etc. etc.
  59
  60   The goal of the routines in this file is to translate
  61   information supplied by user in the form of handles
  62   to more intelligible for kernel form, to make some sanity
  63   checks and part of work, which is common to all qdiscs
  64   and to provide rtnetlink notifications.
  65
  66   All real intelligent work is done inside qdisc modules.
  67
  68
  69
  70   Every discipline has two major routines: enqueue and dequeue.
  71
  72   ---dequeue
  73
  74   dequeue usually returns a skb to send. It is allowed to return NULL,
  75   but it does not mean that queue is empty, it just means that
  76   discipline does not want to send anything this time.
  77   Queue is really empty if q->q.qlen == 0.
  78   For complicated disciplines with multiple queues q->q is not
  79   real packet queue, but however q->q.qlen must be valid.
  80
  81   ---enqueue
  82
  83   enqueue returns 0, if packet was enqueued successfully.
  84   If packet (this one or another one) was dropped, it returns
  85   not zero error code.
  86   NET_XMIT_DROP        - this packet dropped
  87     Expected action: do not backoff, but wait until queue will clear.
  88   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  89     Expected action: backoff or ignore
  90
  91   Auxiliary routines:
  92
  93   ---peek
  94
  95   like dequeue but without removing a packet from the queue
  96
  97   ---reset
  98
  99   returns qdisc to initial state: purge all buffers, clear all
 100   timers, counters (except for statistics) etc.
 101
 102   ---init
 103
 104   initializes newly created qdisc.
 105
 106   ---destroy
 107
 108   destroys resources allocated by init and during lifetime of qdisc.
 109
 110   ---change
 111
 112   changes qdisc parameters.
 113 */
 114
 115/* Protects list of registered TC modules. It is pure SMP lock. */
 116static DEFINE_RWLOCK(qdisc_mod_lock);
 117
 118
 119/************************************************
 120 *      Queueing disciplines manipulation.      *
 121 ************************************************/
 122
 123
 124/* The list of all installed queueing disciplines. */
 125
 126static struct Qdisc_ops *qdisc_base;
 127
 128/* Register/unregister queueing discipline */
 129
 130int register_qdisc(struct Qdisc_ops *qops)
 131{
 132        struct Qdisc_ops *q, **qp;
 133        int rc = -EEXIST;
 134
 135        write_lock(&qdisc_mod_lock);
 136        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 137                if (!strcmp(qops->id, q->id))
 138                        goto out;
 139
 140        if (qops->enqueue == NULL)
 141                qops->enqueue = noop_qdisc_ops.enqueue;
 142        if (qops->peek == NULL) {
 143                if (qops->dequeue == NULL)
 144                        qops->peek = noop_qdisc_ops.peek;
 145                else
 146                        goto out_einval;
 147        }
 148        if (qops->dequeue == NULL)
 149                qops->dequeue = noop_qdisc_ops.dequeue;
 150
 151        if (qops->cl_ops) {
 152                const struct Qdisc_class_ops *cops = qops->cl_ops;
 153
 154                if (!(cops->find && cops->walk && cops->leaf))
 155                        goto out_einval;
 156
 157                if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
 158                        goto out_einval;
 159        }
 160
 161        qops->next = NULL;
 162        *qp = qops;
 163        rc = 0;
 164out:
 165        write_unlock(&qdisc_mod_lock);
 166        return rc;
 167
 168out_einval:
 169        rc = -EINVAL;
 170        goto out;
 171}
 172EXPORT_SYMBOL(register_qdisc);
 173
 174int unregister_qdisc(struct Qdisc_ops *qops)
 175{
 176        struct Qdisc_ops *q, **qp;
 177        int err = -ENOENT;
 178
 179        write_lock(&qdisc_mod_lock);
 180        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 181                if (q == qops)
 182                        break;
 183        if (q) {
 184                *qp = q->next;
 185                q->next = NULL;
 186                err = 0;
 187        }
 188        write_unlock(&qdisc_mod_lock);
 189        return err;
 190}
 191EXPORT_SYMBOL(unregister_qdisc);
 192
 193/* Get default qdisc if not otherwise specified */
 194void qdisc_get_default(char *name, size_t len)
 195{
 196        read_lock(&qdisc_mod_lock);
 197        strlcpy(name, default_qdisc_ops->id, len);
 198        read_unlock(&qdisc_mod_lock);
 199}
 200
 201static struct Qdisc_ops *qdisc_lookup_default(const char *name)
 202{
 203        struct Qdisc_ops *q = NULL;
 204
 205        for (q = qdisc_base; q; q = q->next) {
 206                if (!strcmp(name, q->id)) {
 207                        if (!try_module_get(q->owner))
 208                                q = NULL;
 209                        break;
 210                }
 211        }
 212
 213        return q;
 214}
 215
 216/* Set new default qdisc to use */
 217int qdisc_set_default(const char *name)
 218{
 219        const struct Qdisc_ops *ops;
 220
 221        if (!capable(CAP_NET_ADMIN))
 222                return -EPERM;
 223
 224        write_lock(&qdisc_mod_lock);
 225        ops = qdisc_lookup_default(name);
 226        if (!ops) {
 227                /* Not found, drop lock and try to load module */
 228                write_unlock(&qdisc_mod_lock);
 229                request_module("sch_%s", name);
 230                write_lock(&qdisc_mod_lock);
 231
 232                ops = qdisc_lookup_default(name);
 233        }
 234
 235        if (ops) {
 236                /* Set new default */
 237                module_put(default_qdisc_ops->owner);
 238                default_qdisc_ops = ops;
 239        }
 240        write_unlock(&qdisc_mod_lock);
 241
 242        return ops ? 0 : -ENOENT;
 243}
 244
 245#ifdef CONFIG_NET_SCH_DEFAULT
 246/* Set default value from kernel config */
 247static int __init sch_default_qdisc(void)
 248{
 249        return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
 250}
 251late_initcall(sch_default_qdisc);
 252#endif
 253
 254/* We know handle. Find qdisc among all qdisc's attached to device
 255 * (root qdisc, all its children, children of children etc.)
 256 * Note: caller either uses rtnl or rcu_read_lock()
 257 */
 258
 259static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 260{
 261        struct Qdisc *q;
 262
 263        if (!qdisc_dev(root))
 264                return (root->handle == handle ? root : NULL);
 265
 266        if (!(root->flags & TCQ_F_BUILTIN) &&
 267            root->handle == handle)
 268                return root;
 269
 270        hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
 271                                   lockdep_rtnl_is_held()) {
 272                if (q->handle == handle)
 273                        return q;
 274        }
 275        return NULL;
 276}
 277
 278void qdisc_hash_add(struct Qdisc *q, bool invisible)
 279{
 280        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 281                ASSERT_RTNL();
 282                hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
 283                if (invisible)
 284                        q->flags |= TCQ_F_INVISIBLE;
 285        }
 286}
 287EXPORT_SYMBOL(qdisc_hash_add);
 288
 289void qdisc_hash_del(struct Qdisc *q)
 290{
 291        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 292                ASSERT_RTNL();
 293                hash_del_rcu(&q->hash);
 294        }
 295}
 296EXPORT_SYMBOL(qdisc_hash_del);
 297
 298struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 299{
 300        struct Qdisc *q;
 301
 302        if (!handle)
 303                return NULL;
 304        q = qdisc_match_from_root(dev->qdisc, handle);
 305        if (q)
 306                goto out;
 307
 308        if (dev_ingress_queue(dev))
 309                q = qdisc_match_from_root(
 310                        dev_ingress_queue(dev)->qdisc_sleeping,
 311                        handle);
 312out:
 313        return q;
 314}
 315
 316struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
 317{
 318        struct netdev_queue *nq;
 319        struct Qdisc *q;
 320
 321        if (!handle)
 322                return NULL;
 323        q = qdisc_match_from_root(dev->qdisc, handle);
 324        if (q)
 325                goto out;
 326
 327        nq = dev_ingress_queue_rcu(dev);
 328        if (nq)
 329                q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
 330out:
 331        return q;
 332}
 333
 334static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 335{
 336        unsigned long cl;
 337        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 338
 339        if (cops == NULL)
 340                return NULL;
 341        cl = cops->find(p, classid);
 342
 343        if (cl == 0)
 344                return NULL;
 345        return cops->leaf(p, cl);
 346}
 347
 348/* Find queueing discipline by name */
 349
 350static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 351{
 352        struct Qdisc_ops *q = NULL;
 353
 354        if (kind) {
 355                read_lock(&qdisc_mod_lock);
 356                for (q = qdisc_base; q; q = q->next) {
 357                        if (nla_strcmp(kind, q->id) == 0) {
 358                                if (!try_module_get(q->owner))
 359                                        q = NULL;
 360                                break;
 361                        }
 362                }
 363                read_unlock(&qdisc_mod_lock);
 364        }
 365        return q;
 366}
 367
 368/* The linklayer setting were not transferred from iproute2, in older
 369 * versions, and the rate tables lookup systems have been dropped in
 370 * the kernel. To keep backward compatible with older iproute2 tc
 371 * utils, we detect the linklayer setting by detecting if the rate
 372 * table were modified.
 373 *
 374 * For linklayer ATM table entries, the rate table will be aligned to
 375 * 48 bytes, thus some table entries will contain the same value.  The
 376 * mpu (min packet unit) is also encoded into the old rate table, thus
 377 * starting from the mpu, we find low and high table entries for
 378 * mapping this cell.  If these entries contain the same value, when
 379 * the rate tables have been modified for linklayer ATM.
 380 *
 381 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 382 * and then roundup to the next cell, calc the table entry one below,
 383 * and compare.
 384 */
 385static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 386{
 387        int low       = roundup(r->mpu, 48);
 388        int high      = roundup(low+1, 48);
 389        int cell_low  = low >> r->cell_log;
 390        int cell_high = (high >> r->cell_log) - 1;
 391
 392        /* rtab is too inaccurate at rates > 100Mbit/s */
 393        if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
 394                pr_debug("TC linklayer: Giving up ATM detection\n");
 395                return TC_LINKLAYER_ETHERNET;
 396        }
 397
 398        if ((cell_high > cell_low) && (cell_high < 256)
 399            && (rtab[cell_low] == rtab[cell_high])) {
 400                pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
 401                         cell_low, cell_high, rtab[cell_high]);
 402                return TC_LINKLAYER_ATM;
 403        }
 404        return TC_LINKLAYER_ETHERNET;
 405}
 406
 407static struct qdisc_rate_table *qdisc_rtab_list;
 408
 409struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 410                                        struct nlattr *tab,
 411                                        struct netlink_ext_ack *extack)
 412{
 413        struct qdisc_rate_table *rtab;
 414
 415        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 416            nla_len(tab) != TC_RTAB_SIZE) {
 417                NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
 418                return NULL;
 419        }
 420
 421        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 422                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 423                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 424                        rtab->refcnt++;
 425                        return rtab;
 426                }
 427        }
 428
 429        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 430        if (rtab) {
 431                rtab->rate = *r;
 432                rtab->refcnt = 1;
 433                memcpy(rtab->data, nla_data(tab), 1024);
 434                if (r->linklayer == TC_LINKLAYER_UNAWARE)
 435                        r->linklayer = __detect_linklayer(r, rtab->data);
 436                rtab->next = qdisc_rtab_list;
 437                qdisc_rtab_list = rtab;
 438        } else {
 439                NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
 440        }
 441        return rtab;
 442}
 443EXPORT_SYMBOL(qdisc_get_rtab);
 444
 445void qdisc_put_rtab(struct qdisc_rate_table *tab)
 446{
 447        struct qdisc_rate_table *rtab, **rtabp;
 448
 449        if (!tab || --tab->refcnt)
 450                return;
 451
 452        for (rtabp = &qdisc_rtab_list;
 453             (rtab = *rtabp) != NULL;
 454             rtabp = &rtab->next) {
 455                if (rtab == tab) {
 456                        *rtabp = rtab->next;
 457                        kfree(rtab);
 458                        return;
 459                }
 460        }
 461}
 462EXPORT_SYMBOL(qdisc_put_rtab);
 463
 464static LIST_HEAD(qdisc_stab_list);
 465
 466static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 467        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 468        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 469};
 470
 471static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
 472                                               struct netlink_ext_ack *extack)
 473{
 474        struct nlattr *tb[TCA_STAB_MAX + 1];
 475        struct qdisc_size_table *stab;
 476        struct tc_sizespec *s;
 477        unsigned int tsize = 0;
 478        u16 *tab = NULL;
 479        int err;
 480
 481        err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
 482                                          extack);
 483        if (err < 0)
 484                return ERR_PTR(err);
 485        if (!tb[TCA_STAB_BASE]) {
 486                NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
 487                return ERR_PTR(-EINVAL);
 488        }
 489
 490        s = nla_data(tb[TCA_STAB_BASE]);
 491
 492        if (s->tsize > 0) {
 493                if (!tb[TCA_STAB_DATA]) {
 494                        NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
 495                        return ERR_PTR(-EINVAL);
 496                }
 497                tab = nla_data(tb[TCA_STAB_DATA]);
 498                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 499        }
 500
 501        if (tsize != s->tsize || (!tab && tsize > 0)) {
 502                NL_SET_ERR_MSG(extack, "Invalid size of size table");
 503                return ERR_PTR(-EINVAL);
 504        }
 505
 506        list_for_each_entry(stab, &qdisc_stab_list, list) {
 507                if (memcmp(&stab->szopts, s, sizeof(*s)))
 508                        continue;
 509                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 510                        continue;
 511                stab->refcnt++;
 512                return stab;
 513        }
 514
 515        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 516        if (!stab)
 517                return ERR_PTR(-ENOMEM);
 518
 519        stab->refcnt = 1;
 520        stab->szopts = *s;
 521        if (tsize > 0)
 522                memcpy(stab->data, tab, tsize * sizeof(u16));
 523
 524        list_add_tail(&stab->list, &qdisc_stab_list);
 525
 526        return stab;
 527}
 528
 529void qdisc_put_stab(struct qdisc_size_table *tab)
 530{
 531        if (!tab)
 532                return;
 533
 534        if (--tab->refcnt == 0) {
 535                list_del(&tab->list);
 536                kfree_rcu(tab, rcu);
 537        }
 538}
 539EXPORT_SYMBOL(qdisc_put_stab);
 540
 541static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 542{
 543        struct nlattr *nest;
 544
 545        nest = nla_nest_start_noflag(skb, TCA_STAB);
 546        if (nest == NULL)
 547                goto nla_put_failure;
 548        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 549                goto nla_put_failure;
 550        nla_nest_end(skb, nest);
 551
 552        return skb->len;
 553
 554nla_put_failure:
 555        return -1;
 556}
 557
 558void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 559                               const struct qdisc_size_table *stab)
 560{
 561        int pkt_len, slot;
 562
 563        pkt_len = skb->len + stab->szopts.overhead;
 564        if (unlikely(!stab->szopts.tsize))
 565                goto out;
 566
 567        slot = pkt_len + stab->szopts.cell_align;
 568        if (unlikely(slot < 0))
 569                slot = 0;
 570
 571        slot >>= stab->szopts.cell_log;
 572        if (likely(slot < stab->szopts.tsize))
 573                pkt_len = stab->data[slot];
 574        else
 575                pkt_len = stab->data[stab->szopts.tsize - 1] *
 576                                (slot / stab->szopts.tsize) +
 577                                stab->data[slot % stab->szopts.tsize];
 578
 579        pkt_len <<= stab->szopts.size_log;
 580out:
 581        if (unlikely(pkt_len < 1))
 582                pkt_len = 1;
 583        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 584}
 585EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 586
 587void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
 588{
 589        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 590                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 591                        txt, qdisc->ops->id, qdisc->handle >> 16);
 592                qdisc->flags |= TCQ_F_WARN_NONWC;
 593        }
 594}
 595EXPORT_SYMBOL(qdisc_warn_nonwc);
 596
 597static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 598{
 599        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 600                                                 timer);
 601
 602        rcu_read_lock();
 603        __netif_schedule(qdisc_root(wd->qdisc));
 604        rcu_read_unlock();
 605
 606        return HRTIMER_NORESTART;
 607}
 608
 609void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
 610                                 clockid_t clockid)
 611{
 612        hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
 613        wd->timer.function = qdisc_watchdog;
 614        wd->qdisc = qdisc;
 615}
 616EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
 617
 618void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 619{
 620        qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
 621}
 622EXPORT_SYMBOL(qdisc_watchdog_init);
 623
 624void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
 625                                      u64 delta_ns)
 626{
 627        if (test_bit(__QDISC_STATE_DEACTIVATED,
 628                     &qdisc_root_sleeping(wd->qdisc)->state))
 629                return;
 630
 631        if (hrtimer_is_queued(&wd->timer)) {
 632                /* If timer is already set in [expires, expires + delta_ns],
 633                 * do not reprogram it.
 634                 */
 635                if (wd->last_expires - expires <= delta_ns)
 636                        return;
 637        }
 638
 639        wd->last_expires = expires;
 640        hrtimer_start_range_ns(&wd->timer,
 641                               ns_to_ktime(expires),
 642                               delta_ns,
 643                               HRTIMER_MODE_ABS_PINNED);
 644}
 645EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
 646
 647void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 648{
 649        hrtimer_cancel(&wd->timer);
 650}
 651EXPORT_SYMBOL(qdisc_watchdog_cancel);
 652
 653static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 654{
 655        struct hlist_head *h;
 656        unsigned int i;
 657
 658        h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
 659
 660        if (h != NULL) {
 661                for (i = 0; i < n; i++)
 662                        INIT_HLIST_HEAD(&h[i]);
 663        }
 664        return h;
 665}
 666
 667void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 668{
 669        struct Qdisc_class_common *cl;
 670        struct hlist_node *next;
 671        struct hlist_head *nhash, *ohash;
 672        unsigned int nsize, nmask, osize;
 673        unsigned int i, h;
 674
 675        /* Rehash when load factor exceeds 0.75 */
 676        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 677                return;
 678        nsize = clhash->hashsize * 2;
 679        nmask = nsize - 1;
 680        nhash = qdisc_class_hash_alloc(nsize);
 681        if (nhash == NULL)
 682                return;
 683
 684        ohash = clhash->hash;
 685        osize = clhash->hashsize;
 686
 687        sch_tree_lock(sch);
 688        for (i = 0; i < osize; i++) {
 689                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 690                        h = qdisc_class_hash(cl->classid, nmask);
 691                        hlist_add_head(&cl->hnode, &nhash[h]);
 692                }
 693        }
 694        clhash->hash     = nhash;
 695        clhash->hashsize = nsize;
 696        clhash->hashmask = nmask;
 697        sch_tree_unlock(sch);
 698
 699        kvfree(ohash);
 700}
 701EXPORT_SYMBOL(qdisc_class_hash_grow);
 702
 703int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 704{
 705        unsigned int size = 4;
 706
 707        clhash->hash = qdisc_class_hash_alloc(size);
 708        if (!clhash->hash)
 709                return -ENOMEM;
 710        clhash->hashsize  = size;
 711        clhash->hashmask  = size - 1;
 712        clhash->hashelems = 0;
 713        return 0;
 714}
 715EXPORT_SYMBOL(qdisc_class_hash_init);
 716
 717void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 718{
 719        kvfree(clhash->hash);
 720}
 721EXPORT_SYMBOL(qdisc_class_hash_destroy);
 722
 723void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 724                             struct Qdisc_class_common *cl)
 725{
 726        unsigned int h;
 727
 728        INIT_HLIST_NODE(&cl->hnode);
 729        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 730        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 731        clhash->hashelems++;
 732}
 733EXPORT_SYMBOL(qdisc_class_hash_insert);
 734
 735void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 736                             struct Qdisc_class_common *cl)
 737{
 738        hlist_del(&cl->hnode);
 739        clhash->hashelems--;
 740}
 741EXPORT_SYMBOL(qdisc_class_hash_remove);
 742
 743/* Allocate an unique handle from space managed by kernel
 744 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 745 */
 746static u32 qdisc_alloc_handle(struct net_device *dev)
 747{
 748        int i = 0x8000;
 749        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 750
 751        do {
 752                autohandle += TC_H_MAKE(0x10000U, 0);
 753                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 754                        autohandle = TC_H_MAKE(0x80000000U, 0);
 755                if (!qdisc_lookup(dev, autohandle))
 756                        return autohandle;
 757                cond_resched();
 758        } while (--i > 0);
 759
 760        return 0;
 761}
 762
 763void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
 764{
 765        bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
 766        const struct Qdisc_class_ops *cops;
 767        unsigned long cl;
 768        u32 parentid;
 769        bool notify;
 770        int drops;
 771
 772        if (n == 0 && len == 0)
 773                return;
 774        drops = max_t(int, n, 0);
 775        rcu_read_lock();
 776        while ((parentid = sch->parent)) {
 777                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 778                        break;
 779
 780                if (sch->flags & TCQ_F_NOPARENT)
 781                        break;
 782                /* Notify parent qdisc only if child qdisc becomes empty.
 783                 *
 784                 * If child was empty even before update then backlog
 785                 * counter is screwed and we skip notification because
 786                 * parent class is already passive.
 787                 *
 788                 * If the original child was offloaded then it is allowed
 789                 * to be seem as empty, so the parent is notified anyway.
 790                 */
 791                notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
 792                                                       !qdisc_is_offloaded);
 793                /* TODO: perform the search on a per txq basis */
 794                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 795                if (sch == NULL) {
 796                        WARN_ON_ONCE(parentid != TC_H_ROOT);
 797                        break;
 798                }
 799                cops = sch->ops->cl_ops;
 800                if (notify && cops->qlen_notify) {
 801                        cl = cops->find(sch, parentid);
 802                        cops->qlen_notify(sch, cl);
 803                }
 804                sch->q.qlen -= n;
 805                sch->qstats.backlog -= len;
 806                __qdisc_qstats_drop(sch, drops);
 807        }
 808        rcu_read_unlock();
 809}
 810EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 811
 812int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
 813                              void *type_data)
 814{
 815        struct net_device *dev = qdisc_dev(sch);
 816        int err;
 817
 818        sch->flags &= ~TCQ_F_OFFLOADED;
 819        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 820                return 0;
 821
 822        err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
 823        if (err == -EOPNOTSUPP)
 824                return 0;
 825
 826        if (!err)
 827                sch->flags |= TCQ_F_OFFLOADED;
 828
 829        return err;
 830}
 831EXPORT_SYMBOL(qdisc_offload_dump_helper);
 832
 833void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
 834                                struct Qdisc *new, struct Qdisc *old,
 835                                enum tc_setup_type type, void *type_data,
 836                                struct netlink_ext_ack *extack)
 837{
 838        bool any_qdisc_is_offloaded;
 839        int err;
 840
 841        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 842                return;
 843
 844        err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
 845
 846        /* Don't report error if the graft is part of destroy operation. */
 847        if (!err || !new || new == &noop_qdisc)
 848                return;
 849
 850        /* Don't report error if the parent, the old child and the new
 851         * one are not offloaded.
 852         */
 853        any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
 854        any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
 855        any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
 856
 857        if (any_qdisc_is_offloaded)
 858                NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
 859}
 860EXPORT_SYMBOL(qdisc_offload_graft_helper);
 861
 862static void qdisc_offload_graft_root(struct net_device *dev,
 863                                     struct Qdisc *new, struct Qdisc *old,
 864                                     struct netlink_ext_ack *extack)
 865{
 866        struct tc_root_qopt_offload graft_offload = {
 867                .command        = TC_ROOT_GRAFT,
 868                .handle         = new ? new->handle : 0,
 869                .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
 870                                  (old && old->flags & TCQ_F_INGRESS),
 871        };
 872
 873        qdisc_offload_graft_helper(dev, NULL, new, old,
 874                                   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
 875}
 876
 877static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 878                         u32 portid, u32 seq, u16 flags, int event)
 879{
 880        struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
 881        struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 882        struct tcmsg *tcm;
 883        struct nlmsghdr  *nlh;
 884        unsigned char *b = skb_tail_pointer(skb);
 885        struct gnet_dump d;
 886        struct qdisc_size_table *stab;
 887        u32 block_index;
 888        __u32 qlen;
 889
 890        cond_resched();
 891        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 892        if (!nlh)
 893                goto out_nlmsg_trim;
 894        tcm = nlmsg_data(nlh);
 895        tcm->tcm_family = AF_UNSPEC;
 896        tcm->tcm__pad1 = 0;
 897        tcm->tcm__pad2 = 0;
 898        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 899        tcm->tcm_parent = clid;
 900        tcm->tcm_handle = q->handle;
 901        tcm->tcm_info = refcount_read(&q->refcnt);
 902        if (nla_put_string(skb, TCA_KIND, q->ops->id))
 903                goto nla_put_failure;
 904        if (q->ops->ingress_block_get) {
 905                block_index = q->ops->ingress_block_get(q);
 906                if (block_index &&
 907                    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
 908                        goto nla_put_failure;
 909        }
 910        if (q->ops->egress_block_get) {
 911                block_index = q->ops->egress_block_get(q);
 912                if (block_index &&
 913                    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
 914                        goto nla_put_failure;
 915        }
 916        if (q->ops->dump && q->ops->dump(q, skb) < 0)
 917                goto nla_put_failure;
 918        if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
 919                goto nla_put_failure;
 920        qlen = qdisc_qlen_sum(q);
 921
 922        stab = rtnl_dereference(q->stab);
 923        if (stab && qdisc_dump_stab(skb, stab) < 0)
 924                goto nla_put_failure;
 925
 926        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
 927                                         NULL, &d, TCA_PAD) < 0)
 928                goto nla_put_failure;
 929
 930        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
 931                goto nla_put_failure;
 932
 933        if (qdisc_is_percpu_stats(q)) {
 934                cpu_bstats = q->cpu_bstats;
 935                cpu_qstats = q->cpu_qstats;
 936        }
 937
 938        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
 939                                  &d, cpu_bstats, &q->bstats) < 0 ||
 940            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
 941            gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 942                goto nla_put_failure;
 943
 944        if (gnet_stats_finish_copy(&d) < 0)
 945                goto nla_put_failure;
 946
 947        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 948        return skb->len;
 949
 950out_nlmsg_trim:
 951nla_put_failure:
 952        nlmsg_trim(skb, b);
 953        return -1;
 954}
 955
 956static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
 957{
 958        if (q->flags & TCQ_F_BUILTIN)
 959                return true;
 960        if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
 961                return true;
 962
 963        return false;
 964}
 965
 966static int qdisc_notify(struct net *net, struct sk_buff *oskb,
 967                        struct nlmsghdr *n, u32 clid,
 968                        struct Qdisc *old, struct Qdisc *new)
 969{
 970        struct sk_buff *skb;
 971        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 972
 973        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 974        if (!skb)
 975                return -ENOBUFS;
 976
 977        if (old && !tc_qdisc_dump_ignore(old, false)) {
 978                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
 979                                  0, RTM_DELQDISC) < 0)
 980                        goto err_out;
 981        }
 982        if (new && !tc_qdisc_dump_ignore(new, false)) {
 983                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
 984                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 985                        goto err_out;
 986        }
 987
 988        if (skb->len)
 989                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 990                                      n->nlmsg_flags & NLM_F_ECHO);
 991
 992err_out:
 993        kfree_skb(skb);
 994        return -EINVAL;
 995}
 996
 997static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 998                               struct nlmsghdr *n, u32 clid,
 999                               struct Qdisc *old, struct Qdisc *new)
1000{
1001        if (new || old)
1002                qdisc_notify(net, skb, n, clid, old, new);
1003
1004        if (old)
1005                qdisc_put(old);
1006}
1007
1008static void qdisc_clear_nolock(struct Qdisc *sch)
1009{
1010        sch->flags &= ~TCQ_F_NOLOCK;
1011        if (!(sch->flags & TCQ_F_CPUSTATS))
1012                return;
1013
1014        free_percpu(sch->cpu_bstats);
1015        free_percpu(sch->cpu_qstats);
1016        sch->cpu_bstats = NULL;
1017        sch->cpu_qstats = NULL;
1018        sch->flags &= ~TCQ_F_CPUSTATS;
1019}
1020
1021/* Graft qdisc "new" to class "classid" of qdisc "parent" or
1022 * to device "dev".
1023 *
1024 * When appropriate send a netlink notification using 'skb'
1025 * and "n".
1026 *
1027 * On success, destroy old qdisc.
1028 */
1029
1030static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1031                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1032                       struct Qdisc *new, struct Qdisc *old,
1033                       struct netlink_ext_ack *extack)
1034{
1035        struct Qdisc *q = old;
1036        struct net *net = dev_net(dev);
1037
1038        if (parent == NULL) {
1039                unsigned int i, num_q, ingress;
1040
1041                ingress = 0;
1042                num_q = dev->num_tx_queues;
1043                if ((q && q->flags & TCQ_F_INGRESS) ||
1044                    (new && new->flags & TCQ_F_INGRESS)) {
1045                        num_q = 1;
1046                        ingress = 1;
1047                        if (!dev_ingress_queue(dev)) {
1048                                NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1049                                return -ENOENT;
1050                        }
1051                }
1052
1053                if (dev->flags & IFF_UP)
1054                        dev_deactivate(dev);
1055
1056                qdisc_offload_graft_root(dev, new, old, extack);
1057
1058                if (new && new->ops->attach)
1059                        goto skip;
1060
1061                for (i = 0; i < num_q; i++) {
1062                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1063
1064                        if (!ingress)
1065                                dev_queue = netdev_get_tx_queue(dev, i);
1066
1067                        old = dev_graft_qdisc(dev_queue, new);
1068                        if (new && i > 0)
1069                                qdisc_refcount_inc(new);
1070
1071                        if (!ingress)
1072                                qdisc_put(old);
1073                }
1074
1075skip:
1076                if (!ingress) {
1077                        notify_and_destroy(net, skb, n, classid,
1078                                           dev->qdisc, new);
1079                        if (new && !new->ops->attach)
1080                                qdisc_refcount_inc(new);
1081                        dev->qdisc = new ? : &noop_qdisc;
1082
1083                        if (new && new->ops->attach)
1084                                new->ops->attach(new);
1085                } else {
1086                        notify_and_destroy(net, skb, n, classid, old, new);
1087                }
1088
1089                if (dev->flags & IFF_UP)
1090                        dev_activate(dev);
1091        } else {
1092                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1093                unsigned long cl;
1094                int err;
1095
1096                /* Only support running class lockless if parent is lockless */
1097                if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1098                        qdisc_clear_nolock(new);
1099
1100                if (!cops || !cops->graft)
1101                        return -EOPNOTSUPP;
1102
1103                cl = cops->find(parent, classid);
1104                if (!cl) {
1105                        NL_SET_ERR_MSG(extack, "Specified class not found");
1106                        return -ENOENT;
1107                }
1108
1109                err = cops->graft(parent, cl, new, &old, extack);
1110                if (err)
1111                        return err;
1112                notify_and_destroy(net, skb, n, classid, old, new);
1113        }
1114        return 0;
1115}
1116
1117static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1118                                   struct netlink_ext_ack *extack)
1119{
1120        u32 block_index;
1121
1122        if (tca[TCA_INGRESS_BLOCK]) {
1123                block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1124
1125                if (!block_index) {
1126                        NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1127                        return -EINVAL;
1128                }
1129                if (!sch->ops->ingress_block_set) {
1130                        NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1131                        return -EOPNOTSUPP;
1132                }
1133                sch->ops->ingress_block_set(sch, block_index);
1134        }
1135        if (tca[TCA_EGRESS_BLOCK]) {
1136                block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1137
1138                if (!block_index) {
1139                        NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1140                        return -EINVAL;
1141                }
1142                if (!sch->ops->egress_block_set) {
1143                        NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1144                        return -EOPNOTSUPP;
1145                }
1146                sch->ops->egress_block_set(sch, block_index);
1147        }
1148        return 0;
1149}
1150
1151/*
1152   Allocate and initialize new qdisc.
1153
1154   Parameters are passed via opt.
1155 */
1156
1157static struct Qdisc *qdisc_create(struct net_device *dev,
1158                                  struct netdev_queue *dev_queue,
1159                                  struct Qdisc *p, u32 parent, u32 handle,
1160                                  struct nlattr **tca, int *errp,
1161                                  struct netlink_ext_ack *extack)
1162{
1163        int err;
1164        struct nlattr *kind = tca[TCA_KIND];
1165        struct Qdisc *sch;
1166        struct Qdisc_ops *ops;
1167        struct qdisc_size_table *stab;
1168
1169        ops = qdisc_lookup_ops(kind);
1170#ifdef CONFIG_MODULES
1171        if (ops == NULL && kind != NULL) {
1172                char name[IFNAMSIZ];
1173                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1174                        /* We dropped the RTNL semaphore in order to
1175                         * perform the module load.  So, even if we
1176                         * succeeded in loading the module we have to
1177                         * tell the caller to replay the request.  We
1178                         * indicate this using -EAGAIN.
1179                         * We replay the request because the device may
1180                         * go away in the mean time.
1181                         */
1182                        rtnl_unlock();
1183                        request_module("sch_%s", name);
1184                        rtnl_lock();
1185                        ops = qdisc_lookup_ops(kind);
1186                        if (ops != NULL) {
1187                                /* We will try again qdisc_lookup_ops,
1188                                 * so don't keep a reference.
1189                                 */
1190                                module_put(ops->owner);
1191                                err = -EAGAIN;
1192                                goto err_out;
1193                        }
1194                }
1195        }
1196#endif
1197
1198        err = -ENOENT;
1199        if (!ops) {
1200                NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1201                goto err_out;
1202        }
1203
1204        sch = qdisc_alloc(dev_queue, ops, extack);
1205        if (IS_ERR(sch)) {
1206                err = PTR_ERR(sch);
1207                goto err_out2;
1208        }
1209
1210        sch->parent = parent;
1211
1212        if (handle == TC_H_INGRESS) {
1213                sch->flags |= TCQ_F_INGRESS;
1214                handle = TC_H_MAKE(TC_H_INGRESS, 0);
1215        } else {
1216                if (handle == 0) {
1217                        handle = qdisc_alloc_handle(dev);
1218                        if (handle == 0) {
1219                                NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1220                                err = -ENOSPC;
1221                                goto err_out3;
1222                        }
1223                }
1224                if (!netif_is_multiqueue(dev))
1225                        sch->flags |= TCQ_F_ONETXQUEUE;
1226        }
1227
1228        sch->handle = handle;
1229
1230        /* This exist to keep backward compatible with a userspace
1231         * loophole, what allowed userspace to get IFF_NO_QUEUE
1232         * facility on older kernels by setting tx_queue_len=0 (prior
1233         * to qdisc init), and then forgot to reinit tx_queue_len
1234         * before again attaching a qdisc.
1235         */
1236        if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1237                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1238                netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1239        }
1240
1241        err = qdisc_block_indexes_set(sch, tca, extack);
1242        if (err)
1243                goto err_out3;
1244
1245        if (ops->init) {
1246                err = ops->init(sch, tca[TCA_OPTIONS], extack);
1247                if (err != 0)
1248                        goto err_out5;
1249        }
1250
1251        if (tca[TCA_STAB]) {
1252                stab = qdisc_get_stab(tca[TCA_STAB], extack);
1253                if (IS_ERR(stab)) {
1254                        err = PTR_ERR(stab);
1255                        goto err_out4;
1256                }
1257                rcu_assign_pointer(sch->stab, stab);
1258        }
1259        if (tca[TCA_RATE]) {
1260                seqcount_t *running;
1261
1262                err = -EOPNOTSUPP;
1263                if (sch->flags & TCQ_F_MQROOT) {
1264                        NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1265                        goto err_out4;
1266                }
1267
1268                if (sch->parent != TC_H_ROOT &&
1269                    !(sch->flags & TCQ_F_INGRESS) &&
1270                    (!p || !(p->flags & TCQ_F_MQROOT)))
1271                        running = qdisc_root_sleeping_running(sch);
1272                else
1273                        running = &sch->running;
1274
1275                err = gen_new_estimator(&sch->bstats,
1276                                        sch->cpu_bstats,
1277                                        &sch->rate_est,
1278                                        NULL,
1279                                        running,
1280                                        tca[TCA_RATE]);
1281                if (err) {
1282                        NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1283                        goto err_out4;
1284                }
1285        }
1286
1287        qdisc_hash_add(sch, false);
1288        trace_qdisc_create(ops, dev, parent);
1289
1290        return sch;
1291
1292err_out5:
1293        /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1294        if (ops->destroy)
1295                ops->destroy(sch);
1296err_out3:
1297        dev_put(dev);
1298        qdisc_free(sch);
1299err_out2:
1300        module_put(ops->owner);
1301err_out:
1302        *errp = err;
1303        return NULL;
1304
1305err_out4:
1306        /*
1307         * Any broken qdiscs that would require a ops->reset() here?
1308         * The qdisc was never in action so it shouldn't be necessary.
1309         */
1310        qdisc_put_stab(rtnl_dereference(sch->stab));
1311        if (ops->destroy)
1312                ops->destroy(sch);
1313        goto err_out3;
1314}
1315
1316static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1317                        struct netlink_ext_ack *extack)
1318{
1319        struct qdisc_size_table *ostab, *stab = NULL;
1320        int err = 0;
1321
1322        if (tca[TCA_OPTIONS]) {
1323                if (!sch->ops->change) {
1324                        NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1325                        return -EINVAL;
1326                }
1327                if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1328                        NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1329                        return -EOPNOTSUPP;
1330                }
1331                err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1332                if (err)
1333                        return err;
1334        }
1335
1336        if (tca[TCA_STAB]) {
1337                stab = qdisc_get_stab(tca[TCA_STAB], extack);
1338                if (IS_ERR(stab))
1339                        return PTR_ERR(stab);
1340        }
1341
1342        ostab = rtnl_dereference(sch->stab);
1343        rcu_assign_pointer(sch->stab, stab);
1344        qdisc_put_stab(ostab);
1345
1346        if (tca[TCA_RATE]) {
1347                /* NB: ignores errors from replace_estimator
1348                   because change can't be undone. */
1349                if (sch->flags & TCQ_F_MQROOT)
1350                        goto out;
1351                gen_replace_estimator(&sch->bstats,
1352                                      sch->cpu_bstats,
1353                                      &sch->rate_est,
1354                                      NULL,
1355                                      qdisc_root_sleeping_running(sch),
1356                                      tca[TCA_RATE]);
1357        }
1358out:
1359        return 0;
1360}
1361
1362struct check_loop_arg {
1363        struct qdisc_walker     w;
1364        struct Qdisc            *p;
1365        int                     depth;
1366};
1367
1368static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1369                         struct qdisc_walker *w);
1370
1371static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1372{
1373        struct check_loop_arg   arg;
1374
1375        if (q->ops->cl_ops == NULL)
1376                return 0;
1377
1378        arg.w.stop = arg.w.skip = arg.w.count = 0;
1379        arg.w.fn = check_loop_fn;
1380        arg.depth = depth;
1381        arg.p = p;
1382        q->ops->cl_ops->walk(q, &arg.w);
1383        return arg.w.stop ? -ELOOP : 0;
1384}
1385
1386static int
1387check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1388{
1389        struct Qdisc *leaf;
1390        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1391        struct check_loop_arg *arg = (struct check_loop_arg *)w;
1392
1393        leaf = cops->leaf(q, cl);
1394        if (leaf) {
1395                if (leaf == arg->p || arg->depth > 7)
1396                        return -ELOOP;
1397                return check_loop(leaf, arg->p, arg->depth + 1);
1398        }
1399        return 0;
1400}
1401
1402const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1403        [TCA_KIND]              = { .type = NLA_STRING },
1404        [TCA_RATE]              = { .type = NLA_BINARY,
1405                                    .len = sizeof(struct tc_estimator) },
1406        [TCA_STAB]              = { .type = NLA_NESTED },
1407        [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1408        [TCA_CHAIN]             = { .type = NLA_U32 },
1409        [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1410        [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1411};
1412
1413/*
1414 * Delete/get qdisc.
1415 */
1416
1417static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1418                        struct netlink_ext_ack *extack)
1419{
1420        struct net *net = sock_net(skb->sk);
1421        struct tcmsg *tcm = nlmsg_data(n);
1422        struct nlattr *tca[TCA_MAX + 1];
1423        struct net_device *dev;
1424        u32 clid;
1425        struct Qdisc *q = NULL;
1426        struct Qdisc *p = NULL;
1427        int err;
1428
1429        if ((n->nlmsg_type != RTM_GETQDISC) &&
1430            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1431                return -EPERM;
1432
1433        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1434                                     rtm_tca_policy, extack);
1435        if (err < 0)
1436                return err;
1437
1438        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1439        if (!dev)
1440                return -ENODEV;
1441
1442        clid = tcm->tcm_parent;
1443        if (clid) {
1444                if (clid != TC_H_ROOT) {
1445                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1446                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1447                                if (!p) {
1448                                        NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1449                                        return -ENOENT;
1450                                }
1451                                q = qdisc_leaf(p, clid);
1452                        } else if (dev_ingress_queue(dev)) {
1453                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1454                        }
1455                } else {
1456                        q = dev->qdisc;
1457                }
1458                if (!q) {
1459                        NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1460                        return -ENOENT;
1461                }
1462
1463                if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1464                        NL_SET_ERR_MSG(extack, "Invalid handle");
1465                        return -EINVAL;
1466                }
1467        } else {
1468                q = qdisc_lookup(dev, tcm->tcm_handle);
1469                if (!q) {
1470                        NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1471                        return -ENOENT;
1472                }
1473        }
1474
1475        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1476                NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1477                return -EINVAL;
1478        }
1479
1480        if (n->nlmsg_type == RTM_DELQDISC) {
1481                if (!clid) {
1482                        NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1483                        return -EINVAL;
1484                }
1485                if (q->handle == 0) {
1486                        NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1487                        return -ENOENT;
1488                }
1489                err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1490                if (err != 0)
1491                        return err;
1492        } else {
1493                qdisc_notify(net, skb, n, clid, NULL, q);
1494        }
1495        return 0;
1496}
1497
1498/*
1499 * Create/change qdisc.
1500 */
1501
1502static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1503                           struct netlink_ext_ack *extack)
1504{
1505        struct net *net = sock_net(skb->sk);
1506        struct tcmsg *tcm;
1507        struct nlattr *tca[TCA_MAX + 1];
1508        struct net_device *dev;
1509        u32 clid;
1510        struct Qdisc *q, *p;
1511        int err;
1512
1513        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1514                return -EPERM;
1515
1516replay:
1517        /* Reinit, just in case something touches this. */
1518        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1519                                     rtm_tca_policy, extack);
1520        if (err < 0)
1521                return err;
1522
1523        tcm = nlmsg_data(n);
1524        clid = tcm->tcm_parent;
1525        q = p = NULL;
1526
1527        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1528        if (!dev)
1529                return -ENODEV;
1530
1531
1532        if (clid) {
1533                if (clid != TC_H_ROOT) {
1534                        if (clid != TC_H_INGRESS) {
1535                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1536                                if (!p) {
1537                                        NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1538                                        return -ENOENT;
1539                                }
1540                                q = qdisc_leaf(p, clid);
1541                        } else if (dev_ingress_queue_create(dev)) {
1542                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1543                        }
1544                } else {
1545                        q = dev->qdisc;
1546                }
1547
1548                /* It may be default qdisc, ignore it */
1549                if (q && q->handle == 0)
1550                        q = NULL;
1551
1552                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1553                        if (tcm->tcm_handle) {
1554                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1555                                        NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1556                                        return -EEXIST;
1557                                }
1558                                if (TC_H_MIN(tcm->tcm_handle)) {
1559                                        NL_SET_ERR_MSG(extack, "Invalid minor handle");
1560                                        return -EINVAL;
1561                                }
1562                                q = qdisc_lookup(dev, tcm->tcm_handle);
1563                                if (!q)
1564                                        goto create_n_graft;
1565                                if (n->nlmsg_flags & NLM_F_EXCL) {
1566                                        NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1567                                        return -EEXIST;
1568                                }
1569                                if (tca[TCA_KIND] &&
1570                                    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1571                                        NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1572                                        return -EINVAL;
1573                                }
1574                                if (q == p ||
1575                                    (p && check_loop(q, p, 0))) {
1576                                        NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1577                                        return -ELOOP;
1578                                }
1579                                qdisc_refcount_inc(q);
1580                                goto graft;
1581                        } else {
1582                                if (!q)
1583                                        goto create_n_graft;
1584
1585                                /* This magic test requires explanation.
1586                                 *
1587                                 *   We know, that some child q is already
1588                                 *   attached to this parent and have choice:
1589                                 *   either to change it or to create/graft new one.
1590                                 *
1591                                 *   1. We are allowed to create/graft only
1592                                 *   if CREATE and REPLACE flags are set.
1593                                 *
1594                                 *   2. If EXCL is set, requestor wanted to say,
1595                                 *   that qdisc tcm_handle is not expected
1596                                 *   to exist, so that we choose create/graft too.
1597                                 *
1598                                 *   3. The last case is when no flags are set.
1599                                 *   Alas, it is sort of hole in API, we
1600                                 *   cannot decide what to do unambiguously.
1601                                 *   For now we select create/graft, if
1602                                 *   user gave KIND, which does not match existing.
1603                                 */
1604                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1605                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1606                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1607                                     (tca[TCA_KIND] &&
1608                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1609                                        goto create_n_graft;
1610                        }
1611                }
1612        } else {
1613                if (!tcm->tcm_handle) {
1614                        NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1615                        return -EINVAL;
1616                }
1617                q = qdisc_lookup(dev, tcm->tcm_handle);
1618        }
1619
1620        /* Change qdisc parameters */
1621        if (!q) {
1622                NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1623                return -ENOENT;
1624        }
1625        if (n->nlmsg_flags & NLM_F_EXCL) {
1626                NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1627                return -EEXIST;
1628        }
1629        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1630                NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1631                return -EINVAL;
1632        }
1633        err = qdisc_change(q, tca, extack);
1634        if (err == 0)
1635                qdisc_notify(net, skb, n, clid, NULL, q);
1636        return err;
1637
1638create_n_graft:
1639        if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1640                NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1641                return -ENOENT;
1642        }
1643        if (clid == TC_H_INGRESS) {
1644                if (dev_ingress_queue(dev)) {
1645                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1646                                         tcm->tcm_parent, tcm->tcm_parent,
1647                                         tca, &err, extack);
1648                } else {
1649                        NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1650                        err = -ENOENT;
1651                }
1652        } else {
1653                struct netdev_queue *dev_queue;
1654
1655                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1656                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1657                else if (p)
1658                        dev_queue = p->dev_queue;
1659                else
1660                        dev_queue = netdev_get_tx_queue(dev, 0);
1661
1662                q = qdisc_create(dev, dev_queue, p,
1663                                 tcm->tcm_parent, tcm->tcm_handle,
1664                                 tca, &err, extack);
1665        }
1666        if (q == NULL) {
1667                if (err == -EAGAIN)
1668                        goto replay;
1669                return err;
1670        }
1671
1672graft:
1673        err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1674        if (err) {
1675                if (q)
1676                        qdisc_put(q);
1677                return err;
1678        }
1679
1680        return 0;
1681}
1682
1683static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1684                              struct netlink_callback *cb,
1685                              int *q_idx_p, int s_q_idx, bool recur,
1686                              bool dump_invisible)
1687{
1688        int ret = 0, q_idx = *q_idx_p;
1689        struct Qdisc *q;
1690        int b;
1691
1692        if (!root)
1693                return 0;
1694
1695        q = root;
1696        if (q_idx < s_q_idx) {
1697                q_idx++;
1698        } else {
1699                if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1700                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1701                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1702                                  RTM_NEWQDISC) <= 0)
1703                        goto done;
1704                q_idx++;
1705        }
1706
1707        /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1708         * itself has already been dumped.
1709         *
1710         * If we've already dumped the top-level (ingress) qdisc above and the global
1711         * qdisc hashtable, we don't want to hit it again
1712         */
1713        if (!qdisc_dev(root) || !recur)
1714                goto out;
1715
1716        hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1717                if (q_idx < s_q_idx) {
1718                        q_idx++;
1719                        continue;
1720                }
1721                if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1722                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1723                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1724                                  RTM_NEWQDISC) <= 0)
1725                        goto done;
1726                q_idx++;
1727        }
1728
1729out:
1730        *q_idx_p = q_idx;
1731        return ret;
1732done:
1733        ret = -1;
1734        goto out;
1735}
1736
1737static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1738{
1739        struct net *net = sock_net(skb->sk);
1740        int idx, q_idx;
1741        int s_idx, s_q_idx;
1742        struct net_device *dev;
1743        const struct nlmsghdr *nlh = cb->nlh;
1744        struct nlattr *tca[TCA_MAX + 1];
1745        int err;
1746
1747        s_idx = cb->args[0];
1748        s_q_idx = q_idx = cb->args[1];
1749
1750        idx = 0;
1751        ASSERT_RTNL();
1752
1753        err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1754                                     rtm_tca_policy, cb->extack);
1755        if (err < 0)
1756                return err;
1757
1758        for_each_netdev(net, dev) {
1759                struct netdev_queue *dev_queue;
1760
1761                if (idx < s_idx)
1762                        goto cont;
1763                if (idx > s_idx)
1764                        s_q_idx = 0;
1765                q_idx = 0;
1766
1767                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1768                                       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1769                        goto done;
1770
1771                dev_queue = dev_ingress_queue(dev);
1772                if (dev_queue &&
1773                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1774                                       &q_idx, s_q_idx, false,
1775                                       tca[TCA_DUMP_INVISIBLE]) < 0)
1776                        goto done;
1777
1778cont:
1779                idx++;
1780        }
1781
1782done:
1783        cb->args[0] = idx;
1784        cb->args[1] = q_idx;
1785
1786        return skb->len;
1787}
1788
1789
1790
1791/************************************************
1792 *      Traffic classes manipulation.           *
1793 ************************************************/
1794
1795static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1796                          unsigned long cl,
1797                          u32 portid, u32 seq, u16 flags, int event)
1798{
1799        struct tcmsg *tcm;
1800        struct nlmsghdr  *nlh;
1801        unsigned char *b = skb_tail_pointer(skb);
1802        struct gnet_dump d;
1803        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1804
1805        cond_resched();
1806        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1807        if (!nlh)
1808                goto out_nlmsg_trim;
1809        tcm = nlmsg_data(nlh);
1810        tcm->tcm_family = AF_UNSPEC;
1811        tcm->tcm__pad1 = 0;
1812        tcm->tcm__pad2 = 0;
1813        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1814        tcm->tcm_parent = q->handle;
1815        tcm->tcm_handle = q->handle;
1816        tcm->tcm_info = 0;
1817        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1818                goto nla_put_failure;
1819        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1820                goto nla_put_failure;
1821
1822        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1823                                         NULL, &d, TCA_PAD) < 0)
1824                goto nla_put_failure;
1825
1826        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1827                goto nla_put_failure;
1828
1829        if (gnet_stats_finish_copy(&d) < 0)
1830                goto nla_put_failure;
1831
1832        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1833        return skb->len;
1834
1835out_nlmsg_trim:
1836nla_put_failure:
1837        nlmsg_trim(skb, b);
1838        return -1;
1839}
1840
1841static int tclass_notify(struct net *net, struct sk_buff *oskb,
1842                         struct nlmsghdr *n, struct Qdisc *q,
1843                         unsigned long cl, int event)
1844{
1845        struct sk_buff *skb;
1846        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1847        int err = 0;
1848
1849        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1850        if (!skb)
1851                return -ENOBUFS;
1852
1853        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1854                kfree_skb(skb);
1855                return -EINVAL;
1856        }
1857
1858        err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1859                             n->nlmsg_flags & NLM_F_ECHO);
1860        if (err > 0)
1861                err = 0;
1862        return err;
1863}
1864
1865static int tclass_del_notify(struct net *net,
1866                             const struct Qdisc_class_ops *cops,
1867                             struct sk_buff *oskb, struct nlmsghdr *n,
1868                             struct Qdisc *q, unsigned long cl)
1869{
1870        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1871        struct sk_buff *skb;
1872        int err = 0;
1873
1874        if (!cops->delete)
1875                return -EOPNOTSUPP;
1876
1877        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1878        if (!skb)
1879                return -ENOBUFS;
1880
1881        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1882                           RTM_DELTCLASS) < 0) {
1883                kfree_skb(skb);
1884                return -EINVAL;
1885        }
1886
1887        err = cops->delete(q, cl);
1888        if (err) {
1889                kfree_skb(skb);
1890                return err;
1891        }
1892
1893        err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1894                             n->nlmsg_flags & NLM_F_ECHO);
1895        if (err > 0)
1896                err = 0;
1897        return err;
1898}
1899
1900#ifdef CONFIG_NET_CLS
1901
1902struct tcf_bind_args {
1903        struct tcf_walker w;
1904        unsigned long base;
1905        unsigned long cl;
1906        u32 classid;
1907};
1908
1909static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1910{
1911        struct tcf_bind_args *a = (void *)arg;
1912
1913        if (tp->ops->bind_class) {
1914                struct Qdisc *q = tcf_block_q(tp->chain->block);
1915
1916                sch_tree_lock(q);
1917                tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1918                sch_tree_unlock(q);
1919        }
1920        return 0;
1921}
1922
1923struct tc_bind_class_args {
1924        struct qdisc_walker w;
1925        unsigned long new_cl;
1926        u32 portid;
1927        u32 clid;
1928};
1929
1930static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1931                                struct qdisc_walker *w)
1932{
1933        struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1934        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1935        struct tcf_block *block;
1936        struct tcf_chain *chain;
1937
1938        block = cops->tcf_block(q, cl, NULL);
1939        if (!block)
1940                return 0;
1941        for (chain = tcf_get_next_chain(block, NULL);
1942             chain;
1943             chain = tcf_get_next_chain(block, chain)) {
1944                struct tcf_proto *tp;
1945
1946                for (tp = tcf_get_next_proto(chain, NULL, true);
1947                     tp; tp = tcf_get_next_proto(chain, tp, true)) {
1948                        struct tcf_bind_args arg = {};
1949
1950                        arg.w.fn = tcf_node_bind;
1951                        arg.classid = a->clid;
1952                        arg.base = cl;
1953                        arg.cl = a->new_cl;
1954                        tp->ops->walk(tp, &arg.w, true);
1955                }
1956        }
1957
1958        return 0;
1959}
1960
1961static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1962                           unsigned long new_cl)
1963{
1964        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1965        struct tc_bind_class_args args = {};
1966
1967        if (!cops->tcf_block)
1968                return;
1969        args.portid = portid;
1970        args.clid = clid;
1971        args.new_cl = new_cl;
1972        args.w.fn = tc_bind_class_walker;
1973        q->ops->cl_ops->walk(q, &args.w);
1974}
1975
1976#else
1977
1978static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1979                           unsigned long new_cl)
1980{
1981}
1982
1983#endif
1984
1985static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1986                         struct netlink_ext_ack *extack)
1987{
1988        struct net *net = sock_net(skb->sk);
1989        struct tcmsg *tcm = nlmsg_data(n);
1990        struct nlattr *tca[TCA_MAX + 1];
1991        struct net_device *dev;
1992        struct Qdisc *q = NULL;
1993        const struct Qdisc_class_ops *cops;
1994        unsigned long cl = 0;
1995        unsigned long new_cl;
1996        u32 portid;
1997        u32 clid;
1998        u32 qid;
1999        int err;
2000
2001        if ((n->nlmsg_type != RTM_GETTCLASS) &&
2002            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2003                return -EPERM;
2004
2005        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2006                                     rtm_tca_policy, extack);
2007        if (err < 0)
2008                return err;
2009
2010        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2011        if (!dev)
2012                return -ENODEV;
2013
2014        /*
2015           parent == TC_H_UNSPEC - unspecified parent.
2016           parent == TC_H_ROOT   - class is root, which has no parent.
2017           parent == X:0         - parent is root class.
2018           parent == X:Y         - parent is a node in hierarchy.
2019           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2020
2021           handle == 0:0         - generate handle from kernel pool.
2022           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2023           handle == X:Y         - clear.
2024           handle == X:0         - root class.
2025         */
2026
2027        /* Step 1. Determine qdisc handle X:0 */
2028
2029        portid = tcm->tcm_parent;
2030        clid = tcm->tcm_handle;
2031        qid = TC_H_MAJ(clid);
2032
2033        if (portid != TC_H_ROOT) {
2034                u32 qid1 = TC_H_MAJ(portid);
2035
2036                if (qid && qid1) {
2037                        /* If both majors are known, they must be identical. */
2038                        if (qid != qid1)
2039                                return -EINVAL;
2040                } else if (qid1) {
2041                        qid = qid1;
2042                } else if (qid == 0)
2043                        qid = dev->qdisc->handle;
2044
2045                /* Now qid is genuine qdisc handle consistent
2046                 * both with parent and child.
2047                 *
2048                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2049                 */
2050                if (portid)
2051                        portid = TC_H_MAKE(qid, portid);
2052        } else {
2053                if (qid == 0)
2054                        qid = dev->qdisc->handle;
2055        }
2056
2057        /* OK. Locate qdisc */
2058        q = qdisc_lookup(dev, qid);
2059        if (!q)
2060                return -ENOENT;
2061
2062        /* An check that it supports classes */
2063        cops = q->ops->cl_ops;
2064        if (cops == NULL)
2065                return -EINVAL;
2066
2067        /* Now try to get class */
2068        if (clid == 0) {
2069                if (portid == TC_H_ROOT)
2070                        clid = qid;
2071        } else
2072                clid = TC_H_MAKE(qid, clid);
2073
2074        if (clid)
2075                cl = cops->find(q, clid);
2076
2077        if (cl == 0) {
2078                err = -ENOENT;
2079                if (n->nlmsg_type != RTM_NEWTCLASS ||
2080                    !(n->nlmsg_flags & NLM_F_CREATE))
2081                        goto out;
2082        } else {
2083                switch (n->nlmsg_type) {
2084                case RTM_NEWTCLASS:
2085                        err = -EEXIST;
2086                        if (n->nlmsg_flags & NLM_F_EXCL)
2087                                goto out;
2088                        break;
2089                case RTM_DELTCLASS:
2090                        err = tclass_del_notify(net, cops, skb, n, q, cl);
2091                        /* Unbind the class with flilters with 0 */
2092                        tc_bind_tclass(q, portid, clid, 0);
2093                        goto out;
2094                case RTM_GETTCLASS:
2095                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2096                        goto out;
2097                default:
2098                        err = -EINVAL;
2099                        goto out;
2100                }
2101        }
2102
2103        if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2104                NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2105                return -EOPNOTSUPP;
2106        }
2107
2108        new_cl = cl;
2109        err = -EOPNOTSUPP;
2110        if (cops->change)
2111                err = cops->change(q, clid, portid, tca, &new_cl, extack);
2112        if (err == 0) {
2113                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2114                /* We just create a new class, need to do reverse binding. */
2115                if (cl != new_cl)
2116                        tc_bind_tclass(q, portid, clid, new_cl);
2117        }
2118out:
2119        return err;
2120}
2121
2122struct qdisc_dump_args {
2123        struct qdisc_walker     w;
2124        struct sk_buff          *skb;
2125        struct netlink_callback *cb;
2126};
2127
2128static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2129                            struct qdisc_walker *arg)
2130{
2131        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2132
2133        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2134                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2135                              RTM_NEWTCLASS);
2136}
2137
2138static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2139                                struct tcmsg *tcm, struct netlink_callback *cb,
2140                                int *t_p, int s_t)
2141{
2142        struct qdisc_dump_args arg;
2143
2144        if (tc_qdisc_dump_ignore(q, false) ||
2145            *t_p < s_t || !q->ops->cl_ops ||
2146            (tcm->tcm_parent &&
2147             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2148                (*t_p)++;
2149                return 0;
2150        }
2151        if (*t_p > s_t)
2152                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2153        arg.w.fn = qdisc_class_dump;
2154        arg.skb = skb;
2155        arg.cb = cb;
2156        arg.w.stop  = 0;
2157        arg.w.skip = cb->args[1];
2158        arg.w.count = 0;
2159        q->ops->cl_ops->walk(q, &arg.w);
2160        cb->args[1] = arg.w.count;
2161        if (arg.w.stop)
2162                return -1;
2163        (*t_p)++;
2164        return 0;
2165}
2166
2167static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2168                               struct tcmsg *tcm, struct netlink_callback *cb,
2169                               int *t_p, int s_t)
2170{
2171        struct Qdisc *q;
2172        int b;
2173
2174        if (!root)
2175                return 0;
2176
2177        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2178                return -1;
2179
2180        if (!qdisc_dev(root))
2181                return 0;
2182
2183        if (tcm->tcm_parent) {
2184                q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2185                if (q && q != root &&
2186                    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2187                        return -1;
2188                return 0;
2189        }
2190        hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2191                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2192                        return -1;
2193        }
2194
2195        return 0;
2196}
2197
2198static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2199{
2200        struct tcmsg *tcm = nlmsg_data(cb->nlh);
2201        struct net *net = sock_net(skb->sk);
2202        struct netdev_queue *dev_queue;
2203        struct net_device *dev;
2204        int t, s_t;
2205
2206        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2207                return 0;
2208        dev = dev_get_by_index(net, tcm->tcm_ifindex);
2209        if (!dev)
2210                return 0;
2211
2212        s_t = cb->args[0];
2213        t = 0;
2214
2215        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2216                goto done;
2217
2218        dev_queue = dev_ingress_queue(dev);
2219        if (dev_queue &&
2220            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2221                                &t, s_t) < 0)
2222                goto done;
2223
2224done:
2225        cb->args[0] = t;
2226
2227        dev_put(dev);
2228        return skb->len;
2229}
2230
2231#ifdef CONFIG_PROC_FS
2232static int psched_show(struct seq_file *seq, void *v)
2233{
2234        seq_printf(seq, "%08x %08x %08x %08x\n",
2235                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2236                   1000000,
2237                   (u32)NSEC_PER_SEC / hrtimer_resolution);
2238
2239        return 0;
2240}
2241
2242static int __net_init psched_net_init(struct net *net)
2243{
2244        struct proc_dir_entry *e;
2245
2246        e = proc_create_single("psched", 0, net->proc_net, psched_show);
2247        if (e == NULL)
2248                return -ENOMEM;
2249
2250        return 0;
2251}
2252
2253static void __net_exit psched_net_exit(struct net *net)
2254{
2255        remove_proc_entry("psched", net->proc_net);
2256}
2257#else
2258static int __net_init psched_net_init(struct net *net)
2259{
2260        return 0;
2261}
2262
2263static void __net_exit psched_net_exit(struct net *net)
2264{
2265}
2266#endif
2267
2268static struct pernet_operations psched_net_ops = {
2269        .init = psched_net_init,
2270        .exit = psched_net_exit,
2271};
2272
2273static int __init pktsched_init(void)
2274{
2275        int err;
2276
2277        err = register_pernet_subsys(&psched_net_ops);
2278        if (err) {
2279                pr_err("pktsched_init: "
2280                       "cannot initialize per netns operations\n");
2281                return err;
2282        }
2283
2284        register_qdisc(&pfifo_fast_ops);
2285        register_qdisc(&pfifo_qdisc_ops);
2286        register_qdisc(&bfifo_qdisc_ops);
2287        register_qdisc(&pfifo_head_drop_qdisc_ops);
2288        register_qdisc(&mq_qdisc_ops);
2289        register_qdisc(&noqueue_qdisc_ops);
2290
2291        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2292        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2293        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2294                      0);
2295        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2296        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2297        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2298                      0);
2299
2300        return 0;
2301}
2302
2303subsys_initcall(pktsched_init);
2304