linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/slab.h>
  31#include <linux/hashtable.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37#include <net/pkt_cls.h>
  38
  39/*
  40
  41   Short review.
  42   -------------
  43
  44   This file consists of two interrelated parts:
  45
  46   1. queueing disciplines manager frontend.
  47   2. traffic classes manager frontend.
  48
  49   Generally, queueing discipline ("qdisc") is a black box,
  50   which is able to enqueue packets and to dequeue them (when
  51   device is ready to send something) in order and at times
  52   determined by algorithm hidden in it.
  53
  54   qdisc's are divided to two categories:
  55   - "queues", which have no internal structure visible from outside.
  56   - "schedulers", which split all the packets to "traffic classes",
  57     using "packet classifiers" (look at cls_api.c)
  58
  59   In turn, classes may have child qdiscs (as rule, queues)
  60   attached to them etc. etc. etc.
  61
  62   The goal of the routines in this file is to translate
  63   information supplied by user in the form of handles
  64   to more intelligible for kernel form, to make some sanity
  65   checks and part of work, which is common to all qdiscs
  66   and to provide rtnetlink notifications.
  67
  68   All real intelligent work is done inside qdisc modules.
  69
  70
  71
  72   Every discipline has two major routines: enqueue and dequeue.
  73
  74   ---dequeue
  75
  76   dequeue usually returns a skb to send. It is allowed to return NULL,
  77   but it does not mean that queue is empty, it just means that
  78   discipline does not want to send anything this time.
  79   Queue is really empty if q->q.qlen == 0.
  80   For complicated disciplines with multiple queues q->q is not
  81   real packet queue, but however q->q.qlen must be valid.
  82
  83   ---enqueue
  84
  85   enqueue returns 0, if packet was enqueued successfully.
  86   If packet (this one or another one) was dropped, it returns
  87   not zero error code.
  88   NET_XMIT_DROP        - this packet dropped
  89     Expected action: do not backoff, but wait until queue will clear.
  90   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  91     Expected action: backoff or ignore
  92
  93   Auxiliary routines:
  94
  95   ---peek
  96
  97   like dequeue but without removing a packet from the queue
  98
  99   ---reset
 100
 101   returns qdisc to initial state: purge all buffers, clear all
 102   timers, counters (except for statistics) etc.
 103
 104   ---init
 105
 106   initializes newly created qdisc.
 107
 108   ---destroy
 109
 110   destroys resources allocated by init and during lifetime of qdisc.
 111
 112   ---change
 113
 114   changes qdisc parameters.
 115 */
 116
 117/* Protects list of registered TC modules. It is pure SMP lock. */
 118static DEFINE_RWLOCK(qdisc_mod_lock);
 119
 120
 121/************************************************
 122 *      Queueing disciplines manipulation.      *
 123 ************************************************/
 124
 125
 126/* The list of all installed queueing disciplines. */
 127
 128static struct Qdisc_ops *qdisc_base;
 129
 130/* Register/unregister queueing discipline */
 131
 132int register_qdisc(struct Qdisc_ops *qops)
 133{
 134        struct Qdisc_ops *q, **qp;
 135        int rc = -EEXIST;
 136
 137        write_lock(&qdisc_mod_lock);
 138        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 139                if (!strcmp(qops->id, q->id))
 140                        goto out;
 141
 142        if (qops->enqueue == NULL)
 143                qops->enqueue = noop_qdisc_ops.enqueue;
 144        if (qops->peek == NULL) {
 145                if (qops->dequeue == NULL)
 146                        qops->peek = noop_qdisc_ops.peek;
 147                else
 148                        goto out_einval;
 149        }
 150        if (qops->dequeue == NULL)
 151                qops->dequeue = noop_qdisc_ops.dequeue;
 152
 153        if (qops->cl_ops) {
 154                const struct Qdisc_class_ops *cops = qops->cl_ops;
 155
 156                if (!(cops->find && cops->walk && cops->leaf))
 157                        goto out_einval;
 158
 159                if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
 160                        goto out_einval;
 161        }
 162
 163        qops->next = NULL;
 164        *qp = qops;
 165        rc = 0;
 166out:
 167        write_unlock(&qdisc_mod_lock);
 168        return rc;
 169
 170out_einval:
 171        rc = -EINVAL;
 172        goto out;
 173}
 174EXPORT_SYMBOL(register_qdisc);
 175
 176int unregister_qdisc(struct Qdisc_ops *qops)
 177{
 178        struct Qdisc_ops *q, **qp;
 179        int err = -ENOENT;
 180
 181        write_lock(&qdisc_mod_lock);
 182        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 183                if (q == qops)
 184                        break;
 185        if (q) {
 186                *qp = q->next;
 187                q->next = NULL;
 188                err = 0;
 189        }
 190        write_unlock(&qdisc_mod_lock);
 191        return err;
 192}
 193EXPORT_SYMBOL(unregister_qdisc);
 194
 195/* Get default qdisc if not otherwise specified */
 196void qdisc_get_default(char *name, size_t len)
 197{
 198        read_lock(&qdisc_mod_lock);
 199        strlcpy(name, default_qdisc_ops->id, len);
 200        read_unlock(&qdisc_mod_lock);
 201}
 202
 203static struct Qdisc_ops *qdisc_lookup_default(const char *name)
 204{
 205        struct Qdisc_ops *q = NULL;
 206
 207        for (q = qdisc_base; q; q = q->next) {
 208                if (!strcmp(name, q->id)) {
 209                        if (!try_module_get(q->owner))
 210                                q = NULL;
 211                        break;
 212                }
 213        }
 214
 215        return q;
 216}
 217
 218/* Set new default qdisc to use */
 219int qdisc_set_default(const char *name)
 220{
 221        const struct Qdisc_ops *ops;
 222
 223        if (!capable(CAP_NET_ADMIN))
 224                return -EPERM;
 225
 226        write_lock(&qdisc_mod_lock);
 227        ops = qdisc_lookup_default(name);
 228        if (!ops) {
 229                /* Not found, drop lock and try to load module */
 230                write_unlock(&qdisc_mod_lock);
 231                request_module("sch_%s", name);
 232                write_lock(&qdisc_mod_lock);
 233
 234                ops = qdisc_lookup_default(name);
 235        }
 236
 237        if (ops) {
 238                /* Set new default */
 239                module_put(default_qdisc_ops->owner);
 240                default_qdisc_ops = ops;
 241        }
 242        write_unlock(&qdisc_mod_lock);
 243
 244        return ops ? 0 : -ENOENT;
 245}
 246
 247#ifdef CONFIG_NET_SCH_DEFAULT
 248/* Set default value from kernel config */
 249static int __init sch_default_qdisc(void)
 250{
 251        return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
 252}
 253late_initcall(sch_default_qdisc);
 254#endif
 255
 256/* We know handle. Find qdisc among all qdisc's attached to device
 257 * (root qdisc, all its children, children of children etc.)
 258 * Note: caller either uses rtnl or rcu_read_lock()
 259 */
 260
 261static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 262{
 263        struct Qdisc *q;
 264
 265        if (!qdisc_dev(root))
 266                return (root->handle == handle ? root : NULL);
 267
 268        if (!(root->flags & TCQ_F_BUILTIN) &&
 269            root->handle == handle)
 270                return root;
 271
 272        hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
 273                if (q->handle == handle)
 274                        return q;
 275        }
 276        return NULL;
 277}
 278
 279void qdisc_hash_add(struct Qdisc *q, bool invisible)
 280{
 281        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 282                ASSERT_RTNL();
 283                hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
 284                if (invisible)
 285                        q->flags |= TCQ_F_INVISIBLE;
 286        }
 287}
 288EXPORT_SYMBOL(qdisc_hash_add);
 289
 290void qdisc_hash_del(struct Qdisc *q)
 291{
 292        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 293                ASSERT_RTNL();
 294                hash_del_rcu(&q->hash);
 295        }
 296}
 297EXPORT_SYMBOL(qdisc_hash_del);
 298
 299struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 300{
 301        struct Qdisc *q;
 302
 303        if (!handle)
 304                return NULL;
 305        q = qdisc_match_from_root(dev->qdisc, handle);
 306        if (q)
 307                goto out;
 308
 309        if (dev_ingress_queue(dev))
 310                q = qdisc_match_from_root(
 311                        dev_ingress_queue(dev)->qdisc_sleeping,
 312                        handle);
 313out:
 314        return q;
 315}
 316
 317struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
 318{
 319        struct netdev_queue *nq;
 320        struct Qdisc *q;
 321
 322        if (!handle)
 323                return NULL;
 324        q = qdisc_match_from_root(dev->qdisc, handle);
 325        if (q)
 326                goto out;
 327
 328        nq = dev_ingress_queue_rcu(dev);
 329        if (nq)
 330                q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
 331out:
 332        return q;
 333}
 334
 335static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 336{
 337        unsigned long cl;
 338        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 339
 340        if (cops == NULL)
 341                return NULL;
 342        cl = cops->find(p, classid);
 343
 344        if (cl == 0)
 345                return NULL;
 346        return cops->leaf(p, cl);
 347}
 348
 349/* Find queueing discipline by name */
 350
 351static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 352{
 353        struct Qdisc_ops *q = NULL;
 354
 355        if (kind) {
 356                read_lock(&qdisc_mod_lock);
 357                for (q = qdisc_base; q; q = q->next) {
 358                        if (nla_strcmp(kind, q->id) == 0) {
 359                                if (!try_module_get(q->owner))
 360                                        q = NULL;
 361                                break;
 362                        }
 363                }
 364                read_unlock(&qdisc_mod_lock);
 365        }
 366        return q;
 367}
 368
 369/* The linklayer setting were not transferred from iproute2, in older
 370 * versions, and the rate tables lookup systems have been dropped in
 371 * the kernel. To keep backward compatible with older iproute2 tc
 372 * utils, we detect the linklayer setting by detecting if the rate
 373 * table were modified.
 374 *
 375 * For linklayer ATM table entries, the rate table will be aligned to
 376 * 48 bytes, thus some table entries will contain the same value.  The
 377 * mpu (min packet unit) is also encoded into the old rate table, thus
 378 * starting from the mpu, we find low and high table entries for
 379 * mapping this cell.  If these entries contain the same value, when
 380 * the rate tables have been modified for linklayer ATM.
 381 *
 382 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 383 * and then roundup to the next cell, calc the table entry one below,
 384 * and compare.
 385 */
 386static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 387{
 388        int low       = roundup(r->mpu, 48);
 389        int high      = roundup(low+1, 48);
 390        int cell_low  = low >> r->cell_log;
 391        int cell_high = (high >> r->cell_log) - 1;
 392
 393        /* rtab is too inaccurate at rates > 100Mbit/s */
 394        if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
 395                pr_debug("TC linklayer: Giving up ATM detection\n");
 396                return TC_LINKLAYER_ETHERNET;
 397        }
 398
 399        if ((cell_high > cell_low) && (cell_high < 256)
 400            && (rtab[cell_low] == rtab[cell_high])) {
 401                pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
 402                         cell_low, cell_high, rtab[cell_high]);
 403                return TC_LINKLAYER_ATM;
 404        }
 405        return TC_LINKLAYER_ETHERNET;
 406}
 407
 408static struct qdisc_rate_table *qdisc_rtab_list;
 409
 410struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 411                                        struct nlattr *tab,
 412                                        struct netlink_ext_ack *extack)
 413{
 414        struct qdisc_rate_table *rtab;
 415
 416        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 417            nla_len(tab) != TC_RTAB_SIZE) {
 418                NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
 419                return NULL;
 420        }
 421
 422        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 423                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 424                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 425                        rtab->refcnt++;
 426                        return rtab;
 427                }
 428        }
 429
 430        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 431        if (rtab) {
 432                rtab->rate = *r;
 433                rtab->refcnt = 1;
 434                memcpy(rtab->data, nla_data(tab), 1024);
 435                if (r->linklayer == TC_LINKLAYER_UNAWARE)
 436                        r->linklayer = __detect_linklayer(r, rtab->data);
 437                rtab->next = qdisc_rtab_list;
 438                qdisc_rtab_list = rtab;
 439        } else {
 440                NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
 441        }
 442        return rtab;
 443}
 444EXPORT_SYMBOL(qdisc_get_rtab);
 445
 446void qdisc_put_rtab(struct qdisc_rate_table *tab)
 447{
 448        struct qdisc_rate_table *rtab, **rtabp;
 449
 450        if (!tab || --tab->refcnt)
 451                return;
 452
 453        for (rtabp = &qdisc_rtab_list;
 454             (rtab = *rtabp) != NULL;
 455             rtabp = &rtab->next) {
 456                if (rtab == tab) {
 457                        *rtabp = rtab->next;
 458                        kfree(rtab);
 459                        return;
 460                }
 461        }
 462}
 463EXPORT_SYMBOL(qdisc_put_rtab);
 464
 465static LIST_HEAD(qdisc_stab_list);
 466
 467static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 468        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 469        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 470};
 471
 472static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
 473                                               struct netlink_ext_ack *extack)
 474{
 475        struct nlattr *tb[TCA_STAB_MAX + 1];
 476        struct qdisc_size_table *stab;
 477        struct tc_sizespec *s;
 478        unsigned int tsize = 0;
 479        u16 *tab = NULL;
 480        int err;
 481
 482        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
 483        if (err < 0)
 484                return ERR_PTR(err);
 485        if (!tb[TCA_STAB_BASE]) {
 486                NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
 487                return ERR_PTR(-EINVAL);
 488        }
 489
 490        s = nla_data(tb[TCA_STAB_BASE]);
 491
 492        if (s->tsize > 0) {
 493                if (!tb[TCA_STAB_DATA]) {
 494                        NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
 495                        return ERR_PTR(-EINVAL);
 496                }
 497                tab = nla_data(tb[TCA_STAB_DATA]);
 498                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 499        }
 500
 501        if (tsize != s->tsize || (!tab && tsize > 0)) {
 502                NL_SET_ERR_MSG(extack, "Invalid size of size table");
 503                return ERR_PTR(-EINVAL);
 504        }
 505
 506        list_for_each_entry(stab, &qdisc_stab_list, list) {
 507                if (memcmp(&stab->szopts, s, sizeof(*s)))
 508                        continue;
 509                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 510                        continue;
 511                stab->refcnt++;
 512                return stab;
 513        }
 514
 515        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 516        if (!stab)
 517                return ERR_PTR(-ENOMEM);
 518
 519        stab->refcnt = 1;
 520        stab->szopts = *s;
 521        if (tsize > 0)
 522                memcpy(stab->data, tab, tsize * sizeof(u16));
 523
 524        list_add_tail(&stab->list, &qdisc_stab_list);
 525
 526        return stab;
 527}
 528
 529static void stab_kfree_rcu(struct rcu_head *head)
 530{
 531        kfree(container_of(head, struct qdisc_size_table, rcu));
 532}
 533
 534void qdisc_put_stab(struct qdisc_size_table *tab)
 535{
 536        if (!tab)
 537                return;
 538
 539        if (--tab->refcnt == 0) {
 540                list_del(&tab->list);
 541                call_rcu(&tab->rcu, stab_kfree_rcu);
 542        }
 543}
 544EXPORT_SYMBOL(qdisc_put_stab);
 545
 546static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 547{
 548        struct nlattr *nest;
 549
 550        nest = nla_nest_start(skb, TCA_STAB);
 551        if (nest == NULL)
 552                goto nla_put_failure;
 553        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 554                goto nla_put_failure;
 555        nla_nest_end(skb, nest);
 556
 557        return skb->len;
 558
 559nla_put_failure:
 560        return -1;
 561}
 562
 563void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 564                               const struct qdisc_size_table *stab)
 565{
 566        int pkt_len, slot;
 567
 568        pkt_len = skb->len + stab->szopts.overhead;
 569        if (unlikely(!stab->szopts.tsize))
 570                goto out;
 571
 572        slot = pkt_len + stab->szopts.cell_align;
 573        if (unlikely(slot < 0))
 574                slot = 0;
 575
 576        slot >>= stab->szopts.cell_log;
 577        if (likely(slot < stab->szopts.tsize))
 578                pkt_len = stab->data[slot];
 579        else
 580                pkt_len = stab->data[stab->szopts.tsize - 1] *
 581                                (slot / stab->szopts.tsize) +
 582                                stab->data[slot % stab->szopts.tsize];
 583
 584        pkt_len <<= stab->szopts.size_log;
 585out:
 586        if (unlikely(pkt_len < 1))
 587                pkt_len = 1;
 588        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 589}
 590EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 591
 592void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
 593{
 594        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 595                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 596                        txt, qdisc->ops->id, qdisc->handle >> 16);
 597                qdisc->flags |= TCQ_F_WARN_NONWC;
 598        }
 599}
 600EXPORT_SYMBOL(qdisc_warn_nonwc);
 601
 602static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 603{
 604        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 605                                                 timer);
 606
 607        rcu_read_lock();
 608        __netif_schedule(qdisc_root(wd->qdisc));
 609        rcu_read_unlock();
 610
 611        return HRTIMER_NORESTART;
 612}
 613
 614void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
 615                                 clockid_t clockid)
 616{
 617        hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
 618        wd->timer.function = qdisc_watchdog;
 619        wd->qdisc = qdisc;
 620}
 621EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
 622
 623void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 624{
 625        qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
 626}
 627EXPORT_SYMBOL(qdisc_watchdog_init);
 628
 629void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
 630{
 631        if (test_bit(__QDISC_STATE_DEACTIVATED,
 632                     &qdisc_root_sleeping(wd->qdisc)->state))
 633                return;
 634
 635        if (wd->last_expires == expires)
 636                return;
 637
 638        wd->last_expires = expires;
 639        hrtimer_start(&wd->timer,
 640                      ns_to_ktime(expires),
 641                      HRTIMER_MODE_ABS_PINNED);
 642}
 643EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
 644
 645void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 646{
 647        hrtimer_cancel(&wd->timer);
 648}
 649EXPORT_SYMBOL(qdisc_watchdog_cancel);
 650
 651static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 652{
 653        struct hlist_head *h;
 654        unsigned int i;
 655
 656        h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
 657
 658        if (h != NULL) {
 659                for (i = 0; i < n; i++)
 660                        INIT_HLIST_HEAD(&h[i]);
 661        }
 662        return h;
 663}
 664
 665void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 666{
 667        struct Qdisc_class_common *cl;
 668        struct hlist_node *next;
 669        struct hlist_head *nhash, *ohash;
 670        unsigned int nsize, nmask, osize;
 671        unsigned int i, h;
 672
 673        /* Rehash when load factor exceeds 0.75 */
 674        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 675                return;
 676        nsize = clhash->hashsize * 2;
 677        nmask = nsize - 1;
 678        nhash = qdisc_class_hash_alloc(nsize);
 679        if (nhash == NULL)
 680                return;
 681
 682        ohash = clhash->hash;
 683        osize = clhash->hashsize;
 684
 685        sch_tree_lock(sch);
 686        for (i = 0; i < osize; i++) {
 687                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 688                        h = qdisc_class_hash(cl->classid, nmask);
 689                        hlist_add_head(&cl->hnode, &nhash[h]);
 690                }
 691        }
 692        clhash->hash     = nhash;
 693        clhash->hashsize = nsize;
 694        clhash->hashmask = nmask;
 695        sch_tree_unlock(sch);
 696
 697        kvfree(ohash);
 698}
 699EXPORT_SYMBOL(qdisc_class_hash_grow);
 700
 701int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 702{
 703        unsigned int size = 4;
 704
 705        clhash->hash = qdisc_class_hash_alloc(size);
 706        if (!clhash->hash)
 707                return -ENOMEM;
 708        clhash->hashsize  = size;
 709        clhash->hashmask  = size - 1;
 710        clhash->hashelems = 0;
 711        return 0;
 712}
 713EXPORT_SYMBOL(qdisc_class_hash_init);
 714
 715void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 716{
 717        kvfree(clhash->hash);
 718}
 719EXPORT_SYMBOL(qdisc_class_hash_destroy);
 720
 721void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 722                             struct Qdisc_class_common *cl)
 723{
 724        unsigned int h;
 725
 726        INIT_HLIST_NODE(&cl->hnode);
 727        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 728        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 729        clhash->hashelems++;
 730}
 731EXPORT_SYMBOL(qdisc_class_hash_insert);
 732
 733void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 734                             struct Qdisc_class_common *cl)
 735{
 736        hlist_del(&cl->hnode);
 737        clhash->hashelems--;
 738}
 739EXPORT_SYMBOL(qdisc_class_hash_remove);
 740
 741/* Allocate an unique handle from space managed by kernel
 742 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 743 */
 744static u32 qdisc_alloc_handle(struct net_device *dev)
 745{
 746        int i = 0x8000;
 747        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 748
 749        do {
 750                autohandle += TC_H_MAKE(0x10000U, 0);
 751                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 752                        autohandle = TC_H_MAKE(0x80000000U, 0);
 753                if (!qdisc_lookup(dev, autohandle))
 754                        return autohandle;
 755                cond_resched();
 756        } while (--i > 0);
 757
 758        return 0;
 759}
 760
 761void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
 762                               unsigned int len)
 763{
 764        bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
 765        const struct Qdisc_class_ops *cops;
 766        unsigned long cl;
 767        u32 parentid;
 768        bool notify;
 769        int drops;
 770
 771        if (n == 0 && len == 0)
 772                return;
 773        drops = max_t(int, n, 0);
 774        rcu_read_lock();
 775        while ((parentid = sch->parent)) {
 776                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 777                        break;
 778
 779                if (sch->flags & TCQ_F_NOPARENT)
 780                        break;
 781                /* Notify parent qdisc only if child qdisc becomes empty.
 782                 *
 783                 * If child was empty even before update then backlog
 784                 * counter is screwed and we skip notification because
 785                 * parent class is already passive.
 786                 *
 787                 * If the original child was offloaded then it is allowed
 788                 * to be seem as empty, so the parent is notified anyway.
 789                 */
 790                notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
 791                                                       !qdisc_is_offloaded);
 792                /* TODO: perform the search on a per txq basis */
 793                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 794                if (sch == NULL) {
 795                        WARN_ON_ONCE(parentid != TC_H_ROOT);
 796                        break;
 797                }
 798                cops = sch->ops->cl_ops;
 799                if (notify && cops->qlen_notify) {
 800                        cl = cops->find(sch, parentid);
 801                        cops->qlen_notify(sch, cl);
 802                }
 803                sch->q.qlen -= n;
 804                sch->qstats.backlog -= len;
 805                __qdisc_qstats_drop(sch, drops);
 806        }
 807        rcu_read_unlock();
 808}
 809EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 810
 811int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
 812                              void *type_data)
 813{
 814        struct net_device *dev = qdisc_dev(sch);
 815        int err;
 816
 817        sch->flags &= ~TCQ_F_OFFLOADED;
 818        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 819                return 0;
 820
 821        err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
 822        if (err == -EOPNOTSUPP)
 823                return 0;
 824
 825        if (!err)
 826                sch->flags |= TCQ_F_OFFLOADED;
 827
 828        return err;
 829}
 830EXPORT_SYMBOL(qdisc_offload_dump_helper);
 831
 832void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
 833                                struct Qdisc *new, struct Qdisc *old,
 834                                enum tc_setup_type type, void *type_data,
 835                                struct netlink_ext_ack *extack)
 836{
 837        bool any_qdisc_is_offloaded;
 838        int err;
 839
 840        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 841                return;
 842
 843        err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
 844
 845        /* Don't report error if the graft is part of destroy operation. */
 846        if (!err || !new || new == &noop_qdisc)
 847                return;
 848
 849        /* Don't report error if the parent, the old child and the new
 850         * one are not offloaded.
 851         */
 852        any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
 853        any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
 854        any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
 855
 856        if (any_qdisc_is_offloaded)
 857                NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
 858}
 859EXPORT_SYMBOL(qdisc_offload_graft_helper);
 860
 861static void qdisc_offload_graft_root(struct net_device *dev,
 862                                     struct Qdisc *new, struct Qdisc *old,
 863                                     struct netlink_ext_ack *extack)
 864{
 865        struct tc_root_qopt_offload graft_offload = {
 866                .command        = TC_ROOT_GRAFT,
 867                .handle         = new ? new->handle : 0,
 868                .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
 869                                  (old && old->flags & TCQ_F_INGRESS),
 870        };
 871
 872        qdisc_offload_graft_helper(dev, NULL, new, old,
 873                                   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
 874}
 875
 876static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 877                         u32 portid, u32 seq, u16 flags, int event)
 878{
 879        struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
 880        struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 881        struct tcmsg *tcm;
 882        struct nlmsghdr  *nlh;
 883        unsigned char *b = skb_tail_pointer(skb);
 884        struct gnet_dump d;
 885        struct qdisc_size_table *stab;
 886        u32 block_index;
 887        __u32 qlen;
 888
 889        cond_resched();
 890        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 891        if (!nlh)
 892                goto out_nlmsg_trim;
 893        tcm = nlmsg_data(nlh);
 894        tcm->tcm_family = AF_UNSPEC;
 895        tcm->tcm__pad1 = 0;
 896        tcm->tcm__pad2 = 0;
 897        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 898        tcm->tcm_parent = clid;
 899        tcm->tcm_handle = q->handle;
 900        tcm->tcm_info = refcount_read(&q->refcnt);
 901        if (nla_put_string(skb, TCA_KIND, q->ops->id))
 902                goto nla_put_failure;
 903        if (q->ops->ingress_block_get) {
 904                block_index = q->ops->ingress_block_get(q);
 905                if (block_index &&
 906                    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
 907                        goto nla_put_failure;
 908        }
 909        if (q->ops->egress_block_get) {
 910                block_index = q->ops->egress_block_get(q);
 911                if (block_index &&
 912                    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
 913                        goto nla_put_failure;
 914        }
 915        if (q->ops->dump && q->ops->dump(q, skb) < 0)
 916                goto nla_put_failure;
 917        if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
 918                goto nla_put_failure;
 919        qlen = qdisc_qlen_sum(q);
 920
 921        stab = rtnl_dereference(q->stab);
 922        if (stab && qdisc_dump_stab(skb, stab) < 0)
 923                goto nla_put_failure;
 924
 925        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
 926                                         NULL, &d, TCA_PAD) < 0)
 927                goto nla_put_failure;
 928
 929        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
 930                goto nla_put_failure;
 931
 932        if (qdisc_is_percpu_stats(q)) {
 933                cpu_bstats = q->cpu_bstats;
 934                cpu_qstats = q->cpu_qstats;
 935        }
 936
 937        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
 938                                  &d, cpu_bstats, &q->bstats) < 0 ||
 939            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
 940            gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 941                goto nla_put_failure;
 942
 943        if (gnet_stats_finish_copy(&d) < 0)
 944                goto nla_put_failure;
 945
 946        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 947        return skb->len;
 948
 949out_nlmsg_trim:
 950nla_put_failure:
 951        nlmsg_trim(skb, b);
 952        return -1;
 953}
 954
 955static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
 956{
 957        if (q->flags & TCQ_F_BUILTIN)
 958                return true;
 959        if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
 960                return true;
 961
 962        return false;
 963}
 964
 965static int qdisc_notify(struct net *net, struct sk_buff *oskb,
 966                        struct nlmsghdr *n, u32 clid,
 967                        struct Qdisc *old, struct Qdisc *new)
 968{
 969        struct sk_buff *skb;
 970        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 971
 972        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 973        if (!skb)
 974                return -ENOBUFS;
 975
 976        if (old && !tc_qdisc_dump_ignore(old, false)) {
 977                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
 978                                  0, RTM_DELQDISC) < 0)
 979                        goto err_out;
 980        }
 981        if (new && !tc_qdisc_dump_ignore(new, false)) {
 982                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
 983                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 984                        goto err_out;
 985        }
 986
 987        if (skb->len)
 988                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 989                                      n->nlmsg_flags & NLM_F_ECHO);
 990
 991err_out:
 992        kfree_skb(skb);
 993        return -EINVAL;
 994}
 995
 996static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 997                               struct nlmsghdr *n, u32 clid,
 998                               struct Qdisc *old, struct Qdisc *new)
 999{
1000        if (new || old)
1001                qdisc_notify(net, skb, n, clid, old, new);
1002
1003        if (old)
1004                qdisc_put(old);
1005}
1006
1007/* Graft qdisc "new" to class "classid" of qdisc "parent" or
1008 * to device "dev".
1009 *
1010 * When appropriate send a netlink notification using 'skb'
1011 * and "n".
1012 *
1013 * On success, destroy old qdisc.
1014 */
1015
1016static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1017                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1018                       struct Qdisc *new, struct Qdisc *old,
1019                       struct netlink_ext_ack *extack)
1020{
1021        struct Qdisc *q = old;
1022        struct net *net = dev_net(dev);
1023
1024        if (parent == NULL) {
1025                unsigned int i, num_q, ingress;
1026
1027                ingress = 0;
1028                num_q = dev->num_tx_queues;
1029                if ((q && q->flags & TCQ_F_INGRESS) ||
1030                    (new && new->flags & TCQ_F_INGRESS)) {
1031                        num_q = 1;
1032                        ingress = 1;
1033                        if (!dev_ingress_queue(dev)) {
1034                                NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1035                                return -ENOENT;
1036                        }
1037                }
1038
1039                if (dev->flags & IFF_UP)
1040                        dev_deactivate(dev);
1041
1042                qdisc_offload_graft_root(dev, new, old, extack);
1043
1044                if (new && new->ops->attach)
1045                        goto skip;
1046
1047                for (i = 0; i < num_q; i++) {
1048                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1049
1050                        if (!ingress)
1051                                dev_queue = netdev_get_tx_queue(dev, i);
1052
1053                        old = dev_graft_qdisc(dev_queue, new);
1054                        if (new && i > 0)
1055                                qdisc_refcount_inc(new);
1056
1057                        if (!ingress)
1058                                qdisc_put(old);
1059                }
1060
1061skip:
1062                if (!ingress) {
1063                        notify_and_destroy(net, skb, n, classid,
1064                                           dev->qdisc, new);
1065                        if (new && !new->ops->attach)
1066                                qdisc_refcount_inc(new);
1067                        dev->qdisc = new ? : &noop_qdisc;
1068
1069                        if (new && new->ops->attach)
1070                                new->ops->attach(new);
1071                } else {
1072                        notify_and_destroy(net, skb, n, classid, old, new);
1073                }
1074
1075                if (dev->flags & IFF_UP)
1076                        dev_activate(dev);
1077        } else {
1078                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1079                unsigned long cl;
1080                int err;
1081
1082                /* Only support running class lockless if parent is lockless */
1083                if (new && (new->flags & TCQ_F_NOLOCK) &&
1084                    parent && !(parent->flags & TCQ_F_NOLOCK))
1085                        new->flags &= ~TCQ_F_NOLOCK;
1086
1087                if (!cops || !cops->graft)
1088                        return -EOPNOTSUPP;
1089
1090                cl = cops->find(parent, classid);
1091                if (!cl) {
1092                        NL_SET_ERR_MSG(extack, "Specified class not found");
1093                        return -ENOENT;
1094                }
1095
1096                err = cops->graft(parent, cl, new, &old, extack);
1097                if (err)
1098                        return err;
1099                notify_and_destroy(net, skb, n, classid, old, new);
1100        }
1101        return 0;
1102}
1103
1104static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1105                                   struct netlink_ext_ack *extack)
1106{
1107        u32 block_index;
1108
1109        if (tca[TCA_INGRESS_BLOCK]) {
1110                block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1111
1112                if (!block_index) {
1113                        NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1114                        return -EINVAL;
1115                }
1116                if (!sch->ops->ingress_block_set) {
1117                        NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1118                        return -EOPNOTSUPP;
1119                }
1120                sch->ops->ingress_block_set(sch, block_index);
1121        }
1122        if (tca[TCA_EGRESS_BLOCK]) {
1123                block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1124
1125                if (!block_index) {
1126                        NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1127                        return -EINVAL;
1128                }
1129                if (!sch->ops->egress_block_set) {
1130                        NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1131                        return -EOPNOTSUPP;
1132                }
1133                sch->ops->egress_block_set(sch, block_index);
1134        }
1135        return 0;
1136}
1137
1138/*
1139   Allocate and initialize new qdisc.
1140
1141   Parameters are passed via opt.
1142 */
1143
1144static struct Qdisc *qdisc_create(struct net_device *dev,
1145                                  struct netdev_queue *dev_queue,
1146                                  struct Qdisc *p, u32 parent, u32 handle,
1147                                  struct nlattr **tca, int *errp,
1148                                  struct netlink_ext_ack *extack)
1149{
1150        int err;
1151        struct nlattr *kind = tca[TCA_KIND];
1152        struct Qdisc *sch;
1153        struct Qdisc_ops *ops;
1154        struct qdisc_size_table *stab;
1155
1156        ops = qdisc_lookup_ops(kind);
1157#ifdef CONFIG_MODULES
1158        if (ops == NULL && kind != NULL) {
1159                char name[IFNAMSIZ];
1160                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1161                        /* We dropped the RTNL semaphore in order to
1162                         * perform the module load.  So, even if we
1163                         * succeeded in loading the module we have to
1164                         * tell the caller to replay the request.  We
1165                         * indicate this using -EAGAIN.
1166                         * We replay the request because the device may
1167                         * go away in the mean time.
1168                         */
1169                        rtnl_unlock();
1170                        request_module("sch_%s", name);
1171                        rtnl_lock();
1172                        ops = qdisc_lookup_ops(kind);
1173                        if (ops != NULL) {
1174                                /* We will try again qdisc_lookup_ops,
1175                                 * so don't keep a reference.
1176                                 */
1177                                module_put(ops->owner);
1178                                err = -EAGAIN;
1179                                goto err_out;
1180                        }
1181                }
1182        }
1183#endif
1184
1185        err = -ENOENT;
1186        if (!ops) {
1187                NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1188                goto err_out;
1189        }
1190
1191        sch = qdisc_alloc(dev_queue, ops, extack);
1192        if (IS_ERR(sch)) {
1193                err = PTR_ERR(sch);
1194                goto err_out2;
1195        }
1196
1197        sch->parent = parent;
1198
1199        if (handle == TC_H_INGRESS) {
1200                sch->flags |= TCQ_F_INGRESS;
1201                handle = TC_H_MAKE(TC_H_INGRESS, 0);
1202        } else {
1203                if (handle == 0) {
1204                        handle = qdisc_alloc_handle(dev);
1205                        err = -ENOMEM;
1206                        if (handle == 0)
1207                                goto err_out3;
1208                }
1209                if (!netif_is_multiqueue(dev))
1210                        sch->flags |= TCQ_F_ONETXQUEUE;
1211        }
1212
1213        sch->handle = handle;
1214
1215        /* This exist to keep backward compatible with a userspace
1216         * loophole, what allowed userspace to get IFF_NO_QUEUE
1217         * facility on older kernels by setting tx_queue_len=0 (prior
1218         * to qdisc init), and then forgot to reinit tx_queue_len
1219         * before again attaching a qdisc.
1220         */
1221        if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1222                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1223                netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1224        }
1225
1226        err = qdisc_block_indexes_set(sch, tca, extack);
1227        if (err)
1228                goto err_out3;
1229
1230        if (ops->init) {
1231                err = ops->init(sch, tca[TCA_OPTIONS], extack);
1232                if (err != 0)
1233                        goto err_out5;
1234        }
1235
1236        if (tca[TCA_STAB]) {
1237                stab = qdisc_get_stab(tca[TCA_STAB], extack);
1238                if (IS_ERR(stab)) {
1239                        err = PTR_ERR(stab);
1240                        goto err_out4;
1241                }
1242                rcu_assign_pointer(sch->stab, stab);
1243        }
1244        if (tca[TCA_RATE]) {
1245                seqcount_t *running;
1246
1247                err = -EOPNOTSUPP;
1248                if (sch->flags & TCQ_F_MQROOT) {
1249                        NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1250                        goto err_out4;
1251                }
1252
1253                if (sch->parent != TC_H_ROOT &&
1254                    !(sch->flags & TCQ_F_INGRESS) &&
1255                    (!p || !(p->flags & TCQ_F_MQROOT)))
1256                        running = qdisc_root_sleeping_running(sch);
1257                else
1258                        running = &sch->running;
1259
1260                err = gen_new_estimator(&sch->bstats,
1261                                        sch->cpu_bstats,
1262                                        &sch->rate_est,
1263                                        NULL,
1264                                        running,
1265                                        tca[TCA_RATE]);
1266                if (err) {
1267                        NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1268                        goto err_out4;
1269                }
1270        }
1271
1272        qdisc_hash_add(sch, false);
1273
1274        return sch;
1275
1276err_out5:
1277        /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1278        if (ops->destroy)
1279                ops->destroy(sch);
1280err_out3:
1281        dev_put(dev);
1282        qdisc_free(sch);
1283err_out2:
1284        module_put(ops->owner);
1285err_out:
1286        *errp = err;
1287        return NULL;
1288
1289err_out4:
1290        /*
1291         * Any broken qdiscs that would require a ops->reset() here?
1292         * The qdisc was never in action so it shouldn't be necessary.
1293         */
1294        qdisc_put_stab(rtnl_dereference(sch->stab));
1295        if (ops->destroy)
1296                ops->destroy(sch);
1297        goto err_out3;
1298}
1299
1300static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1301                        struct netlink_ext_ack *extack)
1302{
1303        struct qdisc_size_table *ostab, *stab = NULL;
1304        int err = 0;
1305
1306        if (tca[TCA_OPTIONS]) {
1307                if (!sch->ops->change) {
1308                        NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1309                        return -EINVAL;
1310                }
1311                if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1312                        NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1313                        return -EOPNOTSUPP;
1314                }
1315                err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1316                if (err)
1317                        return err;
1318        }
1319
1320        if (tca[TCA_STAB]) {
1321                stab = qdisc_get_stab(tca[TCA_STAB], extack);
1322                if (IS_ERR(stab))
1323                        return PTR_ERR(stab);
1324        }
1325
1326        ostab = rtnl_dereference(sch->stab);
1327        rcu_assign_pointer(sch->stab, stab);
1328        qdisc_put_stab(ostab);
1329
1330        if (tca[TCA_RATE]) {
1331                /* NB: ignores errors from replace_estimator
1332                   because change can't be undone. */
1333                if (sch->flags & TCQ_F_MQROOT)
1334                        goto out;
1335                gen_replace_estimator(&sch->bstats,
1336                                      sch->cpu_bstats,
1337                                      &sch->rate_est,
1338                                      NULL,
1339                                      qdisc_root_sleeping_running(sch),
1340                                      tca[TCA_RATE]);
1341        }
1342out:
1343        return 0;
1344}
1345
1346struct check_loop_arg {
1347        struct qdisc_walker     w;
1348        struct Qdisc            *p;
1349        int                     depth;
1350};
1351
1352static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1353                         struct qdisc_walker *w);
1354
1355static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1356{
1357        struct check_loop_arg   arg;
1358
1359        if (q->ops->cl_ops == NULL)
1360                return 0;
1361
1362        arg.w.stop = arg.w.skip = arg.w.count = 0;
1363        arg.w.fn = check_loop_fn;
1364        arg.depth = depth;
1365        arg.p = p;
1366        q->ops->cl_ops->walk(q, &arg.w);
1367        return arg.w.stop ? -ELOOP : 0;
1368}
1369
1370static int
1371check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1372{
1373        struct Qdisc *leaf;
1374        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1375        struct check_loop_arg *arg = (struct check_loop_arg *)w;
1376
1377        leaf = cops->leaf(q, cl);
1378        if (leaf) {
1379                if (leaf == arg->p || arg->depth > 7)
1380                        return -ELOOP;
1381                return check_loop(leaf, arg->p, arg->depth + 1);
1382        }
1383        return 0;
1384}
1385
1386const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1387        [TCA_KIND]              = { .type = NLA_STRING },
1388        [TCA_RATE]              = { .type = NLA_BINARY,
1389                                    .len = sizeof(struct tc_estimator) },
1390        [TCA_STAB]              = { .type = NLA_NESTED },
1391        [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1392        [TCA_CHAIN]             = { .type = NLA_U32 },
1393        [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1394        [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1395};
1396
1397/*
1398 * Delete/get qdisc.
1399 */
1400
1401static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1402                        struct netlink_ext_ack *extack)
1403{
1404        struct net *net = sock_net(skb->sk);
1405        struct tcmsg *tcm = nlmsg_data(n);
1406        struct nlattr *tca[TCA_MAX + 1];
1407        struct net_device *dev;
1408        u32 clid;
1409        struct Qdisc *q = NULL;
1410        struct Qdisc *p = NULL;
1411        int err;
1412
1413        if ((n->nlmsg_type != RTM_GETQDISC) &&
1414            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1415                return -EPERM;
1416
1417        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1418                          extack);
1419        if (err < 0)
1420                return err;
1421
1422        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1423        if (!dev)
1424                return -ENODEV;
1425
1426        clid = tcm->tcm_parent;
1427        if (clid) {
1428                if (clid != TC_H_ROOT) {
1429                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1430                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1431                                if (!p) {
1432                                        NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1433                                        return -ENOENT;
1434                                }
1435                                q = qdisc_leaf(p, clid);
1436                        } else if (dev_ingress_queue(dev)) {
1437                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1438                        }
1439                } else {
1440                        q = dev->qdisc;
1441                }
1442                if (!q) {
1443                        NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1444                        return -ENOENT;
1445                }
1446
1447                if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1448                        NL_SET_ERR_MSG(extack, "Invalid handle");
1449                        return -EINVAL;
1450                }
1451        } else {
1452                q = qdisc_lookup(dev, tcm->tcm_handle);
1453                if (!q) {
1454                        NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1455                        return -ENOENT;
1456                }
1457        }
1458
1459        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1460                NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1461                return -EINVAL;
1462        }
1463
1464        if (n->nlmsg_type == RTM_DELQDISC) {
1465                if (!clid) {
1466                        NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1467                        return -EINVAL;
1468                }
1469                if (q->handle == 0) {
1470                        NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1471                        return -ENOENT;
1472                }
1473                err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1474                if (err != 0)
1475                        return err;
1476        } else {
1477                qdisc_notify(net, skb, n, clid, NULL, q);
1478        }
1479        return 0;
1480}
1481
1482/*
1483 * Create/change qdisc.
1484 */
1485
1486static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1487                           struct netlink_ext_ack *extack)
1488{
1489        struct net *net = sock_net(skb->sk);
1490        struct tcmsg *tcm;
1491        struct nlattr *tca[TCA_MAX + 1];
1492        struct net_device *dev;
1493        u32 clid;
1494        struct Qdisc *q, *p;
1495        int err;
1496
1497        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1498                return -EPERM;
1499
1500replay:
1501        /* Reinit, just in case something touches this. */
1502        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1503                          extack);
1504        if (err < 0)
1505                return err;
1506
1507        tcm = nlmsg_data(n);
1508        clid = tcm->tcm_parent;
1509        q = p = NULL;
1510
1511        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1512        if (!dev)
1513                return -ENODEV;
1514
1515
1516        if (clid) {
1517                if (clid != TC_H_ROOT) {
1518                        if (clid != TC_H_INGRESS) {
1519                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1520                                if (!p) {
1521                                        NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1522                                        return -ENOENT;
1523                                }
1524                                q = qdisc_leaf(p, clid);
1525                        } else if (dev_ingress_queue_create(dev)) {
1526                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1527                        }
1528                } else {
1529                        q = dev->qdisc;
1530                }
1531
1532                /* It may be default qdisc, ignore it */
1533                if (q && q->handle == 0)
1534                        q = NULL;
1535
1536                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1537                        if (tcm->tcm_handle) {
1538                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1539                                        NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1540                                        return -EEXIST;
1541                                }
1542                                if (TC_H_MIN(tcm->tcm_handle)) {
1543                                        NL_SET_ERR_MSG(extack, "Invalid minor handle");
1544                                        return -EINVAL;
1545                                }
1546                                q = qdisc_lookup(dev, tcm->tcm_handle);
1547                                if (!q)
1548                                        goto create_n_graft;
1549                                if (n->nlmsg_flags & NLM_F_EXCL) {
1550                                        NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1551                                        return -EEXIST;
1552                                }
1553                                if (tca[TCA_KIND] &&
1554                                    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1555                                        NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1556                                        return -EINVAL;
1557                                }
1558                                if (q == p ||
1559                                    (p && check_loop(q, p, 0))) {
1560                                        NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1561                                        return -ELOOP;
1562                                }
1563                                qdisc_refcount_inc(q);
1564                                goto graft;
1565                        } else {
1566                                if (!q)
1567                                        goto create_n_graft;
1568
1569                                /* This magic test requires explanation.
1570                                 *
1571                                 *   We know, that some child q is already
1572                                 *   attached to this parent and have choice:
1573                                 *   either to change it or to create/graft new one.
1574                                 *
1575                                 *   1. We are allowed to create/graft only
1576                                 *   if CREATE and REPLACE flags are set.
1577                                 *
1578                                 *   2. If EXCL is set, requestor wanted to say,
1579                                 *   that qdisc tcm_handle is not expected
1580                                 *   to exist, so that we choose create/graft too.
1581                                 *
1582                                 *   3. The last case is when no flags are set.
1583                                 *   Alas, it is sort of hole in API, we
1584                                 *   cannot decide what to do unambiguously.
1585                                 *   For now we select create/graft, if
1586                                 *   user gave KIND, which does not match existing.
1587                                 */
1588                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1589                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1590                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1591                                     (tca[TCA_KIND] &&
1592                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1593                                        goto create_n_graft;
1594                        }
1595                }
1596        } else {
1597                if (!tcm->tcm_handle) {
1598                        NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1599                        return -EINVAL;
1600                }
1601                q = qdisc_lookup(dev, tcm->tcm_handle);
1602        }
1603
1604        /* Change qdisc parameters */
1605        if (!q) {
1606                NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1607                return -ENOENT;
1608        }
1609        if (n->nlmsg_flags & NLM_F_EXCL) {
1610                NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1611                return -EEXIST;
1612        }
1613        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1614                NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1615                return -EINVAL;
1616        }
1617        err = qdisc_change(q, tca, extack);
1618        if (err == 0)
1619                qdisc_notify(net, skb, n, clid, NULL, q);
1620        return err;
1621
1622create_n_graft:
1623        if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1624                NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1625                return -ENOENT;
1626        }
1627        if (clid == TC_H_INGRESS) {
1628                if (dev_ingress_queue(dev)) {
1629                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1630                                         tcm->tcm_parent, tcm->tcm_parent,
1631                                         tca, &err, extack);
1632                } else {
1633                        NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1634                        err = -ENOENT;
1635                }
1636        } else {
1637                struct netdev_queue *dev_queue;
1638
1639                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1640                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1641                else if (p)
1642                        dev_queue = p->dev_queue;
1643                else
1644                        dev_queue = netdev_get_tx_queue(dev, 0);
1645
1646                q = qdisc_create(dev, dev_queue, p,
1647                                 tcm->tcm_parent, tcm->tcm_handle,
1648                                 tca, &err, extack);
1649        }
1650        if (q == NULL) {
1651                if (err == -EAGAIN)
1652                        goto replay;
1653                return err;
1654        }
1655
1656graft:
1657        err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1658        if (err) {
1659                if (q)
1660                        qdisc_put(q);
1661                return err;
1662        }
1663
1664        return 0;
1665}
1666
1667static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1668                              struct netlink_callback *cb,
1669                              int *q_idx_p, int s_q_idx, bool recur,
1670                              bool dump_invisible)
1671{
1672        int ret = 0, q_idx = *q_idx_p;
1673        struct Qdisc *q;
1674        int b;
1675
1676        if (!root)
1677                return 0;
1678
1679        q = root;
1680        if (q_idx < s_q_idx) {
1681                q_idx++;
1682        } else {
1683                if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1684                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1685                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1686                                  RTM_NEWQDISC) <= 0)
1687                        goto done;
1688                q_idx++;
1689        }
1690
1691        /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1692         * itself has already been dumped.
1693         *
1694         * If we've already dumped the top-level (ingress) qdisc above and the global
1695         * qdisc hashtable, we don't want to hit it again
1696         */
1697        if (!qdisc_dev(root) || !recur)
1698                goto out;
1699
1700        hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1701                if (q_idx < s_q_idx) {
1702                        q_idx++;
1703                        continue;
1704                }
1705                if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1706                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1707                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1708                                  RTM_NEWQDISC) <= 0)
1709                        goto done;
1710                q_idx++;
1711        }
1712
1713out:
1714        *q_idx_p = q_idx;
1715        return ret;
1716done:
1717        ret = -1;
1718        goto out;
1719}
1720
1721static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1722{
1723        struct net *net = sock_net(skb->sk);
1724        int idx, q_idx;
1725        int s_idx, s_q_idx;
1726        struct net_device *dev;
1727        const struct nlmsghdr *nlh = cb->nlh;
1728        struct nlattr *tca[TCA_MAX + 1];
1729        int err;
1730
1731        s_idx = cb->args[0];
1732        s_q_idx = q_idx = cb->args[1];
1733
1734        idx = 0;
1735        ASSERT_RTNL();
1736
1737        err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1738                          rtm_tca_policy, cb->extack);
1739        if (err < 0)
1740                return err;
1741
1742        for_each_netdev(net, dev) {
1743                struct netdev_queue *dev_queue;
1744
1745                if (idx < s_idx)
1746                        goto cont;
1747                if (idx > s_idx)
1748                        s_q_idx = 0;
1749                q_idx = 0;
1750
1751                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1752                                       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1753                        goto done;
1754
1755                dev_queue = dev_ingress_queue(dev);
1756                if (dev_queue &&
1757                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1758                                       &q_idx, s_q_idx, false,
1759                                       tca[TCA_DUMP_INVISIBLE]) < 0)
1760                        goto done;
1761
1762cont:
1763                idx++;
1764        }
1765
1766done:
1767        cb->args[0] = idx;
1768        cb->args[1] = q_idx;
1769
1770        return skb->len;
1771}
1772
1773
1774
1775/************************************************
1776 *      Traffic classes manipulation.           *
1777 ************************************************/
1778
1779static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1780                          unsigned long cl,
1781                          u32 portid, u32 seq, u16 flags, int event)
1782{
1783        struct tcmsg *tcm;
1784        struct nlmsghdr  *nlh;
1785        unsigned char *b = skb_tail_pointer(skb);
1786        struct gnet_dump d;
1787        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1788
1789        cond_resched();
1790        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1791        if (!nlh)
1792                goto out_nlmsg_trim;
1793        tcm = nlmsg_data(nlh);
1794        tcm->tcm_family = AF_UNSPEC;
1795        tcm->tcm__pad1 = 0;
1796        tcm->tcm__pad2 = 0;
1797        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1798        tcm->tcm_parent = q->handle;
1799        tcm->tcm_handle = q->handle;
1800        tcm->tcm_info = 0;
1801        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1802                goto nla_put_failure;
1803        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1804                goto nla_put_failure;
1805
1806        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1807                                         NULL, &d, TCA_PAD) < 0)
1808                goto nla_put_failure;
1809
1810        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1811                goto nla_put_failure;
1812
1813        if (gnet_stats_finish_copy(&d) < 0)
1814                goto nla_put_failure;
1815
1816        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1817        return skb->len;
1818
1819out_nlmsg_trim:
1820nla_put_failure:
1821        nlmsg_trim(skb, b);
1822        return -1;
1823}
1824
1825static int tclass_notify(struct net *net, struct sk_buff *oskb,
1826                         struct nlmsghdr *n, struct Qdisc *q,
1827                         unsigned long cl, int event)
1828{
1829        struct sk_buff *skb;
1830        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1831
1832        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1833        if (!skb)
1834                return -ENOBUFS;
1835
1836        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1837                kfree_skb(skb);
1838                return -EINVAL;
1839        }
1840
1841        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1842                              n->nlmsg_flags & NLM_F_ECHO);
1843}
1844
1845static int tclass_del_notify(struct net *net,
1846                             const struct Qdisc_class_ops *cops,
1847                             struct sk_buff *oskb, struct nlmsghdr *n,
1848                             struct Qdisc *q, unsigned long cl)
1849{
1850        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1851        struct sk_buff *skb;
1852        int err = 0;
1853
1854        if (!cops->delete)
1855                return -EOPNOTSUPP;
1856
1857        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1858        if (!skb)
1859                return -ENOBUFS;
1860
1861        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1862                           RTM_DELTCLASS) < 0) {
1863                kfree_skb(skb);
1864                return -EINVAL;
1865        }
1866
1867        err = cops->delete(q, cl);
1868        if (err) {
1869                kfree_skb(skb);
1870                return err;
1871        }
1872
1873        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1874                              n->nlmsg_flags & NLM_F_ECHO);
1875}
1876
1877#ifdef CONFIG_NET_CLS
1878
1879struct tcf_bind_args {
1880        struct tcf_walker w;
1881        u32 classid;
1882        unsigned long cl;
1883};
1884
1885static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1886{
1887        struct tcf_bind_args *a = (void *)arg;
1888
1889        if (tp->ops->bind_class) {
1890                struct Qdisc *q = tcf_block_q(tp->chain->block);
1891
1892                sch_tree_lock(q);
1893                tp->ops->bind_class(n, a->classid, a->cl);
1894                sch_tree_unlock(q);
1895        }
1896        return 0;
1897}
1898
1899static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1900                           unsigned long new_cl)
1901{
1902        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1903        struct tcf_block *block;
1904        struct tcf_chain *chain;
1905        unsigned long cl;
1906
1907        cl = cops->find(q, portid);
1908        if (!cl)
1909                return;
1910        block = cops->tcf_block(q, cl, NULL);
1911        if (!block)
1912                return;
1913        list_for_each_entry(chain, &block->chain_list, list) {
1914                struct tcf_proto *tp;
1915
1916                for (tp = rtnl_dereference(chain->filter_chain);
1917                     tp; tp = rtnl_dereference(tp->next)) {
1918                        struct tcf_bind_args arg = {};
1919
1920                        arg.w.fn = tcf_node_bind;
1921                        arg.classid = clid;
1922                        arg.cl = new_cl;
1923                        tp->ops->walk(tp, &arg.w);
1924                }
1925        }
1926}
1927
1928#else
1929
1930static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1931                           unsigned long new_cl)
1932{
1933}
1934
1935#endif
1936
1937static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1938                         struct netlink_ext_ack *extack)
1939{
1940        struct net *net = sock_net(skb->sk);
1941        struct tcmsg *tcm = nlmsg_data(n);
1942        struct nlattr *tca[TCA_MAX + 1];
1943        struct net_device *dev;
1944        struct Qdisc *q = NULL;
1945        const struct Qdisc_class_ops *cops;
1946        unsigned long cl = 0;
1947        unsigned long new_cl;
1948        u32 portid;
1949        u32 clid;
1950        u32 qid;
1951        int err;
1952
1953        if ((n->nlmsg_type != RTM_GETTCLASS) &&
1954            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1955                return -EPERM;
1956
1957        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1958                          extack);
1959        if (err < 0)
1960                return err;
1961
1962        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1963        if (!dev)
1964                return -ENODEV;
1965
1966        /*
1967           parent == TC_H_UNSPEC - unspecified parent.
1968           parent == TC_H_ROOT   - class is root, which has no parent.
1969           parent == X:0         - parent is root class.
1970           parent == X:Y         - parent is a node in hierarchy.
1971           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1972
1973           handle == 0:0         - generate handle from kernel pool.
1974           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1975           handle == X:Y         - clear.
1976           handle == X:0         - root class.
1977         */
1978
1979        /* Step 1. Determine qdisc handle X:0 */
1980
1981        portid = tcm->tcm_parent;
1982        clid = tcm->tcm_handle;
1983        qid = TC_H_MAJ(clid);
1984
1985        if (portid != TC_H_ROOT) {
1986                u32 qid1 = TC_H_MAJ(portid);
1987
1988                if (qid && qid1) {
1989                        /* If both majors are known, they must be identical. */
1990                        if (qid != qid1)
1991                                return -EINVAL;
1992                } else if (qid1) {
1993                        qid = qid1;
1994                } else if (qid == 0)
1995                        qid = dev->qdisc->handle;
1996
1997                /* Now qid is genuine qdisc handle consistent
1998                 * both with parent and child.
1999                 *
2000                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2001                 */
2002                if (portid)
2003                        portid = TC_H_MAKE(qid, portid);
2004        } else {
2005                if (qid == 0)
2006                        qid = dev->qdisc->handle;
2007        }
2008
2009        /* OK. Locate qdisc */
2010        q = qdisc_lookup(dev, qid);
2011        if (!q)
2012                return -ENOENT;
2013
2014        /* An check that it supports classes */
2015        cops = q->ops->cl_ops;
2016        if (cops == NULL)
2017                return -EINVAL;
2018
2019        /* Now try to get class */
2020        if (clid == 0) {
2021                if (portid == TC_H_ROOT)
2022                        clid = qid;
2023        } else
2024                clid = TC_H_MAKE(qid, clid);
2025
2026        if (clid)
2027                cl = cops->find(q, clid);
2028
2029        if (cl == 0) {
2030                err = -ENOENT;
2031                if (n->nlmsg_type != RTM_NEWTCLASS ||
2032                    !(n->nlmsg_flags & NLM_F_CREATE))
2033                        goto out;
2034        } else {
2035                switch (n->nlmsg_type) {
2036                case RTM_NEWTCLASS:
2037                        err = -EEXIST;
2038                        if (n->nlmsg_flags & NLM_F_EXCL)
2039                                goto out;
2040                        break;
2041                case RTM_DELTCLASS:
2042                        err = tclass_del_notify(net, cops, skb, n, q, cl);
2043                        /* Unbind the class with flilters with 0 */
2044                        tc_bind_tclass(q, portid, clid, 0);
2045                        goto out;
2046                case RTM_GETTCLASS:
2047                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2048                        goto out;
2049                default:
2050                        err = -EINVAL;
2051                        goto out;
2052                }
2053        }
2054
2055        if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2056                NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2057                return -EOPNOTSUPP;
2058        }
2059
2060        new_cl = cl;
2061        err = -EOPNOTSUPP;
2062        if (cops->change)
2063                err = cops->change(q, clid, portid, tca, &new_cl, extack);
2064        if (err == 0) {
2065                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2066                /* We just create a new class, need to do reverse binding. */
2067                if (cl != new_cl)
2068                        tc_bind_tclass(q, portid, clid, new_cl);
2069        }
2070out:
2071        return err;
2072}
2073
2074struct qdisc_dump_args {
2075        struct qdisc_walker     w;
2076        struct sk_buff          *skb;
2077        struct netlink_callback *cb;
2078};
2079
2080static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2081                            struct qdisc_walker *arg)
2082{
2083        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2084
2085        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2086                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2087                              RTM_NEWTCLASS);
2088}
2089
2090static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2091                                struct tcmsg *tcm, struct netlink_callback *cb,
2092                                int *t_p, int s_t)
2093{
2094        struct qdisc_dump_args arg;
2095
2096        if (tc_qdisc_dump_ignore(q, false) ||
2097            *t_p < s_t || !q->ops->cl_ops ||
2098            (tcm->tcm_parent &&
2099             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2100                (*t_p)++;
2101                return 0;
2102        }
2103        if (*t_p > s_t)
2104                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2105        arg.w.fn = qdisc_class_dump;
2106        arg.skb = skb;
2107        arg.cb = cb;
2108        arg.w.stop  = 0;
2109        arg.w.skip = cb->args[1];
2110        arg.w.count = 0;
2111        q->ops->cl_ops->walk(q, &arg.w);
2112        cb->args[1] = arg.w.count;
2113        if (arg.w.stop)
2114                return -1;
2115        (*t_p)++;
2116        return 0;
2117}
2118
2119static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2120                               struct tcmsg *tcm, struct netlink_callback *cb,
2121                               int *t_p, int s_t)
2122{
2123        struct Qdisc *q;
2124        int b;
2125
2126        if (!root)
2127                return 0;
2128
2129        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2130                return -1;
2131
2132        if (!qdisc_dev(root))
2133                return 0;
2134
2135        if (tcm->tcm_parent) {
2136                q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2137                if (q && q != root &&
2138                    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2139                        return -1;
2140                return 0;
2141        }
2142        hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2143                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2144                        return -1;
2145        }
2146
2147        return 0;
2148}
2149
2150static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2151{
2152        struct tcmsg *tcm = nlmsg_data(cb->nlh);
2153        struct net *net = sock_net(skb->sk);
2154        struct netdev_queue *dev_queue;
2155        struct net_device *dev;
2156        int t, s_t;
2157
2158        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2159                return 0;
2160        dev = dev_get_by_index(net, tcm->tcm_ifindex);
2161        if (!dev)
2162                return 0;
2163
2164        s_t = cb->args[0];
2165        t = 0;
2166
2167        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2168                goto done;
2169
2170        dev_queue = dev_ingress_queue(dev);
2171        if (dev_queue &&
2172            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2173                                &t, s_t) < 0)
2174                goto done;
2175
2176done:
2177        cb->args[0] = t;
2178
2179        dev_put(dev);
2180        return skb->len;
2181}
2182
2183#ifdef CONFIG_PROC_FS
2184static int psched_show(struct seq_file *seq, void *v)
2185{
2186        seq_printf(seq, "%08x %08x %08x %08x\n",
2187                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2188                   1000000,
2189                   (u32)NSEC_PER_SEC / hrtimer_resolution);
2190
2191        return 0;
2192}
2193
2194static int __net_init psched_net_init(struct net *net)
2195{
2196        struct proc_dir_entry *e;
2197
2198        e = proc_create_single("psched", 0, net->proc_net, psched_show);
2199        if (e == NULL)
2200                return -ENOMEM;
2201
2202        return 0;
2203}
2204
2205static void __net_exit psched_net_exit(struct net *net)
2206{
2207        remove_proc_entry("psched", net->proc_net);
2208}
2209#else
2210static int __net_init psched_net_init(struct net *net)
2211{
2212        return 0;
2213}
2214
2215static void __net_exit psched_net_exit(struct net *net)
2216{
2217}
2218#endif
2219
2220static struct pernet_operations psched_net_ops = {
2221        .init = psched_net_init,
2222        .exit = psched_net_exit,
2223};
2224
2225static int __init pktsched_init(void)
2226{
2227        int err;
2228
2229        err = register_pernet_subsys(&psched_net_ops);
2230        if (err) {
2231                pr_err("pktsched_init: "
2232                       "cannot initialize per netns operations\n");
2233                return err;
2234        }
2235
2236        register_qdisc(&pfifo_fast_ops);
2237        register_qdisc(&pfifo_qdisc_ops);
2238        register_qdisc(&bfifo_qdisc_ops);
2239        register_qdisc(&pfifo_head_drop_qdisc_ops);
2240        register_qdisc(&mq_qdisc_ops);
2241        register_qdisc(&noqueue_qdisc_ops);
2242
2243        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2244        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2245        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2246                      0);
2247        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2248        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2249        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2250                      0);
2251
2252        return 0;
2253}
2254
2255subsys_initcall(pktsched_init);
2256