linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/slab.h>
  31#include <linux/hashtable.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37#include <net/pkt_cls.h>
  38
  39#include <trace/events/qdisc.h>
  40
  41/*
  42
  43   Short review.
  44   -------------
  45
  46   This file consists of two interrelated parts:
  47
  48   1. queueing disciplines manager frontend.
  49   2. traffic classes manager frontend.
  50
  51   Generally, queueing discipline ("qdisc") is a black box,
  52   which is able to enqueue packets and to dequeue them (when
  53   device is ready to send something) in order and at times
  54   determined by algorithm hidden in it.
  55
  56   qdisc's are divided to two categories:
  57   - "queues", which have no internal structure visible from outside.
  58   - "schedulers", which split all the packets to "traffic classes",
  59     using "packet classifiers" (look at cls_api.c)
  60
  61   In turn, classes may have child qdiscs (as rule, queues)
  62   attached to them etc. etc. etc.
  63
  64   The goal of the routines in this file is to translate
  65   information supplied by user in the form of handles
  66   to more intelligible for kernel form, to make some sanity
  67   checks and part of work, which is common to all qdiscs
  68   and to provide rtnetlink notifications.
  69
  70   All real intelligent work is done inside qdisc modules.
  71
  72
  73
  74   Every discipline has two major routines: enqueue and dequeue.
  75
  76   ---dequeue
  77
  78   dequeue usually returns a skb to send. It is allowed to return NULL,
  79   but it does not mean that queue is empty, it just means that
  80   discipline does not want to send anything this time.
  81   Queue is really empty if q->q.qlen == 0.
  82   For complicated disciplines with multiple queues q->q is not
  83   real packet queue, but however q->q.qlen must be valid.
  84
  85   ---enqueue
  86
  87   enqueue returns 0, if packet was enqueued successfully.
  88   If packet (this one or another one) was dropped, it returns
  89   not zero error code.
  90   NET_XMIT_DROP        - this packet dropped
  91     Expected action: do not backoff, but wait until queue will clear.
  92   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  93     Expected action: backoff or ignore
  94
  95   Auxiliary routines:
  96
  97   ---peek
  98
  99   like dequeue but without removing a packet from the queue
 100
 101   ---reset
 102
 103   returns qdisc to initial state: purge all buffers, clear all
 104   timers, counters (except for statistics) etc.
 105
 106   ---init
 107
 108   initializes newly created qdisc.
 109
 110   ---destroy
 111
 112   destroys resources allocated by init and during lifetime of qdisc.
 113
 114   ---change
 115
 116   changes qdisc parameters.
 117 */
 118
 119/* Protects list of registered TC modules. It is pure SMP lock. */
 120static DEFINE_RWLOCK(qdisc_mod_lock);
 121
 122
 123/************************************************
 124 *      Queueing disciplines manipulation.      *
 125 ************************************************/
 126
 127
 128/* The list of all installed queueing disciplines. */
 129
 130static struct Qdisc_ops *qdisc_base;
 131
 132/* Register/unregister queueing discipline */
 133
 134int register_qdisc(struct Qdisc_ops *qops)
 135{
 136        struct Qdisc_ops *q, **qp;
 137        int rc = -EEXIST;
 138
 139        write_lock(&qdisc_mod_lock);
 140        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 141                if (!strcmp(qops->id, q->id))
 142                        goto out;
 143
 144        if (qops->enqueue == NULL)
 145                qops->enqueue = noop_qdisc_ops.enqueue;
 146        if (qops->peek == NULL) {
 147                if (qops->dequeue == NULL)
 148                        qops->peek = noop_qdisc_ops.peek;
 149                else
 150                        goto out_einval;
 151        }
 152        if (qops->dequeue == NULL)
 153                qops->dequeue = noop_qdisc_ops.dequeue;
 154
 155        if (qops->cl_ops) {
 156                const struct Qdisc_class_ops *cops = qops->cl_ops;
 157
 158                if (!(cops->find && cops->walk && cops->leaf))
 159                        goto out_einval;
 160
 161                if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
 162                        goto out_einval;
 163        }
 164
 165        qops->next = NULL;
 166        *qp = qops;
 167        rc = 0;
 168out:
 169        write_unlock(&qdisc_mod_lock);
 170        return rc;
 171
 172out_einval:
 173        rc = -EINVAL;
 174        goto out;
 175}
 176EXPORT_SYMBOL(register_qdisc);
 177
 178int unregister_qdisc(struct Qdisc_ops *qops)
 179{
 180        struct Qdisc_ops *q, **qp;
 181        int err = -ENOENT;
 182
 183        write_lock(&qdisc_mod_lock);
 184        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 185                if (q == qops)
 186                        break;
 187        if (q) {
 188                *qp = q->next;
 189                q->next = NULL;
 190                err = 0;
 191        }
 192        write_unlock(&qdisc_mod_lock);
 193        return err;
 194}
 195EXPORT_SYMBOL(unregister_qdisc);
 196
 197/* Get default qdisc if not otherwise specified */
 198void qdisc_get_default(char *name, size_t len)
 199{
 200        read_lock(&qdisc_mod_lock);
 201        strlcpy(name, default_qdisc_ops->id, len);
 202        read_unlock(&qdisc_mod_lock);
 203}
 204
 205static struct Qdisc_ops *qdisc_lookup_default(const char *name)
 206{
 207        struct Qdisc_ops *q = NULL;
 208
 209        for (q = qdisc_base; q; q = q->next) {
 210                if (!strcmp(name, q->id)) {
 211                        if (!try_module_get(q->owner))
 212                                q = NULL;
 213                        break;
 214                }
 215        }
 216
 217        return q;
 218}
 219
 220/* Set new default qdisc to use */
 221int qdisc_set_default(const char *name)
 222{
 223        const struct Qdisc_ops *ops;
 224
 225        if (!capable(CAP_NET_ADMIN))
 226                return -EPERM;
 227
 228        write_lock(&qdisc_mod_lock);
 229        ops = qdisc_lookup_default(name);
 230        if (!ops) {
 231                /* Not found, drop lock and try to load module */
 232                write_unlock(&qdisc_mod_lock);
 233                request_module("sch_%s", name);
 234                write_lock(&qdisc_mod_lock);
 235
 236                ops = qdisc_lookup_default(name);
 237        }
 238
 239        if (ops) {
 240                /* Set new default */
 241                module_put(default_qdisc_ops->owner);
 242                default_qdisc_ops = ops;
 243        }
 244        write_unlock(&qdisc_mod_lock);
 245
 246        return ops ? 0 : -ENOENT;
 247}
 248
 249#ifdef CONFIG_NET_SCH_DEFAULT
 250/* Set default value from kernel config */
 251static int __init sch_default_qdisc(void)
 252{
 253        return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
 254}
 255late_initcall(sch_default_qdisc);
 256#endif
 257
 258/* We know handle. Find qdisc among all qdisc's attached to device
 259 * (root qdisc, all its children, children of children etc.)
 260 * Note: caller either uses rtnl or rcu_read_lock()
 261 */
 262
 263static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 264{
 265        struct Qdisc *q;
 266
 267        if (!qdisc_dev(root))
 268                return (root->handle == handle ? root : NULL);
 269
 270        if (!(root->flags & TCQ_F_BUILTIN) &&
 271            root->handle == handle)
 272                return root;
 273
 274        hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
 275                                   lockdep_rtnl_is_held()) {
 276                if (q->handle == handle)
 277                        return q;
 278        }
 279        return NULL;
 280}
 281
 282void qdisc_hash_add(struct Qdisc *q, bool invisible)
 283{
 284        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 285                ASSERT_RTNL();
 286                hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
 287                if (invisible)
 288                        q->flags |= TCQ_F_INVISIBLE;
 289        }
 290}
 291EXPORT_SYMBOL(qdisc_hash_add);
 292
 293void qdisc_hash_del(struct Qdisc *q)
 294{
 295        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 296                ASSERT_RTNL();
 297                hash_del_rcu(&q->hash);
 298        }
 299}
 300EXPORT_SYMBOL(qdisc_hash_del);
 301
 302struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 303{
 304        struct Qdisc *q;
 305
 306        if (!handle)
 307                return NULL;
 308        q = qdisc_match_from_root(dev->qdisc, handle);
 309        if (q)
 310                goto out;
 311
 312        if (dev_ingress_queue(dev))
 313                q = qdisc_match_from_root(
 314                        dev_ingress_queue(dev)->qdisc_sleeping,
 315                        handle);
 316out:
 317        return q;
 318}
 319
 320struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
 321{
 322        struct netdev_queue *nq;
 323        struct Qdisc *q;
 324
 325        if (!handle)
 326                return NULL;
 327        q = qdisc_match_from_root(dev->qdisc, handle);
 328        if (q)
 329                goto out;
 330
 331        nq = dev_ingress_queue_rcu(dev);
 332        if (nq)
 333                q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
 334out:
 335        return q;
 336}
 337
 338static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 339{
 340        unsigned long cl;
 341        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 342
 343        if (cops == NULL)
 344                return NULL;
 345        cl = cops->find(p, classid);
 346
 347        if (cl == 0)
 348                return NULL;
 349        return cops->leaf(p, cl);
 350}
 351
 352/* Find queueing discipline by name */
 353
 354static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 355{
 356        struct Qdisc_ops *q = NULL;
 357
 358        if (kind) {
 359                read_lock(&qdisc_mod_lock);
 360                for (q = qdisc_base; q; q = q->next) {
 361                        if (nla_strcmp(kind, q->id) == 0) {
 362                                if (!try_module_get(q->owner))
 363                                        q = NULL;
 364                                break;
 365                        }
 366                }
 367                read_unlock(&qdisc_mod_lock);
 368        }
 369        return q;
 370}
 371
 372/* The linklayer setting were not transferred from iproute2, in older
 373 * versions, and the rate tables lookup systems have been dropped in
 374 * the kernel. To keep backward compatible with older iproute2 tc
 375 * utils, we detect the linklayer setting by detecting if the rate
 376 * table were modified.
 377 *
 378 * For linklayer ATM table entries, the rate table will be aligned to
 379 * 48 bytes, thus some table entries will contain the same value.  The
 380 * mpu (min packet unit) is also encoded into the old rate table, thus
 381 * starting from the mpu, we find low and high table entries for
 382 * mapping this cell.  If these entries contain the same value, when
 383 * the rate tables have been modified for linklayer ATM.
 384 *
 385 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 386 * and then roundup to the next cell, calc the table entry one below,
 387 * and compare.
 388 */
 389static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 390{
 391        int low       = roundup(r->mpu, 48);
 392        int high      = roundup(low+1, 48);
 393        int cell_low  = low >> r->cell_log;
 394        int cell_high = (high >> r->cell_log) - 1;
 395
 396        /* rtab is too inaccurate at rates > 100Mbit/s */
 397        if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
 398                pr_debug("TC linklayer: Giving up ATM detection\n");
 399                return TC_LINKLAYER_ETHERNET;
 400        }
 401
 402        if ((cell_high > cell_low) && (cell_high < 256)
 403            && (rtab[cell_low] == rtab[cell_high])) {
 404                pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
 405                         cell_low, cell_high, rtab[cell_high]);
 406                return TC_LINKLAYER_ATM;
 407        }
 408        return TC_LINKLAYER_ETHERNET;
 409}
 410
 411static struct qdisc_rate_table *qdisc_rtab_list;
 412
 413struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 414                                        struct nlattr *tab,
 415                                        struct netlink_ext_ack *extack)
 416{
 417        struct qdisc_rate_table *rtab;
 418
 419        if (tab == NULL || r->rate == 0 ||
 420            r->cell_log == 0 || r->cell_log >= 32 ||
 421            nla_len(tab) != TC_RTAB_SIZE) {
 422                NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
 423                return NULL;
 424        }
 425
 426        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 427                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 428                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 429                        rtab->refcnt++;
 430                        return rtab;
 431                }
 432        }
 433
 434        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 435        if (rtab) {
 436                rtab->rate = *r;
 437                rtab->refcnt = 1;
 438                memcpy(rtab->data, nla_data(tab), 1024);
 439                if (r->linklayer == TC_LINKLAYER_UNAWARE)
 440                        r->linklayer = __detect_linklayer(r, rtab->data);
 441                rtab->next = qdisc_rtab_list;
 442                qdisc_rtab_list = rtab;
 443        } else {
 444                NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
 445        }
 446        return rtab;
 447}
 448EXPORT_SYMBOL(qdisc_get_rtab);
 449
 450void qdisc_put_rtab(struct qdisc_rate_table *tab)
 451{
 452        struct qdisc_rate_table *rtab, **rtabp;
 453
 454        if (!tab || --tab->refcnt)
 455                return;
 456
 457        for (rtabp = &qdisc_rtab_list;
 458             (rtab = *rtabp) != NULL;
 459             rtabp = &rtab->next) {
 460                if (rtab == tab) {
 461                        *rtabp = rtab->next;
 462                        kfree(rtab);
 463                        return;
 464                }
 465        }
 466}
 467EXPORT_SYMBOL(qdisc_put_rtab);
 468
 469static LIST_HEAD(qdisc_stab_list);
 470
 471static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 472        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 473        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 474};
 475
 476static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
 477                                               struct netlink_ext_ack *extack)
 478{
 479        struct nlattr *tb[TCA_STAB_MAX + 1];
 480        struct qdisc_size_table *stab;
 481        struct tc_sizespec *s;
 482        unsigned int tsize = 0;
 483        u16 *tab = NULL;
 484        int err;
 485
 486        err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
 487                                          extack);
 488        if (err < 0)
 489                return ERR_PTR(err);
 490        if (!tb[TCA_STAB_BASE]) {
 491                NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
 492                return ERR_PTR(-EINVAL);
 493        }
 494
 495        s = nla_data(tb[TCA_STAB_BASE]);
 496
 497        if (s->tsize > 0) {
 498                if (!tb[TCA_STAB_DATA]) {
 499                        NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
 500                        return ERR_PTR(-EINVAL);
 501                }
 502                tab = nla_data(tb[TCA_STAB_DATA]);
 503                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 504        }
 505
 506        if (tsize != s->tsize || (!tab && tsize > 0)) {
 507                NL_SET_ERR_MSG(extack, "Invalid size of size table");
 508                return ERR_PTR(-EINVAL);
 509        }
 510
 511        list_for_each_entry(stab, &qdisc_stab_list, list) {
 512                if (memcmp(&stab->szopts, s, sizeof(*s)))
 513                        continue;
 514                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 515                        continue;
 516                stab->refcnt++;
 517                return stab;
 518        }
 519
 520        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 521        if (!stab)
 522                return ERR_PTR(-ENOMEM);
 523
 524        stab->refcnt = 1;
 525        stab->szopts = *s;
 526        if (tsize > 0)
 527                memcpy(stab->data, tab, tsize * sizeof(u16));
 528
 529        list_add_tail(&stab->list, &qdisc_stab_list);
 530
 531        return stab;
 532}
 533
 534void qdisc_put_stab(struct qdisc_size_table *tab)
 535{
 536        if (!tab)
 537                return;
 538
 539        if (--tab->refcnt == 0) {
 540                list_del(&tab->list);
 541                kfree_rcu(tab, rcu);
 542        }
 543}
 544EXPORT_SYMBOL(qdisc_put_stab);
 545
 546static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 547{
 548        struct nlattr *nest;
 549
 550        nest = nla_nest_start_noflag(skb, TCA_STAB);
 551        if (nest == NULL)
 552                goto nla_put_failure;
 553        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 554                goto nla_put_failure;
 555        nla_nest_end(skb, nest);
 556
 557        return skb->len;
 558
 559nla_put_failure:
 560        return -1;
 561}
 562
 563void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 564                               const struct qdisc_size_table *stab)
 565{
 566        int pkt_len, slot;
 567
 568        pkt_len = skb->len + stab->szopts.overhead;
 569        if (unlikely(!stab->szopts.tsize))
 570                goto out;
 571
 572        slot = pkt_len + stab->szopts.cell_align;
 573        if (unlikely(slot < 0))
 574                slot = 0;
 575
 576        slot >>= stab->szopts.cell_log;
 577        if (likely(slot < stab->szopts.tsize))
 578                pkt_len = stab->data[slot];
 579        else
 580                pkt_len = stab->data[stab->szopts.tsize - 1] *
 581                                (slot / stab->szopts.tsize) +
 582                                stab->data[slot % stab->szopts.tsize];
 583
 584        pkt_len <<= stab->szopts.size_log;
 585out:
 586        if (unlikely(pkt_len < 1))
 587                pkt_len = 1;
 588        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 589}
 590EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 591
 592void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
 593{
 594        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 595                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 596                        txt, qdisc->ops->id, qdisc->handle >> 16);
 597                qdisc->flags |= TCQ_F_WARN_NONWC;
 598        }
 599}
 600EXPORT_SYMBOL(qdisc_warn_nonwc);
 601
 602static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 603{
 604        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 605                                                 timer);
 606
 607        rcu_read_lock();
 608        __netif_schedule(qdisc_root(wd->qdisc));
 609        rcu_read_unlock();
 610
 611        return HRTIMER_NORESTART;
 612}
 613
 614void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
 615                                 clockid_t clockid)
 616{
 617        hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
 618        wd->timer.function = qdisc_watchdog;
 619        wd->qdisc = qdisc;
 620}
 621EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
 622
 623void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 624{
 625        qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
 626}
 627EXPORT_SYMBOL(qdisc_watchdog_init);
 628
 629void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
 630                                      u64 delta_ns)
 631{
 632        if (test_bit(__QDISC_STATE_DEACTIVATED,
 633                     &qdisc_root_sleeping(wd->qdisc)->state))
 634                return;
 635
 636        if (hrtimer_is_queued(&wd->timer)) {
 637                /* If timer is already set in [expires, expires + delta_ns],
 638                 * do not reprogram it.
 639                 */
 640                if (wd->last_expires - expires <= delta_ns)
 641                        return;
 642        }
 643
 644        wd->last_expires = expires;
 645        hrtimer_start_range_ns(&wd->timer,
 646                               ns_to_ktime(expires),
 647                               delta_ns,
 648                               HRTIMER_MODE_ABS_PINNED);
 649}
 650EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
 651
 652void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 653{
 654        hrtimer_cancel(&wd->timer);
 655}
 656EXPORT_SYMBOL(qdisc_watchdog_cancel);
 657
 658static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 659{
 660        struct hlist_head *h;
 661        unsigned int i;
 662
 663        h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
 664
 665        if (h != NULL) {
 666                for (i = 0; i < n; i++)
 667                        INIT_HLIST_HEAD(&h[i]);
 668        }
 669        return h;
 670}
 671
 672void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 673{
 674        struct Qdisc_class_common *cl;
 675        struct hlist_node *next;
 676        struct hlist_head *nhash, *ohash;
 677        unsigned int nsize, nmask, osize;
 678        unsigned int i, h;
 679
 680        /* Rehash when load factor exceeds 0.75 */
 681        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 682                return;
 683        nsize = clhash->hashsize * 2;
 684        nmask = nsize - 1;
 685        nhash = qdisc_class_hash_alloc(nsize);
 686        if (nhash == NULL)
 687                return;
 688
 689        ohash = clhash->hash;
 690        osize = clhash->hashsize;
 691
 692        sch_tree_lock(sch);
 693        for (i = 0; i < osize; i++) {
 694                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 695                        h = qdisc_class_hash(cl->classid, nmask);
 696                        hlist_add_head(&cl->hnode, &nhash[h]);
 697                }
 698        }
 699        clhash->hash     = nhash;
 700        clhash->hashsize = nsize;
 701        clhash->hashmask = nmask;
 702        sch_tree_unlock(sch);
 703
 704        kvfree(ohash);
 705}
 706EXPORT_SYMBOL(qdisc_class_hash_grow);
 707
 708int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 709{
 710        unsigned int size = 4;
 711
 712        clhash->hash = qdisc_class_hash_alloc(size);
 713        if (!clhash->hash)
 714                return -ENOMEM;
 715        clhash->hashsize  = size;
 716        clhash->hashmask  = size - 1;
 717        clhash->hashelems = 0;
 718        return 0;
 719}
 720EXPORT_SYMBOL(qdisc_class_hash_init);
 721
 722void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 723{
 724        kvfree(clhash->hash);
 725}
 726EXPORT_SYMBOL(qdisc_class_hash_destroy);
 727
 728void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 729                             struct Qdisc_class_common *cl)
 730{
 731        unsigned int h;
 732
 733        INIT_HLIST_NODE(&cl->hnode);
 734        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 735        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 736        clhash->hashelems++;
 737}
 738EXPORT_SYMBOL(qdisc_class_hash_insert);
 739
 740void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 741                             struct Qdisc_class_common *cl)
 742{
 743        hlist_del(&cl->hnode);
 744        clhash->hashelems--;
 745}
 746EXPORT_SYMBOL(qdisc_class_hash_remove);
 747
 748/* Allocate an unique handle from space managed by kernel
 749 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 750 */
 751static u32 qdisc_alloc_handle(struct net_device *dev)
 752{
 753        int i = 0x8000;
 754        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 755
 756        do {
 757                autohandle += TC_H_MAKE(0x10000U, 0);
 758                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 759                        autohandle = TC_H_MAKE(0x80000000U, 0);
 760                if (!qdisc_lookup(dev, autohandle))
 761                        return autohandle;
 762                cond_resched();
 763        } while (--i > 0);
 764
 765        return 0;
 766}
 767
 768void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
 769{
 770        bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
 771        const struct Qdisc_class_ops *cops;
 772        unsigned long cl;
 773        u32 parentid;
 774        bool notify;
 775        int drops;
 776
 777        if (n == 0 && len == 0)
 778                return;
 779        drops = max_t(int, n, 0);
 780        rcu_read_lock();
 781        while ((parentid = sch->parent)) {
 782                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 783                        break;
 784
 785                if (sch->flags & TCQ_F_NOPARENT)
 786                        break;
 787                /* Notify parent qdisc only if child qdisc becomes empty.
 788                 *
 789                 * If child was empty even before update then backlog
 790                 * counter is screwed and we skip notification because
 791                 * parent class is already passive.
 792                 *
 793                 * If the original child was offloaded then it is allowed
 794                 * to be seem as empty, so the parent is notified anyway.
 795                 */
 796                notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
 797                                                       !qdisc_is_offloaded);
 798                /* TODO: perform the search on a per txq basis */
 799                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 800                if (sch == NULL) {
 801                        WARN_ON_ONCE(parentid != TC_H_ROOT);
 802                        break;
 803                }
 804                cops = sch->ops->cl_ops;
 805                if (notify && cops->qlen_notify) {
 806                        cl = cops->find(sch, parentid);
 807                        cops->qlen_notify(sch, cl);
 808                }
 809                sch->q.qlen -= n;
 810                sch->qstats.backlog -= len;
 811                __qdisc_qstats_drop(sch, drops);
 812        }
 813        rcu_read_unlock();
 814}
 815EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 816
 817int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
 818                              void *type_data)
 819{
 820        struct net_device *dev = qdisc_dev(sch);
 821        int err;
 822
 823        sch->flags &= ~TCQ_F_OFFLOADED;
 824        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 825                return 0;
 826
 827        err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
 828        if (err == -EOPNOTSUPP)
 829                return 0;
 830
 831        if (!err)
 832                sch->flags |= TCQ_F_OFFLOADED;
 833
 834        return err;
 835}
 836EXPORT_SYMBOL(qdisc_offload_dump_helper);
 837
 838void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
 839                                struct Qdisc *new, struct Qdisc *old,
 840                                enum tc_setup_type type, void *type_data,
 841                                struct netlink_ext_ack *extack)
 842{
 843        bool any_qdisc_is_offloaded;
 844        int err;
 845
 846        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 847                return;
 848
 849        err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
 850
 851        /* Don't report error if the graft is part of destroy operation. */
 852        if (!err || !new || new == &noop_qdisc)
 853                return;
 854
 855        /* Don't report error if the parent, the old child and the new
 856         * one are not offloaded.
 857         */
 858        any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
 859        any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
 860        any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
 861
 862        if (any_qdisc_is_offloaded)
 863                NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
 864}
 865EXPORT_SYMBOL(qdisc_offload_graft_helper);
 866
 867static void qdisc_offload_graft_root(struct net_device *dev,
 868                                     struct Qdisc *new, struct Qdisc *old,
 869                                     struct netlink_ext_ack *extack)
 870{
 871        struct tc_root_qopt_offload graft_offload = {
 872                .command        = TC_ROOT_GRAFT,
 873                .handle         = new ? new->handle : 0,
 874                .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
 875                                  (old && old->flags & TCQ_F_INGRESS),
 876        };
 877
 878        qdisc_offload_graft_helper(dev, NULL, new, old,
 879                                   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
 880}
 881
 882static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 883                         u32 portid, u32 seq, u16 flags, int event)
 884{
 885        struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
 886        struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 887        struct tcmsg *tcm;
 888        struct nlmsghdr  *nlh;
 889        unsigned char *b = skb_tail_pointer(skb);
 890        struct gnet_dump d;
 891        struct qdisc_size_table *stab;
 892        u32 block_index;
 893        __u32 qlen;
 894
 895        cond_resched();
 896        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 897        if (!nlh)
 898                goto out_nlmsg_trim;
 899        tcm = nlmsg_data(nlh);
 900        tcm->tcm_family = AF_UNSPEC;
 901        tcm->tcm__pad1 = 0;
 902        tcm->tcm__pad2 = 0;
 903        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 904        tcm->tcm_parent = clid;
 905        tcm->tcm_handle = q->handle;
 906        tcm->tcm_info = refcount_read(&q->refcnt);
 907        if (nla_put_string(skb, TCA_KIND, q->ops->id))
 908                goto nla_put_failure;
 909        if (q->ops->ingress_block_get) {
 910                block_index = q->ops->ingress_block_get(q);
 911                if (block_index &&
 912                    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
 913                        goto nla_put_failure;
 914        }
 915        if (q->ops->egress_block_get) {
 916                block_index = q->ops->egress_block_get(q);
 917                if (block_index &&
 918                    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
 919                        goto nla_put_failure;
 920        }
 921        if (q->ops->dump && q->ops->dump(q, skb) < 0)
 922                goto nla_put_failure;
 923        if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
 924                goto nla_put_failure;
 925        qlen = qdisc_qlen_sum(q);
 926
 927        stab = rtnl_dereference(q->stab);
 928        if (stab && qdisc_dump_stab(skb, stab) < 0)
 929                goto nla_put_failure;
 930
 931        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
 932                                         NULL, &d, TCA_PAD) < 0)
 933                goto nla_put_failure;
 934
 935        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
 936                goto nla_put_failure;
 937
 938        if (qdisc_is_percpu_stats(q)) {
 939                cpu_bstats = q->cpu_bstats;
 940                cpu_qstats = q->cpu_qstats;
 941        }
 942
 943        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
 944                                  &d, cpu_bstats, &q->bstats) < 0 ||
 945            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
 946            gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 947                goto nla_put_failure;
 948
 949        if (gnet_stats_finish_copy(&d) < 0)
 950                goto nla_put_failure;
 951
 952        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 953        return skb->len;
 954
 955out_nlmsg_trim:
 956nla_put_failure:
 957        nlmsg_trim(skb, b);
 958        return -1;
 959}
 960
 961static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
 962{
 963        if (q->flags & TCQ_F_BUILTIN)
 964                return true;
 965        if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
 966                return true;
 967
 968        return false;
 969}
 970
 971static int qdisc_notify(struct net *net, struct sk_buff *oskb,
 972                        struct nlmsghdr *n, u32 clid,
 973                        struct Qdisc *old, struct Qdisc *new)
 974{
 975        struct sk_buff *skb;
 976        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 977
 978        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 979        if (!skb)
 980                return -ENOBUFS;
 981
 982        if (old && !tc_qdisc_dump_ignore(old, false)) {
 983                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
 984                                  0, RTM_DELQDISC) < 0)
 985                        goto err_out;
 986        }
 987        if (new && !tc_qdisc_dump_ignore(new, false)) {
 988                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
 989                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 990                        goto err_out;
 991        }
 992
 993        if (skb->len)
 994                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 995                                      n->nlmsg_flags & NLM_F_ECHO);
 996
 997err_out:
 998        kfree_skb(skb);
 999        return -EINVAL;
1000}
1001
1002static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1003                               struct nlmsghdr *n, u32 clid,
1004                               struct Qdisc *old, struct Qdisc *new)
1005{
1006        if (new || old)
1007                qdisc_notify(net, skb, n, clid, old, new);
1008
1009        if (old)
1010                qdisc_put(old);
1011}
1012
1013static void qdisc_clear_nolock(struct Qdisc *sch)
1014{
1015        sch->flags &= ~TCQ_F_NOLOCK;
1016        if (!(sch->flags & TCQ_F_CPUSTATS))
1017                return;
1018
1019        free_percpu(sch->cpu_bstats);
1020        free_percpu(sch->cpu_qstats);
1021        sch->cpu_bstats = NULL;
1022        sch->cpu_qstats = NULL;
1023        sch->flags &= ~TCQ_F_CPUSTATS;
1024}
1025
1026/* Graft qdisc "new" to class "classid" of qdisc "parent" or
1027 * to device "dev".
1028 *
1029 * When appropriate send a netlink notification using 'skb'
1030 * and "n".
1031 *
1032 * On success, destroy old qdisc.
1033 */
1034
1035static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1036                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1037                       struct Qdisc *new, struct Qdisc *old,
1038                       struct netlink_ext_ack *extack)
1039{
1040        struct Qdisc *q = old;
1041        struct net *net = dev_net(dev);
1042
1043        if (parent == NULL) {
1044                unsigned int i, num_q, ingress;
1045
1046                ingress = 0;
1047                num_q = dev->num_tx_queues;
1048                if ((q && q->flags & TCQ_F_INGRESS) ||
1049                    (new && new->flags & TCQ_F_INGRESS)) {
1050                        num_q = 1;
1051                        ingress = 1;
1052                        if (!dev_ingress_queue(dev)) {
1053                                NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1054                                return -ENOENT;
1055                        }
1056                }
1057
1058                if (dev->flags & IFF_UP)
1059                        dev_deactivate(dev);
1060
1061                qdisc_offload_graft_root(dev, new, old, extack);
1062
1063                if (new && new->ops->attach)
1064                        goto skip;
1065
1066                for (i = 0; i < num_q; i++) {
1067                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1068
1069                        if (!ingress)
1070                                dev_queue = netdev_get_tx_queue(dev, i);
1071
1072                        old = dev_graft_qdisc(dev_queue, new);
1073                        if (new && i > 0)
1074                                qdisc_refcount_inc(new);
1075
1076                        if (!ingress)
1077                                qdisc_put(old);
1078                }
1079
1080skip:
1081                if (!ingress) {
1082                        notify_and_destroy(net, skb, n, classid,
1083                                           dev->qdisc, new);
1084                        if (new && !new->ops->attach)
1085                                qdisc_refcount_inc(new);
1086                        dev->qdisc = new ? : &noop_qdisc;
1087
1088                        if (new && new->ops->attach)
1089                                new->ops->attach(new);
1090                } else {
1091                        notify_and_destroy(net, skb, n, classid, old, new);
1092                }
1093
1094                if (dev->flags & IFF_UP)
1095                        dev_activate(dev);
1096        } else {
1097                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1098                unsigned long cl;
1099                int err;
1100
1101                /* Only support running class lockless if parent is lockless */
1102                if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1103                        qdisc_clear_nolock(new);
1104
1105                if (!cops || !cops->graft)
1106                        return -EOPNOTSUPP;
1107
1108                cl = cops->find(parent, classid);
1109                if (!cl) {
1110                        NL_SET_ERR_MSG(extack, "Specified class not found");
1111                        return -ENOENT;
1112                }
1113
1114                err = cops->graft(parent, cl, new, &old, extack);
1115                if (err)
1116                        return err;
1117                notify_and_destroy(net, skb, n, classid, old, new);
1118        }
1119        return 0;
1120}
1121
1122static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1123                                   struct netlink_ext_ack *extack)
1124{
1125        u32 block_index;
1126
1127        if (tca[TCA_INGRESS_BLOCK]) {
1128                block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1129
1130                if (!block_index) {
1131                        NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1132                        return -EINVAL;
1133                }
1134                if (!sch->ops->ingress_block_set) {
1135                        NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1136                        return -EOPNOTSUPP;
1137                }
1138                sch->ops->ingress_block_set(sch, block_index);
1139        }
1140        if (tca[TCA_EGRESS_BLOCK]) {
1141                block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1142
1143                if (!block_index) {
1144                        NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1145                        return -EINVAL;
1146                }
1147                if (!sch->ops->egress_block_set) {
1148                        NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1149                        return -EOPNOTSUPP;
1150                }
1151                sch->ops->egress_block_set(sch, block_index);
1152        }
1153        return 0;
1154}
1155
1156/*
1157   Allocate and initialize new qdisc.
1158
1159   Parameters are passed via opt.
1160 */
1161
1162static struct Qdisc *qdisc_create(struct net_device *dev,
1163                                  struct netdev_queue *dev_queue,
1164                                  struct Qdisc *p, u32 parent, u32 handle,
1165                                  struct nlattr **tca, int *errp,
1166                                  struct netlink_ext_ack *extack)
1167{
1168        int err;
1169        struct nlattr *kind = tca[TCA_KIND];
1170        struct Qdisc *sch;
1171        struct Qdisc_ops *ops;
1172        struct qdisc_size_table *stab;
1173
1174        ops = qdisc_lookup_ops(kind);
1175#ifdef CONFIG_MODULES
1176        if (ops == NULL && kind != NULL) {
1177                char name[IFNAMSIZ];
1178                if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1179                        /* We dropped the RTNL semaphore in order to
1180                         * perform the module load.  So, even if we
1181                         * succeeded in loading the module we have to
1182                         * tell the caller to replay the request.  We
1183                         * indicate this using -EAGAIN.
1184                         * We replay the request because the device may
1185                         * go away in the mean time.
1186                         */
1187                        rtnl_unlock();
1188                        request_module("sch_%s", name);
1189                        rtnl_lock();
1190                        ops = qdisc_lookup_ops(kind);
1191                        if (ops != NULL) {
1192                                /* We will try again qdisc_lookup_ops,
1193                                 * so don't keep a reference.
1194                                 */
1195                                module_put(ops->owner);
1196                                err = -EAGAIN;
1197                                goto err_out;
1198                        }
1199                }
1200        }
1201#endif
1202
1203        err = -ENOENT;
1204        if (!ops) {
1205                NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1206                goto err_out;
1207        }
1208
1209        sch = qdisc_alloc(dev_queue, ops, extack);
1210        if (IS_ERR(sch)) {
1211                err = PTR_ERR(sch);
1212                goto err_out2;
1213        }
1214
1215        sch->parent = parent;
1216
1217        if (handle == TC_H_INGRESS) {
1218                sch->flags |= TCQ_F_INGRESS;
1219                handle = TC_H_MAKE(TC_H_INGRESS, 0);
1220        } else {
1221                if (handle == 0) {
1222                        handle = qdisc_alloc_handle(dev);
1223                        if (handle == 0) {
1224                                NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1225                                err = -ENOSPC;
1226                                goto err_out3;
1227                        }
1228                }
1229                if (!netif_is_multiqueue(dev))
1230                        sch->flags |= TCQ_F_ONETXQUEUE;
1231        }
1232
1233        sch->handle = handle;
1234
1235        /* This exist to keep backward compatible with a userspace
1236         * loophole, what allowed userspace to get IFF_NO_QUEUE
1237         * facility on older kernels by setting tx_queue_len=0 (prior
1238         * to qdisc init), and then forgot to reinit tx_queue_len
1239         * before again attaching a qdisc.
1240         */
1241        if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1242                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1243                netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1244        }
1245
1246        err = qdisc_block_indexes_set(sch, tca, extack);
1247        if (err)
1248                goto err_out3;
1249
1250        if (ops->init) {
1251                err = ops->init(sch, tca[TCA_OPTIONS], extack);
1252                if (err != 0)
1253                        goto err_out5;
1254        }
1255
1256        if (tca[TCA_STAB]) {
1257                stab = qdisc_get_stab(tca[TCA_STAB], extack);
1258                if (IS_ERR(stab)) {
1259                        err = PTR_ERR(stab);
1260                        goto err_out4;
1261                }
1262                rcu_assign_pointer(sch->stab, stab);
1263        }
1264        if (tca[TCA_RATE]) {
1265                seqcount_t *running;
1266
1267                err = -EOPNOTSUPP;
1268                if (sch->flags & TCQ_F_MQROOT) {
1269                        NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1270                        goto err_out4;
1271                }
1272
1273                if (sch->parent != TC_H_ROOT &&
1274                    !(sch->flags & TCQ_F_INGRESS) &&
1275                    (!p || !(p->flags & TCQ_F_MQROOT)))
1276                        running = qdisc_root_sleeping_running(sch);
1277                else
1278                        running = &sch->running;
1279
1280                err = gen_new_estimator(&sch->bstats,
1281                                        sch->cpu_bstats,
1282                                        &sch->rate_est,
1283                                        NULL,
1284                                        running,
1285                                        tca[TCA_RATE]);
1286                if (err) {
1287                        NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1288                        goto err_out4;
1289                }
1290        }
1291
1292        qdisc_hash_add(sch, false);
1293        trace_qdisc_create(ops, dev, parent);
1294
1295        return sch;
1296
1297err_out5:
1298        /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1299        if (ops->destroy)
1300                ops->destroy(sch);
1301err_out3:
1302        dev_put(dev);
1303        qdisc_free(sch);
1304err_out2:
1305        module_put(ops->owner);
1306err_out:
1307        *errp = err;
1308        return NULL;
1309
1310err_out4:
1311        /*
1312         * Any broken qdiscs that would require a ops->reset() here?
1313         * The qdisc was never in action so it shouldn't be necessary.
1314         */
1315        qdisc_put_stab(rtnl_dereference(sch->stab));
1316        if (ops->destroy)
1317                ops->destroy(sch);
1318        goto err_out3;
1319}
1320
1321static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1322                        struct netlink_ext_ack *extack)
1323{
1324        struct qdisc_size_table *ostab, *stab = NULL;
1325        int err = 0;
1326
1327        if (tca[TCA_OPTIONS]) {
1328                if (!sch->ops->change) {
1329                        NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1330                        return -EINVAL;
1331                }
1332                if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1333                        NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1334                        return -EOPNOTSUPP;
1335                }
1336                err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1337                if (err)
1338                        return err;
1339        }
1340
1341        if (tca[TCA_STAB]) {
1342                stab = qdisc_get_stab(tca[TCA_STAB], extack);
1343                if (IS_ERR(stab))
1344                        return PTR_ERR(stab);
1345        }
1346
1347        ostab = rtnl_dereference(sch->stab);
1348        rcu_assign_pointer(sch->stab, stab);
1349        qdisc_put_stab(ostab);
1350
1351        if (tca[TCA_RATE]) {
1352                /* NB: ignores errors from replace_estimator
1353                   because change can't be undone. */
1354                if (sch->flags & TCQ_F_MQROOT)
1355                        goto out;
1356                gen_replace_estimator(&sch->bstats,
1357                                      sch->cpu_bstats,
1358                                      &sch->rate_est,
1359                                      NULL,
1360                                      qdisc_root_sleeping_running(sch),
1361                                      tca[TCA_RATE]);
1362        }
1363out:
1364        return 0;
1365}
1366
1367struct check_loop_arg {
1368        struct qdisc_walker     w;
1369        struct Qdisc            *p;
1370        int                     depth;
1371};
1372
1373static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1374                         struct qdisc_walker *w);
1375
1376static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1377{
1378        struct check_loop_arg   arg;
1379
1380        if (q->ops->cl_ops == NULL)
1381                return 0;
1382
1383        arg.w.stop = arg.w.skip = arg.w.count = 0;
1384        arg.w.fn = check_loop_fn;
1385        arg.depth = depth;
1386        arg.p = p;
1387        q->ops->cl_ops->walk(q, &arg.w);
1388        return arg.w.stop ? -ELOOP : 0;
1389}
1390
1391static int
1392check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1393{
1394        struct Qdisc *leaf;
1395        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1396        struct check_loop_arg *arg = (struct check_loop_arg *)w;
1397
1398        leaf = cops->leaf(q, cl);
1399        if (leaf) {
1400                if (leaf == arg->p || arg->depth > 7)
1401                        return -ELOOP;
1402                return check_loop(leaf, arg->p, arg->depth + 1);
1403        }
1404        return 0;
1405}
1406
1407const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1408        [TCA_KIND]              = { .type = NLA_STRING },
1409        [TCA_RATE]              = { .type = NLA_BINARY,
1410                                    .len = sizeof(struct tc_estimator) },
1411        [TCA_STAB]              = { .type = NLA_NESTED },
1412        [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1413        [TCA_CHAIN]             = { .type = NLA_U32 },
1414        [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1415        [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1416};
1417
1418/*
1419 * Delete/get qdisc.
1420 */
1421
1422static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1423                        struct netlink_ext_ack *extack)
1424{
1425        struct net *net = sock_net(skb->sk);
1426        struct tcmsg *tcm = nlmsg_data(n);
1427        struct nlattr *tca[TCA_MAX + 1];
1428        struct net_device *dev;
1429        u32 clid;
1430        struct Qdisc *q = NULL;
1431        struct Qdisc *p = NULL;
1432        int err;
1433
1434        if ((n->nlmsg_type != RTM_GETQDISC) &&
1435            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1436                return -EPERM;
1437
1438        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1439                                     rtm_tca_policy, extack);
1440        if (err < 0)
1441                return err;
1442
1443        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1444        if (!dev)
1445                return -ENODEV;
1446
1447        clid = tcm->tcm_parent;
1448        if (clid) {
1449                if (clid != TC_H_ROOT) {
1450                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1451                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1452                                if (!p) {
1453                                        NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1454                                        return -ENOENT;
1455                                }
1456                                q = qdisc_leaf(p, clid);
1457                        } else if (dev_ingress_queue(dev)) {
1458                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1459                        }
1460                } else {
1461                        q = dev->qdisc;
1462                }
1463                if (!q) {
1464                        NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1465                        return -ENOENT;
1466                }
1467
1468                if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1469                        NL_SET_ERR_MSG(extack, "Invalid handle");
1470                        return -EINVAL;
1471                }
1472        } else {
1473                q = qdisc_lookup(dev, tcm->tcm_handle);
1474                if (!q) {
1475                        NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1476                        return -ENOENT;
1477                }
1478        }
1479
1480        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1481                NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1482                return -EINVAL;
1483        }
1484
1485        if (n->nlmsg_type == RTM_DELQDISC) {
1486                if (!clid) {
1487                        NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1488                        return -EINVAL;
1489                }
1490                if (q->handle == 0) {
1491                        NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1492                        return -ENOENT;
1493                }
1494                err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1495                if (err != 0)
1496                        return err;
1497        } else {
1498                qdisc_notify(net, skb, n, clid, NULL, q);
1499        }
1500        return 0;
1501}
1502
1503/*
1504 * Create/change qdisc.
1505 */
1506
1507static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1508                           struct netlink_ext_ack *extack)
1509{
1510        struct net *net = sock_net(skb->sk);
1511        struct tcmsg *tcm;
1512        struct nlattr *tca[TCA_MAX + 1];
1513        struct net_device *dev;
1514        u32 clid;
1515        struct Qdisc *q, *p;
1516        int err;
1517
1518        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1519                return -EPERM;
1520
1521replay:
1522        /* Reinit, just in case something touches this. */
1523        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1524                                     rtm_tca_policy, extack);
1525        if (err < 0)
1526                return err;
1527
1528        tcm = nlmsg_data(n);
1529        clid = tcm->tcm_parent;
1530        q = p = NULL;
1531
1532        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1533        if (!dev)
1534                return -ENODEV;
1535
1536
1537        if (clid) {
1538                if (clid != TC_H_ROOT) {
1539                        if (clid != TC_H_INGRESS) {
1540                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1541                                if (!p) {
1542                                        NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1543                                        return -ENOENT;
1544                                }
1545                                q = qdisc_leaf(p, clid);
1546                        } else if (dev_ingress_queue_create(dev)) {
1547                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1548                        }
1549                } else {
1550                        q = dev->qdisc;
1551                }
1552
1553                /* It may be default qdisc, ignore it */
1554                if (q && q->handle == 0)
1555                        q = NULL;
1556
1557                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1558                        if (tcm->tcm_handle) {
1559                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1560                                        NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1561                                        return -EEXIST;
1562                                }
1563                                if (TC_H_MIN(tcm->tcm_handle)) {
1564                                        NL_SET_ERR_MSG(extack, "Invalid minor handle");
1565                                        return -EINVAL;
1566                                }
1567                                q = qdisc_lookup(dev, tcm->tcm_handle);
1568                                if (!q)
1569                                        goto create_n_graft;
1570                                if (n->nlmsg_flags & NLM_F_EXCL) {
1571                                        NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1572                                        return -EEXIST;
1573                                }
1574                                if (tca[TCA_KIND] &&
1575                                    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1576                                        NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1577                                        return -EINVAL;
1578                                }
1579                                if (q == p ||
1580                                    (p && check_loop(q, p, 0))) {
1581                                        NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1582                                        return -ELOOP;
1583                                }
1584                                qdisc_refcount_inc(q);
1585                                goto graft;
1586                        } else {
1587                                if (!q)
1588                                        goto create_n_graft;
1589
1590                                /* This magic test requires explanation.
1591                                 *
1592                                 *   We know, that some child q is already
1593                                 *   attached to this parent and have choice:
1594                                 *   either to change it or to create/graft new one.
1595                                 *
1596                                 *   1. We are allowed to create/graft only
1597                                 *   if CREATE and REPLACE flags are set.
1598                                 *
1599                                 *   2. If EXCL is set, requestor wanted to say,
1600                                 *   that qdisc tcm_handle is not expected
1601                                 *   to exist, so that we choose create/graft too.
1602                                 *
1603                                 *   3. The last case is when no flags are set.
1604                                 *   Alas, it is sort of hole in API, we
1605                                 *   cannot decide what to do unambiguously.
1606                                 *   For now we select create/graft, if
1607                                 *   user gave KIND, which does not match existing.
1608                                 */
1609                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1610                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1611                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1612                                     (tca[TCA_KIND] &&
1613                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1614                                        goto create_n_graft;
1615                        }
1616                }
1617        } else {
1618                if (!tcm->tcm_handle) {
1619                        NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1620                        return -EINVAL;
1621                }
1622                q = qdisc_lookup(dev, tcm->tcm_handle);
1623        }
1624
1625        /* Change qdisc parameters */
1626        if (!q) {
1627                NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1628                return -ENOENT;
1629        }
1630        if (n->nlmsg_flags & NLM_F_EXCL) {
1631                NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1632                return -EEXIST;
1633        }
1634        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1635                NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1636                return -EINVAL;
1637        }
1638        err = qdisc_change(q, tca, extack);
1639        if (err == 0)
1640                qdisc_notify(net, skb, n, clid, NULL, q);
1641        return err;
1642
1643create_n_graft:
1644        if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1645                NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1646                return -ENOENT;
1647        }
1648        if (clid == TC_H_INGRESS) {
1649                if (dev_ingress_queue(dev)) {
1650                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1651                                         tcm->tcm_parent, tcm->tcm_parent,
1652                                         tca, &err, extack);
1653                } else {
1654                        NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1655                        err = -ENOENT;
1656                }
1657        } else {
1658                struct netdev_queue *dev_queue;
1659
1660                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1661                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1662                else if (p)
1663                        dev_queue = p->dev_queue;
1664                else
1665                        dev_queue = netdev_get_tx_queue(dev, 0);
1666
1667                q = qdisc_create(dev, dev_queue, p,
1668                                 tcm->tcm_parent, tcm->tcm_handle,
1669                                 tca, &err, extack);
1670        }
1671        if (q == NULL) {
1672                if (err == -EAGAIN)
1673                        goto replay;
1674                return err;
1675        }
1676
1677graft:
1678        err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1679        if (err) {
1680                if (q)
1681                        qdisc_put(q);
1682                return err;
1683        }
1684
1685        return 0;
1686}
1687
1688static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1689                              struct netlink_callback *cb,
1690                              int *q_idx_p, int s_q_idx, bool recur,
1691                              bool dump_invisible)
1692{
1693        int ret = 0, q_idx = *q_idx_p;
1694        struct Qdisc *q;
1695        int b;
1696
1697        if (!root)
1698                return 0;
1699
1700        q = root;
1701        if (q_idx < s_q_idx) {
1702                q_idx++;
1703        } else {
1704                if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1705                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1706                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1707                                  RTM_NEWQDISC) <= 0)
1708                        goto done;
1709                q_idx++;
1710        }
1711
1712        /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1713         * itself has already been dumped.
1714         *
1715         * If we've already dumped the top-level (ingress) qdisc above and the global
1716         * qdisc hashtable, we don't want to hit it again
1717         */
1718        if (!qdisc_dev(root) || !recur)
1719                goto out;
1720
1721        hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1722                if (q_idx < s_q_idx) {
1723                        q_idx++;
1724                        continue;
1725                }
1726                if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1727                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1728                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1729                                  RTM_NEWQDISC) <= 0)
1730                        goto done;
1731                q_idx++;
1732        }
1733
1734out:
1735        *q_idx_p = q_idx;
1736        return ret;
1737done:
1738        ret = -1;
1739        goto out;
1740}
1741
1742static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1743{
1744        struct net *net = sock_net(skb->sk);
1745        int idx, q_idx;
1746        int s_idx, s_q_idx;
1747        struct net_device *dev;
1748        const struct nlmsghdr *nlh = cb->nlh;
1749        struct nlattr *tca[TCA_MAX + 1];
1750        int err;
1751
1752        s_idx = cb->args[0];
1753        s_q_idx = q_idx = cb->args[1];
1754
1755        idx = 0;
1756        ASSERT_RTNL();
1757
1758        err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1759                                     rtm_tca_policy, cb->extack);
1760        if (err < 0)
1761                return err;
1762
1763        for_each_netdev(net, dev) {
1764                struct netdev_queue *dev_queue;
1765
1766                if (idx < s_idx)
1767                        goto cont;
1768                if (idx > s_idx)
1769                        s_q_idx = 0;
1770                q_idx = 0;
1771
1772                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1773                                       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1774                        goto done;
1775
1776                dev_queue = dev_ingress_queue(dev);
1777                if (dev_queue &&
1778                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1779                                       &q_idx, s_q_idx, false,
1780                                       tca[TCA_DUMP_INVISIBLE]) < 0)
1781                        goto done;
1782
1783cont:
1784                idx++;
1785        }
1786
1787done:
1788        cb->args[0] = idx;
1789        cb->args[1] = q_idx;
1790
1791        return skb->len;
1792}
1793
1794
1795
1796/************************************************
1797 *      Traffic classes manipulation.           *
1798 ************************************************/
1799
1800static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1801                          unsigned long cl,
1802                          u32 portid, u32 seq, u16 flags, int event)
1803{
1804        struct tcmsg *tcm;
1805        struct nlmsghdr  *nlh;
1806        unsigned char *b = skb_tail_pointer(skb);
1807        struct gnet_dump d;
1808        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1809
1810        cond_resched();
1811        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1812        if (!nlh)
1813                goto out_nlmsg_trim;
1814        tcm = nlmsg_data(nlh);
1815        tcm->tcm_family = AF_UNSPEC;
1816        tcm->tcm__pad1 = 0;
1817        tcm->tcm__pad2 = 0;
1818        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1819        tcm->tcm_parent = q->handle;
1820        tcm->tcm_handle = q->handle;
1821        tcm->tcm_info = 0;
1822        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1823                goto nla_put_failure;
1824        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1825                goto nla_put_failure;
1826
1827        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1828                                         NULL, &d, TCA_PAD) < 0)
1829                goto nla_put_failure;
1830
1831        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1832                goto nla_put_failure;
1833
1834        if (gnet_stats_finish_copy(&d) < 0)
1835                goto nla_put_failure;
1836
1837        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1838        return skb->len;
1839
1840out_nlmsg_trim:
1841nla_put_failure:
1842        nlmsg_trim(skb, b);
1843        return -1;
1844}
1845
1846static int tclass_notify(struct net *net, struct sk_buff *oskb,
1847                         struct nlmsghdr *n, struct Qdisc *q,
1848                         unsigned long cl, int event)
1849{
1850        struct sk_buff *skb;
1851        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1852        int err = 0;
1853
1854        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1855        if (!skb)
1856                return -ENOBUFS;
1857
1858        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1859                kfree_skb(skb);
1860                return -EINVAL;
1861        }
1862
1863        err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1864                             n->nlmsg_flags & NLM_F_ECHO);
1865        if (err > 0)
1866                err = 0;
1867        return err;
1868}
1869
1870static int tclass_del_notify(struct net *net,
1871                             const struct Qdisc_class_ops *cops,
1872                             struct sk_buff *oskb, struct nlmsghdr *n,
1873                             struct Qdisc *q, unsigned long cl,
1874                             struct netlink_ext_ack *extack)
1875{
1876        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1877        struct sk_buff *skb;
1878        int err = 0;
1879
1880        if (!cops->delete)
1881                return -EOPNOTSUPP;
1882
1883        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1884        if (!skb)
1885                return -ENOBUFS;
1886
1887        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1888                           RTM_DELTCLASS) < 0) {
1889                kfree_skb(skb);
1890                return -EINVAL;
1891        }
1892
1893        err = cops->delete(q, cl, extack);
1894        if (err) {
1895                kfree_skb(skb);
1896                return err;
1897        }
1898
1899        err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1900                             n->nlmsg_flags & NLM_F_ECHO);
1901        if (err > 0)
1902                err = 0;
1903        return err;
1904}
1905
1906#ifdef CONFIG_NET_CLS
1907
1908struct tcf_bind_args {
1909        struct tcf_walker w;
1910        unsigned long base;
1911        unsigned long cl;
1912        u32 classid;
1913};
1914
1915static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1916{
1917        struct tcf_bind_args *a = (void *)arg;
1918
1919        if (tp->ops->bind_class) {
1920                struct Qdisc *q = tcf_block_q(tp->chain->block);
1921
1922                sch_tree_lock(q);
1923                tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1924                sch_tree_unlock(q);
1925        }
1926        return 0;
1927}
1928
1929struct tc_bind_class_args {
1930        struct qdisc_walker w;
1931        unsigned long new_cl;
1932        u32 portid;
1933        u32 clid;
1934};
1935
1936static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1937                                struct qdisc_walker *w)
1938{
1939        struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1940        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1941        struct tcf_block *block;
1942        struct tcf_chain *chain;
1943
1944        block = cops->tcf_block(q, cl, NULL);
1945        if (!block)
1946                return 0;
1947        for (chain = tcf_get_next_chain(block, NULL);
1948             chain;
1949             chain = tcf_get_next_chain(block, chain)) {
1950                struct tcf_proto *tp;
1951
1952                for (tp = tcf_get_next_proto(chain, NULL);
1953                     tp; tp = tcf_get_next_proto(chain, tp)) {
1954                        struct tcf_bind_args arg = {};
1955
1956                        arg.w.fn = tcf_node_bind;
1957                        arg.classid = a->clid;
1958                        arg.base = cl;
1959                        arg.cl = a->new_cl;
1960                        tp->ops->walk(tp, &arg.w, true);
1961                }
1962        }
1963
1964        return 0;
1965}
1966
1967static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1968                           unsigned long new_cl)
1969{
1970        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1971        struct tc_bind_class_args args = {};
1972
1973        if (!cops->tcf_block)
1974                return;
1975        args.portid = portid;
1976        args.clid = clid;
1977        args.new_cl = new_cl;
1978        args.w.fn = tc_bind_class_walker;
1979        q->ops->cl_ops->walk(q, &args.w);
1980}
1981
1982#else
1983
1984static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1985                           unsigned long new_cl)
1986{
1987}
1988
1989#endif
1990
1991static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1992                         struct netlink_ext_ack *extack)
1993{
1994        struct net *net = sock_net(skb->sk);
1995        struct tcmsg *tcm = nlmsg_data(n);
1996        struct nlattr *tca[TCA_MAX + 1];
1997        struct net_device *dev;
1998        struct Qdisc *q = NULL;
1999        const struct Qdisc_class_ops *cops;
2000        unsigned long cl = 0;
2001        unsigned long new_cl;
2002        u32 portid;
2003        u32 clid;
2004        u32 qid;
2005        int err;
2006
2007        if ((n->nlmsg_type != RTM_GETTCLASS) &&
2008            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2009                return -EPERM;
2010
2011        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2012                                     rtm_tca_policy, extack);
2013        if (err < 0)
2014                return err;
2015
2016        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2017        if (!dev)
2018                return -ENODEV;
2019
2020        /*
2021           parent == TC_H_UNSPEC - unspecified parent.
2022           parent == TC_H_ROOT   - class is root, which has no parent.
2023           parent == X:0         - parent is root class.
2024           parent == X:Y         - parent is a node in hierarchy.
2025           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2026
2027           handle == 0:0         - generate handle from kernel pool.
2028           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2029           handle == X:Y         - clear.
2030           handle == X:0         - root class.
2031         */
2032
2033        /* Step 1. Determine qdisc handle X:0 */
2034
2035        portid = tcm->tcm_parent;
2036        clid = tcm->tcm_handle;
2037        qid = TC_H_MAJ(clid);
2038
2039        if (portid != TC_H_ROOT) {
2040                u32 qid1 = TC_H_MAJ(portid);
2041
2042                if (qid && qid1) {
2043                        /* If both majors are known, they must be identical. */
2044                        if (qid != qid1)
2045                                return -EINVAL;
2046                } else if (qid1) {
2047                        qid = qid1;
2048                } else if (qid == 0)
2049                        qid = dev->qdisc->handle;
2050
2051                /* Now qid is genuine qdisc handle consistent
2052                 * both with parent and child.
2053                 *
2054                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2055                 */
2056                if (portid)
2057                        portid = TC_H_MAKE(qid, portid);
2058        } else {
2059                if (qid == 0)
2060                        qid = dev->qdisc->handle;
2061        }
2062
2063        /* OK. Locate qdisc */
2064        q = qdisc_lookup(dev, qid);
2065        if (!q)
2066                return -ENOENT;
2067
2068        /* An check that it supports classes */
2069        cops = q->ops->cl_ops;
2070        if (cops == NULL)
2071                return -EINVAL;
2072
2073        /* Now try to get class */
2074        if (clid == 0) {
2075                if (portid == TC_H_ROOT)
2076                        clid = qid;
2077        } else
2078                clid = TC_H_MAKE(qid, clid);
2079
2080        if (clid)
2081                cl = cops->find(q, clid);
2082
2083        if (cl == 0) {
2084                err = -ENOENT;
2085                if (n->nlmsg_type != RTM_NEWTCLASS ||
2086                    !(n->nlmsg_flags & NLM_F_CREATE))
2087                        goto out;
2088        } else {
2089                switch (n->nlmsg_type) {
2090                case RTM_NEWTCLASS:
2091                        err = -EEXIST;
2092                        if (n->nlmsg_flags & NLM_F_EXCL)
2093                                goto out;
2094                        break;
2095                case RTM_DELTCLASS:
2096                        err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2097                        /* Unbind the class with flilters with 0 */
2098                        tc_bind_tclass(q, portid, clid, 0);
2099                        goto out;
2100                case RTM_GETTCLASS:
2101                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2102                        goto out;
2103                default:
2104                        err = -EINVAL;
2105                        goto out;
2106                }
2107        }
2108
2109        if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2110                NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2111                return -EOPNOTSUPP;
2112        }
2113
2114        new_cl = cl;
2115        err = -EOPNOTSUPP;
2116        if (cops->change)
2117                err = cops->change(q, clid, portid, tca, &new_cl, extack);
2118        if (err == 0) {
2119                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2120                /* We just create a new class, need to do reverse binding. */
2121                if (cl != new_cl)
2122                        tc_bind_tclass(q, portid, clid, new_cl);
2123        }
2124out:
2125        return err;
2126}
2127
2128struct qdisc_dump_args {
2129        struct qdisc_walker     w;
2130        struct sk_buff          *skb;
2131        struct netlink_callback *cb;
2132};
2133
2134static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2135                            struct qdisc_walker *arg)
2136{
2137        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2138
2139        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2140                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2141                              RTM_NEWTCLASS);
2142}
2143
2144static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2145                                struct tcmsg *tcm, struct netlink_callback *cb,
2146                                int *t_p, int s_t)
2147{
2148        struct qdisc_dump_args arg;
2149
2150        if (tc_qdisc_dump_ignore(q, false) ||
2151            *t_p < s_t || !q->ops->cl_ops ||
2152            (tcm->tcm_parent &&
2153             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2154                (*t_p)++;
2155                return 0;
2156        }
2157        if (*t_p > s_t)
2158                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2159        arg.w.fn = qdisc_class_dump;
2160        arg.skb = skb;
2161        arg.cb = cb;
2162        arg.w.stop  = 0;
2163        arg.w.skip = cb->args[1];
2164        arg.w.count = 0;
2165        q->ops->cl_ops->walk(q, &arg.w);
2166        cb->args[1] = arg.w.count;
2167        if (arg.w.stop)
2168                return -1;
2169        (*t_p)++;
2170        return 0;
2171}
2172
2173static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2174                               struct tcmsg *tcm, struct netlink_callback *cb,
2175                               int *t_p, int s_t, bool recur)
2176{
2177        struct Qdisc *q;
2178        int b;
2179
2180        if (!root)
2181                return 0;
2182
2183        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2184                return -1;
2185
2186        if (!qdisc_dev(root) || !recur)
2187                return 0;
2188
2189        if (tcm->tcm_parent) {
2190                q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2191                if (q && q != root &&
2192                    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2193                        return -1;
2194                return 0;
2195        }
2196        hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2197                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2198                        return -1;
2199        }
2200
2201        return 0;
2202}
2203
2204static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2205{
2206        struct tcmsg *tcm = nlmsg_data(cb->nlh);
2207        struct net *net = sock_net(skb->sk);
2208        struct netdev_queue *dev_queue;
2209        struct net_device *dev;
2210        int t, s_t;
2211
2212        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2213                return 0;
2214        dev = dev_get_by_index(net, tcm->tcm_ifindex);
2215        if (!dev)
2216                return 0;
2217
2218        s_t = cb->args[0];
2219        t = 0;
2220
2221        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
2222                goto done;
2223
2224        dev_queue = dev_ingress_queue(dev);
2225        if (dev_queue &&
2226            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2227                                &t, s_t, false) < 0)
2228                goto done;
2229
2230done:
2231        cb->args[0] = t;
2232
2233        dev_put(dev);
2234        return skb->len;
2235}
2236
2237#ifdef CONFIG_PROC_FS
2238static int psched_show(struct seq_file *seq, void *v)
2239{
2240        seq_printf(seq, "%08x %08x %08x %08x\n",
2241                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2242                   1000000,
2243                   (u32)NSEC_PER_SEC / hrtimer_resolution);
2244
2245        return 0;
2246}
2247
2248static int __net_init psched_net_init(struct net *net)
2249{
2250        struct proc_dir_entry *e;
2251
2252        e = proc_create_single("psched", 0, net->proc_net, psched_show);
2253        if (e == NULL)
2254                return -ENOMEM;
2255
2256        return 0;
2257}
2258
2259static void __net_exit psched_net_exit(struct net *net)
2260{
2261        remove_proc_entry("psched", net->proc_net);
2262}
2263#else
2264static int __net_init psched_net_init(struct net *net)
2265{
2266        return 0;
2267}
2268
2269static void __net_exit psched_net_exit(struct net *net)
2270{
2271}
2272#endif
2273
2274static struct pernet_operations psched_net_ops = {
2275        .init = psched_net_init,
2276        .exit = psched_net_exit,
2277};
2278
2279static int __init pktsched_init(void)
2280{
2281        int err;
2282
2283        err = register_pernet_subsys(&psched_net_ops);
2284        if (err) {
2285                pr_err("pktsched_init: "
2286                       "cannot initialize per netns operations\n");
2287                return err;
2288        }
2289
2290        register_qdisc(&pfifo_fast_ops);
2291        register_qdisc(&pfifo_qdisc_ops);
2292        register_qdisc(&bfifo_qdisc_ops);
2293        register_qdisc(&pfifo_head_drop_qdisc_ops);
2294        register_qdisc(&mq_qdisc_ops);
2295        register_qdisc(&noqueue_qdisc_ops);
2296
2297        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2298        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2299        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2300                      0);
2301        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2302        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2303        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2304                      0);
2305
2306        return 0;
2307}
2308
2309subsys_initcall(pktsched_init);
2310