linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/unregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* Get default qdisc if not otherwise specified */
 204void qdisc_get_default(char *name, size_t len)
 205{
 206        read_lock(&qdisc_mod_lock);
 207        strlcpy(name, default_qdisc_ops->id, len);
 208        read_unlock(&qdisc_mod_lock);
 209}
 210
 211static struct Qdisc_ops *qdisc_lookup_default(const char *name)
 212{
 213        struct Qdisc_ops *q = NULL;
 214
 215        for (q = qdisc_base; q; q = q->next) {
 216                if (!strcmp(name, q->id)) {
 217                        if (!try_module_get(q->owner))
 218                                q = NULL;
 219                        break;
 220                }
 221        }
 222
 223        return q;
 224}
 225
 226/* Set new default qdisc to use */
 227int qdisc_set_default(const char *name)
 228{
 229        const struct Qdisc_ops *ops;
 230
 231        if (!capable(CAP_NET_ADMIN))
 232                return -EPERM;
 233
 234        write_lock(&qdisc_mod_lock);
 235        ops = qdisc_lookup_default(name);
 236        if (!ops) {
 237                /* Not found, drop lock and try to load module */
 238                write_unlock(&qdisc_mod_lock);
 239                request_module("sch_%s", name);
 240                write_lock(&qdisc_mod_lock);
 241
 242                ops = qdisc_lookup_default(name);
 243        }
 244
 245        if (ops) {
 246                /* Set new default */
 247                module_put(default_qdisc_ops->owner);
 248                default_qdisc_ops = ops;
 249        }
 250        write_unlock(&qdisc_mod_lock);
 251
 252        return ops ? 0 : -ENOENT;
 253}
 254
 255/* We know handle. Find qdisc among all qdisc's attached to device
 256   (root qdisc, all its children, children of children etc.)
 257 */
 258
 259static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 260{
 261        struct Qdisc *q;
 262
 263        if (!(root->flags & TCQ_F_BUILTIN) &&
 264            root->handle == handle)
 265                return root;
 266
 267        list_for_each_entry(q, &root->list, list) {
 268                if (q->handle == handle)
 269                        return q;
 270        }
 271        return NULL;
 272}
 273
 274void qdisc_list_add(struct Qdisc *q)
 275{
 276        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 277                struct Qdisc *root = qdisc_dev(q)->qdisc;
 278
 279                WARN_ON_ONCE(root == &noop_qdisc);
 280                list_add_tail(&q->list, &root->list);
 281        }
 282}
 283EXPORT_SYMBOL(qdisc_list_add);
 284
 285void qdisc_list_del(struct Qdisc *q)
 286{
 287        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 288                list_del(&q->list);
 289}
 290EXPORT_SYMBOL(qdisc_list_del);
 291
 292struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 293{
 294        struct Qdisc *q;
 295
 296        q = qdisc_match_from_root(dev->qdisc, handle);
 297        if (q)
 298                goto out;
 299
 300        if (dev_ingress_queue(dev))
 301                q = qdisc_match_from_root(
 302                        dev_ingress_queue(dev)->qdisc_sleeping,
 303                        handle);
 304out:
 305        return q;
 306}
 307
 308static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 309{
 310        unsigned long cl;
 311        struct Qdisc *leaf;
 312        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 313
 314        if (cops == NULL)
 315                return NULL;
 316        cl = cops->get(p, classid);
 317
 318        if (cl == 0)
 319                return NULL;
 320        leaf = cops->leaf(p, cl);
 321        cops->put(p, cl);
 322        return leaf;
 323}
 324
 325/* Find queueing discipline by name */
 326
 327static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 328{
 329        struct Qdisc_ops *q = NULL;
 330
 331        if (kind) {
 332                read_lock(&qdisc_mod_lock);
 333                for (q = qdisc_base; q; q = q->next) {
 334                        if (nla_strcmp(kind, q->id) == 0) {
 335                                if (!try_module_get(q->owner))
 336                                        q = NULL;
 337                                break;
 338                        }
 339                }
 340                read_unlock(&qdisc_mod_lock);
 341        }
 342        return q;
 343}
 344
 345/* The linklayer setting were not transferred from iproute2, in older
 346 * versions, and the rate tables lookup systems have been dropped in
 347 * the kernel. To keep backward compatible with older iproute2 tc
 348 * utils, we detect the linklayer setting by detecting if the rate
 349 * table were modified.
 350 *
 351 * For linklayer ATM table entries, the rate table will be aligned to
 352 * 48 bytes, thus some table entries will contain the same value.  The
 353 * mpu (min packet unit) is also encoded into the old rate table, thus
 354 * starting from the mpu, we find low and high table entries for
 355 * mapping this cell.  If these entries contain the same value, when
 356 * the rate tables have been modified for linklayer ATM.
 357 *
 358 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 359 * and then roundup to the next cell, calc the table entry one below,
 360 * and compare.
 361 */
 362static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 363{
 364        int low       = roundup(r->mpu, 48);
 365        int high      = roundup(low+1, 48);
 366        int cell_low  = low >> r->cell_log;
 367        int cell_high = (high >> r->cell_log) - 1;
 368
 369        /* rtab is too inaccurate at rates > 100Mbit/s */
 370        if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
 371                pr_debug("TC linklayer: Giving up ATM detection\n");
 372                return TC_LINKLAYER_ETHERNET;
 373        }
 374
 375        if ((cell_high > cell_low) && (cell_high < 256)
 376            && (rtab[cell_low] == rtab[cell_high])) {
 377                pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
 378                         cell_low, cell_high, rtab[cell_high]);
 379                return TC_LINKLAYER_ATM;
 380        }
 381        return TC_LINKLAYER_ETHERNET;
 382}
 383
 384static struct qdisc_rate_table *qdisc_rtab_list;
 385
 386struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 387{
 388        struct qdisc_rate_table *rtab;
 389
 390        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 391            nla_len(tab) != TC_RTAB_SIZE)
 392                return NULL;
 393
 394        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 395                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 396                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 397                        rtab->refcnt++;
 398                        return rtab;
 399                }
 400        }
 401
 402        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 403        if (rtab) {
 404                rtab->rate = *r;
 405                rtab->refcnt = 1;
 406                memcpy(rtab->data, nla_data(tab), 1024);
 407                if (r->linklayer == TC_LINKLAYER_UNAWARE)
 408                        r->linklayer = __detect_linklayer(r, rtab->data);
 409                rtab->next = qdisc_rtab_list;
 410                qdisc_rtab_list = rtab;
 411        }
 412        return rtab;
 413}
 414EXPORT_SYMBOL(qdisc_get_rtab);
 415
 416void qdisc_put_rtab(struct qdisc_rate_table *tab)
 417{
 418        struct qdisc_rate_table *rtab, **rtabp;
 419
 420        if (!tab || --tab->refcnt)
 421                return;
 422
 423        for (rtabp = &qdisc_rtab_list;
 424             (rtab = *rtabp) != NULL;
 425             rtabp = &rtab->next) {
 426                if (rtab == tab) {
 427                        *rtabp = rtab->next;
 428                        kfree(rtab);
 429                        return;
 430                }
 431        }
 432}
 433EXPORT_SYMBOL(qdisc_put_rtab);
 434
 435static LIST_HEAD(qdisc_stab_list);
 436static DEFINE_SPINLOCK(qdisc_stab_lock);
 437
 438static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 439        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 440        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 441};
 442
 443static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 444{
 445        struct nlattr *tb[TCA_STAB_MAX + 1];
 446        struct qdisc_size_table *stab;
 447        struct tc_sizespec *s;
 448        unsigned int tsize = 0;
 449        u16 *tab = NULL;
 450        int err;
 451
 452        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 453        if (err < 0)
 454                return ERR_PTR(err);
 455        if (!tb[TCA_STAB_BASE])
 456                return ERR_PTR(-EINVAL);
 457
 458        s = nla_data(tb[TCA_STAB_BASE]);
 459
 460        if (s->tsize > 0) {
 461                if (!tb[TCA_STAB_DATA])
 462                        return ERR_PTR(-EINVAL);
 463                tab = nla_data(tb[TCA_STAB_DATA]);
 464                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 465        }
 466
 467        if (tsize != s->tsize || (!tab && tsize > 0))
 468                return ERR_PTR(-EINVAL);
 469
 470        spin_lock(&qdisc_stab_lock);
 471
 472        list_for_each_entry(stab, &qdisc_stab_list, list) {
 473                if (memcmp(&stab->szopts, s, sizeof(*s)))
 474                        continue;
 475                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 476                        continue;
 477                stab->refcnt++;
 478                spin_unlock(&qdisc_stab_lock);
 479                return stab;
 480        }
 481
 482        spin_unlock(&qdisc_stab_lock);
 483
 484        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 485        if (!stab)
 486                return ERR_PTR(-ENOMEM);
 487
 488        stab->refcnt = 1;
 489        stab->szopts = *s;
 490        if (tsize > 0)
 491                memcpy(stab->data, tab, tsize * sizeof(u16));
 492
 493        spin_lock(&qdisc_stab_lock);
 494        list_add_tail(&stab->list, &qdisc_stab_list);
 495        spin_unlock(&qdisc_stab_lock);
 496
 497        return stab;
 498}
 499
 500static void stab_kfree_rcu(struct rcu_head *head)
 501{
 502        kfree(container_of(head, struct qdisc_size_table, rcu));
 503}
 504
 505void qdisc_put_stab(struct qdisc_size_table *tab)
 506{
 507        if (!tab)
 508                return;
 509
 510        spin_lock(&qdisc_stab_lock);
 511
 512        if (--tab->refcnt == 0) {
 513                list_del(&tab->list);
 514                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 515        }
 516
 517        spin_unlock(&qdisc_stab_lock);
 518}
 519EXPORT_SYMBOL(qdisc_put_stab);
 520
 521static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 522{
 523        struct nlattr *nest;
 524
 525        nest = nla_nest_start(skb, TCA_STAB);
 526        if (nest == NULL)
 527                goto nla_put_failure;
 528        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 529                goto nla_put_failure;
 530        nla_nest_end(skb, nest);
 531
 532        return skb->len;
 533
 534nla_put_failure:
 535        return -1;
 536}
 537
 538void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 539{
 540        int pkt_len, slot;
 541
 542        pkt_len = skb->len + stab->szopts.overhead;
 543        if (unlikely(!stab->szopts.tsize))
 544                goto out;
 545
 546        slot = pkt_len + stab->szopts.cell_align;
 547        if (unlikely(slot < 0))
 548                slot = 0;
 549
 550        slot >>= stab->szopts.cell_log;
 551        if (likely(slot < stab->szopts.tsize))
 552                pkt_len = stab->data[slot];
 553        else
 554                pkt_len = stab->data[stab->szopts.tsize - 1] *
 555                                (slot / stab->szopts.tsize) +
 556                                stab->data[slot % stab->szopts.tsize];
 557
 558        pkt_len <<= stab->szopts.size_log;
 559out:
 560        if (unlikely(pkt_len < 1))
 561                pkt_len = 1;
 562        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 563}
 564EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 565
 566void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
 567{
 568        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 569                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 570                        txt, qdisc->ops->id, qdisc->handle >> 16);
 571                qdisc->flags |= TCQ_F_WARN_NONWC;
 572        }
 573}
 574EXPORT_SYMBOL(qdisc_warn_nonwc);
 575
 576static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 577{
 578        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 579                                                 timer);
 580
 581        rcu_read_lock();
 582        qdisc_unthrottled(wd->qdisc);
 583        __netif_schedule(qdisc_root(wd->qdisc));
 584        rcu_read_unlock();
 585
 586        return HRTIMER_NORESTART;
 587}
 588
 589void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 590{
 591        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 592        wd->timer.function = qdisc_watchdog;
 593        wd->qdisc = qdisc;
 594}
 595EXPORT_SYMBOL(qdisc_watchdog_init);
 596
 597void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
 598{
 599        if (test_bit(__QDISC_STATE_DEACTIVATED,
 600                     &qdisc_root_sleeping(wd->qdisc)->state))
 601                return;
 602
 603        if (throttle)
 604                qdisc_throttled(wd->qdisc);
 605
 606        hrtimer_start(&wd->timer,
 607                      ns_to_ktime(expires),
 608                      HRTIMER_MODE_ABS_PINNED);
 609}
 610EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
 611
 612void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 613{
 614        hrtimer_cancel(&wd->timer);
 615        qdisc_unthrottled(wd->qdisc);
 616}
 617EXPORT_SYMBOL(qdisc_watchdog_cancel);
 618
 619static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 620{
 621        unsigned int size = n * sizeof(struct hlist_head), i;
 622        struct hlist_head *h;
 623
 624        if (size <= PAGE_SIZE)
 625                h = kmalloc(size, GFP_KERNEL);
 626        else
 627                h = (struct hlist_head *)
 628                        __get_free_pages(GFP_KERNEL, get_order(size));
 629
 630        if (h != NULL) {
 631                for (i = 0; i < n; i++)
 632                        INIT_HLIST_HEAD(&h[i]);
 633        }
 634        return h;
 635}
 636
 637static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 638{
 639        unsigned int size = n * sizeof(struct hlist_head);
 640
 641        if (size <= PAGE_SIZE)
 642                kfree(h);
 643        else
 644                free_pages((unsigned long)h, get_order(size));
 645}
 646
 647void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 648{
 649        struct Qdisc_class_common *cl;
 650        struct hlist_node *next;
 651        struct hlist_head *nhash, *ohash;
 652        unsigned int nsize, nmask, osize;
 653        unsigned int i, h;
 654
 655        /* Rehash when load factor exceeds 0.75 */
 656        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 657                return;
 658        nsize = clhash->hashsize * 2;
 659        nmask = nsize - 1;
 660        nhash = qdisc_class_hash_alloc(nsize);
 661        if (nhash == NULL)
 662                return;
 663
 664        ohash = clhash->hash;
 665        osize = clhash->hashsize;
 666
 667        sch_tree_lock(sch);
 668        for (i = 0; i < osize; i++) {
 669                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 670                        h = qdisc_class_hash(cl->classid, nmask);
 671                        hlist_add_head(&cl->hnode, &nhash[h]);
 672                }
 673        }
 674        clhash->hash     = nhash;
 675        clhash->hashsize = nsize;
 676        clhash->hashmask = nmask;
 677        sch_tree_unlock(sch);
 678
 679        qdisc_class_hash_free(ohash, osize);
 680}
 681EXPORT_SYMBOL(qdisc_class_hash_grow);
 682
 683int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 684{
 685        unsigned int size = 4;
 686
 687        clhash->hash = qdisc_class_hash_alloc(size);
 688        if (clhash->hash == NULL)
 689                return -ENOMEM;
 690        clhash->hashsize  = size;
 691        clhash->hashmask  = size - 1;
 692        clhash->hashelems = 0;
 693        return 0;
 694}
 695EXPORT_SYMBOL(qdisc_class_hash_init);
 696
 697void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 698{
 699        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 700}
 701EXPORT_SYMBOL(qdisc_class_hash_destroy);
 702
 703void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 704                             struct Qdisc_class_common *cl)
 705{
 706        unsigned int h;
 707
 708        INIT_HLIST_NODE(&cl->hnode);
 709        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 710        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 711        clhash->hashelems++;
 712}
 713EXPORT_SYMBOL(qdisc_class_hash_insert);
 714
 715void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 716                             struct Qdisc_class_common *cl)
 717{
 718        hlist_del(&cl->hnode);
 719        clhash->hashelems--;
 720}
 721EXPORT_SYMBOL(qdisc_class_hash_remove);
 722
 723/* Allocate an unique handle from space managed by kernel
 724 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 725 */
 726static u32 qdisc_alloc_handle(struct net_device *dev)
 727{
 728        int i = 0x8000;
 729        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 730
 731        do {
 732                autohandle += TC_H_MAKE(0x10000U, 0);
 733                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 734                        autohandle = TC_H_MAKE(0x80000000U, 0);
 735                if (!qdisc_lookup(dev, autohandle))
 736                        return autohandle;
 737                cond_resched();
 738        } while (--i > 0);
 739
 740        return 0;
 741}
 742
 743void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 744{
 745        const struct Qdisc_class_ops *cops;
 746        unsigned long cl;
 747        u32 parentid;
 748        int drops;
 749
 750        if (n == 0)
 751                return;
 752        drops = max_t(int, n, 0);
 753        while ((parentid = sch->parent)) {
 754                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 755                        return;
 756
 757                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 758                if (sch == NULL) {
 759                        WARN_ON(parentid != TC_H_ROOT);
 760                        return;
 761                }
 762                cops = sch->ops->cl_ops;
 763                if (cops->qlen_notify) {
 764                        cl = cops->get(sch, parentid);
 765                        cops->qlen_notify(sch, cl);
 766                        cops->put(sch, cl);
 767                }
 768                sch->q.qlen -= n;
 769                __qdisc_qstats_drop(sch, drops);
 770        }
 771}
 772EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 773
 774static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 775                               struct nlmsghdr *n, u32 clid,
 776                               struct Qdisc *old, struct Qdisc *new)
 777{
 778        if (new || old)
 779                qdisc_notify(net, skb, n, clid, old, new);
 780
 781        if (old)
 782                qdisc_destroy(old);
 783}
 784
 785/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 786 * to device "dev".
 787 *
 788 * When appropriate send a netlink notification using 'skb'
 789 * and "n".
 790 *
 791 * On success, destroy old qdisc.
 792 */
 793
 794static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 795                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 796                       struct Qdisc *new, struct Qdisc *old)
 797{
 798        struct Qdisc *q = old;
 799        struct net *net = dev_net(dev);
 800        int err = 0;
 801
 802        if (parent == NULL) {
 803                unsigned int i, num_q, ingress;
 804
 805                ingress = 0;
 806                num_q = dev->num_tx_queues;
 807                if ((q && q->flags & TCQ_F_INGRESS) ||
 808                    (new && new->flags & TCQ_F_INGRESS)) {
 809                        num_q = 1;
 810                        ingress = 1;
 811                        if (!dev_ingress_queue(dev))
 812                                return -ENOENT;
 813                }
 814
 815                if (dev->flags & IFF_UP)
 816                        dev_deactivate(dev);
 817
 818                if (new && new->ops->attach)
 819                        goto skip;
 820
 821                for (i = 0; i < num_q; i++) {
 822                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 823
 824                        if (!ingress)
 825                                dev_queue = netdev_get_tx_queue(dev, i);
 826
 827                        old = dev_graft_qdisc(dev_queue, new);
 828                        if (new && i > 0)
 829                                atomic_inc(&new->refcnt);
 830
 831                        if (!ingress)
 832                                qdisc_destroy(old);
 833                }
 834
 835skip:
 836                if (!ingress) {
 837                        notify_and_destroy(net, skb, n, classid,
 838                                           dev->qdisc, new);
 839                        if (new && !new->ops->attach)
 840                                atomic_inc(&new->refcnt);
 841                        dev->qdisc = new ? : &noop_qdisc;
 842
 843                        if (new && new->ops->attach)
 844                                new->ops->attach(new);
 845                } else {
 846                        notify_and_destroy(net, skb, n, classid, old, new);
 847                }
 848
 849                if (dev->flags & IFF_UP)
 850                        dev_activate(dev);
 851        } else {
 852                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 853
 854                err = -EOPNOTSUPP;
 855                if (cops && cops->graft) {
 856                        unsigned long cl = cops->get(parent, classid);
 857                        if (cl) {
 858                                err = cops->graft(parent, cl, new, &old);
 859                                cops->put(parent, cl);
 860                        } else
 861                                err = -ENOENT;
 862                }
 863                if (!err)
 864                        notify_and_destroy(net, skb, n, classid, old, new);
 865        }
 866        return err;
 867}
 868
 869/* lockdep annotation is needed for ingress; egress gets it only for name */
 870static struct lock_class_key qdisc_tx_lock;
 871static struct lock_class_key qdisc_rx_lock;
 872
 873/*
 874   Allocate and initialize new qdisc.
 875
 876   Parameters are passed via opt.
 877 */
 878
 879static struct Qdisc *
 880qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 881             struct Qdisc *p, u32 parent, u32 handle,
 882             struct nlattr **tca, int *errp)
 883{
 884        int err;
 885        struct nlattr *kind = tca[TCA_KIND];
 886        struct Qdisc *sch;
 887        struct Qdisc_ops *ops;
 888        struct qdisc_size_table *stab;
 889
 890        ops = qdisc_lookup_ops(kind);
 891#ifdef CONFIG_MODULES
 892        if (ops == NULL && kind != NULL) {
 893                char name[IFNAMSIZ];
 894                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 895                        /* We dropped the RTNL semaphore in order to
 896                         * perform the module load.  So, even if we
 897                         * succeeded in loading the module we have to
 898                         * tell the caller to replay the request.  We
 899                         * indicate this using -EAGAIN.
 900                         * We replay the request because the device may
 901                         * go away in the mean time.
 902                         */
 903                        rtnl_unlock();
 904                        request_module("sch_%s", name);
 905                        rtnl_lock();
 906                        ops = qdisc_lookup_ops(kind);
 907                        if (ops != NULL) {
 908                                /* We will try again qdisc_lookup_ops,
 909                                 * so don't keep a reference.
 910                                 */
 911                                module_put(ops->owner);
 912                                err = -EAGAIN;
 913                                goto err_out;
 914                        }
 915                }
 916        }
 917#endif
 918
 919        err = -ENOENT;
 920        if (ops == NULL)
 921                goto err_out;
 922
 923        sch = qdisc_alloc(dev_queue, ops);
 924        if (IS_ERR(sch)) {
 925                err = PTR_ERR(sch);
 926                goto err_out2;
 927        }
 928
 929        sch->parent = parent;
 930
 931        if (handle == TC_H_INGRESS) {
 932                sch->flags |= TCQ_F_INGRESS;
 933                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 934                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 935        } else {
 936                if (handle == 0) {
 937                        handle = qdisc_alloc_handle(dev);
 938                        err = -ENOMEM;
 939                        if (handle == 0)
 940                                goto err_out3;
 941                }
 942                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 943                if (!netif_is_multiqueue(dev))
 944                        sch->flags |= TCQ_F_ONETXQUEUE;
 945        }
 946
 947        sch->handle = handle;
 948
 949        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 950                if (qdisc_is_percpu_stats(sch)) {
 951                        sch->cpu_bstats =
 952                                netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
 953                        if (!sch->cpu_bstats)
 954                                goto err_out4;
 955
 956                        sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
 957                        if (!sch->cpu_qstats)
 958                                goto err_out4;
 959                }
 960
 961                if (tca[TCA_STAB]) {
 962                        stab = qdisc_get_stab(tca[TCA_STAB]);
 963                        if (IS_ERR(stab)) {
 964                                err = PTR_ERR(stab);
 965                                goto err_out4;
 966                        }
 967                        rcu_assign_pointer(sch->stab, stab);
 968                }
 969                if (tca[TCA_RATE]) {
 970                        spinlock_t *root_lock;
 971
 972                        err = -EOPNOTSUPP;
 973                        if (sch->flags & TCQ_F_MQROOT)
 974                                goto err_out4;
 975
 976                        if ((sch->parent != TC_H_ROOT) &&
 977                            !(sch->flags & TCQ_F_INGRESS) &&
 978                            (!p || !(p->flags & TCQ_F_MQROOT)))
 979                                root_lock = qdisc_root_sleeping_lock(sch);
 980                        else
 981                                root_lock = qdisc_lock(sch);
 982
 983                        err = gen_new_estimator(&sch->bstats,
 984                                                sch->cpu_bstats,
 985                                                &sch->rate_est,
 986                                                root_lock,
 987                                                tca[TCA_RATE]);
 988                        if (err)
 989                                goto err_out4;
 990                }
 991
 992                qdisc_list_add(sch);
 993
 994                return sch;
 995        }
 996err_out3:
 997        dev_put(dev);
 998        kfree((char *) sch - sch->padded);
 999err_out2:
1000        module_put(ops->owner);
1001err_out:
1002        *errp = err;
1003        return NULL;
1004
1005err_out4:
1006        free_percpu(sch->cpu_bstats);
1007        free_percpu(sch->cpu_qstats);
1008        /*
1009         * Any broken qdiscs that would require a ops->reset() here?
1010         * The qdisc was never in action so it shouldn't be necessary.
1011         */
1012        qdisc_put_stab(rtnl_dereference(sch->stab));
1013        if (ops->destroy)
1014                ops->destroy(sch);
1015        goto err_out3;
1016}
1017
1018static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1019{
1020        struct qdisc_size_table *ostab, *stab = NULL;
1021        int err = 0;
1022
1023        if (tca[TCA_OPTIONS]) {
1024                if (sch->ops->change == NULL)
1025                        return -EINVAL;
1026                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1027                if (err)
1028                        return err;
1029        }
1030
1031        if (tca[TCA_STAB]) {
1032                stab = qdisc_get_stab(tca[TCA_STAB]);
1033                if (IS_ERR(stab))
1034                        return PTR_ERR(stab);
1035        }
1036
1037        ostab = rtnl_dereference(sch->stab);
1038        rcu_assign_pointer(sch->stab, stab);
1039        qdisc_put_stab(ostab);
1040
1041        if (tca[TCA_RATE]) {
1042                /* NB: ignores errors from replace_estimator
1043                   because change can't be undone. */
1044                if (sch->flags & TCQ_F_MQROOT)
1045                        goto out;
1046                gen_replace_estimator(&sch->bstats,
1047                                      sch->cpu_bstats,
1048                                      &sch->rate_est,
1049                                      qdisc_root_sleeping_lock(sch),
1050                                      tca[TCA_RATE]);
1051        }
1052out:
1053        return 0;
1054}
1055
1056struct check_loop_arg {
1057        struct qdisc_walker     w;
1058        struct Qdisc            *p;
1059        int                     depth;
1060};
1061
1062static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1063
1064static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1065{
1066        struct check_loop_arg   arg;
1067
1068        if (q->ops->cl_ops == NULL)
1069                return 0;
1070
1071        arg.w.stop = arg.w.skip = arg.w.count = 0;
1072        arg.w.fn = check_loop_fn;
1073        arg.depth = depth;
1074        arg.p = p;
1075        q->ops->cl_ops->walk(q, &arg.w);
1076        return arg.w.stop ? -ELOOP : 0;
1077}
1078
1079static int
1080check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1081{
1082        struct Qdisc *leaf;
1083        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1084        struct check_loop_arg *arg = (struct check_loop_arg *)w;
1085
1086        leaf = cops->leaf(q, cl);
1087        if (leaf) {
1088                if (leaf == arg->p || arg->depth > 7)
1089                        return -ELOOP;
1090                return check_loop(leaf, arg->p, arg->depth + 1);
1091        }
1092        return 0;
1093}
1094
1095/*
1096 * Delete/get qdisc.
1097 */
1098
1099static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1100{
1101        struct net *net = sock_net(skb->sk);
1102        struct tcmsg *tcm = nlmsg_data(n);
1103        struct nlattr *tca[TCA_MAX + 1];
1104        struct net_device *dev;
1105        u32 clid;
1106        struct Qdisc *q = NULL;
1107        struct Qdisc *p = NULL;
1108        int err;
1109
1110        if ((n->nlmsg_type != RTM_GETQDISC) &&
1111            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1112                return -EPERM;
1113
1114        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1115        if (err < 0)
1116                return err;
1117
1118        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1119        if (!dev)
1120                return -ENODEV;
1121
1122        clid = tcm->tcm_parent;
1123        if (clid) {
1124                if (clid != TC_H_ROOT) {
1125                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1126                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1127                                if (!p)
1128                                        return -ENOENT;
1129                                q = qdisc_leaf(p, clid);
1130                        } else if (dev_ingress_queue(dev)) {
1131                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1132                        }
1133                } else {
1134                        q = dev->qdisc;
1135                }
1136                if (!q)
1137                        return -ENOENT;
1138
1139                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1140                        return -EINVAL;
1141        } else {
1142                q = qdisc_lookup(dev, tcm->tcm_handle);
1143                if (!q)
1144                        return -ENOENT;
1145        }
1146
1147        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1148                return -EINVAL;
1149
1150        if (n->nlmsg_type == RTM_DELQDISC) {
1151                if (!clid)
1152                        return -EINVAL;
1153                if (q->handle == 0)
1154                        return -ENOENT;
1155                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1156                if (err != 0)
1157                        return err;
1158        } else {
1159                qdisc_notify(net, skb, n, clid, NULL, q);
1160        }
1161        return 0;
1162}
1163
1164/*
1165 * Create/change qdisc.
1166 */
1167
1168static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1169{
1170        struct net *net = sock_net(skb->sk);
1171        struct tcmsg *tcm;
1172        struct nlattr *tca[TCA_MAX + 1];
1173        struct net_device *dev;
1174        u32 clid;
1175        struct Qdisc *q, *p;
1176        int err;
1177
1178        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1179                return -EPERM;
1180
1181replay:
1182        /* Reinit, just in case something touches this. */
1183        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1184        if (err < 0)
1185                return err;
1186
1187        tcm = nlmsg_data(n);
1188        clid = tcm->tcm_parent;
1189        q = p = NULL;
1190
1191        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1192        if (!dev)
1193                return -ENODEV;
1194
1195
1196        if (clid) {
1197                if (clid != TC_H_ROOT) {
1198                        if (clid != TC_H_INGRESS) {
1199                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1200                                if (!p)
1201                                        return -ENOENT;
1202                                q = qdisc_leaf(p, clid);
1203                        } else if (dev_ingress_queue_create(dev)) {
1204                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1205                        }
1206                } else {
1207                        q = dev->qdisc;
1208                }
1209
1210                /* It may be default qdisc, ignore it */
1211                if (q && q->handle == 0)
1212                        q = NULL;
1213
1214                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1215                        if (tcm->tcm_handle) {
1216                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1217                                        return -EEXIST;
1218                                if (TC_H_MIN(tcm->tcm_handle))
1219                                        return -EINVAL;
1220                                q = qdisc_lookup(dev, tcm->tcm_handle);
1221                                if (!q)
1222                                        goto create_n_graft;
1223                                if (n->nlmsg_flags & NLM_F_EXCL)
1224                                        return -EEXIST;
1225                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1226                                        return -EINVAL;
1227                                if (q == p ||
1228                                    (p && check_loop(q, p, 0)))
1229                                        return -ELOOP;
1230                                atomic_inc(&q->refcnt);
1231                                goto graft;
1232                        } else {
1233                                if (!q)
1234                                        goto create_n_graft;
1235
1236                                /* This magic test requires explanation.
1237                                 *
1238                                 *   We know, that some child q is already
1239                                 *   attached to this parent and have choice:
1240                                 *   either to change it or to create/graft new one.
1241                                 *
1242                                 *   1. We are allowed to create/graft only
1243                                 *   if CREATE and REPLACE flags are set.
1244                                 *
1245                                 *   2. If EXCL is set, requestor wanted to say,
1246                                 *   that qdisc tcm_handle is not expected
1247                                 *   to exist, so that we choose create/graft too.
1248                                 *
1249                                 *   3. The last case is when no flags are set.
1250                                 *   Alas, it is sort of hole in API, we
1251                                 *   cannot decide what to do unambiguously.
1252                                 *   For now we select create/graft, if
1253                                 *   user gave KIND, which does not match existing.
1254                                 */
1255                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1256                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1257                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1258                                     (tca[TCA_KIND] &&
1259                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1260                                        goto create_n_graft;
1261                        }
1262                }
1263        } else {
1264                if (!tcm->tcm_handle)
1265                        return -EINVAL;
1266                q = qdisc_lookup(dev, tcm->tcm_handle);
1267        }
1268
1269        /* Change qdisc parameters */
1270        if (q == NULL)
1271                return -ENOENT;
1272        if (n->nlmsg_flags & NLM_F_EXCL)
1273                return -EEXIST;
1274        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1275                return -EINVAL;
1276        err = qdisc_change(q, tca);
1277        if (err == 0)
1278                qdisc_notify(net, skb, n, clid, NULL, q);
1279        return err;
1280
1281create_n_graft:
1282        if (!(n->nlmsg_flags & NLM_F_CREATE))
1283                return -ENOENT;
1284        if (clid == TC_H_INGRESS) {
1285                if (dev_ingress_queue(dev))
1286                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1287                                         tcm->tcm_parent, tcm->tcm_parent,
1288                                         tca, &err);
1289                else
1290                        err = -ENOENT;
1291        } else {
1292                struct netdev_queue *dev_queue;
1293
1294                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1295                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1296                else if (p)
1297                        dev_queue = p->dev_queue;
1298                else
1299                        dev_queue = netdev_get_tx_queue(dev, 0);
1300
1301                q = qdisc_create(dev, dev_queue, p,
1302                                 tcm->tcm_parent, tcm->tcm_handle,
1303                                 tca, &err);
1304        }
1305        if (q == NULL) {
1306                if (err == -EAGAIN)
1307                        goto replay;
1308                return err;
1309        }
1310
1311graft:
1312        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1313        if (err) {
1314                if (q)
1315                        qdisc_destroy(q);
1316                return err;
1317        }
1318
1319        return 0;
1320}
1321
1322static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1323                         u32 portid, u32 seq, u16 flags, int event)
1324{
1325        struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1326        struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1327        struct tcmsg *tcm;
1328        struct nlmsghdr  *nlh;
1329        unsigned char *b = skb_tail_pointer(skb);
1330        struct gnet_dump d;
1331        struct qdisc_size_table *stab;
1332        __u32 qlen;
1333
1334        cond_resched();
1335        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1336        if (!nlh)
1337                goto out_nlmsg_trim;
1338        tcm = nlmsg_data(nlh);
1339        tcm->tcm_family = AF_UNSPEC;
1340        tcm->tcm__pad1 = 0;
1341        tcm->tcm__pad2 = 0;
1342        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1343        tcm->tcm_parent = clid;
1344        tcm->tcm_handle = q->handle;
1345        tcm->tcm_info = atomic_read(&q->refcnt);
1346        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1347                goto nla_put_failure;
1348        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1349                goto nla_put_failure;
1350        qlen = q->q.qlen;
1351
1352        stab = rtnl_dereference(q->stab);
1353        if (stab && qdisc_dump_stab(skb, stab) < 0)
1354                goto nla_put_failure;
1355
1356        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1357                                         qdisc_root_sleeping_lock(q), &d) < 0)
1358                goto nla_put_failure;
1359
1360        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1361                goto nla_put_failure;
1362
1363        if (qdisc_is_percpu_stats(q)) {
1364                cpu_bstats = q->cpu_bstats;
1365                cpu_qstats = q->cpu_qstats;
1366        }
1367
1368        if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
1369            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1370            gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1371                goto nla_put_failure;
1372
1373        if (gnet_stats_finish_copy(&d) < 0)
1374                goto nla_put_failure;
1375
1376        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1377        return skb->len;
1378
1379out_nlmsg_trim:
1380nla_put_failure:
1381        nlmsg_trim(skb, b);
1382        return -1;
1383}
1384
1385static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1386{
1387        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1388}
1389
1390static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1391                        struct nlmsghdr *n, u32 clid,
1392                        struct Qdisc *old, struct Qdisc *new)
1393{
1394        struct sk_buff *skb;
1395        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1396
1397        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1398        if (!skb)
1399                return -ENOBUFS;
1400
1401        if (old && !tc_qdisc_dump_ignore(old)) {
1402                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1403                                  0, RTM_DELQDISC) < 0)
1404                        goto err_out;
1405        }
1406        if (new && !tc_qdisc_dump_ignore(new)) {
1407                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1408                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1409                        goto err_out;
1410        }
1411
1412        if (skb->len)
1413                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1414                                      n->nlmsg_flags & NLM_F_ECHO);
1415
1416err_out:
1417        kfree_skb(skb);
1418        return -EINVAL;
1419}
1420
1421static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1422                              struct netlink_callback *cb,
1423                              int *q_idx_p, int s_q_idx)
1424{
1425        int ret = 0, q_idx = *q_idx_p;
1426        struct Qdisc *q;
1427
1428        if (!root)
1429                return 0;
1430
1431        q = root;
1432        if (q_idx < s_q_idx) {
1433                q_idx++;
1434        } else {
1435                if (!tc_qdisc_dump_ignore(q) &&
1436                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1437                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1438                        goto done;
1439                q_idx++;
1440        }
1441        list_for_each_entry(q, &root->list, list) {
1442                if (q_idx < s_q_idx) {
1443                        q_idx++;
1444                        continue;
1445                }
1446                if (!tc_qdisc_dump_ignore(q) &&
1447                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1448                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1449                        goto done;
1450                q_idx++;
1451        }
1452
1453out:
1454        *q_idx_p = q_idx;
1455        return ret;
1456done:
1457        ret = -1;
1458        goto out;
1459}
1460
1461static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1462{
1463        struct net *net = sock_net(skb->sk);
1464        int idx, q_idx;
1465        int s_idx, s_q_idx;
1466        struct net_device *dev;
1467
1468        s_idx = cb->args[0];
1469        s_q_idx = q_idx = cb->args[1];
1470
1471        idx = 0;
1472        ASSERT_RTNL();
1473        for_each_netdev(net, dev) {
1474                struct netdev_queue *dev_queue;
1475
1476                if (idx < s_idx)
1477                        goto cont;
1478                if (idx > s_idx)
1479                        s_q_idx = 0;
1480                q_idx = 0;
1481
1482                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1483                        goto done;
1484
1485                dev_queue = dev_ingress_queue(dev);
1486                if (dev_queue &&
1487                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1488                                       &q_idx, s_q_idx) < 0)
1489                        goto done;
1490
1491cont:
1492                idx++;
1493        }
1494
1495done:
1496        cb->args[0] = idx;
1497        cb->args[1] = q_idx;
1498
1499        return skb->len;
1500}
1501
1502
1503
1504/************************************************
1505 *      Traffic classes manipulation.           *
1506 ************************************************/
1507
1508
1509
1510static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1511{
1512        struct net *net = sock_net(skb->sk);
1513        struct tcmsg *tcm = nlmsg_data(n);
1514        struct nlattr *tca[TCA_MAX + 1];
1515        struct net_device *dev;
1516        struct Qdisc *q = NULL;
1517        const struct Qdisc_class_ops *cops;
1518        unsigned long cl = 0;
1519        unsigned long new_cl;
1520        u32 portid;
1521        u32 clid;
1522        u32 qid;
1523        int err;
1524
1525        if ((n->nlmsg_type != RTM_GETTCLASS) &&
1526            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1527                return -EPERM;
1528
1529        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1530        if (err < 0)
1531                return err;
1532
1533        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1534        if (!dev)
1535                return -ENODEV;
1536
1537        /*
1538           parent == TC_H_UNSPEC - unspecified parent.
1539           parent == TC_H_ROOT   - class is root, which has no parent.
1540           parent == X:0         - parent is root class.
1541           parent == X:Y         - parent is a node in hierarchy.
1542           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1543
1544           handle == 0:0         - generate handle from kernel pool.
1545           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1546           handle == X:Y         - clear.
1547           handle == X:0         - root class.
1548         */
1549
1550        /* Step 1. Determine qdisc handle X:0 */
1551
1552        portid = tcm->tcm_parent;
1553        clid = tcm->tcm_handle;
1554        qid = TC_H_MAJ(clid);
1555
1556        if (portid != TC_H_ROOT) {
1557                u32 qid1 = TC_H_MAJ(portid);
1558
1559                if (qid && qid1) {
1560                        /* If both majors are known, they must be identical. */
1561                        if (qid != qid1)
1562                                return -EINVAL;
1563                } else if (qid1) {
1564                        qid = qid1;
1565                } else if (qid == 0)
1566                        qid = dev->qdisc->handle;
1567
1568                /* Now qid is genuine qdisc handle consistent
1569                 * both with parent and child.
1570                 *
1571                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1572                 */
1573                if (portid)
1574                        portid = TC_H_MAKE(qid, portid);
1575        } else {
1576                if (qid == 0)
1577                        qid = dev->qdisc->handle;
1578        }
1579
1580        /* OK. Locate qdisc */
1581        q = qdisc_lookup(dev, qid);
1582        if (!q)
1583                return -ENOENT;
1584
1585        /* An check that it supports classes */
1586        cops = q->ops->cl_ops;
1587        if (cops == NULL)
1588                return -EINVAL;
1589
1590        /* Now try to get class */
1591        if (clid == 0) {
1592                if (portid == TC_H_ROOT)
1593                        clid = qid;
1594        } else
1595                clid = TC_H_MAKE(qid, clid);
1596
1597        if (clid)
1598                cl = cops->get(q, clid);
1599
1600        if (cl == 0) {
1601                err = -ENOENT;
1602                if (n->nlmsg_type != RTM_NEWTCLASS ||
1603                    !(n->nlmsg_flags & NLM_F_CREATE))
1604                        goto out;
1605        } else {
1606                switch (n->nlmsg_type) {
1607                case RTM_NEWTCLASS:
1608                        err = -EEXIST;
1609                        if (n->nlmsg_flags & NLM_F_EXCL)
1610                                goto out;
1611                        break;
1612                case RTM_DELTCLASS:
1613                        err = -EOPNOTSUPP;
1614                        if (cops->delete)
1615                                err = cops->delete(q, cl);
1616                        if (err == 0)
1617                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1618                        goto out;
1619                case RTM_GETTCLASS:
1620                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1621                        goto out;
1622                default:
1623                        err = -EINVAL;
1624                        goto out;
1625                }
1626        }
1627
1628        new_cl = cl;
1629        err = -EOPNOTSUPP;
1630        if (cops->change)
1631                err = cops->change(q, clid, portid, tca, &new_cl);
1632        if (err == 0)
1633                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1634
1635out:
1636        if (cl)
1637                cops->put(q, cl);
1638
1639        return err;
1640}
1641
1642
1643static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1644                          unsigned long cl,
1645                          u32 portid, u32 seq, u16 flags, int event)
1646{
1647        struct tcmsg *tcm;
1648        struct nlmsghdr  *nlh;
1649        unsigned char *b = skb_tail_pointer(skb);
1650        struct gnet_dump d;
1651        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1652
1653        cond_resched();
1654        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1655        if (!nlh)
1656                goto out_nlmsg_trim;
1657        tcm = nlmsg_data(nlh);
1658        tcm->tcm_family = AF_UNSPEC;
1659        tcm->tcm__pad1 = 0;
1660        tcm->tcm__pad2 = 0;
1661        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1662        tcm->tcm_parent = q->handle;
1663        tcm->tcm_handle = q->handle;
1664        tcm->tcm_info = 0;
1665        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1666                goto nla_put_failure;
1667        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1668                goto nla_put_failure;
1669
1670        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1671                                         qdisc_root_sleeping_lock(q), &d) < 0)
1672                goto nla_put_failure;
1673
1674        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1675                goto nla_put_failure;
1676
1677        if (gnet_stats_finish_copy(&d) < 0)
1678                goto nla_put_failure;
1679
1680        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1681        return skb->len;
1682
1683out_nlmsg_trim:
1684nla_put_failure:
1685        nlmsg_trim(skb, b);
1686        return -1;
1687}
1688
1689static int tclass_notify(struct net *net, struct sk_buff *oskb,
1690                         struct nlmsghdr *n, struct Qdisc *q,
1691                         unsigned long cl, int event)
1692{
1693        struct sk_buff *skb;
1694        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1695
1696        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1697        if (!skb)
1698                return -ENOBUFS;
1699
1700        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1701                kfree_skb(skb);
1702                return -EINVAL;
1703        }
1704
1705        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1706                              n->nlmsg_flags & NLM_F_ECHO);
1707}
1708
1709struct qdisc_dump_args {
1710        struct qdisc_walker     w;
1711        struct sk_buff          *skb;
1712        struct netlink_callback *cb;
1713};
1714
1715static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1716{
1717        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1718
1719        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1720                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1721}
1722
1723static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1724                                struct tcmsg *tcm, struct netlink_callback *cb,
1725                                int *t_p, int s_t)
1726{
1727        struct qdisc_dump_args arg;
1728
1729        if (tc_qdisc_dump_ignore(q) ||
1730            *t_p < s_t || !q->ops->cl_ops ||
1731            (tcm->tcm_parent &&
1732             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1733                (*t_p)++;
1734                return 0;
1735        }
1736        if (*t_p > s_t)
1737                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1738        arg.w.fn = qdisc_class_dump;
1739        arg.skb = skb;
1740        arg.cb = cb;
1741        arg.w.stop  = 0;
1742        arg.w.skip = cb->args[1];
1743        arg.w.count = 0;
1744        q->ops->cl_ops->walk(q, &arg.w);
1745        cb->args[1] = arg.w.count;
1746        if (arg.w.stop)
1747                return -1;
1748        (*t_p)++;
1749        return 0;
1750}
1751
1752static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1753                               struct tcmsg *tcm, struct netlink_callback *cb,
1754                               int *t_p, int s_t)
1755{
1756        struct Qdisc *q;
1757
1758        if (!root)
1759                return 0;
1760
1761        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1762                return -1;
1763
1764        list_for_each_entry(q, &root->list, list) {
1765                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1766                        return -1;
1767        }
1768
1769        return 0;
1770}
1771
1772static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1773{
1774        struct tcmsg *tcm = nlmsg_data(cb->nlh);
1775        struct net *net = sock_net(skb->sk);
1776        struct netdev_queue *dev_queue;
1777        struct net_device *dev;
1778        int t, s_t;
1779
1780        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1781                return 0;
1782        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1783        if (!dev)
1784                return 0;
1785
1786        s_t = cb->args[0];
1787        t = 0;
1788
1789        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1790                goto done;
1791
1792        dev_queue = dev_ingress_queue(dev);
1793        if (dev_queue &&
1794            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1795                                &t, s_t) < 0)
1796                goto done;
1797
1798done:
1799        cb->args[0] = t;
1800
1801        dev_put(dev);
1802        return skb->len;
1803}
1804
1805/* Main classifier routine: scans classifier chain attached
1806 * to this qdisc, (optionally) tests for protocol and asks
1807 * specific classifiers.
1808 */
1809int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1810                       struct tcf_result *res)
1811{
1812        __be16 protocol = tc_skb_protocol(skb);
1813        int err;
1814
1815        for (; tp; tp = rcu_dereference_bh(tp->next)) {
1816                if (tp->protocol != protocol &&
1817                    tp->protocol != htons(ETH_P_ALL))
1818                        continue;
1819                err = tp->classify(skb, tp, res);
1820
1821                if (err >= 0) {
1822#ifdef CONFIG_NET_CLS_ACT
1823                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1824                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1825#endif
1826                        return err;
1827                }
1828        }
1829        return -1;
1830}
1831EXPORT_SYMBOL(tc_classify_compat);
1832
1833int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1834                struct tcf_result *res)
1835{
1836        int err = 0;
1837#ifdef CONFIG_NET_CLS_ACT
1838        const struct tcf_proto *otp = tp;
1839reclassify:
1840#endif
1841
1842        err = tc_classify_compat(skb, tp, res);
1843#ifdef CONFIG_NET_CLS_ACT
1844        if (err == TC_ACT_RECLASSIFY) {
1845                u32 verd = G_TC_VERD(skb->tc_verd);
1846                tp = otp;
1847
1848                if (verd++ >= MAX_REC_LOOP) {
1849                        net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1850                                               tp->q->ops->id,
1851                                               tp->prio & 0xffff,
1852                                               ntohs(tp->protocol));
1853                        return TC_ACT_SHOT;
1854                }
1855                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1856                goto reclassify;
1857        }
1858#endif
1859        return err;
1860}
1861EXPORT_SYMBOL(tc_classify);
1862
1863bool tcf_destroy(struct tcf_proto *tp, bool force)
1864{
1865        if (tp->ops->destroy(tp, force)) {
1866                module_put(tp->ops->owner);
1867                kfree_rcu(tp, rcu);
1868                return true;
1869        }
1870
1871        return false;
1872}
1873
1874void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1875{
1876        struct tcf_proto *tp;
1877
1878        while ((tp = rtnl_dereference(*fl)) != NULL) {
1879                RCU_INIT_POINTER(*fl, tp->next);
1880                tcf_destroy(tp, true);
1881        }
1882}
1883EXPORT_SYMBOL(tcf_destroy_chain);
1884
1885#ifdef CONFIG_PROC_FS
1886static int psched_show(struct seq_file *seq, void *v)
1887{
1888        struct timespec ts;
1889
1890        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1891        seq_printf(seq, "%08x %08x %08x %08x\n",
1892                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1893                   1000000,
1894                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1895
1896        return 0;
1897}
1898
1899static int psched_open(struct inode *inode, struct file *file)
1900{
1901        return single_open(file, psched_show, NULL);
1902}
1903
1904static const struct file_operations psched_fops = {
1905        .owner = THIS_MODULE,
1906        .open = psched_open,
1907        .read  = seq_read,
1908        .llseek = seq_lseek,
1909        .release = single_release,
1910};
1911
1912static int __net_init psched_net_init(struct net *net)
1913{
1914        struct proc_dir_entry *e;
1915
1916        e = proc_create("psched", 0, net->proc_net, &psched_fops);
1917        if (e == NULL)
1918                return -ENOMEM;
1919
1920        return 0;
1921}
1922
1923static void __net_exit psched_net_exit(struct net *net)
1924{
1925        remove_proc_entry("psched", net->proc_net);
1926}
1927#else
1928static int __net_init psched_net_init(struct net *net)
1929{
1930        return 0;
1931}
1932
1933static void __net_exit psched_net_exit(struct net *net)
1934{
1935}
1936#endif
1937
1938static struct pernet_operations psched_net_ops = {
1939        .init = psched_net_init,
1940        .exit = psched_net_exit,
1941};
1942
1943static int __init pktsched_init(void)
1944{
1945        int err;
1946
1947        err = register_pernet_subsys(&psched_net_ops);
1948        if (err) {
1949                pr_err("pktsched_init: "
1950                       "cannot initialize per netns operations\n");
1951                return err;
1952        }
1953
1954        register_qdisc(&pfifo_fast_ops);
1955        register_qdisc(&pfifo_qdisc_ops);
1956        register_qdisc(&bfifo_qdisc_ops);
1957        register_qdisc(&pfifo_head_drop_qdisc_ops);
1958        register_qdisc(&mq_qdisc_ops);
1959
1960        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1961        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1962        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1963        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1964        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1965        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1966
1967        return 0;
1968}
1969
1970subsys_initcall(pktsched_init);
1971