linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/unregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* Get default qdisc if not otherwise specified */
 204void qdisc_get_default(char *name, size_t len)
 205{
 206        read_lock(&qdisc_mod_lock);
 207        strlcpy(name, default_qdisc_ops->id, len);
 208        read_unlock(&qdisc_mod_lock);
 209}
 210
 211static struct Qdisc_ops *qdisc_lookup_default(const char *name)
 212{
 213        struct Qdisc_ops *q = NULL;
 214
 215        for (q = qdisc_base; q; q = q->next) {
 216                if (!strcmp(name, q->id)) {
 217                        if (!try_module_get(q->owner))
 218                                q = NULL;
 219                        break;
 220                }
 221        }
 222
 223        return q;
 224}
 225
 226/* Set new default qdisc to use */
 227int qdisc_set_default(const char *name)
 228{
 229        const struct Qdisc_ops *ops;
 230
 231        if (!capable(CAP_NET_ADMIN))
 232                return -EPERM;
 233
 234        write_lock(&qdisc_mod_lock);
 235        ops = qdisc_lookup_default(name);
 236        if (!ops) {
 237                /* Not found, drop lock and try to load module */
 238                write_unlock(&qdisc_mod_lock);
 239                request_module("sch_%s", name);
 240                write_lock(&qdisc_mod_lock);
 241
 242                ops = qdisc_lookup_default(name);
 243        }
 244
 245        if (ops) {
 246                /* Set new default */
 247                module_put(default_qdisc_ops->owner);
 248                default_qdisc_ops = ops;
 249        }
 250        write_unlock(&qdisc_mod_lock);
 251
 252        return ops ? 0 : -ENOENT;
 253}
 254
 255/* We know handle. Find qdisc among all qdisc's attached to device
 256   (root qdisc, all its children, children of children etc.)
 257 */
 258
 259static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 260{
 261        struct Qdisc *q;
 262
 263        if (!(root->flags & TCQ_F_BUILTIN) &&
 264            root->handle == handle)
 265                return root;
 266
 267        list_for_each_entry(q, &root->list, list) {
 268                if (q->handle == handle)
 269                        return q;
 270        }
 271        return NULL;
 272}
 273
 274void qdisc_list_add(struct Qdisc *q)
 275{
 276        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 277                struct Qdisc *root = qdisc_dev(q)->qdisc;
 278
 279                WARN_ON_ONCE(root == &noop_qdisc);
 280                list_add_tail(&q->list, &root->list);
 281        }
 282}
 283EXPORT_SYMBOL(qdisc_list_add);
 284
 285void qdisc_list_del(struct Qdisc *q)
 286{
 287        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 288                list_del(&q->list);
 289}
 290EXPORT_SYMBOL(qdisc_list_del);
 291
 292struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 293{
 294        struct Qdisc *q;
 295
 296        q = qdisc_match_from_root(dev->qdisc, handle);
 297        if (q)
 298                goto out;
 299
 300        if (dev_ingress_queue(dev))
 301                q = qdisc_match_from_root(
 302                        dev_ingress_queue(dev)->qdisc_sleeping,
 303                        handle);
 304out:
 305        return q;
 306}
 307
 308static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 309{
 310        unsigned long cl;
 311        struct Qdisc *leaf;
 312        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 313
 314        if (cops == NULL)
 315                return NULL;
 316        cl = cops->get(p, classid);
 317
 318        if (cl == 0)
 319                return NULL;
 320        leaf = cops->leaf(p, cl);
 321        cops->put(p, cl);
 322        return leaf;
 323}
 324
 325/* Find queueing discipline by name */
 326
 327static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 328{
 329        struct Qdisc_ops *q = NULL;
 330
 331        if (kind) {
 332                read_lock(&qdisc_mod_lock);
 333                for (q = qdisc_base; q; q = q->next) {
 334                        if (nla_strcmp(kind, q->id) == 0) {
 335                                if (!try_module_get(q->owner))
 336                                        q = NULL;
 337                                break;
 338                        }
 339                }
 340                read_unlock(&qdisc_mod_lock);
 341        }
 342        return q;
 343}
 344
 345/* The linklayer setting were not transferred from iproute2, in older
 346 * versions, and the rate tables lookup systems have been dropped in
 347 * the kernel. To keep backward compatible with older iproute2 tc
 348 * utils, we detect the linklayer setting by detecting if the rate
 349 * table were modified.
 350 *
 351 * For linklayer ATM table entries, the rate table will be aligned to
 352 * 48 bytes, thus some table entries will contain the same value.  The
 353 * mpu (min packet unit) is also encoded into the old rate table, thus
 354 * starting from the mpu, we find low and high table entries for
 355 * mapping this cell.  If these entries contain the same value, when
 356 * the rate tables have been modified for linklayer ATM.
 357 *
 358 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 359 * and then roundup to the next cell, calc the table entry one below,
 360 * and compare.
 361 */
 362static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 363{
 364        int low       = roundup(r->mpu, 48);
 365        int high      = roundup(low+1, 48);
 366        int cell_low  = low >> r->cell_log;
 367        int cell_high = (high >> r->cell_log) - 1;
 368
 369        /* rtab is too inaccurate at rates > 100Mbit/s */
 370        if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
 371                pr_debug("TC linklayer: Giving up ATM detection\n");
 372                return TC_LINKLAYER_ETHERNET;
 373        }
 374
 375        if ((cell_high > cell_low) && (cell_high < 256)
 376            && (rtab[cell_low] == rtab[cell_high])) {
 377                pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
 378                         cell_low, cell_high, rtab[cell_high]);
 379                return TC_LINKLAYER_ATM;
 380        }
 381        return TC_LINKLAYER_ETHERNET;
 382}
 383
 384static struct qdisc_rate_table *qdisc_rtab_list;
 385
 386struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 387{
 388        struct qdisc_rate_table *rtab;
 389
 390        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 391            nla_len(tab) != TC_RTAB_SIZE)
 392                return NULL;
 393
 394        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 395                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 396                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 397                        rtab->refcnt++;
 398                        return rtab;
 399                }
 400        }
 401
 402        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 403        if (rtab) {
 404                rtab->rate = *r;
 405                rtab->refcnt = 1;
 406                memcpy(rtab->data, nla_data(tab), 1024);
 407                if (r->linklayer == TC_LINKLAYER_UNAWARE)
 408                        r->linklayer = __detect_linklayer(r, rtab->data);
 409                rtab->next = qdisc_rtab_list;
 410                qdisc_rtab_list = rtab;
 411        }
 412        return rtab;
 413}
 414EXPORT_SYMBOL(qdisc_get_rtab);
 415
 416void qdisc_put_rtab(struct qdisc_rate_table *tab)
 417{
 418        struct qdisc_rate_table *rtab, **rtabp;
 419
 420        if (!tab || --tab->refcnt)
 421                return;
 422
 423        for (rtabp = &qdisc_rtab_list;
 424             (rtab = *rtabp) != NULL;
 425             rtabp = &rtab->next) {
 426                if (rtab == tab) {
 427                        *rtabp = rtab->next;
 428                        kfree(rtab);
 429                        return;
 430                }
 431        }
 432}
 433EXPORT_SYMBOL(qdisc_put_rtab);
 434
 435static LIST_HEAD(qdisc_stab_list);
 436static DEFINE_SPINLOCK(qdisc_stab_lock);
 437
 438static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 439        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 440        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 441};
 442
 443static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 444{
 445        struct nlattr *tb[TCA_STAB_MAX + 1];
 446        struct qdisc_size_table *stab;
 447        struct tc_sizespec *s;
 448        unsigned int tsize = 0;
 449        u16 *tab = NULL;
 450        int err;
 451
 452        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 453        if (err < 0)
 454                return ERR_PTR(err);
 455        if (!tb[TCA_STAB_BASE])
 456                return ERR_PTR(-EINVAL);
 457
 458        s = nla_data(tb[TCA_STAB_BASE]);
 459
 460        if (s->tsize > 0) {
 461                if (!tb[TCA_STAB_DATA])
 462                        return ERR_PTR(-EINVAL);
 463                tab = nla_data(tb[TCA_STAB_DATA]);
 464                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 465        }
 466
 467        if (tsize != s->tsize || (!tab && tsize > 0))
 468                return ERR_PTR(-EINVAL);
 469
 470        spin_lock(&qdisc_stab_lock);
 471
 472        list_for_each_entry(stab, &qdisc_stab_list, list) {
 473                if (memcmp(&stab->szopts, s, sizeof(*s)))
 474                        continue;
 475                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 476                        continue;
 477                stab->refcnt++;
 478                spin_unlock(&qdisc_stab_lock);
 479                return stab;
 480        }
 481
 482        spin_unlock(&qdisc_stab_lock);
 483
 484        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 485        if (!stab)
 486                return ERR_PTR(-ENOMEM);
 487
 488        stab->refcnt = 1;
 489        stab->szopts = *s;
 490        if (tsize > 0)
 491                memcpy(stab->data, tab, tsize * sizeof(u16));
 492
 493        spin_lock(&qdisc_stab_lock);
 494        list_add_tail(&stab->list, &qdisc_stab_list);
 495        spin_unlock(&qdisc_stab_lock);
 496
 497        return stab;
 498}
 499
 500static void stab_kfree_rcu(struct rcu_head *head)
 501{
 502        kfree(container_of(head, struct qdisc_size_table, rcu));
 503}
 504
 505void qdisc_put_stab(struct qdisc_size_table *tab)
 506{
 507        if (!tab)
 508                return;
 509
 510        spin_lock(&qdisc_stab_lock);
 511
 512        if (--tab->refcnt == 0) {
 513                list_del(&tab->list);
 514                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 515        }
 516
 517        spin_unlock(&qdisc_stab_lock);
 518}
 519EXPORT_SYMBOL(qdisc_put_stab);
 520
 521static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 522{
 523        struct nlattr *nest;
 524
 525        nest = nla_nest_start(skb, TCA_STAB);
 526        if (nest == NULL)
 527                goto nla_put_failure;
 528        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 529                goto nla_put_failure;
 530        nla_nest_end(skb, nest);
 531
 532        return skb->len;
 533
 534nla_put_failure:
 535        return -1;
 536}
 537
 538void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 539{
 540        int pkt_len, slot;
 541
 542        pkt_len = skb->len + stab->szopts.overhead;
 543        if (unlikely(!stab->szopts.tsize))
 544                goto out;
 545
 546        slot = pkt_len + stab->szopts.cell_align;
 547        if (unlikely(slot < 0))
 548                slot = 0;
 549
 550        slot >>= stab->szopts.cell_log;
 551        if (likely(slot < stab->szopts.tsize))
 552                pkt_len = stab->data[slot];
 553        else
 554                pkt_len = stab->data[stab->szopts.tsize - 1] *
 555                                (slot / stab->szopts.tsize) +
 556                                stab->data[slot % stab->szopts.tsize];
 557
 558        pkt_len <<= stab->szopts.size_log;
 559out:
 560        if (unlikely(pkt_len < 1))
 561                pkt_len = 1;
 562        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 563}
 564EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 565
 566void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
 567{
 568        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 569                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 570                        txt, qdisc->ops->id, qdisc->handle >> 16);
 571                qdisc->flags |= TCQ_F_WARN_NONWC;
 572        }
 573}
 574EXPORT_SYMBOL(qdisc_warn_nonwc);
 575
 576static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 577{
 578        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 579                                                 timer);
 580
 581        rcu_read_lock();
 582        qdisc_unthrottled(wd->qdisc);
 583        __netif_schedule(qdisc_root(wd->qdisc));
 584        rcu_read_unlock();
 585
 586        return HRTIMER_NORESTART;
 587}
 588
 589void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 590{
 591        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 592        wd->timer.function = qdisc_watchdog;
 593        wd->qdisc = qdisc;
 594}
 595EXPORT_SYMBOL(qdisc_watchdog_init);
 596
 597void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
 598{
 599        if (test_bit(__QDISC_STATE_DEACTIVATED,
 600                     &qdisc_root_sleeping(wd->qdisc)->state))
 601                return;
 602
 603        if (throttle)
 604                qdisc_throttled(wd->qdisc);
 605
 606        hrtimer_start(&wd->timer,
 607                      ns_to_ktime(expires),
 608                      HRTIMER_MODE_ABS_PINNED);
 609}
 610EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
 611
 612void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 613{
 614        hrtimer_cancel(&wd->timer);
 615        qdisc_unthrottled(wd->qdisc);
 616}
 617EXPORT_SYMBOL(qdisc_watchdog_cancel);
 618
 619static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 620{
 621        unsigned int size = n * sizeof(struct hlist_head), i;
 622        struct hlist_head *h;
 623
 624        if (size <= PAGE_SIZE)
 625                h = kmalloc(size, GFP_KERNEL);
 626        else
 627                h = (struct hlist_head *)
 628                        __get_free_pages(GFP_KERNEL, get_order(size));
 629
 630        if (h != NULL) {
 631                for (i = 0; i < n; i++)
 632                        INIT_HLIST_HEAD(&h[i]);
 633        }
 634        return h;
 635}
 636
 637static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 638{
 639        unsigned int size = n * sizeof(struct hlist_head);
 640
 641        if (size <= PAGE_SIZE)
 642                kfree(h);
 643        else
 644                free_pages((unsigned long)h, get_order(size));
 645}
 646
 647void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 648{
 649        struct Qdisc_class_common *cl;
 650        struct hlist_node *next;
 651        struct hlist_head *nhash, *ohash;
 652        unsigned int nsize, nmask, osize;
 653        unsigned int i, h;
 654
 655        /* Rehash when load factor exceeds 0.75 */
 656        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 657                return;
 658        nsize = clhash->hashsize * 2;
 659        nmask = nsize - 1;
 660        nhash = qdisc_class_hash_alloc(nsize);
 661        if (nhash == NULL)
 662                return;
 663
 664        ohash = clhash->hash;
 665        osize = clhash->hashsize;
 666
 667        sch_tree_lock(sch);
 668        for (i = 0; i < osize; i++) {
 669                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 670                        h = qdisc_class_hash(cl->classid, nmask);
 671                        hlist_add_head(&cl->hnode, &nhash[h]);
 672                }
 673        }
 674        clhash->hash     = nhash;
 675        clhash->hashsize = nsize;
 676        clhash->hashmask = nmask;
 677        sch_tree_unlock(sch);
 678
 679        qdisc_class_hash_free(ohash, osize);
 680}
 681EXPORT_SYMBOL(qdisc_class_hash_grow);
 682
 683int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 684{
 685        unsigned int size = 4;
 686
 687        clhash->hash = qdisc_class_hash_alloc(size);
 688        if (clhash->hash == NULL)
 689                return -ENOMEM;
 690        clhash->hashsize  = size;
 691        clhash->hashmask  = size - 1;
 692        clhash->hashelems = 0;
 693        return 0;
 694}
 695EXPORT_SYMBOL(qdisc_class_hash_init);
 696
 697void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 698{
 699        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 700}
 701EXPORT_SYMBOL(qdisc_class_hash_destroy);
 702
 703void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 704                             struct Qdisc_class_common *cl)
 705{
 706        unsigned int h;
 707
 708        INIT_HLIST_NODE(&cl->hnode);
 709        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 710        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 711        clhash->hashelems++;
 712}
 713EXPORT_SYMBOL(qdisc_class_hash_insert);
 714
 715void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 716                             struct Qdisc_class_common *cl)
 717{
 718        hlist_del(&cl->hnode);
 719        clhash->hashelems--;
 720}
 721EXPORT_SYMBOL(qdisc_class_hash_remove);
 722
 723/* Allocate an unique handle from space managed by kernel
 724 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 725 */
 726static u32 qdisc_alloc_handle(struct net_device *dev)
 727{
 728        int i = 0x8000;
 729        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 730
 731        do {
 732                autohandle += TC_H_MAKE(0x10000U, 0);
 733                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 734                        autohandle = TC_H_MAKE(0x80000000U, 0);
 735                if (!qdisc_lookup(dev, autohandle))
 736                        return autohandle;
 737                cond_resched();
 738        } while (--i > 0);
 739
 740        return 0;
 741}
 742
 743void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 744{
 745        const struct Qdisc_class_ops *cops;
 746        unsigned long cl;
 747        u32 parentid;
 748        int drops;
 749
 750        if (n == 0)
 751                return;
 752        drops = max_t(int, n, 0);
 753        while ((parentid = sch->parent)) {
 754                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 755                        return;
 756
 757                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 758                if (sch == NULL) {
 759                        WARN_ON(parentid != TC_H_ROOT);
 760                        return;
 761                }
 762                cops = sch->ops->cl_ops;
 763                if (cops->qlen_notify) {
 764                        cl = cops->get(sch, parentid);
 765                        cops->qlen_notify(sch, cl);
 766                        cops->put(sch, cl);
 767                }
 768                sch->q.qlen -= n;
 769                __qdisc_qstats_drop(sch, drops);
 770        }
 771}
 772EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 773
 774static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 775                               struct nlmsghdr *n, u32 clid,
 776                               struct Qdisc *old, struct Qdisc *new)
 777{
 778        if (new || old)
 779                qdisc_notify(net, skb, n, clid, old, new);
 780
 781        if (old)
 782                qdisc_destroy(old);
 783}
 784
 785/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 786 * to device "dev".
 787 *
 788 * When appropriate send a netlink notification using 'skb'
 789 * and "n".
 790 *
 791 * On success, destroy old qdisc.
 792 */
 793
 794static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 795                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 796                       struct Qdisc *new, struct Qdisc *old)
 797{
 798        struct Qdisc *q = old;
 799        struct net *net = dev_net(dev);
 800        int err = 0;
 801
 802        if (parent == NULL) {
 803                unsigned int i, num_q, ingress;
 804
 805                ingress = 0;
 806                num_q = dev->num_tx_queues;
 807                if ((q && q->flags & TCQ_F_INGRESS) ||
 808                    (new && new->flags & TCQ_F_INGRESS)) {
 809                        num_q = 1;
 810                        ingress = 1;
 811                        if (!dev_ingress_queue(dev))
 812                                return -ENOENT;
 813                }
 814
 815                if (dev->flags & IFF_UP)
 816                        dev_deactivate(dev);
 817
 818                if (new && new->ops->attach) {
 819                        new->ops->attach(new);
 820                        num_q = 0;
 821                }
 822
 823                for (i = 0; i < num_q; i++) {
 824                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 825
 826                        if (!ingress)
 827                                dev_queue = netdev_get_tx_queue(dev, i);
 828
 829                        old = dev_graft_qdisc(dev_queue, new);
 830                        if (new && i > 0)
 831                                atomic_inc(&new->refcnt);
 832
 833                        if (!ingress)
 834                                qdisc_destroy(old);
 835                }
 836
 837                if (!ingress) {
 838                        notify_and_destroy(net, skb, n, classid,
 839                                           dev->qdisc, new);
 840                        if (new && !new->ops->attach)
 841                                atomic_inc(&new->refcnt);
 842                        dev->qdisc = new ? : &noop_qdisc;
 843                } else {
 844                        notify_and_destroy(net, skb, n, classid, old, new);
 845                }
 846
 847                if (dev->flags & IFF_UP)
 848                        dev_activate(dev);
 849        } else {
 850                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 851
 852                err = -EOPNOTSUPP;
 853                if (cops && cops->graft) {
 854                        unsigned long cl = cops->get(parent, classid);
 855                        if (cl) {
 856                                err = cops->graft(parent, cl, new, &old);
 857                                cops->put(parent, cl);
 858                        } else
 859                                err = -ENOENT;
 860                }
 861                if (!err)
 862                        notify_and_destroy(net, skb, n, classid, old, new);
 863        }
 864        return err;
 865}
 866
 867/* lockdep annotation is needed for ingress; egress gets it only for name */
 868static struct lock_class_key qdisc_tx_lock;
 869static struct lock_class_key qdisc_rx_lock;
 870
 871/*
 872   Allocate and initialize new qdisc.
 873
 874   Parameters are passed via opt.
 875 */
 876
 877static struct Qdisc *
 878qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 879             struct Qdisc *p, u32 parent, u32 handle,
 880             struct nlattr **tca, int *errp)
 881{
 882        int err;
 883        struct nlattr *kind = tca[TCA_KIND];
 884        struct Qdisc *sch;
 885        struct Qdisc_ops *ops;
 886        struct qdisc_size_table *stab;
 887
 888        ops = qdisc_lookup_ops(kind);
 889#ifdef CONFIG_MODULES
 890        if (ops == NULL && kind != NULL) {
 891                char name[IFNAMSIZ];
 892                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 893                        /* We dropped the RTNL semaphore in order to
 894                         * perform the module load.  So, even if we
 895                         * succeeded in loading the module we have to
 896                         * tell the caller to replay the request.  We
 897                         * indicate this using -EAGAIN.
 898                         * We replay the request because the device may
 899                         * go away in the mean time.
 900                         */
 901                        rtnl_unlock();
 902                        request_module("sch_%s", name);
 903                        rtnl_lock();
 904                        ops = qdisc_lookup_ops(kind);
 905                        if (ops != NULL) {
 906                                /* We will try again qdisc_lookup_ops,
 907                                 * so don't keep a reference.
 908                                 */
 909                                module_put(ops->owner);
 910                                err = -EAGAIN;
 911                                goto err_out;
 912                        }
 913                }
 914        }
 915#endif
 916
 917        err = -ENOENT;
 918        if (ops == NULL)
 919                goto err_out;
 920
 921        sch = qdisc_alloc(dev_queue, ops);
 922        if (IS_ERR(sch)) {
 923                err = PTR_ERR(sch);
 924                goto err_out2;
 925        }
 926
 927        sch->parent = parent;
 928
 929        if (handle == TC_H_INGRESS) {
 930                sch->flags |= TCQ_F_INGRESS;
 931                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 932                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 933        } else {
 934                if (handle == 0) {
 935                        handle = qdisc_alloc_handle(dev);
 936                        err = -ENOMEM;
 937                        if (handle == 0)
 938                                goto err_out3;
 939                }
 940                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 941                if (!netif_is_multiqueue(dev))
 942                        sch->flags |= TCQ_F_ONETXQUEUE;
 943        }
 944
 945        sch->handle = handle;
 946
 947        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 948                if (qdisc_is_percpu_stats(sch)) {
 949                        sch->cpu_bstats =
 950                                netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
 951                        if (!sch->cpu_bstats)
 952                                goto err_out4;
 953
 954                        sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
 955                        if (!sch->cpu_qstats)
 956                                goto err_out4;
 957                }
 958
 959                if (tca[TCA_STAB]) {
 960                        stab = qdisc_get_stab(tca[TCA_STAB]);
 961                        if (IS_ERR(stab)) {
 962                                err = PTR_ERR(stab);
 963                                goto err_out4;
 964                        }
 965                        rcu_assign_pointer(sch->stab, stab);
 966                }
 967                if (tca[TCA_RATE]) {
 968                        spinlock_t *root_lock;
 969
 970                        err = -EOPNOTSUPP;
 971                        if (sch->flags & TCQ_F_MQROOT)
 972                                goto err_out4;
 973
 974                        if ((sch->parent != TC_H_ROOT) &&
 975                            !(sch->flags & TCQ_F_INGRESS) &&
 976                            (!p || !(p->flags & TCQ_F_MQROOT)))
 977                                root_lock = qdisc_root_sleeping_lock(sch);
 978                        else
 979                                root_lock = qdisc_lock(sch);
 980
 981                        err = gen_new_estimator(&sch->bstats,
 982                                                sch->cpu_bstats,
 983                                                &sch->rate_est,
 984                                                root_lock,
 985                                                tca[TCA_RATE]);
 986                        if (err)
 987                                goto err_out4;
 988                }
 989
 990                qdisc_list_add(sch);
 991
 992                return sch;
 993        }
 994err_out3:
 995        dev_put(dev);
 996        kfree((char *) sch - sch->padded);
 997err_out2:
 998        module_put(ops->owner);
 999err_out:
1000        *errp = err;
1001        return NULL;
1002
1003err_out4:
1004        free_percpu(sch->cpu_bstats);
1005        free_percpu(sch->cpu_qstats);
1006        /*
1007         * Any broken qdiscs that would require a ops->reset() here?
1008         * The qdisc was never in action so it shouldn't be necessary.
1009         */
1010        qdisc_put_stab(rtnl_dereference(sch->stab));
1011        if (ops->destroy)
1012                ops->destroy(sch);
1013        goto err_out3;
1014}
1015
1016static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1017{
1018        struct qdisc_size_table *ostab, *stab = NULL;
1019        int err = 0;
1020
1021        if (tca[TCA_OPTIONS]) {
1022                if (sch->ops->change == NULL)
1023                        return -EINVAL;
1024                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1025                if (err)
1026                        return err;
1027        }
1028
1029        if (tca[TCA_STAB]) {
1030                stab = qdisc_get_stab(tca[TCA_STAB]);
1031                if (IS_ERR(stab))
1032                        return PTR_ERR(stab);
1033        }
1034
1035        ostab = rtnl_dereference(sch->stab);
1036        rcu_assign_pointer(sch->stab, stab);
1037        qdisc_put_stab(ostab);
1038
1039        if (tca[TCA_RATE]) {
1040                /* NB: ignores errors from replace_estimator
1041                   because change can't be undone. */
1042                if (sch->flags & TCQ_F_MQROOT)
1043                        goto out;
1044                gen_replace_estimator(&sch->bstats,
1045                                      sch->cpu_bstats,
1046                                      &sch->rate_est,
1047                                      qdisc_root_sleeping_lock(sch),
1048                                      tca[TCA_RATE]);
1049        }
1050out:
1051        return 0;
1052}
1053
1054struct check_loop_arg {
1055        struct qdisc_walker     w;
1056        struct Qdisc            *p;
1057        int                     depth;
1058};
1059
1060static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1061
1062static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1063{
1064        struct check_loop_arg   arg;
1065
1066        if (q->ops->cl_ops == NULL)
1067                return 0;
1068
1069        arg.w.stop = arg.w.skip = arg.w.count = 0;
1070        arg.w.fn = check_loop_fn;
1071        arg.depth = depth;
1072        arg.p = p;
1073        q->ops->cl_ops->walk(q, &arg.w);
1074        return arg.w.stop ? -ELOOP : 0;
1075}
1076
1077static int
1078check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1079{
1080        struct Qdisc *leaf;
1081        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1082        struct check_loop_arg *arg = (struct check_loop_arg *)w;
1083
1084        leaf = cops->leaf(q, cl);
1085        if (leaf) {
1086                if (leaf == arg->p || arg->depth > 7)
1087                        return -ELOOP;
1088                return check_loop(leaf, arg->p, arg->depth + 1);
1089        }
1090        return 0;
1091}
1092
1093/*
1094 * Delete/get qdisc.
1095 */
1096
1097static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1098{
1099        struct net *net = sock_net(skb->sk);
1100        struct tcmsg *tcm = nlmsg_data(n);
1101        struct nlattr *tca[TCA_MAX + 1];
1102        struct net_device *dev;
1103        u32 clid;
1104        struct Qdisc *q = NULL;
1105        struct Qdisc *p = NULL;
1106        int err;
1107
1108        if ((n->nlmsg_type != RTM_GETQDISC) &&
1109            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1110                return -EPERM;
1111
1112        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1113        if (err < 0)
1114                return err;
1115
1116        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1117        if (!dev)
1118                return -ENODEV;
1119
1120        clid = tcm->tcm_parent;
1121        if (clid) {
1122                if (clid != TC_H_ROOT) {
1123                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1124                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1125                                if (!p)
1126                                        return -ENOENT;
1127                                q = qdisc_leaf(p, clid);
1128                        } else if (dev_ingress_queue(dev)) {
1129                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1130                        }
1131                } else {
1132                        q = dev->qdisc;
1133                }
1134                if (!q)
1135                        return -ENOENT;
1136
1137                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1138                        return -EINVAL;
1139        } else {
1140                q = qdisc_lookup(dev, tcm->tcm_handle);
1141                if (!q)
1142                        return -ENOENT;
1143        }
1144
1145        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1146                return -EINVAL;
1147
1148        if (n->nlmsg_type == RTM_DELQDISC) {
1149                if (!clid)
1150                        return -EINVAL;
1151                if (q->handle == 0)
1152                        return -ENOENT;
1153                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1154                if (err != 0)
1155                        return err;
1156        } else {
1157                qdisc_notify(net, skb, n, clid, NULL, q);
1158        }
1159        return 0;
1160}
1161
1162/*
1163 * Create/change qdisc.
1164 */
1165
1166static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1167{
1168        struct net *net = sock_net(skb->sk);
1169        struct tcmsg *tcm;
1170        struct nlattr *tca[TCA_MAX + 1];
1171        struct net_device *dev;
1172        u32 clid;
1173        struct Qdisc *q, *p;
1174        int err;
1175
1176        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1177                return -EPERM;
1178
1179replay:
1180        /* Reinit, just in case something touches this. */
1181        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1182        if (err < 0)
1183                return err;
1184
1185        tcm = nlmsg_data(n);
1186        clid = tcm->tcm_parent;
1187        q = p = NULL;
1188
1189        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1190        if (!dev)
1191                return -ENODEV;
1192
1193
1194        if (clid) {
1195                if (clid != TC_H_ROOT) {
1196                        if (clid != TC_H_INGRESS) {
1197                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1198                                if (!p)
1199                                        return -ENOENT;
1200                                q = qdisc_leaf(p, clid);
1201                        } else if (dev_ingress_queue_create(dev)) {
1202                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1203                        }
1204                } else {
1205                        q = dev->qdisc;
1206                }
1207
1208                /* It may be default qdisc, ignore it */
1209                if (q && q->handle == 0)
1210                        q = NULL;
1211
1212                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1213                        if (tcm->tcm_handle) {
1214                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1215                                        return -EEXIST;
1216                                if (TC_H_MIN(tcm->tcm_handle))
1217                                        return -EINVAL;
1218                                q = qdisc_lookup(dev, tcm->tcm_handle);
1219                                if (!q)
1220                                        goto create_n_graft;
1221                                if (n->nlmsg_flags & NLM_F_EXCL)
1222                                        return -EEXIST;
1223                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1224                                        return -EINVAL;
1225                                if (q == p ||
1226                                    (p && check_loop(q, p, 0)))
1227                                        return -ELOOP;
1228                                atomic_inc(&q->refcnt);
1229                                goto graft;
1230                        } else {
1231                                if (!q)
1232                                        goto create_n_graft;
1233
1234                                /* This magic test requires explanation.
1235                                 *
1236                                 *   We know, that some child q is already
1237                                 *   attached to this parent and have choice:
1238                                 *   either to change it or to create/graft new one.
1239                                 *
1240                                 *   1. We are allowed to create/graft only
1241                                 *   if CREATE and REPLACE flags are set.
1242                                 *
1243                                 *   2. If EXCL is set, requestor wanted to say,
1244                                 *   that qdisc tcm_handle is not expected
1245                                 *   to exist, so that we choose create/graft too.
1246                                 *
1247                                 *   3. The last case is when no flags are set.
1248                                 *   Alas, it is sort of hole in API, we
1249                                 *   cannot decide what to do unambiguously.
1250                                 *   For now we select create/graft, if
1251                                 *   user gave KIND, which does not match existing.
1252                                 */
1253                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1254                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1255                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1256                                     (tca[TCA_KIND] &&
1257                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1258                                        goto create_n_graft;
1259                        }
1260                }
1261        } else {
1262                if (!tcm->tcm_handle)
1263                        return -EINVAL;
1264                q = qdisc_lookup(dev, tcm->tcm_handle);
1265        }
1266
1267        /* Change qdisc parameters */
1268        if (q == NULL)
1269                return -ENOENT;
1270        if (n->nlmsg_flags & NLM_F_EXCL)
1271                return -EEXIST;
1272        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1273                return -EINVAL;
1274        err = qdisc_change(q, tca);
1275        if (err == 0)
1276                qdisc_notify(net, skb, n, clid, NULL, q);
1277        return err;
1278
1279create_n_graft:
1280        if (!(n->nlmsg_flags & NLM_F_CREATE))
1281                return -ENOENT;
1282        if (clid == TC_H_INGRESS) {
1283                if (dev_ingress_queue(dev))
1284                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1285                                         tcm->tcm_parent, tcm->tcm_parent,
1286                                         tca, &err);
1287                else
1288                        err = -ENOENT;
1289        } else {
1290                struct netdev_queue *dev_queue;
1291
1292                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1293                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1294                else if (p)
1295                        dev_queue = p->dev_queue;
1296                else
1297                        dev_queue = netdev_get_tx_queue(dev, 0);
1298
1299                q = qdisc_create(dev, dev_queue, p,
1300                                 tcm->tcm_parent, tcm->tcm_handle,
1301                                 tca, &err);
1302        }
1303        if (q == NULL) {
1304                if (err == -EAGAIN)
1305                        goto replay;
1306                return err;
1307        }
1308
1309graft:
1310        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1311        if (err) {
1312                if (q)
1313                        qdisc_destroy(q);
1314                return err;
1315        }
1316
1317        return 0;
1318}
1319
1320static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1321                         u32 portid, u32 seq, u16 flags, int event)
1322{
1323        struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1324        struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1325        struct tcmsg *tcm;
1326        struct nlmsghdr  *nlh;
1327        unsigned char *b = skb_tail_pointer(skb);
1328        struct gnet_dump d;
1329        struct qdisc_size_table *stab;
1330        __u32 qlen;
1331
1332        cond_resched();
1333        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1334        if (!nlh)
1335                goto out_nlmsg_trim;
1336        tcm = nlmsg_data(nlh);
1337        tcm->tcm_family = AF_UNSPEC;
1338        tcm->tcm__pad1 = 0;
1339        tcm->tcm__pad2 = 0;
1340        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1341        tcm->tcm_parent = clid;
1342        tcm->tcm_handle = q->handle;
1343        tcm->tcm_info = atomic_read(&q->refcnt);
1344        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1345                goto nla_put_failure;
1346        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1347                goto nla_put_failure;
1348        qlen = q->q.qlen;
1349
1350        stab = rtnl_dereference(q->stab);
1351        if (stab && qdisc_dump_stab(skb, stab) < 0)
1352                goto nla_put_failure;
1353
1354        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1355                                         qdisc_root_sleeping_lock(q), &d) < 0)
1356                goto nla_put_failure;
1357
1358        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1359                goto nla_put_failure;
1360
1361        if (qdisc_is_percpu_stats(q)) {
1362                cpu_bstats = q->cpu_bstats;
1363                cpu_qstats = q->cpu_qstats;
1364        }
1365
1366        if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
1367            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1368            gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1369                goto nla_put_failure;
1370
1371        if (gnet_stats_finish_copy(&d) < 0)
1372                goto nla_put_failure;
1373
1374        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1375        return skb->len;
1376
1377out_nlmsg_trim:
1378nla_put_failure:
1379        nlmsg_trim(skb, b);
1380        return -1;
1381}
1382
1383static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1384{
1385        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1386}
1387
1388static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1389                        struct nlmsghdr *n, u32 clid,
1390                        struct Qdisc *old, struct Qdisc *new)
1391{
1392        struct sk_buff *skb;
1393        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1394
1395        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1396        if (!skb)
1397                return -ENOBUFS;
1398
1399        if (old && !tc_qdisc_dump_ignore(old)) {
1400                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1401                                  0, RTM_DELQDISC) < 0)
1402                        goto err_out;
1403        }
1404        if (new && !tc_qdisc_dump_ignore(new)) {
1405                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1406                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1407                        goto err_out;
1408        }
1409
1410        if (skb->len)
1411                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1412                                      n->nlmsg_flags & NLM_F_ECHO);
1413
1414err_out:
1415        kfree_skb(skb);
1416        return -EINVAL;
1417}
1418
1419static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1420                              struct netlink_callback *cb,
1421                              int *q_idx_p, int s_q_idx)
1422{
1423        int ret = 0, q_idx = *q_idx_p;
1424        struct Qdisc *q;
1425
1426        if (!root)
1427                return 0;
1428
1429        q = root;
1430        if (q_idx < s_q_idx) {
1431                q_idx++;
1432        } else {
1433                if (!tc_qdisc_dump_ignore(q) &&
1434                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1435                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1436                        goto done;
1437                q_idx++;
1438        }
1439        list_for_each_entry(q, &root->list, list) {
1440                if (q_idx < s_q_idx) {
1441                        q_idx++;
1442                        continue;
1443                }
1444                if (!tc_qdisc_dump_ignore(q) &&
1445                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1446                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1447                        goto done;
1448                q_idx++;
1449        }
1450
1451out:
1452        *q_idx_p = q_idx;
1453        return ret;
1454done:
1455        ret = -1;
1456        goto out;
1457}
1458
1459static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1460{
1461        struct net *net = sock_net(skb->sk);
1462        int idx, q_idx;
1463        int s_idx, s_q_idx;
1464        struct net_device *dev;
1465
1466        s_idx = cb->args[0];
1467        s_q_idx = q_idx = cb->args[1];
1468
1469        idx = 0;
1470        ASSERT_RTNL();
1471        for_each_netdev(net, dev) {
1472                struct netdev_queue *dev_queue;
1473
1474                if (idx < s_idx)
1475                        goto cont;
1476                if (idx > s_idx)
1477                        s_q_idx = 0;
1478                q_idx = 0;
1479
1480                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1481                        goto done;
1482
1483                dev_queue = dev_ingress_queue(dev);
1484                if (dev_queue &&
1485                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1486                                       &q_idx, s_q_idx) < 0)
1487                        goto done;
1488
1489cont:
1490                idx++;
1491        }
1492
1493done:
1494        cb->args[0] = idx;
1495        cb->args[1] = q_idx;
1496
1497        return skb->len;
1498}
1499
1500
1501
1502/************************************************
1503 *      Traffic classes manipulation.           *
1504 ************************************************/
1505
1506
1507
1508static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1509{
1510        struct net *net = sock_net(skb->sk);
1511        struct tcmsg *tcm = nlmsg_data(n);
1512        struct nlattr *tca[TCA_MAX + 1];
1513        struct net_device *dev;
1514        struct Qdisc *q = NULL;
1515        const struct Qdisc_class_ops *cops;
1516        unsigned long cl = 0;
1517        unsigned long new_cl;
1518        u32 portid;
1519        u32 clid;
1520        u32 qid;
1521        int err;
1522
1523        if ((n->nlmsg_type != RTM_GETTCLASS) &&
1524            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1525                return -EPERM;
1526
1527        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1528        if (err < 0)
1529                return err;
1530
1531        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1532        if (!dev)
1533                return -ENODEV;
1534
1535        /*
1536           parent == TC_H_UNSPEC - unspecified parent.
1537           parent == TC_H_ROOT   - class is root, which has no parent.
1538           parent == X:0         - parent is root class.
1539           parent == X:Y         - parent is a node in hierarchy.
1540           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1541
1542           handle == 0:0         - generate handle from kernel pool.
1543           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1544           handle == X:Y         - clear.
1545           handle == X:0         - root class.
1546         */
1547
1548        /* Step 1. Determine qdisc handle X:0 */
1549
1550        portid = tcm->tcm_parent;
1551        clid = tcm->tcm_handle;
1552        qid = TC_H_MAJ(clid);
1553
1554        if (portid != TC_H_ROOT) {
1555                u32 qid1 = TC_H_MAJ(portid);
1556
1557                if (qid && qid1) {
1558                        /* If both majors are known, they must be identical. */
1559                        if (qid != qid1)
1560                                return -EINVAL;
1561                } else if (qid1) {
1562                        qid = qid1;
1563                } else if (qid == 0)
1564                        qid = dev->qdisc->handle;
1565
1566                /* Now qid is genuine qdisc handle consistent
1567                 * both with parent and child.
1568                 *
1569                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1570                 */
1571                if (portid)
1572                        portid = TC_H_MAKE(qid, portid);
1573        } else {
1574                if (qid == 0)
1575                        qid = dev->qdisc->handle;
1576        }
1577
1578        /* OK. Locate qdisc */
1579        q = qdisc_lookup(dev, qid);
1580        if (!q)
1581                return -ENOENT;
1582
1583        /* An check that it supports classes */
1584        cops = q->ops->cl_ops;
1585        if (cops == NULL)
1586                return -EINVAL;
1587
1588        /* Now try to get class */
1589        if (clid == 0) {
1590                if (portid == TC_H_ROOT)
1591                        clid = qid;
1592        } else
1593                clid = TC_H_MAKE(qid, clid);
1594
1595        if (clid)
1596                cl = cops->get(q, clid);
1597
1598        if (cl == 0) {
1599                err = -ENOENT;
1600                if (n->nlmsg_type != RTM_NEWTCLASS ||
1601                    !(n->nlmsg_flags & NLM_F_CREATE))
1602                        goto out;
1603        } else {
1604                switch (n->nlmsg_type) {
1605                case RTM_NEWTCLASS:
1606                        err = -EEXIST;
1607                        if (n->nlmsg_flags & NLM_F_EXCL)
1608                                goto out;
1609                        break;
1610                case RTM_DELTCLASS:
1611                        err = -EOPNOTSUPP;
1612                        if (cops->delete)
1613                                err = cops->delete(q, cl);
1614                        if (err == 0)
1615                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1616                        goto out;
1617                case RTM_GETTCLASS:
1618                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1619                        goto out;
1620                default:
1621                        err = -EINVAL;
1622                        goto out;
1623                }
1624        }
1625
1626        new_cl = cl;
1627        err = -EOPNOTSUPP;
1628        if (cops->change)
1629                err = cops->change(q, clid, portid, tca, &new_cl);
1630        if (err == 0)
1631                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1632
1633out:
1634        if (cl)
1635                cops->put(q, cl);
1636
1637        return err;
1638}
1639
1640
1641static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1642                          unsigned long cl,
1643                          u32 portid, u32 seq, u16 flags, int event)
1644{
1645        struct tcmsg *tcm;
1646        struct nlmsghdr  *nlh;
1647        unsigned char *b = skb_tail_pointer(skb);
1648        struct gnet_dump d;
1649        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1650
1651        cond_resched();
1652        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1653        if (!nlh)
1654                goto out_nlmsg_trim;
1655        tcm = nlmsg_data(nlh);
1656        tcm->tcm_family = AF_UNSPEC;
1657        tcm->tcm__pad1 = 0;
1658        tcm->tcm__pad2 = 0;
1659        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1660        tcm->tcm_parent = q->handle;
1661        tcm->tcm_handle = q->handle;
1662        tcm->tcm_info = 0;
1663        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1664                goto nla_put_failure;
1665        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1666                goto nla_put_failure;
1667
1668        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1669                                         qdisc_root_sleeping_lock(q), &d) < 0)
1670                goto nla_put_failure;
1671
1672        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1673                goto nla_put_failure;
1674
1675        if (gnet_stats_finish_copy(&d) < 0)
1676                goto nla_put_failure;
1677
1678        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1679        return skb->len;
1680
1681out_nlmsg_trim:
1682nla_put_failure:
1683        nlmsg_trim(skb, b);
1684        return -1;
1685}
1686
1687static int tclass_notify(struct net *net, struct sk_buff *oskb,
1688                         struct nlmsghdr *n, struct Qdisc *q,
1689                         unsigned long cl, int event)
1690{
1691        struct sk_buff *skb;
1692        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1693
1694        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1695        if (!skb)
1696                return -ENOBUFS;
1697
1698        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1699                kfree_skb(skb);
1700                return -EINVAL;
1701        }
1702
1703        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1704                              n->nlmsg_flags & NLM_F_ECHO);
1705}
1706
1707struct qdisc_dump_args {
1708        struct qdisc_walker     w;
1709        struct sk_buff          *skb;
1710        struct netlink_callback *cb;
1711};
1712
1713static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1714{
1715        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1716
1717        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1718                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1719}
1720
1721static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1722                                struct tcmsg *tcm, struct netlink_callback *cb,
1723                                int *t_p, int s_t)
1724{
1725        struct qdisc_dump_args arg;
1726
1727        if (tc_qdisc_dump_ignore(q) ||
1728            *t_p < s_t || !q->ops->cl_ops ||
1729            (tcm->tcm_parent &&
1730             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1731                (*t_p)++;
1732                return 0;
1733        }
1734        if (*t_p > s_t)
1735                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1736        arg.w.fn = qdisc_class_dump;
1737        arg.skb = skb;
1738        arg.cb = cb;
1739        arg.w.stop  = 0;
1740        arg.w.skip = cb->args[1];
1741        arg.w.count = 0;
1742        q->ops->cl_ops->walk(q, &arg.w);
1743        cb->args[1] = arg.w.count;
1744        if (arg.w.stop)
1745                return -1;
1746        (*t_p)++;
1747        return 0;
1748}
1749
1750static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1751                               struct tcmsg *tcm, struct netlink_callback *cb,
1752                               int *t_p, int s_t)
1753{
1754        struct Qdisc *q;
1755
1756        if (!root)
1757                return 0;
1758
1759        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1760                return -1;
1761
1762        list_for_each_entry(q, &root->list, list) {
1763                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1764                        return -1;
1765        }
1766
1767        return 0;
1768}
1769
1770static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1771{
1772        struct tcmsg *tcm = nlmsg_data(cb->nlh);
1773        struct net *net = sock_net(skb->sk);
1774        struct netdev_queue *dev_queue;
1775        struct net_device *dev;
1776        int t, s_t;
1777
1778        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1779                return 0;
1780        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1781        if (!dev)
1782                return 0;
1783
1784        s_t = cb->args[0];
1785        t = 0;
1786
1787        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1788                goto done;
1789
1790        dev_queue = dev_ingress_queue(dev);
1791        if (dev_queue &&
1792            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1793                                &t, s_t) < 0)
1794                goto done;
1795
1796done:
1797        cb->args[0] = t;
1798
1799        dev_put(dev);
1800        return skb->len;
1801}
1802
1803/* Main classifier routine: scans classifier chain attached
1804 * to this qdisc, (optionally) tests for protocol and asks
1805 * specific classifiers.
1806 */
1807int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1808                       struct tcf_result *res)
1809{
1810        __be16 protocol = skb->protocol;
1811        int err;
1812
1813        for (; tp; tp = rcu_dereference_bh(tp->next)) {
1814                if (tp->protocol != protocol &&
1815                    tp->protocol != htons(ETH_P_ALL))
1816                        continue;
1817                err = tp->classify(skb, tp, res);
1818
1819                if (err >= 0) {
1820#ifdef CONFIG_NET_CLS_ACT
1821                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1822                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1823#endif
1824                        return err;
1825                }
1826        }
1827        return -1;
1828}
1829EXPORT_SYMBOL(tc_classify_compat);
1830
1831int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1832                struct tcf_result *res)
1833{
1834        int err = 0;
1835#ifdef CONFIG_NET_CLS_ACT
1836        const struct tcf_proto *otp = tp;
1837reclassify:
1838#endif
1839
1840        err = tc_classify_compat(skb, tp, res);
1841#ifdef CONFIG_NET_CLS_ACT
1842        if (err == TC_ACT_RECLASSIFY) {
1843                u32 verd = G_TC_VERD(skb->tc_verd);
1844                tp = otp;
1845
1846                if (verd++ >= MAX_REC_LOOP) {
1847                        net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1848                                               tp->q->ops->id,
1849                                               tp->prio & 0xffff,
1850                                               ntohs(tp->protocol));
1851                        return TC_ACT_SHOT;
1852                }
1853                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1854                goto reclassify;
1855        }
1856#endif
1857        return err;
1858}
1859EXPORT_SYMBOL(tc_classify);
1860
1861void tcf_destroy(struct tcf_proto *tp)
1862{
1863        tp->ops->destroy(tp);
1864        module_put(tp->ops->owner);
1865        kfree_rcu(tp, rcu);
1866}
1867
1868void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1869{
1870        struct tcf_proto *tp;
1871
1872        while ((tp = rtnl_dereference(*fl)) != NULL) {
1873                RCU_INIT_POINTER(*fl, tp->next);
1874                tcf_destroy(tp);
1875        }
1876}
1877EXPORT_SYMBOL(tcf_destroy_chain);
1878
1879#ifdef CONFIG_PROC_FS
1880static int psched_show(struct seq_file *seq, void *v)
1881{
1882        struct timespec ts;
1883
1884        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1885        seq_printf(seq, "%08x %08x %08x %08x\n",
1886                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1887                   1000000,
1888                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1889
1890        return 0;
1891}
1892
1893static int psched_open(struct inode *inode, struct file *file)
1894{
1895        return single_open(file, psched_show, NULL);
1896}
1897
1898static const struct file_operations psched_fops = {
1899        .owner = THIS_MODULE,
1900        .open = psched_open,
1901        .read  = seq_read,
1902        .llseek = seq_lseek,
1903        .release = single_release,
1904};
1905
1906static int __net_init psched_net_init(struct net *net)
1907{
1908        struct proc_dir_entry *e;
1909
1910        e = proc_create("psched", 0, net->proc_net, &psched_fops);
1911        if (e == NULL)
1912                return -ENOMEM;
1913
1914        return 0;
1915}
1916
1917static void __net_exit psched_net_exit(struct net *net)
1918{
1919        remove_proc_entry("psched", net->proc_net);
1920}
1921#else
1922static int __net_init psched_net_init(struct net *net)
1923{
1924        return 0;
1925}
1926
1927static void __net_exit psched_net_exit(struct net *net)
1928{
1929}
1930#endif
1931
1932static struct pernet_operations psched_net_ops = {
1933        .init = psched_net_init,
1934        .exit = psched_net_exit,
1935};
1936
1937static int __init pktsched_init(void)
1938{
1939        int err;
1940
1941        err = register_pernet_subsys(&psched_net_ops);
1942        if (err) {
1943                pr_err("pktsched_init: "
1944                       "cannot initialize per netns operations\n");
1945                return err;
1946        }
1947
1948        register_qdisc(&pfifo_fast_ops);
1949        register_qdisc(&pfifo_qdisc_ops);
1950        register_qdisc(&bfifo_qdisc_ops);
1951        register_qdisc(&pfifo_head_drop_qdisc_ops);
1952        register_qdisc(&mq_qdisc_ops);
1953
1954        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1955        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1956        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1957        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1958        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1959        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1960
1961        return 0;
1962}
1963
1964subsys_initcall(pktsched_init);
1965