linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/uregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* We know handle. Find qdisc among all qdisc's attached to device
 204   (root qdisc, all its children, children of children etc.)
 205 */
 206
 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 208{
 209        struct Qdisc *q;
 210
 211        if (!(root->flags & TCQ_F_BUILTIN) &&
 212            root->handle == handle)
 213                return root;
 214
 215        list_for_each_entry(q, &root->list, list) {
 216                if (q->handle == handle)
 217                        return q;
 218        }
 219        return NULL;
 220}
 221
 222static void qdisc_list_add(struct Qdisc *q)
 223{
 224        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 225                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 226}
 227
 228void qdisc_list_del(struct Qdisc *q)
 229{
 230        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 231                list_del(&q->list);
 232}
 233EXPORT_SYMBOL(qdisc_list_del);
 234
 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 236{
 237        struct Qdisc *q;
 238
 239        q = qdisc_match_from_root(dev->qdisc, handle);
 240        if (q)
 241                goto out;
 242
 243        if (dev_ingress_queue(dev))
 244                q = qdisc_match_from_root(
 245                        dev_ingress_queue(dev)->qdisc_sleeping,
 246                        handle);
 247out:
 248        return q;
 249}
 250
 251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 252{
 253        unsigned long cl;
 254        struct Qdisc *leaf;
 255        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 256
 257        if (cops == NULL)
 258                return NULL;
 259        cl = cops->get(p, classid);
 260
 261        if (cl == 0)
 262                return NULL;
 263        leaf = cops->leaf(p, cl);
 264        cops->put(p, cl);
 265        return leaf;
 266}
 267
 268/* Find queueing discipline by name */
 269
 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 271{
 272        struct Qdisc_ops *q = NULL;
 273
 274        if (kind) {
 275                read_lock(&qdisc_mod_lock);
 276                for (q = qdisc_base; q; q = q->next) {
 277                        if (nla_strcmp(kind, q->id) == 0) {
 278                                if (!try_module_get(q->owner))
 279                                        q = NULL;
 280                                break;
 281                        }
 282                }
 283                read_unlock(&qdisc_mod_lock);
 284        }
 285        return q;
 286}
 287
 288static struct qdisc_rate_table *qdisc_rtab_list;
 289
 290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 291{
 292        struct qdisc_rate_table *rtab;
 293
 294        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 295                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 296                        rtab->refcnt++;
 297                        return rtab;
 298                }
 299        }
 300
 301        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 302            nla_len(tab) != TC_RTAB_SIZE)
 303                return NULL;
 304
 305        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 306        if (rtab) {
 307                rtab->rate = *r;
 308                rtab->refcnt = 1;
 309                memcpy(rtab->data, nla_data(tab), 1024);
 310                rtab->next = qdisc_rtab_list;
 311                qdisc_rtab_list = rtab;
 312        }
 313        return rtab;
 314}
 315EXPORT_SYMBOL(qdisc_get_rtab);
 316
 317void qdisc_put_rtab(struct qdisc_rate_table *tab)
 318{
 319        struct qdisc_rate_table *rtab, **rtabp;
 320
 321        if (!tab || --tab->refcnt)
 322                return;
 323
 324        for (rtabp = &qdisc_rtab_list;
 325             (rtab = *rtabp) != NULL;
 326             rtabp = &rtab->next) {
 327                if (rtab == tab) {
 328                        *rtabp = rtab->next;
 329                        kfree(rtab);
 330                        return;
 331                }
 332        }
 333}
 334EXPORT_SYMBOL(qdisc_put_rtab);
 335
 336static LIST_HEAD(qdisc_stab_list);
 337static DEFINE_SPINLOCK(qdisc_stab_lock);
 338
 339static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 340        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 341        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 342};
 343
 344static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 345{
 346        struct nlattr *tb[TCA_STAB_MAX + 1];
 347        struct qdisc_size_table *stab;
 348        struct tc_sizespec *s;
 349        unsigned int tsize = 0;
 350        u16 *tab = NULL;
 351        int err;
 352
 353        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 354        if (err < 0)
 355                return ERR_PTR(err);
 356        if (!tb[TCA_STAB_BASE])
 357                return ERR_PTR(-EINVAL);
 358
 359        s = nla_data(tb[TCA_STAB_BASE]);
 360
 361        if (s->tsize > 0) {
 362                if (!tb[TCA_STAB_DATA])
 363                        return ERR_PTR(-EINVAL);
 364                tab = nla_data(tb[TCA_STAB_DATA]);
 365                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 366        }
 367
 368        if (tsize != s->tsize || (!tab && tsize > 0))
 369                return ERR_PTR(-EINVAL);
 370
 371        spin_lock(&qdisc_stab_lock);
 372
 373        list_for_each_entry(stab, &qdisc_stab_list, list) {
 374                if (memcmp(&stab->szopts, s, sizeof(*s)))
 375                        continue;
 376                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 377                        continue;
 378                stab->refcnt++;
 379                spin_unlock(&qdisc_stab_lock);
 380                return stab;
 381        }
 382
 383        spin_unlock(&qdisc_stab_lock);
 384
 385        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 386        if (!stab)
 387                return ERR_PTR(-ENOMEM);
 388
 389        stab->refcnt = 1;
 390        stab->szopts = *s;
 391        if (tsize > 0)
 392                memcpy(stab->data, tab, tsize * sizeof(u16));
 393
 394        spin_lock(&qdisc_stab_lock);
 395        list_add_tail(&stab->list, &qdisc_stab_list);
 396        spin_unlock(&qdisc_stab_lock);
 397
 398        return stab;
 399}
 400
 401static void stab_kfree_rcu(struct rcu_head *head)
 402{
 403        kfree(container_of(head, struct qdisc_size_table, rcu));
 404}
 405
 406void qdisc_put_stab(struct qdisc_size_table *tab)
 407{
 408        if (!tab)
 409                return;
 410
 411        spin_lock(&qdisc_stab_lock);
 412
 413        if (--tab->refcnt == 0) {
 414                list_del(&tab->list);
 415                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 416        }
 417
 418        spin_unlock(&qdisc_stab_lock);
 419}
 420EXPORT_SYMBOL(qdisc_put_stab);
 421
 422static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 423{
 424        struct nlattr *nest;
 425
 426        nest = nla_nest_start(skb, TCA_STAB);
 427        if (nest == NULL)
 428                goto nla_put_failure;
 429        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 430                goto nla_put_failure;
 431        nla_nest_end(skb, nest);
 432
 433        return skb->len;
 434
 435nla_put_failure:
 436        return -1;
 437}
 438
 439void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 440{
 441        int pkt_len, slot;
 442
 443        pkt_len = skb->len + stab->szopts.overhead;
 444        if (unlikely(!stab->szopts.tsize))
 445                goto out;
 446
 447        slot = pkt_len + stab->szopts.cell_align;
 448        if (unlikely(slot < 0))
 449                slot = 0;
 450
 451        slot >>= stab->szopts.cell_log;
 452        if (likely(slot < stab->szopts.tsize))
 453                pkt_len = stab->data[slot];
 454        else
 455                pkt_len = stab->data[stab->szopts.tsize - 1] *
 456                                (slot / stab->szopts.tsize) +
 457                                stab->data[slot % stab->szopts.tsize];
 458
 459        pkt_len <<= stab->szopts.size_log;
 460out:
 461        if (unlikely(pkt_len < 1))
 462                pkt_len = 1;
 463        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 464}
 465EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 466
 467void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 468{
 469        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 470                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 471                        txt, qdisc->ops->id, qdisc->handle >> 16);
 472                qdisc->flags |= TCQ_F_WARN_NONWC;
 473        }
 474}
 475EXPORT_SYMBOL(qdisc_warn_nonwc);
 476
 477static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 478{
 479        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 480                                                 timer);
 481
 482        qdisc_unthrottled(wd->qdisc);
 483        __netif_schedule(qdisc_root(wd->qdisc));
 484
 485        return HRTIMER_NORESTART;
 486}
 487
 488void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 489{
 490        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 491        wd->timer.function = qdisc_watchdog;
 492        wd->qdisc = qdisc;
 493}
 494EXPORT_SYMBOL(qdisc_watchdog_init);
 495
 496void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 497{
 498        if (test_bit(__QDISC_STATE_DEACTIVATED,
 499                     &qdisc_root_sleeping(wd->qdisc)->state))
 500                return;
 501
 502        qdisc_throttled(wd->qdisc);
 503
 504        hrtimer_start(&wd->timer,
 505                      ns_to_ktime(PSCHED_TICKS2NS(expires)),
 506                      HRTIMER_MODE_ABS);
 507}
 508EXPORT_SYMBOL(qdisc_watchdog_schedule);
 509
 510void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 511{
 512        hrtimer_cancel(&wd->timer);
 513        qdisc_unthrottled(wd->qdisc);
 514}
 515EXPORT_SYMBOL(qdisc_watchdog_cancel);
 516
 517static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 518{
 519        unsigned int size = n * sizeof(struct hlist_head), i;
 520        struct hlist_head *h;
 521
 522        if (size <= PAGE_SIZE)
 523                h = kmalloc(size, GFP_KERNEL);
 524        else
 525                h = (struct hlist_head *)
 526                        __get_free_pages(GFP_KERNEL, get_order(size));
 527
 528        if (h != NULL) {
 529                for (i = 0; i < n; i++)
 530                        INIT_HLIST_HEAD(&h[i]);
 531        }
 532        return h;
 533}
 534
 535static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 536{
 537        unsigned int size = n * sizeof(struct hlist_head);
 538
 539        if (size <= PAGE_SIZE)
 540                kfree(h);
 541        else
 542                free_pages((unsigned long)h, get_order(size));
 543}
 544
 545void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 546{
 547        struct Qdisc_class_common *cl;
 548        struct hlist_node *n, *next;
 549        struct hlist_head *nhash, *ohash;
 550        unsigned int nsize, nmask, osize;
 551        unsigned int i, h;
 552
 553        /* Rehash when load factor exceeds 0.75 */
 554        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 555                return;
 556        nsize = clhash->hashsize * 2;
 557        nmask = nsize - 1;
 558        nhash = qdisc_class_hash_alloc(nsize);
 559        if (nhash == NULL)
 560                return;
 561
 562        ohash = clhash->hash;
 563        osize = clhash->hashsize;
 564
 565        sch_tree_lock(sch);
 566        for (i = 0; i < osize; i++) {
 567                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 568                        h = qdisc_class_hash(cl->classid, nmask);
 569                        hlist_add_head(&cl->hnode, &nhash[h]);
 570                }
 571        }
 572        clhash->hash     = nhash;
 573        clhash->hashsize = nsize;
 574        clhash->hashmask = nmask;
 575        sch_tree_unlock(sch);
 576
 577        qdisc_class_hash_free(ohash, osize);
 578}
 579EXPORT_SYMBOL(qdisc_class_hash_grow);
 580
 581int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 582{
 583        unsigned int size = 4;
 584
 585        clhash->hash = qdisc_class_hash_alloc(size);
 586        if (clhash->hash == NULL)
 587                return -ENOMEM;
 588        clhash->hashsize  = size;
 589        clhash->hashmask  = size - 1;
 590        clhash->hashelems = 0;
 591        return 0;
 592}
 593EXPORT_SYMBOL(qdisc_class_hash_init);
 594
 595void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 596{
 597        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 598}
 599EXPORT_SYMBOL(qdisc_class_hash_destroy);
 600
 601void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 602                             struct Qdisc_class_common *cl)
 603{
 604        unsigned int h;
 605
 606        INIT_HLIST_NODE(&cl->hnode);
 607        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 608        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 609        clhash->hashelems++;
 610}
 611EXPORT_SYMBOL(qdisc_class_hash_insert);
 612
 613void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 614                             struct Qdisc_class_common *cl)
 615{
 616        hlist_del(&cl->hnode);
 617        clhash->hashelems--;
 618}
 619EXPORT_SYMBOL(qdisc_class_hash_remove);
 620
 621/* Allocate an unique handle from space managed by kernel
 622 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 623 */
 624static u32 qdisc_alloc_handle(struct net_device *dev)
 625{
 626        int i = 0x8000;
 627        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 628
 629        do {
 630                autohandle += TC_H_MAKE(0x10000U, 0);
 631                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 632                        autohandle = TC_H_MAKE(0x80000000U, 0);
 633                if (!qdisc_lookup(dev, autohandle))
 634                        return autohandle;
 635                cond_resched();
 636        } while (--i > 0);
 637
 638        return 0;
 639}
 640
 641void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 642{
 643        const struct Qdisc_class_ops *cops;
 644        unsigned long cl;
 645        u32 parentid;
 646
 647        if (n == 0)
 648                return;
 649        while ((parentid = sch->parent)) {
 650                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 651                        return;
 652
 653                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 654                if (sch == NULL) {
 655                        WARN_ON(parentid != TC_H_ROOT);
 656                        return;
 657                }
 658                cops = sch->ops->cl_ops;
 659                if (cops->qlen_notify) {
 660                        cl = cops->get(sch, parentid);
 661                        cops->qlen_notify(sch, cl);
 662                        cops->put(sch, cl);
 663                }
 664                sch->q.qlen -= n;
 665        }
 666}
 667EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 668
 669static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 670                               struct nlmsghdr *n, u32 clid,
 671                               struct Qdisc *old, struct Qdisc *new)
 672{
 673        if (new || old)
 674                qdisc_notify(net, skb, n, clid, old, new);
 675
 676        if (old)
 677                qdisc_destroy(old);
 678}
 679
 680/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 681 * to device "dev".
 682 *
 683 * When appropriate send a netlink notification using 'skb'
 684 * and "n".
 685 *
 686 * On success, destroy old qdisc.
 687 */
 688
 689static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 690                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 691                       struct Qdisc *new, struct Qdisc *old)
 692{
 693        struct Qdisc *q = old;
 694        struct net *net = dev_net(dev);
 695        int err = 0;
 696
 697        if (parent == NULL) {
 698                unsigned int i, num_q, ingress;
 699
 700                ingress = 0;
 701                num_q = dev->num_tx_queues;
 702                if ((q && q->flags & TCQ_F_INGRESS) ||
 703                    (new && new->flags & TCQ_F_INGRESS)) {
 704                        num_q = 1;
 705                        ingress = 1;
 706                        if (!dev_ingress_queue(dev))
 707                                return -ENOENT;
 708                }
 709
 710                if (dev->flags & IFF_UP)
 711                        dev_deactivate(dev);
 712
 713                if (new && new->ops->attach) {
 714                        new->ops->attach(new);
 715                        num_q = 0;
 716                }
 717
 718                for (i = 0; i < num_q; i++) {
 719                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 720
 721                        if (!ingress)
 722                                dev_queue = netdev_get_tx_queue(dev, i);
 723
 724                        old = dev_graft_qdisc(dev_queue, new);
 725                        if (new && i > 0)
 726                                atomic_inc(&new->refcnt);
 727
 728                        if (!ingress)
 729                                qdisc_destroy(old);
 730                }
 731
 732                if (!ingress) {
 733                        notify_and_destroy(net, skb, n, classid,
 734                                           dev->qdisc, new);
 735                        if (new && !new->ops->attach)
 736                                atomic_inc(&new->refcnt);
 737                        dev->qdisc = new ? : &noop_qdisc;
 738                } else {
 739                        notify_and_destroy(net, skb, n, classid, old, new);
 740                }
 741
 742                if (dev->flags & IFF_UP)
 743                        dev_activate(dev);
 744        } else {
 745                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 746
 747                err = -EOPNOTSUPP;
 748                if (cops && cops->graft) {
 749                        unsigned long cl = cops->get(parent, classid);
 750                        if (cl) {
 751                                err = cops->graft(parent, cl, new, &old);
 752                                cops->put(parent, cl);
 753                        } else
 754                                err = -ENOENT;
 755                }
 756                if (!err)
 757                        notify_and_destroy(net, skb, n, classid, old, new);
 758        }
 759        return err;
 760}
 761
 762/* lockdep annotation is needed for ingress; egress gets it only for name */
 763static struct lock_class_key qdisc_tx_lock;
 764static struct lock_class_key qdisc_rx_lock;
 765
 766/*
 767   Allocate and initialize new qdisc.
 768
 769   Parameters are passed via opt.
 770 */
 771
 772static struct Qdisc *
 773qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 774             struct Qdisc *p, u32 parent, u32 handle,
 775             struct nlattr **tca, int *errp)
 776{
 777        int err;
 778        struct nlattr *kind = tca[TCA_KIND];
 779        struct Qdisc *sch;
 780        struct Qdisc_ops *ops;
 781        struct qdisc_size_table *stab;
 782
 783        ops = qdisc_lookup_ops(kind);
 784#ifdef CONFIG_MODULES
 785        if (ops == NULL && kind != NULL) {
 786                char name[IFNAMSIZ];
 787                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 788                        /* We dropped the RTNL semaphore in order to
 789                         * perform the module load.  So, even if we
 790                         * succeeded in loading the module we have to
 791                         * tell the caller to replay the request.  We
 792                         * indicate this using -EAGAIN.
 793                         * We replay the request because the device may
 794                         * go away in the mean time.
 795                         */
 796                        rtnl_unlock();
 797                        request_module("sch_%s", name);
 798                        rtnl_lock();
 799                        ops = qdisc_lookup_ops(kind);
 800                        if (ops != NULL) {
 801                                /* We will try again qdisc_lookup_ops,
 802                                 * so don't keep a reference.
 803                                 */
 804                                module_put(ops->owner);
 805                                err = -EAGAIN;
 806                                goto err_out;
 807                        }
 808                }
 809        }
 810#endif
 811
 812        err = -ENOENT;
 813        if (ops == NULL)
 814                goto err_out;
 815
 816        sch = qdisc_alloc(dev_queue, ops);
 817        if (IS_ERR(sch)) {
 818                err = PTR_ERR(sch);
 819                goto err_out2;
 820        }
 821
 822        sch->parent = parent;
 823
 824        if (handle == TC_H_INGRESS) {
 825                sch->flags |= TCQ_F_INGRESS;
 826                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 827                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 828        } else {
 829                if (handle == 0) {
 830                        handle = qdisc_alloc_handle(dev);
 831                        err = -ENOMEM;
 832                        if (handle == 0)
 833                                goto err_out3;
 834                }
 835                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 836                if (!netif_is_multiqueue(dev))
 837                        sch->flags |= TCQ_F_ONETXQUEUE;
 838        }
 839
 840        sch->handle = handle;
 841
 842        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 843                if (tca[TCA_STAB]) {
 844                        stab = qdisc_get_stab(tca[TCA_STAB]);
 845                        if (IS_ERR(stab)) {
 846                                err = PTR_ERR(stab);
 847                                goto err_out4;
 848                        }
 849                        rcu_assign_pointer(sch->stab, stab);
 850                }
 851                if (tca[TCA_RATE]) {
 852                        spinlock_t *root_lock;
 853
 854                        err = -EOPNOTSUPP;
 855                        if (sch->flags & TCQ_F_MQROOT)
 856                                goto err_out4;
 857
 858                        if ((sch->parent != TC_H_ROOT) &&
 859                            !(sch->flags & TCQ_F_INGRESS) &&
 860                            (!p || !(p->flags & TCQ_F_MQROOT)))
 861                                root_lock = qdisc_root_sleeping_lock(sch);
 862                        else
 863                                root_lock = qdisc_lock(sch);
 864
 865                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 866                                                root_lock, tca[TCA_RATE]);
 867                        if (err)
 868                                goto err_out4;
 869                }
 870
 871                qdisc_list_add(sch);
 872
 873                return sch;
 874        }
 875err_out3:
 876        dev_put(dev);
 877        kfree((char *) sch - sch->padded);
 878err_out2:
 879        module_put(ops->owner);
 880err_out:
 881        *errp = err;
 882        return NULL;
 883
 884err_out4:
 885        /*
 886         * Any broken qdiscs that would require a ops->reset() here?
 887         * The qdisc was never in action so it shouldn't be necessary.
 888         */
 889        qdisc_put_stab(rtnl_dereference(sch->stab));
 890        if (ops->destroy)
 891                ops->destroy(sch);
 892        goto err_out3;
 893}
 894
 895static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 896{
 897        struct qdisc_size_table *ostab, *stab = NULL;
 898        int err = 0;
 899
 900        if (tca[TCA_OPTIONS]) {
 901                if (sch->ops->change == NULL)
 902                        return -EINVAL;
 903                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 904                if (err)
 905                        return err;
 906        }
 907
 908        if (tca[TCA_STAB]) {
 909                stab = qdisc_get_stab(tca[TCA_STAB]);
 910                if (IS_ERR(stab))
 911                        return PTR_ERR(stab);
 912        }
 913
 914        ostab = rtnl_dereference(sch->stab);
 915        rcu_assign_pointer(sch->stab, stab);
 916        qdisc_put_stab(ostab);
 917
 918        if (tca[TCA_RATE]) {
 919                /* NB: ignores errors from replace_estimator
 920                   because change can't be undone. */
 921                if (sch->flags & TCQ_F_MQROOT)
 922                        goto out;
 923                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 924                                            qdisc_root_sleeping_lock(sch),
 925                                            tca[TCA_RATE]);
 926        }
 927out:
 928        return 0;
 929}
 930
 931struct check_loop_arg {
 932        struct qdisc_walker     w;
 933        struct Qdisc            *p;
 934        int                     depth;
 935};
 936
 937static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 938
 939static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 940{
 941        struct check_loop_arg   arg;
 942
 943        if (q->ops->cl_ops == NULL)
 944                return 0;
 945
 946        arg.w.stop = arg.w.skip = arg.w.count = 0;
 947        arg.w.fn = check_loop_fn;
 948        arg.depth = depth;
 949        arg.p = p;
 950        q->ops->cl_ops->walk(q, &arg.w);
 951        return arg.w.stop ? -ELOOP : 0;
 952}
 953
 954static int
 955check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 956{
 957        struct Qdisc *leaf;
 958        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 959        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 960
 961        leaf = cops->leaf(q, cl);
 962        if (leaf) {
 963                if (leaf == arg->p || arg->depth > 7)
 964                        return -ELOOP;
 965                return check_loop(leaf, arg->p, arg->depth + 1);
 966        }
 967        return 0;
 968}
 969
 970/*
 971 * Delete/get qdisc.
 972 */
 973
 974static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 975{
 976        struct net *net = sock_net(skb->sk);
 977        struct tcmsg *tcm = nlmsg_data(n);
 978        struct nlattr *tca[TCA_MAX + 1];
 979        struct net_device *dev;
 980        u32 clid = tcm->tcm_parent;
 981        struct Qdisc *q = NULL;
 982        struct Qdisc *p = NULL;
 983        int err;
 984
 985        if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
 986                return -EPERM;
 987
 988        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
 989        if (!dev)
 990                return -ENODEV;
 991
 992        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 993        if (err < 0)
 994                return err;
 995
 996        if (clid) {
 997                if (clid != TC_H_ROOT) {
 998                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 999                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1000                                if (!p)
1001                                        return -ENOENT;
1002                                q = qdisc_leaf(p, clid);
1003                        } else if (dev_ingress_queue(dev)) {
1004                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1005                        }
1006                } else {
1007                        q = dev->qdisc;
1008                }
1009                if (!q)
1010                        return -ENOENT;
1011
1012                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1013                        return -EINVAL;
1014        } else {
1015                q = qdisc_lookup(dev, tcm->tcm_handle);
1016                if (!q)
1017                        return -ENOENT;
1018        }
1019
1020        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1021                return -EINVAL;
1022
1023        if (n->nlmsg_type == RTM_DELQDISC) {
1024                if (!clid)
1025                        return -EINVAL;
1026                if (q->handle == 0)
1027                        return -ENOENT;
1028                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1029                if (err != 0)
1030                        return err;
1031        } else {
1032                qdisc_notify(net, skb, n, clid, NULL, q);
1033        }
1034        return 0;
1035}
1036
1037/*
1038 * Create/change qdisc.
1039 */
1040
1041static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1042{
1043        struct net *net = sock_net(skb->sk);
1044        struct tcmsg *tcm;
1045        struct nlattr *tca[TCA_MAX + 1];
1046        struct net_device *dev;
1047        u32 clid;
1048        struct Qdisc *q, *p;
1049        int err;
1050
1051        if (!capable(CAP_NET_ADMIN))
1052                return -EPERM;
1053
1054replay:
1055        /* Reinit, just in case something touches this. */
1056        tcm = nlmsg_data(n);
1057        clid = tcm->tcm_parent;
1058        q = p = NULL;
1059
1060        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1061        if (!dev)
1062                return -ENODEV;
1063
1064        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1065        if (err < 0)
1066                return err;
1067
1068        if (clid) {
1069                if (clid != TC_H_ROOT) {
1070                        if (clid != TC_H_INGRESS) {
1071                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1072                                if (!p)
1073                                        return -ENOENT;
1074                                q = qdisc_leaf(p, clid);
1075                        } else if (dev_ingress_queue_create(dev)) {
1076                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1077                        }
1078                } else {
1079                        q = dev->qdisc;
1080                }
1081
1082                /* It may be default qdisc, ignore it */
1083                if (q && q->handle == 0)
1084                        q = NULL;
1085
1086                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1087                        if (tcm->tcm_handle) {
1088                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1089                                        return -EEXIST;
1090                                if (TC_H_MIN(tcm->tcm_handle))
1091                                        return -EINVAL;
1092                                q = qdisc_lookup(dev, tcm->tcm_handle);
1093                                if (!q)
1094                                        goto create_n_graft;
1095                                if (n->nlmsg_flags & NLM_F_EXCL)
1096                                        return -EEXIST;
1097                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1098                                        return -EINVAL;
1099                                if (q == p ||
1100                                    (p && check_loop(q, p, 0)))
1101                                        return -ELOOP;
1102                                atomic_inc(&q->refcnt);
1103                                goto graft;
1104                        } else {
1105                                if (!q)
1106                                        goto create_n_graft;
1107
1108                                /* This magic test requires explanation.
1109                                 *
1110                                 *   We know, that some child q is already
1111                                 *   attached to this parent and have choice:
1112                                 *   either to change it or to create/graft new one.
1113                                 *
1114                                 *   1. We are allowed to create/graft only
1115                                 *   if CREATE and REPLACE flags are set.
1116                                 *
1117                                 *   2. If EXCL is set, requestor wanted to say,
1118                                 *   that qdisc tcm_handle is not expected
1119                                 *   to exist, so that we choose create/graft too.
1120                                 *
1121                                 *   3. The last case is when no flags are set.
1122                                 *   Alas, it is sort of hole in API, we
1123                                 *   cannot decide what to do unambiguously.
1124                                 *   For now we select create/graft, if
1125                                 *   user gave KIND, which does not match existing.
1126                                 */
1127                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1128                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1129                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1130                                     (tca[TCA_KIND] &&
1131                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1132                                        goto create_n_graft;
1133                        }
1134                }
1135        } else {
1136                if (!tcm->tcm_handle)
1137                        return -EINVAL;
1138                q = qdisc_lookup(dev, tcm->tcm_handle);
1139        }
1140
1141        /* Change qdisc parameters */
1142        if (q == NULL)
1143                return -ENOENT;
1144        if (n->nlmsg_flags & NLM_F_EXCL)
1145                return -EEXIST;
1146        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1147                return -EINVAL;
1148        err = qdisc_change(q, tca);
1149        if (err == 0)
1150                qdisc_notify(net, skb, n, clid, NULL, q);
1151        return err;
1152
1153create_n_graft:
1154        if (!(n->nlmsg_flags & NLM_F_CREATE))
1155                return -ENOENT;
1156        if (clid == TC_H_INGRESS) {
1157                if (dev_ingress_queue(dev))
1158                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1159                                         tcm->tcm_parent, tcm->tcm_parent,
1160                                         tca, &err);
1161                else
1162                        err = -ENOENT;
1163        } else {
1164                struct netdev_queue *dev_queue;
1165
1166                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1167                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1168                else if (p)
1169                        dev_queue = p->dev_queue;
1170                else
1171                        dev_queue = netdev_get_tx_queue(dev, 0);
1172
1173                q = qdisc_create(dev, dev_queue, p,
1174                                 tcm->tcm_parent, tcm->tcm_handle,
1175                                 tca, &err);
1176        }
1177        if (q == NULL) {
1178                if (err == -EAGAIN)
1179                        goto replay;
1180                return err;
1181        }
1182
1183graft:
1184        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1185        if (err) {
1186                if (q)
1187                        qdisc_destroy(q);
1188                return err;
1189        }
1190
1191        return 0;
1192}
1193
1194static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1195                         u32 portid, u32 seq, u16 flags, int event)
1196{
1197        struct tcmsg *tcm;
1198        struct nlmsghdr  *nlh;
1199        unsigned char *b = skb_tail_pointer(skb);
1200        struct gnet_dump d;
1201        struct qdisc_size_table *stab;
1202
1203        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1204        if (!nlh)
1205                goto out_nlmsg_trim;
1206        tcm = nlmsg_data(nlh);
1207        tcm->tcm_family = AF_UNSPEC;
1208        tcm->tcm__pad1 = 0;
1209        tcm->tcm__pad2 = 0;
1210        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1211        tcm->tcm_parent = clid;
1212        tcm->tcm_handle = q->handle;
1213        tcm->tcm_info = atomic_read(&q->refcnt);
1214        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1215                goto nla_put_failure;
1216        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1217                goto nla_put_failure;
1218        q->qstats.qlen = q->q.qlen;
1219
1220        stab = rtnl_dereference(q->stab);
1221        if (stab && qdisc_dump_stab(skb, stab) < 0)
1222                goto nla_put_failure;
1223
1224        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1225                                         qdisc_root_sleeping_lock(q), &d) < 0)
1226                goto nla_put_failure;
1227
1228        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1229                goto nla_put_failure;
1230
1231        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1232            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1233            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1234                goto nla_put_failure;
1235
1236        if (gnet_stats_finish_copy(&d) < 0)
1237                goto nla_put_failure;
1238
1239        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1240        return skb->len;
1241
1242out_nlmsg_trim:
1243nla_put_failure:
1244        nlmsg_trim(skb, b);
1245        return -1;
1246}
1247
1248static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1249{
1250        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1251}
1252
1253static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1254                        struct nlmsghdr *n, u32 clid,
1255                        struct Qdisc *old, struct Qdisc *new)
1256{
1257        struct sk_buff *skb;
1258        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1259
1260        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1261        if (!skb)
1262                return -ENOBUFS;
1263
1264        if (old && !tc_qdisc_dump_ignore(old)) {
1265                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1266                                  0, RTM_DELQDISC) < 0)
1267                        goto err_out;
1268        }
1269        if (new && !tc_qdisc_dump_ignore(new)) {
1270                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1271                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1272                        goto err_out;
1273        }
1274
1275        if (skb->len)
1276                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1277                                      n->nlmsg_flags & NLM_F_ECHO);
1278
1279err_out:
1280        kfree_skb(skb);
1281        return -EINVAL;
1282}
1283
1284static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1285                              struct netlink_callback *cb,
1286                              int *q_idx_p, int s_q_idx)
1287{
1288        int ret = 0, q_idx = *q_idx_p;
1289        struct Qdisc *q;
1290
1291        if (!root)
1292                return 0;
1293
1294        q = root;
1295        if (q_idx < s_q_idx) {
1296                q_idx++;
1297        } else {
1298                if (!tc_qdisc_dump_ignore(q) &&
1299                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1300                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1301                        goto done;
1302                q_idx++;
1303        }
1304        list_for_each_entry(q, &root->list, list) {
1305                if (q_idx < s_q_idx) {
1306                        q_idx++;
1307                        continue;
1308                }
1309                if (!tc_qdisc_dump_ignore(q) &&
1310                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1311                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1312                        goto done;
1313                q_idx++;
1314        }
1315
1316out:
1317        *q_idx_p = q_idx;
1318        return ret;
1319done:
1320        ret = -1;
1321        goto out;
1322}
1323
1324static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1325{
1326        struct net *net = sock_net(skb->sk);
1327        int idx, q_idx;
1328        int s_idx, s_q_idx;
1329        struct net_device *dev;
1330
1331        s_idx = cb->args[0];
1332        s_q_idx = q_idx = cb->args[1];
1333
1334        rcu_read_lock();
1335        idx = 0;
1336        for_each_netdev_rcu(net, dev) {
1337                struct netdev_queue *dev_queue;
1338
1339                if (idx < s_idx)
1340                        goto cont;
1341                if (idx > s_idx)
1342                        s_q_idx = 0;
1343                q_idx = 0;
1344
1345                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1346                        goto done;
1347
1348                dev_queue = dev_ingress_queue(dev);
1349                if (dev_queue &&
1350                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1351                                       &q_idx, s_q_idx) < 0)
1352                        goto done;
1353
1354cont:
1355                idx++;
1356        }
1357
1358done:
1359        rcu_read_unlock();
1360
1361        cb->args[0] = idx;
1362        cb->args[1] = q_idx;
1363
1364        return skb->len;
1365}
1366
1367
1368
1369/************************************************
1370 *      Traffic classes manipulation.           *
1371 ************************************************/
1372
1373
1374
1375static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1376{
1377        struct net *net = sock_net(skb->sk);
1378        struct tcmsg *tcm = nlmsg_data(n);
1379        struct nlattr *tca[TCA_MAX + 1];
1380        struct net_device *dev;
1381        struct Qdisc *q = NULL;
1382        const struct Qdisc_class_ops *cops;
1383        unsigned long cl = 0;
1384        unsigned long new_cl;
1385        u32 portid = tcm->tcm_parent;
1386        u32 clid = tcm->tcm_handle;
1387        u32 qid = TC_H_MAJ(clid);
1388        int err;
1389
1390        if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1391                return -EPERM;
1392
1393        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1394        if (!dev)
1395                return -ENODEV;
1396
1397        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1398        if (err < 0)
1399                return err;
1400
1401        /*
1402           parent == TC_H_UNSPEC - unspecified parent.
1403           parent == TC_H_ROOT   - class is root, which has no parent.
1404           parent == X:0         - parent is root class.
1405           parent == X:Y         - parent is a node in hierarchy.
1406           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1407
1408           handle == 0:0         - generate handle from kernel pool.
1409           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1410           handle == X:Y         - clear.
1411           handle == X:0         - root class.
1412         */
1413
1414        /* Step 1. Determine qdisc handle X:0 */
1415
1416        if (portid != TC_H_ROOT) {
1417                u32 qid1 = TC_H_MAJ(portid);
1418
1419                if (qid && qid1) {
1420                        /* If both majors are known, they must be identical. */
1421                        if (qid != qid1)
1422                                return -EINVAL;
1423                } else if (qid1) {
1424                        qid = qid1;
1425                } else if (qid == 0)
1426                        qid = dev->qdisc->handle;
1427
1428                /* Now qid is genuine qdisc handle consistent
1429                 * both with parent and child.
1430                 *
1431                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1432                 */
1433                if (portid)
1434                        portid = TC_H_MAKE(qid, portid);
1435        } else {
1436                if (qid == 0)
1437                        qid = dev->qdisc->handle;
1438        }
1439
1440        /* OK. Locate qdisc */
1441        q = qdisc_lookup(dev, qid);
1442        if (!q)
1443                return -ENOENT;
1444
1445        /* An check that it supports classes */
1446        cops = q->ops->cl_ops;
1447        if (cops == NULL)
1448                return -EINVAL;
1449
1450        /* Now try to get class */
1451        if (clid == 0) {
1452                if (portid == TC_H_ROOT)
1453                        clid = qid;
1454        } else
1455                clid = TC_H_MAKE(qid, clid);
1456
1457        if (clid)
1458                cl = cops->get(q, clid);
1459
1460        if (cl == 0) {
1461                err = -ENOENT;
1462                if (n->nlmsg_type != RTM_NEWTCLASS ||
1463                    !(n->nlmsg_flags & NLM_F_CREATE))
1464                        goto out;
1465        } else {
1466                switch (n->nlmsg_type) {
1467                case RTM_NEWTCLASS:
1468                        err = -EEXIST;
1469                        if (n->nlmsg_flags & NLM_F_EXCL)
1470                                goto out;
1471                        break;
1472                case RTM_DELTCLASS:
1473                        err = -EOPNOTSUPP;
1474                        if (cops->delete)
1475                                err = cops->delete(q, cl);
1476                        if (err == 0)
1477                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1478                        goto out;
1479                case RTM_GETTCLASS:
1480                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1481                        goto out;
1482                default:
1483                        err = -EINVAL;
1484                        goto out;
1485                }
1486        }
1487
1488        new_cl = cl;
1489        err = -EOPNOTSUPP;
1490        if (cops->change)
1491                err = cops->change(q, clid, portid, tca, &new_cl);
1492        if (err == 0)
1493                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1494
1495out:
1496        if (cl)
1497                cops->put(q, cl);
1498
1499        return err;
1500}
1501
1502
1503static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1504                          unsigned long cl,
1505                          u32 portid, u32 seq, u16 flags, int event)
1506{
1507        struct tcmsg *tcm;
1508        struct nlmsghdr  *nlh;
1509        unsigned char *b = skb_tail_pointer(skb);
1510        struct gnet_dump d;
1511        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1512
1513        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1514        if (!nlh)
1515                goto out_nlmsg_trim;
1516        tcm = nlmsg_data(nlh);
1517        tcm->tcm_family = AF_UNSPEC;
1518        tcm->tcm__pad1 = 0;
1519        tcm->tcm__pad2 = 0;
1520        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1521        tcm->tcm_parent = q->handle;
1522        tcm->tcm_handle = q->handle;
1523        tcm->tcm_info = 0;
1524        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1525                goto nla_put_failure;
1526        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1527                goto nla_put_failure;
1528
1529        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1530                                         qdisc_root_sleeping_lock(q), &d) < 0)
1531                goto nla_put_failure;
1532
1533        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1534                goto nla_put_failure;
1535
1536        if (gnet_stats_finish_copy(&d) < 0)
1537                goto nla_put_failure;
1538
1539        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1540        return skb->len;
1541
1542out_nlmsg_trim:
1543nla_put_failure:
1544        nlmsg_trim(skb, b);
1545        return -1;
1546}
1547
1548static int tclass_notify(struct net *net, struct sk_buff *oskb,
1549                         struct nlmsghdr *n, struct Qdisc *q,
1550                         unsigned long cl, int event)
1551{
1552        struct sk_buff *skb;
1553        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1554
1555        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1556        if (!skb)
1557                return -ENOBUFS;
1558
1559        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1560                kfree_skb(skb);
1561                return -EINVAL;
1562        }
1563
1564        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1565                              n->nlmsg_flags & NLM_F_ECHO);
1566}
1567
1568struct qdisc_dump_args {
1569        struct qdisc_walker     w;
1570        struct sk_buff          *skb;
1571        struct netlink_callback *cb;
1572};
1573
1574static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1575{
1576        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1577
1578        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1579                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1580}
1581
1582static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1583                                struct tcmsg *tcm, struct netlink_callback *cb,
1584                                int *t_p, int s_t)
1585{
1586        struct qdisc_dump_args arg;
1587
1588        if (tc_qdisc_dump_ignore(q) ||
1589            *t_p < s_t || !q->ops->cl_ops ||
1590            (tcm->tcm_parent &&
1591             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1592                (*t_p)++;
1593                return 0;
1594        }
1595        if (*t_p > s_t)
1596                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1597        arg.w.fn = qdisc_class_dump;
1598        arg.skb = skb;
1599        arg.cb = cb;
1600        arg.w.stop  = 0;
1601        arg.w.skip = cb->args[1];
1602        arg.w.count = 0;
1603        q->ops->cl_ops->walk(q, &arg.w);
1604        cb->args[1] = arg.w.count;
1605        if (arg.w.stop)
1606                return -1;
1607        (*t_p)++;
1608        return 0;
1609}
1610
1611static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1612                               struct tcmsg *tcm, struct netlink_callback *cb,
1613                               int *t_p, int s_t)
1614{
1615        struct Qdisc *q;
1616
1617        if (!root)
1618                return 0;
1619
1620        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1621                return -1;
1622
1623        list_for_each_entry(q, &root->list, list) {
1624                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1625                        return -1;
1626        }
1627
1628        return 0;
1629}
1630
1631static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1632{
1633        struct tcmsg *tcm = nlmsg_data(cb->nlh);
1634        struct net *net = sock_net(skb->sk);
1635        struct netdev_queue *dev_queue;
1636        struct net_device *dev;
1637        int t, s_t;
1638
1639        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1640                return 0;
1641        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1642        if (!dev)
1643                return 0;
1644
1645        s_t = cb->args[0];
1646        t = 0;
1647
1648        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1649                goto done;
1650
1651        dev_queue = dev_ingress_queue(dev);
1652        if (dev_queue &&
1653            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1654                                &t, s_t) < 0)
1655                goto done;
1656
1657done:
1658        cb->args[0] = t;
1659
1660        dev_put(dev);
1661        return skb->len;
1662}
1663
1664/* Main classifier routine: scans classifier chain attached
1665 * to this qdisc, (optionally) tests for protocol and asks
1666 * specific classifiers.
1667 */
1668int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1669                       struct tcf_result *res)
1670{
1671        __be16 protocol = skb->protocol;
1672        int err;
1673
1674        for (; tp; tp = tp->next) {
1675                if (tp->protocol != protocol &&
1676                    tp->protocol != htons(ETH_P_ALL))
1677                        continue;
1678                err = tp->classify(skb, tp, res);
1679
1680                if (err >= 0) {
1681#ifdef CONFIG_NET_CLS_ACT
1682                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1683                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1684#endif
1685                        return err;
1686                }
1687        }
1688        return -1;
1689}
1690EXPORT_SYMBOL(tc_classify_compat);
1691
1692int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1693                struct tcf_result *res)
1694{
1695        int err = 0;
1696#ifdef CONFIG_NET_CLS_ACT
1697        const struct tcf_proto *otp = tp;
1698reclassify:
1699#endif
1700
1701        err = tc_classify_compat(skb, tp, res);
1702#ifdef CONFIG_NET_CLS_ACT
1703        if (err == TC_ACT_RECLASSIFY) {
1704                u32 verd = G_TC_VERD(skb->tc_verd);
1705                tp = otp;
1706
1707                if (verd++ >= MAX_REC_LOOP) {
1708                        net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1709                                               tp->q->ops->id,
1710                                               tp->prio & 0xffff,
1711                                               ntohs(tp->protocol));
1712                        return TC_ACT_SHOT;
1713                }
1714                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1715                goto reclassify;
1716        }
1717#endif
1718        return err;
1719}
1720EXPORT_SYMBOL(tc_classify);
1721
1722void tcf_destroy(struct tcf_proto *tp)
1723{
1724        tp->ops->destroy(tp);
1725        module_put(tp->ops->owner);
1726        kfree(tp);
1727}
1728
1729void tcf_destroy_chain(struct tcf_proto **fl)
1730{
1731        struct tcf_proto *tp;
1732
1733        while ((tp = *fl) != NULL) {
1734                *fl = tp->next;
1735                tcf_destroy(tp);
1736        }
1737}
1738EXPORT_SYMBOL(tcf_destroy_chain);
1739
1740#ifdef CONFIG_PROC_FS
1741static int psched_show(struct seq_file *seq, void *v)
1742{
1743        struct timespec ts;
1744
1745        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1746        seq_printf(seq, "%08x %08x %08x %08x\n",
1747                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1748                   1000000,
1749                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1750
1751        return 0;
1752}
1753
1754static int psched_open(struct inode *inode, struct file *file)
1755{
1756        return single_open(file, psched_show, NULL);
1757}
1758
1759static const struct file_operations psched_fops = {
1760        .owner = THIS_MODULE,
1761        .open = psched_open,
1762        .read  = seq_read,
1763        .llseek = seq_lseek,
1764        .release = single_release,
1765};
1766
1767static int __net_init psched_net_init(struct net *net)
1768{
1769        struct proc_dir_entry *e;
1770
1771        e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1772        if (e == NULL)
1773                return -ENOMEM;
1774
1775        return 0;
1776}
1777
1778static void __net_exit psched_net_exit(struct net *net)
1779{
1780        proc_net_remove(net, "psched");
1781}
1782#else
1783static int __net_init psched_net_init(struct net *net)
1784{
1785        return 0;
1786}
1787
1788static void __net_exit psched_net_exit(struct net *net)
1789{
1790}
1791#endif
1792
1793static struct pernet_operations psched_net_ops = {
1794        .init = psched_net_init,
1795        .exit = psched_net_exit,
1796};
1797
1798static int __init pktsched_init(void)
1799{
1800        int err;
1801
1802        err = register_pernet_subsys(&psched_net_ops);
1803        if (err) {
1804                pr_err("pktsched_init: "
1805                       "cannot initialize per netns operations\n");
1806                return err;
1807        }
1808
1809        register_qdisc(&pfifo_qdisc_ops);
1810        register_qdisc(&bfifo_qdisc_ops);
1811        register_qdisc(&pfifo_head_drop_qdisc_ops);
1812        register_qdisc(&mq_qdisc_ops);
1813
1814        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1815        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1816        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1817        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1818        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1819        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1820
1821        return 0;
1822}
1823
1824subsys_initcall(pktsched_init);
1825