linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31
  32#include <net/net_namespace.h>
  33#include <net/sock.h>
  34#include <net/netlink.h>
  35#include <net/pkt_sched.h>
  36
  37static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
  38                        struct Qdisc *old, struct Qdisc *new);
  39static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
  40                         struct Qdisc *q, unsigned long cl, int event);
  41
  42/*
  43
  44   Short review.
  45   -------------
  46
  47   This file consists of two interrelated parts:
  48
  49   1. queueing disciplines manager frontend.
  50   2. traffic classes manager frontend.
  51
  52   Generally, queueing discipline ("qdisc") is a black box,
  53   which is able to enqueue packets and to dequeue them (when
  54   device is ready to send something) in order and at times
  55   determined by algorithm hidden in it.
  56
  57   qdisc's are divided to two categories:
  58   - "queues", which have no internal structure visible from outside.
  59   - "schedulers", which split all the packets to "traffic classes",
  60     using "packet classifiers" (look at cls_api.c)
  61
  62   In turn, classes may have child qdiscs (as rule, queues)
  63   attached to them etc. etc. etc.
  64
  65   The goal of the routines in this file is to translate
  66   information supplied by user in the form of handles
  67   to more intelligible for kernel form, to make some sanity
  68   checks and part of work, which is common to all qdiscs
  69   and to provide rtnetlink notifications.
  70
  71   All real intelligent work is done inside qdisc modules.
  72
  73
  74
  75   Every discipline has two major routines: enqueue and dequeue.
  76
  77   ---dequeue
  78
  79   dequeue usually returns a skb to send. It is allowed to return NULL,
  80   but it does not mean that queue is empty, it just means that
  81   discipline does not want to send anything this time.
  82   Queue is really empty if q->q.qlen == 0.
  83   For complicated disciplines with multiple queues q->q is not
  84   real packet queue, but however q->q.qlen must be valid.
  85
  86   ---enqueue
  87
  88   enqueue returns 0, if packet was enqueued successfully.
  89   If packet (this one or another one) was dropped, it returns
  90   not zero error code.
  91   NET_XMIT_DROP        - this packet dropped
  92     Expected action: do not backoff, but wait until queue will clear.
  93   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  94     Expected action: backoff or ignore
  95   NET_XMIT_POLICED     - dropped by police.
  96     Expected action: backoff or error to real-time apps.
  97
  98   Auxiliary routines:
  99
 100   ---peek
 101
 102   like dequeue but without removing a packet from the queue
 103
 104   ---reset
 105
 106   returns qdisc to initial state: purge all buffers, clear all
 107   timers, counters (except for statistics) etc.
 108
 109   ---init
 110
 111   initializes newly created qdisc.
 112
 113   ---destroy
 114
 115   destroys resources allocated by init and during lifetime of qdisc.
 116
 117   ---change
 118
 119   changes qdisc parameters.
 120 */
 121
 122/* Protects list of registered TC modules. It is pure SMP lock. */
 123static DEFINE_RWLOCK(qdisc_mod_lock);
 124
 125
 126/************************************************
 127 *      Queueing disciplines manipulation.      *
 128 ************************************************/
 129
 130
 131/* The list of all installed queueing disciplines. */
 132
 133static struct Qdisc_ops *qdisc_base;
 134
 135/* Register/uregister queueing discipline */
 136
 137int register_qdisc(struct Qdisc_ops *qops)
 138{
 139        struct Qdisc_ops *q, **qp;
 140        int rc = -EEXIST;
 141
 142        write_lock(&qdisc_mod_lock);
 143        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 144                if (!strcmp(qops->id, q->id))
 145                        goto out;
 146
 147        if (qops->enqueue == NULL)
 148                qops->enqueue = noop_qdisc_ops.enqueue;
 149        if (qops->peek == NULL) {
 150                if (qops->dequeue == NULL) {
 151                        qops->peek = noop_qdisc_ops.peek;
 152                } else {
 153                        rc = -EINVAL;
 154                        goto out;
 155                }
 156        }
 157        if (qops->dequeue == NULL)
 158                qops->dequeue = noop_qdisc_ops.dequeue;
 159
 160        qops->next = NULL;
 161        *qp = qops;
 162        rc = 0;
 163out:
 164        write_unlock(&qdisc_mod_lock);
 165        return rc;
 166}
 167EXPORT_SYMBOL(register_qdisc);
 168
 169int unregister_qdisc(struct Qdisc_ops *qops)
 170{
 171        struct Qdisc_ops *q, **qp;
 172        int err = -ENOENT;
 173
 174        write_lock(&qdisc_mod_lock);
 175        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
 176                if (q == qops)
 177                        break;
 178        if (q) {
 179                *qp = q->next;
 180                q->next = NULL;
 181                err = 0;
 182        }
 183        write_unlock(&qdisc_mod_lock);
 184        return err;
 185}
 186EXPORT_SYMBOL(unregister_qdisc);
 187
 188/* We know handle. Find qdisc among all qdisc's attached to device
 189   (root qdisc, all its children, children of children etc.)
 190 */
 191
 192static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 193{
 194        struct Qdisc *q;
 195
 196        if (!(root->flags & TCQ_F_BUILTIN) &&
 197            root->handle == handle)
 198                return root;
 199
 200        list_for_each_entry(q, &root->list, list) {
 201                if (q->handle == handle)
 202                        return q;
 203        }
 204        return NULL;
 205}
 206
 207static void qdisc_list_add(struct Qdisc *q)
 208{
 209        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 210                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 211}
 212
 213void qdisc_list_del(struct Qdisc *q)
 214{
 215        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 216                list_del(&q->list);
 217}
 218EXPORT_SYMBOL(qdisc_list_del);
 219
 220struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 221{
 222        struct Qdisc *q;
 223
 224        q = qdisc_match_from_root(dev->qdisc, handle);
 225        if (q)
 226                goto out;
 227
 228        q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
 229out:
 230        return q;
 231}
 232
 233static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 234{
 235        unsigned long cl;
 236        struct Qdisc *leaf;
 237        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 238
 239        if (cops == NULL)
 240                return NULL;
 241        cl = cops->get(p, classid);
 242
 243        if (cl == 0)
 244                return NULL;
 245        leaf = cops->leaf(p, cl);
 246        cops->put(p, cl);
 247        return leaf;
 248}
 249
 250/* Find queueing discipline by name */
 251
 252static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 253{
 254        struct Qdisc_ops *q = NULL;
 255
 256        if (kind) {
 257                read_lock(&qdisc_mod_lock);
 258                for (q = qdisc_base; q; q = q->next) {
 259                        if (nla_strcmp(kind, q->id) == 0) {
 260                                if (!try_module_get(q->owner))
 261                                        q = NULL;
 262                                break;
 263                        }
 264                }
 265                read_unlock(&qdisc_mod_lock);
 266        }
 267        return q;
 268}
 269
 270static struct qdisc_rate_table *qdisc_rtab_list;
 271
 272struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 273{
 274        struct qdisc_rate_table *rtab;
 275
 276        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 277                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 278                        rtab->refcnt++;
 279                        return rtab;
 280                }
 281        }
 282
 283        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 284            nla_len(tab) != TC_RTAB_SIZE)
 285                return NULL;
 286
 287        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 288        if (rtab) {
 289                rtab->rate = *r;
 290                rtab->refcnt = 1;
 291                memcpy(rtab->data, nla_data(tab), 1024);
 292                rtab->next = qdisc_rtab_list;
 293                qdisc_rtab_list = rtab;
 294        }
 295        return rtab;
 296}
 297EXPORT_SYMBOL(qdisc_get_rtab);
 298
 299void qdisc_put_rtab(struct qdisc_rate_table *tab)
 300{
 301        struct qdisc_rate_table *rtab, **rtabp;
 302
 303        if (!tab || --tab->refcnt)
 304                return;
 305
 306        for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
 307                if (rtab == tab) {
 308                        *rtabp = rtab->next;
 309                        kfree(rtab);
 310                        return;
 311                }
 312        }
 313}
 314EXPORT_SYMBOL(qdisc_put_rtab);
 315
 316static LIST_HEAD(qdisc_stab_list);
 317static DEFINE_SPINLOCK(qdisc_stab_lock);
 318
 319static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 320        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 321        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 322};
 323
 324static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 325{
 326        struct nlattr *tb[TCA_STAB_MAX + 1];
 327        struct qdisc_size_table *stab;
 328        struct tc_sizespec *s;
 329        unsigned int tsize = 0;
 330        u16 *tab = NULL;
 331        int err;
 332
 333        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 334        if (err < 0)
 335                return ERR_PTR(err);
 336        if (!tb[TCA_STAB_BASE])
 337                return ERR_PTR(-EINVAL);
 338
 339        s = nla_data(tb[TCA_STAB_BASE]);
 340
 341        if (s->tsize > 0) {
 342                if (!tb[TCA_STAB_DATA])
 343                        return ERR_PTR(-EINVAL);
 344                tab = nla_data(tb[TCA_STAB_DATA]);
 345                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 346        }
 347
 348        if (!s || tsize != s->tsize || (!tab && tsize > 0))
 349                return ERR_PTR(-EINVAL);
 350
 351        spin_lock(&qdisc_stab_lock);
 352
 353        list_for_each_entry(stab, &qdisc_stab_list, list) {
 354                if (memcmp(&stab->szopts, s, sizeof(*s)))
 355                        continue;
 356                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 357                        continue;
 358                stab->refcnt++;
 359                spin_unlock(&qdisc_stab_lock);
 360                return stab;
 361        }
 362
 363        spin_unlock(&qdisc_stab_lock);
 364
 365        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 366        if (!stab)
 367                return ERR_PTR(-ENOMEM);
 368
 369        stab->refcnt = 1;
 370        stab->szopts = *s;
 371        if (tsize > 0)
 372                memcpy(stab->data, tab, tsize * sizeof(u16));
 373
 374        spin_lock(&qdisc_stab_lock);
 375        list_add_tail(&stab->list, &qdisc_stab_list);
 376        spin_unlock(&qdisc_stab_lock);
 377
 378        return stab;
 379}
 380
 381void qdisc_put_stab(struct qdisc_size_table *tab)
 382{
 383        if (!tab)
 384                return;
 385
 386        spin_lock(&qdisc_stab_lock);
 387
 388        if (--tab->refcnt == 0) {
 389                list_del(&tab->list);
 390                kfree(tab);
 391        }
 392
 393        spin_unlock(&qdisc_stab_lock);
 394}
 395EXPORT_SYMBOL(qdisc_put_stab);
 396
 397static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 398{
 399        struct nlattr *nest;
 400
 401        nest = nla_nest_start(skb, TCA_STAB);
 402        if (nest == NULL)
 403                goto nla_put_failure;
 404        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
 405        nla_nest_end(skb, nest);
 406
 407        return skb->len;
 408
 409nla_put_failure:
 410        return -1;
 411}
 412
 413void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
 414{
 415        int pkt_len, slot;
 416
 417        pkt_len = skb->len + stab->szopts.overhead;
 418        if (unlikely(!stab->szopts.tsize))
 419                goto out;
 420
 421        slot = pkt_len + stab->szopts.cell_align;
 422        if (unlikely(slot < 0))
 423                slot = 0;
 424
 425        slot >>= stab->szopts.cell_log;
 426        if (likely(slot < stab->szopts.tsize))
 427                pkt_len = stab->data[slot];
 428        else
 429                pkt_len = stab->data[stab->szopts.tsize - 1] *
 430                                (slot / stab->szopts.tsize) +
 431                                stab->data[slot % stab->szopts.tsize];
 432
 433        pkt_len <<= stab->szopts.size_log;
 434out:
 435        if (unlikely(pkt_len < 1))
 436                pkt_len = 1;
 437        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 438}
 439EXPORT_SYMBOL(qdisc_calculate_pkt_len);
 440
 441void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 442{
 443        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 444                printk(KERN_WARNING
 445                       "%s: %s qdisc %X: is non-work-conserving?\n",
 446                       txt, qdisc->ops->id, qdisc->handle >> 16);
 447                qdisc->flags |= TCQ_F_WARN_NONWC;
 448        }
 449}
 450EXPORT_SYMBOL(qdisc_warn_nonwc);
 451
 452static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 453{
 454        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 455                                                 timer);
 456
 457        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 458        __netif_schedule(qdisc_root(wd->qdisc));
 459
 460        return HRTIMER_NORESTART;
 461}
 462
 463void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 464{
 465        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 466        wd->timer.function = qdisc_watchdog;
 467        wd->qdisc = qdisc;
 468}
 469EXPORT_SYMBOL(qdisc_watchdog_init);
 470
 471void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 472{
 473        ktime_t time;
 474
 475        if (test_bit(__QDISC_STATE_DEACTIVATED,
 476                     &qdisc_root_sleeping(wd->qdisc)->state))
 477                return;
 478
 479        wd->qdisc->flags |= TCQ_F_THROTTLED;
 480        time = ktime_set(0, 0);
 481        time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
 482        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 483}
 484EXPORT_SYMBOL(qdisc_watchdog_schedule);
 485
 486void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 487{
 488        hrtimer_cancel(&wd->timer);
 489        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 490}
 491EXPORT_SYMBOL(qdisc_watchdog_cancel);
 492
 493static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 494{
 495        unsigned int size = n * sizeof(struct hlist_head), i;
 496        struct hlist_head *h;
 497
 498        if (size <= PAGE_SIZE)
 499                h = kmalloc(size, GFP_KERNEL);
 500        else
 501                h = (struct hlist_head *)
 502                        __get_free_pages(GFP_KERNEL, get_order(size));
 503
 504        if (h != NULL) {
 505                for (i = 0; i < n; i++)
 506                        INIT_HLIST_HEAD(&h[i]);
 507        }
 508        return h;
 509}
 510
 511static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 512{
 513        unsigned int size = n * sizeof(struct hlist_head);
 514
 515        if (size <= PAGE_SIZE)
 516                kfree(h);
 517        else
 518                free_pages((unsigned long)h, get_order(size));
 519}
 520
 521void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 522{
 523        struct Qdisc_class_common *cl;
 524        struct hlist_node *n, *next;
 525        struct hlist_head *nhash, *ohash;
 526        unsigned int nsize, nmask, osize;
 527        unsigned int i, h;
 528
 529        /* Rehash when load factor exceeds 0.75 */
 530        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 531                return;
 532        nsize = clhash->hashsize * 2;
 533        nmask = nsize - 1;
 534        nhash = qdisc_class_hash_alloc(nsize);
 535        if (nhash == NULL)
 536                return;
 537
 538        ohash = clhash->hash;
 539        osize = clhash->hashsize;
 540
 541        sch_tree_lock(sch);
 542        for (i = 0; i < osize; i++) {
 543                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 544                        h = qdisc_class_hash(cl->classid, nmask);
 545                        hlist_add_head(&cl->hnode, &nhash[h]);
 546                }
 547        }
 548        clhash->hash     = nhash;
 549        clhash->hashsize = nsize;
 550        clhash->hashmask = nmask;
 551        sch_tree_unlock(sch);
 552
 553        qdisc_class_hash_free(ohash, osize);
 554}
 555EXPORT_SYMBOL(qdisc_class_hash_grow);
 556
 557int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 558{
 559        unsigned int size = 4;
 560
 561        clhash->hash = qdisc_class_hash_alloc(size);
 562        if (clhash->hash == NULL)
 563                return -ENOMEM;
 564        clhash->hashsize  = size;
 565        clhash->hashmask  = size - 1;
 566        clhash->hashelems = 0;
 567        return 0;
 568}
 569EXPORT_SYMBOL(qdisc_class_hash_init);
 570
 571void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 572{
 573        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 574}
 575EXPORT_SYMBOL(qdisc_class_hash_destroy);
 576
 577void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 578                             struct Qdisc_class_common *cl)
 579{
 580        unsigned int h;
 581
 582        INIT_HLIST_NODE(&cl->hnode);
 583        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 584        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 585        clhash->hashelems++;
 586}
 587EXPORT_SYMBOL(qdisc_class_hash_insert);
 588
 589void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 590                             struct Qdisc_class_common *cl)
 591{
 592        hlist_del(&cl->hnode);
 593        clhash->hashelems--;
 594}
 595EXPORT_SYMBOL(qdisc_class_hash_remove);
 596
 597/* Allocate an unique handle from space managed by kernel */
 598
 599static u32 qdisc_alloc_handle(struct net_device *dev)
 600{
 601        int i = 0x10000;
 602        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 603
 604        do {
 605                autohandle += TC_H_MAKE(0x10000U, 0);
 606                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 607                        autohandle = TC_H_MAKE(0x80000000U, 0);
 608        } while (qdisc_lookup(dev, autohandle) && --i > 0);
 609
 610        return i>0 ? autohandle : 0;
 611}
 612
 613void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 614{
 615        const struct Qdisc_class_ops *cops;
 616        unsigned long cl;
 617        u32 parentid;
 618
 619        if (n == 0)
 620                return;
 621        while ((parentid = sch->parent)) {
 622                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 623                        return;
 624
 625                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 626                if (sch == NULL) {
 627                        WARN_ON(parentid != TC_H_ROOT);
 628                        return;
 629                }
 630                cops = sch->ops->cl_ops;
 631                if (cops->qlen_notify) {
 632                        cl = cops->get(sch, parentid);
 633                        cops->qlen_notify(sch, cl);
 634                        cops->put(sch, cl);
 635                }
 636                sch->q.qlen -= n;
 637        }
 638}
 639EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 640
 641static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
 642                               struct Qdisc *old, struct Qdisc *new)
 643{
 644        if (new || old)
 645                qdisc_notify(skb, n, clid, old, new);
 646
 647        if (old)
 648                qdisc_destroy(old);
 649}
 650
 651/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 652 * to device "dev".
 653 *
 654 * When appropriate send a netlink notification using 'skb'
 655 * and "n".
 656 *
 657 * On success, destroy old qdisc.
 658 */
 659
 660static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 661                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 662                       struct Qdisc *new, struct Qdisc *old)
 663{
 664        struct Qdisc *q = old;
 665        int err = 0;
 666
 667        if (parent == NULL) {
 668                unsigned int i, num_q, ingress;
 669
 670                ingress = 0;
 671                num_q = dev->num_tx_queues;
 672                if ((q && q->flags & TCQ_F_INGRESS) ||
 673                    (new && new->flags & TCQ_F_INGRESS)) {
 674                        num_q = 1;
 675                        ingress = 1;
 676                }
 677
 678                if (dev->flags & IFF_UP)
 679                        dev_deactivate(dev);
 680
 681                if (new && new->ops->attach) {
 682                        new->ops->attach(new);
 683                        num_q = 0;
 684                }
 685
 686                for (i = 0; i < num_q; i++) {
 687                        struct netdev_queue *dev_queue = &dev->rx_queue;
 688
 689                        if (!ingress)
 690                                dev_queue = netdev_get_tx_queue(dev, i);
 691
 692                        old = dev_graft_qdisc(dev_queue, new);
 693                        if (new && i > 0)
 694                                atomic_inc(&new->refcnt);
 695
 696                        if (!ingress)
 697                                qdisc_destroy(old);
 698                }
 699
 700                if (!ingress) {
 701                        notify_and_destroy(skb, n, classid, dev->qdisc, new);
 702                        if (new && !new->ops->attach)
 703                                atomic_inc(&new->refcnt);
 704                        dev->qdisc = new ? : &noop_qdisc;
 705                } else {
 706                        notify_and_destroy(skb, n, classid, old, new);
 707                }
 708
 709                if (dev->flags & IFF_UP)
 710                        dev_activate(dev);
 711        } else {
 712                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 713
 714                err = -EOPNOTSUPP;
 715                if (cops && cops->graft) {
 716                        unsigned long cl = cops->get(parent, classid);
 717                        if (cl) {
 718                                err = cops->graft(parent, cl, new, &old);
 719                                cops->put(parent, cl);
 720                        } else
 721                                err = -ENOENT;
 722                }
 723                if (!err)
 724                        notify_and_destroy(skb, n, classid, old, new);
 725        }
 726        return err;
 727}
 728
 729/* lockdep annotation is needed for ingress; egress gets it only for name */
 730static struct lock_class_key qdisc_tx_lock;
 731static struct lock_class_key qdisc_rx_lock;
 732
 733/*
 734   Allocate and initialize new qdisc.
 735
 736   Parameters are passed via opt.
 737 */
 738
 739static struct Qdisc *
 740qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 741             struct Qdisc *p, u32 parent, u32 handle,
 742             struct nlattr **tca, int *errp)
 743{
 744        int err;
 745        struct nlattr *kind = tca[TCA_KIND];
 746        struct Qdisc *sch;
 747        struct Qdisc_ops *ops;
 748        struct qdisc_size_table *stab;
 749
 750        ops = qdisc_lookup_ops(kind);
 751#ifdef CONFIG_MODULES
 752        if (ops == NULL && kind != NULL) {
 753                char name[IFNAMSIZ];
 754                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 755                        /* We dropped the RTNL semaphore in order to
 756                         * perform the module load.  So, even if we
 757                         * succeeded in loading the module we have to
 758                         * tell the caller to replay the request.  We
 759                         * indicate this using -EAGAIN.
 760                         * We replay the request because the device may
 761                         * go away in the mean time.
 762                         */
 763                        rtnl_unlock();
 764                        request_module("sch_%s", name);
 765                        rtnl_lock();
 766                        ops = qdisc_lookup_ops(kind);
 767                        if (ops != NULL) {
 768                                /* We will try again qdisc_lookup_ops,
 769                                 * so don't keep a reference.
 770                                 */
 771                                module_put(ops->owner);
 772                                err = -EAGAIN;
 773                                goto err_out;
 774                        }
 775                }
 776        }
 777#endif
 778
 779        err = -ENOENT;
 780        if (ops == NULL)
 781                goto err_out;
 782
 783        sch = qdisc_alloc(dev_queue, ops);
 784        if (IS_ERR(sch)) {
 785                err = PTR_ERR(sch);
 786                goto err_out2;
 787        }
 788
 789        sch->parent = parent;
 790
 791        if (handle == TC_H_INGRESS) {
 792                sch->flags |= TCQ_F_INGRESS;
 793                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 794                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 795        } else {
 796                if (handle == 0) {
 797                        handle = qdisc_alloc_handle(dev);
 798                        err = -ENOMEM;
 799                        if (handle == 0)
 800                                goto err_out3;
 801                }
 802                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 803        }
 804
 805        sch->handle = handle;
 806
 807        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 808                if (tca[TCA_STAB]) {
 809                        stab = qdisc_get_stab(tca[TCA_STAB]);
 810                        if (IS_ERR(stab)) {
 811                                err = PTR_ERR(stab);
 812                                goto err_out4;
 813                        }
 814                        sch->stab = stab;
 815                }
 816                if (tca[TCA_RATE]) {
 817                        spinlock_t *root_lock;
 818
 819                        err = -EOPNOTSUPP;
 820                        if (sch->flags & TCQ_F_MQROOT)
 821                                goto err_out4;
 822
 823                        if ((sch->parent != TC_H_ROOT) &&
 824                            !(sch->flags & TCQ_F_INGRESS) &&
 825                            (!p || !(p->flags & TCQ_F_MQROOT)))
 826                                root_lock = qdisc_root_sleeping_lock(sch);
 827                        else
 828                                root_lock = qdisc_lock(sch);
 829
 830                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 831                                                root_lock, tca[TCA_RATE]);
 832                        if (err)
 833                                goto err_out4;
 834                }
 835
 836                qdisc_list_add(sch);
 837
 838                return sch;
 839        }
 840err_out3:
 841        dev_put(dev);
 842        kfree((char *) sch - sch->padded);
 843err_out2:
 844        module_put(ops->owner);
 845err_out:
 846        *errp = err;
 847        return NULL;
 848
 849err_out4:
 850        /*
 851         * Any broken qdiscs that would require a ops->reset() here?
 852         * The qdisc was never in action so it shouldn't be necessary.
 853         */
 854        qdisc_put_stab(sch->stab);
 855        if (ops->destroy)
 856                ops->destroy(sch);
 857        goto err_out3;
 858}
 859
 860static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 861{
 862        struct qdisc_size_table *stab = NULL;
 863        int err = 0;
 864
 865        if (tca[TCA_OPTIONS]) {
 866                if (sch->ops->change == NULL)
 867                        return -EINVAL;
 868                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 869                if (err)
 870                        return err;
 871        }
 872
 873        if (tca[TCA_STAB]) {
 874                stab = qdisc_get_stab(tca[TCA_STAB]);
 875                if (IS_ERR(stab))
 876                        return PTR_ERR(stab);
 877        }
 878
 879        qdisc_put_stab(sch->stab);
 880        sch->stab = stab;
 881
 882        if (tca[TCA_RATE]) {
 883                /* NB: ignores errors from replace_estimator
 884                   because change can't be undone. */
 885                if (sch->flags & TCQ_F_MQROOT)
 886                        goto out;
 887                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 888                                            qdisc_root_sleeping_lock(sch),
 889                                            tca[TCA_RATE]);
 890        }
 891out:
 892        return 0;
 893}
 894
 895struct check_loop_arg
 896{
 897        struct qdisc_walker     w;
 898        struct Qdisc            *p;
 899        int                     depth;
 900};
 901
 902static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 903
 904static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 905{
 906        struct check_loop_arg   arg;
 907
 908        if (q->ops->cl_ops == NULL)
 909                return 0;
 910
 911        arg.w.stop = arg.w.skip = arg.w.count = 0;
 912        arg.w.fn = check_loop_fn;
 913        arg.depth = depth;
 914        arg.p = p;
 915        q->ops->cl_ops->walk(q, &arg.w);
 916        return arg.w.stop ? -ELOOP : 0;
 917}
 918
 919static int
 920check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 921{
 922        struct Qdisc *leaf;
 923        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 924        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 925
 926        leaf = cops->leaf(q, cl);
 927        if (leaf) {
 928                if (leaf == arg->p || arg->depth > 7)
 929                        return -ELOOP;
 930                return check_loop(leaf, arg->p, arg->depth + 1);
 931        }
 932        return 0;
 933}
 934
 935/*
 936 * Delete/get qdisc.
 937 */
 938
 939static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 940{
 941        struct net *net = sock_net(skb->sk);
 942        struct tcmsg *tcm = NLMSG_DATA(n);
 943        struct nlattr *tca[TCA_MAX + 1];
 944        struct net_device *dev;
 945        u32 clid = tcm->tcm_parent;
 946        struct Qdisc *q = NULL;
 947        struct Qdisc *p = NULL;
 948        int err;
 949
 950        if (net != &init_net)
 951                return -EINVAL;
 952
 953        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 954                return -ENODEV;
 955
 956        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 957        if (err < 0)
 958                return err;
 959
 960        if (clid) {
 961                if (clid != TC_H_ROOT) {
 962                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 963                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 964                                        return -ENOENT;
 965                                q = qdisc_leaf(p, clid);
 966                        } else { /* ingress */
 967                                q = dev->rx_queue.qdisc_sleeping;
 968                        }
 969                } else {
 970                        q = dev->qdisc;
 971                }
 972                if (!q)
 973                        return -ENOENT;
 974
 975                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
 976                        return -EINVAL;
 977        } else {
 978                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
 979                        return -ENOENT;
 980        }
 981
 982        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
 983                return -EINVAL;
 984
 985        if (n->nlmsg_type == RTM_DELQDISC) {
 986                if (!clid)
 987                        return -EINVAL;
 988                if (q->handle == 0)
 989                        return -ENOENT;
 990                if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
 991                        return err;
 992        } else {
 993                qdisc_notify(skb, n, clid, NULL, q);
 994        }
 995        return 0;
 996}
 997
 998/*
 999   Create/change qdisc.
1000 */
1001
1002static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1003{
1004        struct net *net = sock_net(skb->sk);
1005        struct tcmsg *tcm;
1006        struct nlattr *tca[TCA_MAX + 1];
1007        struct net_device *dev;
1008        u32 clid;
1009        struct Qdisc *q, *p;
1010        int err;
1011
1012        if (net != &init_net)
1013                return -EINVAL;
1014
1015replay:
1016        /* Reinit, just in case something touches this. */
1017        tcm = NLMSG_DATA(n);
1018        clid = tcm->tcm_parent;
1019        q = p = NULL;
1020
1021        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1022                return -ENODEV;
1023
1024        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1025        if (err < 0)
1026                return err;
1027
1028        if (clid) {
1029                if (clid != TC_H_ROOT) {
1030                        if (clid != TC_H_INGRESS) {
1031                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1032                                        return -ENOENT;
1033                                q = qdisc_leaf(p, clid);
1034                        } else { /*ingress */
1035                                q = dev->rx_queue.qdisc_sleeping;
1036                        }
1037                } else {
1038                        q = dev->qdisc;
1039                }
1040
1041                /* It may be default qdisc, ignore it */
1042                if (q && q->handle == 0)
1043                        q = NULL;
1044
1045                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1046                        if (tcm->tcm_handle) {
1047                                if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1048                                        return -EEXIST;
1049                                if (TC_H_MIN(tcm->tcm_handle))
1050                                        return -EINVAL;
1051                                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1052                                        goto create_n_graft;
1053                                if (n->nlmsg_flags&NLM_F_EXCL)
1054                                        return -EEXIST;
1055                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1056                                        return -EINVAL;
1057                                if (q == p ||
1058                                    (p && check_loop(q, p, 0)))
1059                                        return -ELOOP;
1060                                atomic_inc(&q->refcnt);
1061                                goto graft;
1062                        } else {
1063                                if (q == NULL)
1064                                        goto create_n_graft;
1065
1066                                /* This magic test requires explanation.
1067                                 *
1068                                 *   We know, that some child q is already
1069                                 *   attached to this parent and have choice:
1070                                 *   either to change it or to create/graft new one.
1071                                 *
1072                                 *   1. We are allowed to create/graft only
1073                                 *   if CREATE and REPLACE flags are set.
1074                                 *
1075                                 *   2. If EXCL is set, requestor wanted to say,
1076                                 *   that qdisc tcm_handle is not expected
1077                                 *   to exist, so that we choose create/graft too.
1078                                 *
1079                                 *   3. The last case is when no flags are set.
1080                                 *   Alas, it is sort of hole in API, we
1081                                 *   cannot decide what to do unambiguously.
1082                                 *   For now we select create/graft, if
1083                                 *   user gave KIND, which does not match existing.
1084                                 */
1085                                if ((n->nlmsg_flags&NLM_F_CREATE) &&
1086                                    (n->nlmsg_flags&NLM_F_REPLACE) &&
1087                                    ((n->nlmsg_flags&NLM_F_EXCL) ||
1088                                     (tca[TCA_KIND] &&
1089                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1090                                        goto create_n_graft;
1091                        }
1092                }
1093        } else {
1094                if (!tcm->tcm_handle)
1095                        return -EINVAL;
1096                q = qdisc_lookup(dev, tcm->tcm_handle);
1097        }
1098
1099        /* Change qdisc parameters */
1100        if (q == NULL)
1101                return -ENOENT;
1102        if (n->nlmsg_flags&NLM_F_EXCL)
1103                return -EEXIST;
1104        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1105                return -EINVAL;
1106        err = qdisc_change(q, tca);
1107        if (err == 0)
1108                qdisc_notify(skb, n, clid, NULL, q);
1109        return err;
1110
1111create_n_graft:
1112        if (!(n->nlmsg_flags&NLM_F_CREATE))
1113                return -ENOENT;
1114        if (clid == TC_H_INGRESS)
1115                q = qdisc_create(dev, &dev->rx_queue, p,
1116                                 tcm->tcm_parent, tcm->tcm_parent,
1117                                 tca, &err);
1118        else {
1119                struct netdev_queue *dev_queue;
1120
1121                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1122                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1123                else if (p)
1124                        dev_queue = p->dev_queue;
1125                else
1126                        dev_queue = netdev_get_tx_queue(dev, 0);
1127
1128                q = qdisc_create(dev, dev_queue, p,
1129                                 tcm->tcm_parent, tcm->tcm_handle,
1130                                 tca, &err);
1131        }
1132        if (q == NULL) {
1133                if (err == -EAGAIN)
1134                        goto replay;
1135                return err;
1136        }
1137
1138graft:
1139        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1140        if (err) {
1141                if (q)
1142                        qdisc_destroy(q);
1143                return err;
1144        }
1145
1146        return 0;
1147}
1148
1149static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1150                         u32 pid, u32 seq, u16 flags, int event)
1151{
1152        struct tcmsg *tcm;
1153        struct nlmsghdr  *nlh;
1154        unsigned char *b = skb_tail_pointer(skb);
1155        struct gnet_dump d;
1156
1157        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1158        tcm = NLMSG_DATA(nlh);
1159        tcm->tcm_family = AF_UNSPEC;
1160        tcm->tcm__pad1 = 0;
1161        tcm->tcm__pad2 = 0;
1162        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1163        tcm->tcm_parent = clid;
1164        tcm->tcm_handle = q->handle;
1165        tcm->tcm_info = atomic_read(&q->refcnt);
1166        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1167        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1168                goto nla_put_failure;
1169        q->qstats.qlen = q->q.qlen;
1170
1171        if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1172                goto nla_put_failure;
1173
1174        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1175                                         qdisc_root_sleeping_lock(q), &d) < 0)
1176                goto nla_put_failure;
1177
1178        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1179                goto nla_put_failure;
1180
1181        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1182            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1183            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1184                goto nla_put_failure;
1185
1186        if (gnet_stats_finish_copy(&d) < 0)
1187                goto nla_put_failure;
1188
1189        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1190        return skb->len;
1191
1192nlmsg_failure:
1193nla_put_failure:
1194        nlmsg_trim(skb, b);
1195        return -1;
1196}
1197
1198static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1199                        u32 clid, struct Qdisc *old, struct Qdisc *new)
1200{
1201        struct sk_buff *skb;
1202        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1203
1204        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1205        if (!skb)
1206                return -ENOBUFS;
1207
1208        if (old && old->handle) {
1209                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1210                        goto err_out;
1211        }
1212        if (new) {
1213                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1214                        goto err_out;
1215        }
1216
1217        if (skb->len)
1218                return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1219
1220err_out:
1221        kfree_skb(skb);
1222        return -EINVAL;
1223}
1224
1225static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1226{
1227        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1228}
1229
1230static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1231                              struct netlink_callback *cb,
1232                              int *q_idx_p, int s_q_idx)
1233{
1234        int ret = 0, q_idx = *q_idx_p;
1235        struct Qdisc *q;
1236
1237        if (!root)
1238                return 0;
1239
1240        q = root;
1241        if (q_idx < s_q_idx) {
1242                q_idx++;
1243        } else {
1244                if (!tc_qdisc_dump_ignore(q) &&
1245                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1246                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1247                        goto done;
1248                q_idx++;
1249        }
1250        list_for_each_entry(q, &root->list, list) {
1251                if (q_idx < s_q_idx) {
1252                        q_idx++;
1253                        continue;
1254                }
1255                if (!tc_qdisc_dump_ignore(q) && 
1256                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1257                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1258                        goto done;
1259                q_idx++;
1260        }
1261
1262out:
1263        *q_idx_p = q_idx;
1264        return ret;
1265done:
1266        ret = -1;
1267        goto out;
1268}
1269
1270static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1271{
1272        struct net *net = sock_net(skb->sk);
1273        int idx, q_idx;
1274        int s_idx, s_q_idx;
1275        struct net_device *dev;
1276
1277        if (net != &init_net)
1278                return 0;
1279
1280        s_idx = cb->args[0];
1281        s_q_idx = q_idx = cb->args[1];
1282        read_lock(&dev_base_lock);
1283        idx = 0;
1284        for_each_netdev(&init_net, dev) {
1285                struct netdev_queue *dev_queue;
1286
1287                if (idx < s_idx)
1288                        goto cont;
1289                if (idx > s_idx)
1290                        s_q_idx = 0;
1291                q_idx = 0;
1292
1293                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1294                        goto done;
1295
1296                dev_queue = &dev->rx_queue;
1297                if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1298                        goto done;
1299
1300cont:
1301                idx++;
1302        }
1303
1304done:
1305        read_unlock(&dev_base_lock);
1306
1307        cb->args[0] = idx;
1308        cb->args[1] = q_idx;
1309
1310        return skb->len;
1311}
1312
1313
1314
1315/************************************************
1316 *      Traffic classes manipulation.           *
1317 ************************************************/
1318
1319
1320
1321static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1322{
1323        struct net *net = sock_net(skb->sk);
1324        struct tcmsg *tcm = NLMSG_DATA(n);
1325        struct nlattr *tca[TCA_MAX + 1];
1326        struct net_device *dev;
1327        struct Qdisc *q = NULL;
1328        const struct Qdisc_class_ops *cops;
1329        unsigned long cl = 0;
1330        unsigned long new_cl;
1331        u32 pid = tcm->tcm_parent;
1332        u32 clid = tcm->tcm_handle;
1333        u32 qid = TC_H_MAJ(clid);
1334        int err;
1335
1336        if (net != &init_net)
1337                return -EINVAL;
1338
1339        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1340                return -ENODEV;
1341
1342        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1343        if (err < 0)
1344                return err;
1345
1346        /*
1347           parent == TC_H_UNSPEC - unspecified parent.
1348           parent == TC_H_ROOT   - class is root, which has no parent.
1349           parent == X:0         - parent is root class.
1350           parent == X:Y         - parent is a node in hierarchy.
1351           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1352
1353           handle == 0:0         - generate handle from kernel pool.
1354           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1355           handle == X:Y         - clear.
1356           handle == X:0         - root class.
1357         */
1358
1359        /* Step 1. Determine qdisc handle X:0 */
1360
1361        if (pid != TC_H_ROOT) {
1362                u32 qid1 = TC_H_MAJ(pid);
1363
1364                if (qid && qid1) {
1365                        /* If both majors are known, they must be identical. */
1366                        if (qid != qid1)
1367                                return -EINVAL;
1368                } else if (qid1) {
1369                        qid = qid1;
1370                } else if (qid == 0)
1371                        qid = dev->qdisc->handle;
1372
1373                /* Now qid is genuine qdisc handle consistent
1374                   both with parent and child.
1375
1376                   TC_H_MAJ(pid) still may be unspecified, complete it now.
1377                 */
1378                if (pid)
1379                        pid = TC_H_MAKE(qid, pid);
1380        } else {
1381                if (qid == 0)
1382                        qid = dev->qdisc->handle;
1383        }
1384
1385        /* OK. Locate qdisc */
1386        if ((q = qdisc_lookup(dev, qid)) == NULL)
1387                return -ENOENT;
1388
1389        /* An check that it supports classes */
1390        cops = q->ops->cl_ops;
1391        if (cops == NULL)
1392                return -EINVAL;
1393
1394        /* Now try to get class */
1395        if (clid == 0) {
1396                if (pid == TC_H_ROOT)
1397                        clid = qid;
1398        } else
1399                clid = TC_H_MAKE(qid, clid);
1400
1401        if (clid)
1402                cl = cops->get(q, clid);
1403
1404        if (cl == 0) {
1405                err = -ENOENT;
1406                if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1407                        goto out;
1408        } else {
1409                switch (n->nlmsg_type) {
1410                case RTM_NEWTCLASS:
1411                        err = -EEXIST;
1412                        if (n->nlmsg_flags&NLM_F_EXCL)
1413                                goto out;
1414                        break;
1415                case RTM_DELTCLASS:
1416                        err = -EOPNOTSUPP;
1417                        if (cops->delete)
1418                                err = cops->delete(q, cl);
1419                        if (err == 0)
1420                                tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1421                        goto out;
1422                case RTM_GETTCLASS:
1423                        err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1424                        goto out;
1425                default:
1426                        err = -EINVAL;
1427                        goto out;
1428                }
1429        }
1430
1431        new_cl = cl;
1432        err = -EOPNOTSUPP;
1433        if (cops->change)
1434                err = cops->change(q, clid, pid, tca, &new_cl);
1435        if (err == 0)
1436                tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1437
1438out:
1439        if (cl)
1440                cops->put(q, cl);
1441
1442        return err;
1443}
1444
1445
1446static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1447                          unsigned long cl,
1448                          u32 pid, u32 seq, u16 flags, int event)
1449{
1450        struct tcmsg *tcm;
1451        struct nlmsghdr  *nlh;
1452        unsigned char *b = skb_tail_pointer(skb);
1453        struct gnet_dump d;
1454        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1455
1456        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1457        tcm = NLMSG_DATA(nlh);
1458        tcm->tcm_family = AF_UNSPEC;
1459        tcm->tcm__pad1 = 0;
1460        tcm->tcm__pad2 = 0;
1461        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1462        tcm->tcm_parent = q->handle;
1463        tcm->tcm_handle = q->handle;
1464        tcm->tcm_info = 0;
1465        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1466        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1467                goto nla_put_failure;
1468
1469        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1470                                         qdisc_root_sleeping_lock(q), &d) < 0)
1471                goto nla_put_failure;
1472
1473        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1474                goto nla_put_failure;
1475
1476        if (gnet_stats_finish_copy(&d) < 0)
1477                goto nla_put_failure;
1478
1479        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1480        return skb->len;
1481
1482nlmsg_failure:
1483nla_put_failure:
1484        nlmsg_trim(skb, b);
1485        return -1;
1486}
1487
1488static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1489                          struct Qdisc *q, unsigned long cl, int event)
1490{
1491        struct sk_buff *skb;
1492        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1493
1494        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1495        if (!skb)
1496                return -ENOBUFS;
1497
1498        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1499                kfree_skb(skb);
1500                return -EINVAL;
1501        }
1502
1503        return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1504}
1505
1506struct qdisc_dump_args
1507{
1508        struct qdisc_walker w;
1509        struct sk_buff *skb;
1510        struct netlink_callback *cb;
1511};
1512
1513static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1514{
1515        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1516
1517        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1518                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1519}
1520
1521static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1522                                struct tcmsg *tcm, struct netlink_callback *cb,
1523                                int *t_p, int s_t)
1524{
1525        struct qdisc_dump_args arg;
1526
1527        if (tc_qdisc_dump_ignore(q) ||
1528            *t_p < s_t || !q->ops->cl_ops ||
1529            (tcm->tcm_parent &&
1530             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1531                (*t_p)++;
1532                return 0;
1533        }
1534        if (*t_p > s_t)
1535                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1536        arg.w.fn = qdisc_class_dump;
1537        arg.skb = skb;
1538        arg.cb = cb;
1539        arg.w.stop  = 0;
1540        arg.w.skip = cb->args[1];
1541        arg.w.count = 0;
1542        q->ops->cl_ops->walk(q, &arg.w);
1543        cb->args[1] = arg.w.count;
1544        if (arg.w.stop)
1545                return -1;
1546        (*t_p)++;
1547        return 0;
1548}
1549
1550static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1551                               struct tcmsg *tcm, struct netlink_callback *cb,
1552                               int *t_p, int s_t)
1553{
1554        struct Qdisc *q;
1555
1556        if (!root)
1557                return 0;
1558
1559        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1560                return -1;
1561
1562        list_for_each_entry(q, &root->list, list) {
1563                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1564                        return -1;
1565        }
1566
1567        return 0;
1568}
1569
1570static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1571{
1572        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1573        struct net *net = sock_net(skb->sk);
1574        struct netdev_queue *dev_queue;
1575        struct net_device *dev;
1576        int t, s_t;
1577
1578        if (net != &init_net)
1579                return 0;
1580
1581        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1582                return 0;
1583        if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1584                return 0;
1585
1586        s_t = cb->args[0];
1587        t = 0;
1588
1589        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1590                goto done;
1591
1592        dev_queue = &dev->rx_queue;
1593        if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1594                goto done;
1595
1596done:
1597        cb->args[0] = t;
1598
1599        dev_put(dev);
1600        return skb->len;
1601}
1602
1603/* Main classifier routine: scans classifier chain attached
1604   to this qdisc, (optionally) tests for protocol and asks
1605   specific classifiers.
1606 */
1607int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1608                       struct tcf_result *res)
1609{
1610        __be16 protocol = skb->protocol;
1611        int err = 0;
1612
1613        for (; tp; tp = tp->next) {
1614                if ((tp->protocol == protocol ||
1615                     tp->protocol == htons(ETH_P_ALL)) &&
1616                    (err = tp->classify(skb, tp, res)) >= 0) {
1617#ifdef CONFIG_NET_CLS_ACT
1618                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1619                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1620#endif
1621                        return err;
1622                }
1623        }
1624        return -1;
1625}
1626EXPORT_SYMBOL(tc_classify_compat);
1627
1628int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1629                struct tcf_result *res)
1630{
1631        int err = 0;
1632        __be16 protocol;
1633#ifdef CONFIG_NET_CLS_ACT
1634        struct tcf_proto *otp = tp;
1635reclassify:
1636#endif
1637        protocol = skb->protocol;
1638
1639        err = tc_classify_compat(skb, tp, res);
1640#ifdef CONFIG_NET_CLS_ACT
1641        if (err == TC_ACT_RECLASSIFY) {
1642                u32 verd = G_TC_VERD(skb->tc_verd);
1643                tp = otp;
1644
1645                if (verd++ >= MAX_REC_LOOP) {
1646                        printk("rule prio %u protocol %02x reclassify loop, "
1647                               "packet dropped\n",
1648                               tp->prio&0xffff, ntohs(tp->protocol));
1649                        return TC_ACT_SHOT;
1650                }
1651                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1652                goto reclassify;
1653        }
1654#endif
1655        return err;
1656}
1657EXPORT_SYMBOL(tc_classify);
1658
1659void tcf_destroy(struct tcf_proto *tp)
1660{
1661        tp->ops->destroy(tp);
1662        module_put(tp->ops->owner);
1663        kfree(tp);
1664}
1665
1666void tcf_destroy_chain(struct tcf_proto **fl)
1667{
1668        struct tcf_proto *tp;
1669
1670        while ((tp = *fl) != NULL) {
1671                *fl = tp->next;
1672                tcf_destroy(tp);
1673        }
1674}
1675EXPORT_SYMBOL(tcf_destroy_chain);
1676
1677#ifdef CONFIG_PROC_FS
1678static int psched_show(struct seq_file *seq, void *v)
1679{
1680        struct timespec ts;
1681
1682        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1683        seq_printf(seq, "%08x %08x %08x %08x\n",
1684                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1685                   1000000,
1686                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1687
1688        return 0;
1689}
1690
1691static int psched_open(struct inode *inode, struct file *file)
1692{
1693        return single_open(file, psched_show, PDE(inode)->data);
1694}
1695
1696static const struct file_operations psched_fops = {
1697        .owner = THIS_MODULE,
1698        .open = psched_open,
1699        .read  = seq_read,
1700        .llseek = seq_lseek,
1701        .release = single_release,
1702};
1703#endif
1704
1705static int __init pktsched_init(void)
1706{
1707        register_qdisc(&pfifo_qdisc_ops);
1708        register_qdisc(&bfifo_qdisc_ops);
1709        register_qdisc(&mq_qdisc_ops);
1710        proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1711
1712        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1713        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1714        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1715        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1716        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1717        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1718
1719        return 0;
1720}
1721
1722subsys_initcall(pktsched_init);
1723