linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/uregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* We know handle. Find qdisc among all qdisc's attached to device
 204   (root qdisc, all its children, children of children etc.)
 205 */
 206
 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 208{
 209        struct Qdisc *q;
 210
 211        if (!(root->flags & TCQ_F_BUILTIN) &&
 212            root->handle == handle)
 213                return root;
 214
 215        list_for_each_entry(q, &root->list, list) {
 216                if (q->handle == handle)
 217                        return q;
 218        }
 219        return NULL;
 220}
 221
 222static void qdisc_list_add(struct Qdisc *q)
 223{
 224        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 225                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 226}
 227
 228void qdisc_list_del(struct Qdisc *q)
 229{
 230        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 231                list_del(&q->list);
 232}
 233EXPORT_SYMBOL(qdisc_list_del);
 234
 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 236{
 237        struct Qdisc *q;
 238
 239        q = qdisc_match_from_root(dev->qdisc, handle);
 240        if (q)
 241                goto out;
 242
 243        if (dev_ingress_queue(dev))
 244                q = qdisc_match_from_root(
 245                        dev_ingress_queue(dev)->qdisc_sleeping,
 246                        handle);
 247out:
 248        return q;
 249}
 250
 251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 252{
 253        unsigned long cl;
 254        struct Qdisc *leaf;
 255        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 256
 257        if (cops == NULL)
 258                return NULL;
 259        cl = cops->get(p, classid);
 260
 261        if (cl == 0)
 262                return NULL;
 263        leaf = cops->leaf(p, cl);
 264        cops->put(p, cl);
 265        return leaf;
 266}
 267
 268/* Find queueing discipline by name */
 269
 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 271{
 272        struct Qdisc_ops *q = NULL;
 273
 274        if (kind) {
 275                read_lock(&qdisc_mod_lock);
 276                for (q = qdisc_base; q; q = q->next) {
 277                        if (nla_strcmp(kind, q->id) == 0) {
 278                                if (!try_module_get(q->owner))
 279                                        q = NULL;
 280                                break;
 281                        }
 282                }
 283                read_unlock(&qdisc_mod_lock);
 284        }
 285        return q;
 286}
 287
 288static struct qdisc_rate_table *qdisc_rtab_list;
 289
 290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 291{
 292        struct qdisc_rate_table *rtab;
 293
 294        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 295                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 296                        rtab->refcnt++;
 297                        return rtab;
 298                }
 299        }
 300
 301        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 302            nla_len(tab) != TC_RTAB_SIZE)
 303                return NULL;
 304
 305        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 306        if (rtab) {
 307                rtab->rate = *r;
 308                rtab->refcnt = 1;
 309                memcpy(rtab->data, nla_data(tab), 1024);
 310                rtab->next = qdisc_rtab_list;
 311                qdisc_rtab_list = rtab;
 312        }
 313        return rtab;
 314}
 315EXPORT_SYMBOL(qdisc_get_rtab);
 316
 317void qdisc_put_rtab(struct qdisc_rate_table *tab)
 318{
 319        struct qdisc_rate_table *rtab, **rtabp;
 320
 321        if (!tab || --tab->refcnt)
 322                return;
 323
 324        for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
 325                if (rtab == tab) {
 326                        *rtabp = rtab->next;
 327                        kfree(rtab);
 328                        return;
 329                }
 330        }
 331}
 332EXPORT_SYMBOL(qdisc_put_rtab);
 333
 334static LIST_HEAD(qdisc_stab_list);
 335static DEFINE_SPINLOCK(qdisc_stab_lock);
 336
 337static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 338        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 339        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 340};
 341
 342static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 343{
 344        struct nlattr *tb[TCA_STAB_MAX + 1];
 345        struct qdisc_size_table *stab;
 346        struct tc_sizespec *s;
 347        unsigned int tsize = 0;
 348        u16 *tab = NULL;
 349        int err;
 350
 351        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 352        if (err < 0)
 353                return ERR_PTR(err);
 354        if (!tb[TCA_STAB_BASE])
 355                return ERR_PTR(-EINVAL);
 356
 357        s = nla_data(tb[TCA_STAB_BASE]);
 358
 359        if (s->tsize > 0) {
 360                if (!tb[TCA_STAB_DATA])
 361                        return ERR_PTR(-EINVAL);
 362                tab = nla_data(tb[TCA_STAB_DATA]);
 363                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 364        }
 365
 366        if (tsize != s->tsize || (!tab && tsize > 0))
 367                return ERR_PTR(-EINVAL);
 368
 369        spin_lock(&qdisc_stab_lock);
 370
 371        list_for_each_entry(stab, &qdisc_stab_list, list) {
 372                if (memcmp(&stab->szopts, s, sizeof(*s)))
 373                        continue;
 374                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 375                        continue;
 376                stab->refcnt++;
 377                spin_unlock(&qdisc_stab_lock);
 378                return stab;
 379        }
 380
 381        spin_unlock(&qdisc_stab_lock);
 382
 383        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 384        if (!stab)
 385                return ERR_PTR(-ENOMEM);
 386
 387        stab->refcnt = 1;
 388        stab->szopts = *s;
 389        if (tsize > 0)
 390                memcpy(stab->data, tab, tsize * sizeof(u16));
 391
 392        spin_lock(&qdisc_stab_lock);
 393        list_add_tail(&stab->list, &qdisc_stab_list);
 394        spin_unlock(&qdisc_stab_lock);
 395
 396        return stab;
 397}
 398
 399void qdisc_put_stab(struct qdisc_size_table *tab)
 400{
 401        if (!tab)
 402                return;
 403
 404        spin_lock(&qdisc_stab_lock);
 405
 406        if (--tab->refcnt == 0) {
 407                list_del(&tab->list);
 408                kfree(tab);
 409        }
 410
 411        spin_unlock(&qdisc_stab_lock);
 412}
 413EXPORT_SYMBOL(qdisc_put_stab);
 414
 415static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 416{
 417        struct nlattr *nest;
 418
 419        nest = nla_nest_start(skb, TCA_STAB);
 420        if (nest == NULL)
 421                goto nla_put_failure;
 422        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
 423        nla_nest_end(skb, nest);
 424
 425        return skb->len;
 426
 427nla_put_failure:
 428        return -1;
 429}
 430
 431void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
 432{
 433        int pkt_len, slot;
 434
 435        pkt_len = skb->len + stab->szopts.overhead;
 436        if (unlikely(!stab->szopts.tsize))
 437                goto out;
 438
 439        slot = pkt_len + stab->szopts.cell_align;
 440        if (unlikely(slot < 0))
 441                slot = 0;
 442
 443        slot >>= stab->szopts.cell_log;
 444        if (likely(slot < stab->szopts.tsize))
 445                pkt_len = stab->data[slot];
 446        else
 447                pkt_len = stab->data[stab->szopts.tsize - 1] *
 448                                (slot / stab->szopts.tsize) +
 449                                stab->data[slot % stab->szopts.tsize];
 450
 451        pkt_len <<= stab->szopts.size_log;
 452out:
 453        if (unlikely(pkt_len < 1))
 454                pkt_len = 1;
 455        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 456}
 457EXPORT_SYMBOL(qdisc_calculate_pkt_len);
 458
 459void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 460{
 461        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 462                printk(KERN_WARNING
 463                       "%s: %s qdisc %X: is non-work-conserving?\n",
 464                       txt, qdisc->ops->id, qdisc->handle >> 16);
 465                qdisc->flags |= TCQ_F_WARN_NONWC;
 466        }
 467}
 468EXPORT_SYMBOL(qdisc_warn_nonwc);
 469
 470static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 471{
 472        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 473                                                 timer);
 474
 475        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 476        __netif_schedule(qdisc_root(wd->qdisc));
 477
 478        return HRTIMER_NORESTART;
 479}
 480
 481void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 482{
 483        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 484        wd->timer.function = qdisc_watchdog;
 485        wd->qdisc = qdisc;
 486}
 487EXPORT_SYMBOL(qdisc_watchdog_init);
 488
 489void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 490{
 491        ktime_t time;
 492
 493        if (test_bit(__QDISC_STATE_DEACTIVATED,
 494                     &qdisc_root_sleeping(wd->qdisc)->state))
 495                return;
 496
 497        wd->qdisc->flags |= TCQ_F_THROTTLED;
 498        time = ktime_set(0, 0);
 499        time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
 500        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 501}
 502EXPORT_SYMBOL(qdisc_watchdog_schedule);
 503
 504void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 505{
 506        hrtimer_cancel(&wd->timer);
 507        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 508}
 509EXPORT_SYMBOL(qdisc_watchdog_cancel);
 510
 511static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 512{
 513        unsigned int size = n * sizeof(struct hlist_head), i;
 514        struct hlist_head *h;
 515
 516        if (size <= PAGE_SIZE)
 517                h = kmalloc(size, GFP_KERNEL);
 518        else
 519                h = (struct hlist_head *)
 520                        __get_free_pages(GFP_KERNEL, get_order(size));
 521
 522        if (h != NULL) {
 523                for (i = 0; i < n; i++)
 524                        INIT_HLIST_HEAD(&h[i]);
 525        }
 526        return h;
 527}
 528
 529static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 530{
 531        unsigned int size = n * sizeof(struct hlist_head);
 532
 533        if (size <= PAGE_SIZE)
 534                kfree(h);
 535        else
 536                free_pages((unsigned long)h, get_order(size));
 537}
 538
 539void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 540{
 541        struct Qdisc_class_common *cl;
 542        struct hlist_node *n, *next;
 543        struct hlist_head *nhash, *ohash;
 544        unsigned int nsize, nmask, osize;
 545        unsigned int i, h;
 546
 547        /* Rehash when load factor exceeds 0.75 */
 548        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 549                return;
 550        nsize = clhash->hashsize * 2;
 551        nmask = nsize - 1;
 552        nhash = qdisc_class_hash_alloc(nsize);
 553        if (nhash == NULL)
 554                return;
 555
 556        ohash = clhash->hash;
 557        osize = clhash->hashsize;
 558
 559        sch_tree_lock(sch);
 560        for (i = 0; i < osize; i++) {
 561                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 562                        h = qdisc_class_hash(cl->classid, nmask);
 563                        hlist_add_head(&cl->hnode, &nhash[h]);
 564                }
 565        }
 566        clhash->hash     = nhash;
 567        clhash->hashsize = nsize;
 568        clhash->hashmask = nmask;
 569        sch_tree_unlock(sch);
 570
 571        qdisc_class_hash_free(ohash, osize);
 572}
 573EXPORT_SYMBOL(qdisc_class_hash_grow);
 574
 575int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 576{
 577        unsigned int size = 4;
 578
 579        clhash->hash = qdisc_class_hash_alloc(size);
 580        if (clhash->hash == NULL)
 581                return -ENOMEM;
 582        clhash->hashsize  = size;
 583        clhash->hashmask  = size - 1;
 584        clhash->hashelems = 0;
 585        return 0;
 586}
 587EXPORT_SYMBOL(qdisc_class_hash_init);
 588
 589void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 590{
 591        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 592}
 593EXPORT_SYMBOL(qdisc_class_hash_destroy);
 594
 595void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 596                             struct Qdisc_class_common *cl)
 597{
 598        unsigned int h;
 599
 600        INIT_HLIST_NODE(&cl->hnode);
 601        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 602        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 603        clhash->hashelems++;
 604}
 605EXPORT_SYMBOL(qdisc_class_hash_insert);
 606
 607void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 608                             struct Qdisc_class_common *cl)
 609{
 610        hlist_del(&cl->hnode);
 611        clhash->hashelems--;
 612}
 613EXPORT_SYMBOL(qdisc_class_hash_remove);
 614
 615/* Allocate an unique handle from space managed by kernel */
 616
 617static u32 qdisc_alloc_handle(struct net_device *dev)
 618{
 619        int i = 0x10000;
 620        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 621
 622        do {
 623                autohandle += TC_H_MAKE(0x10000U, 0);
 624                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 625                        autohandle = TC_H_MAKE(0x80000000U, 0);
 626        } while (qdisc_lookup(dev, autohandle) && --i > 0);
 627
 628        return i>0 ? autohandle : 0;
 629}
 630
 631void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 632{
 633        const struct Qdisc_class_ops *cops;
 634        unsigned long cl;
 635        u32 parentid;
 636
 637        if (n == 0)
 638                return;
 639        while ((parentid = sch->parent)) {
 640                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 641                        return;
 642
 643                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 644                if (sch == NULL) {
 645                        WARN_ON(parentid != TC_H_ROOT);
 646                        return;
 647                }
 648                cops = sch->ops->cl_ops;
 649                if (cops->qlen_notify) {
 650                        cl = cops->get(sch, parentid);
 651                        cops->qlen_notify(sch, cl);
 652                        cops->put(sch, cl);
 653                }
 654                sch->q.qlen -= n;
 655        }
 656}
 657EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 658
 659static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 660                               struct nlmsghdr *n, u32 clid,
 661                               struct Qdisc *old, struct Qdisc *new)
 662{
 663        if (new || old)
 664                qdisc_notify(net, skb, n, clid, old, new);
 665
 666        if (old)
 667                qdisc_destroy(old);
 668}
 669
 670/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 671 * to device "dev".
 672 *
 673 * When appropriate send a netlink notification using 'skb'
 674 * and "n".
 675 *
 676 * On success, destroy old qdisc.
 677 */
 678
 679static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 680                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 681                       struct Qdisc *new, struct Qdisc *old)
 682{
 683        struct Qdisc *q = old;
 684        struct net *net = dev_net(dev);
 685        int err = 0;
 686
 687        if (parent == NULL) {
 688                unsigned int i, num_q, ingress;
 689
 690                ingress = 0;
 691                num_q = dev->num_tx_queues;
 692                if ((q && q->flags & TCQ_F_INGRESS) ||
 693                    (new && new->flags & TCQ_F_INGRESS)) {
 694                        num_q = 1;
 695                        ingress = 1;
 696                        if (!dev_ingress_queue(dev))
 697                                return -ENOENT;
 698                }
 699
 700                if (dev->flags & IFF_UP)
 701                        dev_deactivate(dev);
 702
 703                if (new && new->ops->attach) {
 704                        new->ops->attach(new);
 705                        num_q = 0;
 706                }
 707
 708                for (i = 0; i < num_q; i++) {
 709                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 710
 711                        if (!ingress)
 712                                dev_queue = netdev_get_tx_queue(dev, i);
 713
 714                        old = dev_graft_qdisc(dev_queue, new);
 715                        if (new && i > 0)
 716                                atomic_inc(&new->refcnt);
 717
 718                        if (!ingress)
 719                                qdisc_destroy(old);
 720                }
 721
 722                if (!ingress) {
 723                        notify_and_destroy(net, skb, n, classid,
 724                                           dev->qdisc, new);
 725                        if (new && !new->ops->attach)
 726                                atomic_inc(&new->refcnt);
 727                        dev->qdisc = new ? : &noop_qdisc;
 728                } else {
 729                        notify_and_destroy(net, skb, n, classid, old, new);
 730                }
 731
 732                if (dev->flags & IFF_UP)
 733                        dev_activate(dev);
 734        } else {
 735                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 736
 737                err = -EOPNOTSUPP;
 738                if (cops && cops->graft) {
 739                        unsigned long cl = cops->get(parent, classid);
 740                        if (cl) {
 741                                err = cops->graft(parent, cl, new, &old);
 742                                cops->put(parent, cl);
 743                        } else
 744                                err = -ENOENT;
 745                }
 746                if (!err)
 747                        notify_and_destroy(net, skb, n, classid, old, new);
 748        }
 749        return err;
 750}
 751
 752/* lockdep annotation is needed for ingress; egress gets it only for name */
 753static struct lock_class_key qdisc_tx_lock;
 754static struct lock_class_key qdisc_rx_lock;
 755
 756/*
 757   Allocate and initialize new qdisc.
 758
 759   Parameters are passed via opt.
 760 */
 761
 762static struct Qdisc *
 763qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 764             struct Qdisc *p, u32 parent, u32 handle,
 765             struct nlattr **tca, int *errp)
 766{
 767        int err;
 768        struct nlattr *kind = tca[TCA_KIND];
 769        struct Qdisc *sch;
 770        struct Qdisc_ops *ops;
 771        struct qdisc_size_table *stab;
 772
 773        ops = qdisc_lookup_ops(kind);
 774#ifdef CONFIG_MODULES
 775        if (ops == NULL && kind != NULL) {
 776                char name[IFNAMSIZ];
 777                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 778                        /* We dropped the RTNL semaphore in order to
 779                         * perform the module load.  So, even if we
 780                         * succeeded in loading the module we have to
 781                         * tell the caller to replay the request.  We
 782                         * indicate this using -EAGAIN.
 783                         * We replay the request because the device may
 784                         * go away in the mean time.
 785                         */
 786                        rtnl_unlock();
 787                        request_module("sch_%s", name);
 788                        rtnl_lock();
 789                        ops = qdisc_lookup_ops(kind);
 790                        if (ops != NULL) {
 791                                /* We will try again qdisc_lookup_ops,
 792                                 * so don't keep a reference.
 793                                 */
 794                                module_put(ops->owner);
 795                                err = -EAGAIN;
 796                                goto err_out;
 797                        }
 798                }
 799        }
 800#endif
 801
 802        err = -ENOENT;
 803        if (ops == NULL)
 804                goto err_out;
 805
 806        sch = qdisc_alloc(dev_queue, ops);
 807        if (IS_ERR(sch)) {
 808                err = PTR_ERR(sch);
 809                goto err_out2;
 810        }
 811
 812        sch->parent = parent;
 813
 814        if (handle == TC_H_INGRESS) {
 815                sch->flags |= TCQ_F_INGRESS;
 816                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 817                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 818        } else {
 819                if (handle == 0) {
 820                        handle = qdisc_alloc_handle(dev);
 821                        err = -ENOMEM;
 822                        if (handle == 0)
 823                                goto err_out3;
 824                }
 825                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 826        }
 827
 828        sch->handle = handle;
 829
 830        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 831                if (tca[TCA_STAB]) {
 832                        stab = qdisc_get_stab(tca[TCA_STAB]);
 833                        if (IS_ERR(stab)) {
 834                                err = PTR_ERR(stab);
 835                                goto err_out4;
 836                        }
 837                        sch->stab = stab;
 838                }
 839                if (tca[TCA_RATE]) {
 840                        spinlock_t *root_lock;
 841
 842                        err = -EOPNOTSUPP;
 843                        if (sch->flags & TCQ_F_MQROOT)
 844                                goto err_out4;
 845
 846                        if ((sch->parent != TC_H_ROOT) &&
 847                            !(sch->flags & TCQ_F_INGRESS) &&
 848                            (!p || !(p->flags & TCQ_F_MQROOT)))
 849                                root_lock = qdisc_root_sleeping_lock(sch);
 850                        else
 851                                root_lock = qdisc_lock(sch);
 852
 853                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 854                                                root_lock, tca[TCA_RATE]);
 855                        if (err)
 856                                goto err_out4;
 857                }
 858
 859                qdisc_list_add(sch);
 860
 861                return sch;
 862        }
 863err_out3:
 864        dev_put(dev);
 865        kfree((char *) sch - sch->padded);
 866err_out2:
 867        module_put(ops->owner);
 868err_out:
 869        *errp = err;
 870        return NULL;
 871
 872err_out4:
 873        /*
 874         * Any broken qdiscs that would require a ops->reset() here?
 875         * The qdisc was never in action so it shouldn't be necessary.
 876         */
 877        qdisc_put_stab(sch->stab);
 878        if (ops->destroy)
 879                ops->destroy(sch);
 880        goto err_out3;
 881}
 882
 883static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 884{
 885        struct qdisc_size_table *stab = NULL;
 886        int err = 0;
 887
 888        if (tca[TCA_OPTIONS]) {
 889                if (sch->ops->change == NULL)
 890                        return -EINVAL;
 891                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 892                if (err)
 893                        return err;
 894        }
 895
 896        if (tca[TCA_STAB]) {
 897                stab = qdisc_get_stab(tca[TCA_STAB]);
 898                if (IS_ERR(stab))
 899                        return PTR_ERR(stab);
 900        }
 901
 902        qdisc_put_stab(sch->stab);
 903        sch->stab = stab;
 904
 905        if (tca[TCA_RATE]) {
 906                /* NB: ignores errors from replace_estimator
 907                   because change can't be undone. */
 908                if (sch->flags & TCQ_F_MQROOT)
 909                        goto out;
 910                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 911                                            qdisc_root_sleeping_lock(sch),
 912                                            tca[TCA_RATE]);
 913        }
 914out:
 915        return 0;
 916}
 917
 918struct check_loop_arg
 919{
 920        struct qdisc_walker     w;
 921        struct Qdisc            *p;
 922        int                     depth;
 923};
 924
 925static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 926
 927static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 928{
 929        struct check_loop_arg   arg;
 930
 931        if (q->ops->cl_ops == NULL)
 932                return 0;
 933
 934        arg.w.stop = arg.w.skip = arg.w.count = 0;
 935        arg.w.fn = check_loop_fn;
 936        arg.depth = depth;
 937        arg.p = p;
 938        q->ops->cl_ops->walk(q, &arg.w);
 939        return arg.w.stop ? -ELOOP : 0;
 940}
 941
 942static int
 943check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 944{
 945        struct Qdisc *leaf;
 946        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 947        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 948
 949        leaf = cops->leaf(q, cl);
 950        if (leaf) {
 951                if (leaf == arg->p || arg->depth > 7)
 952                        return -ELOOP;
 953                return check_loop(leaf, arg->p, arg->depth + 1);
 954        }
 955        return 0;
 956}
 957
 958/*
 959 * Delete/get qdisc.
 960 */
 961
 962static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 963{
 964        struct net *net = sock_net(skb->sk);
 965        struct tcmsg *tcm = NLMSG_DATA(n);
 966        struct nlattr *tca[TCA_MAX + 1];
 967        struct net_device *dev;
 968        u32 clid = tcm->tcm_parent;
 969        struct Qdisc *q = NULL;
 970        struct Qdisc *p = NULL;
 971        int err;
 972
 973        if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
 974                return -ENODEV;
 975
 976        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 977        if (err < 0)
 978                return err;
 979
 980        if (clid) {
 981                if (clid != TC_H_ROOT) {
 982                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 983                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 984                                        return -ENOENT;
 985                                q = qdisc_leaf(p, clid);
 986                        } else { /* ingress */
 987                                if (dev_ingress_queue(dev))
 988                                        q = dev_ingress_queue(dev)->qdisc_sleeping;
 989                        }
 990                } else {
 991                        q = dev->qdisc;
 992                }
 993                if (!q)
 994                        return -ENOENT;
 995
 996                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
 997                        return -EINVAL;
 998        } else {
 999                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1000                        return -ENOENT;
1001        }
1002
1003        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1004                return -EINVAL;
1005
1006        if (n->nlmsg_type == RTM_DELQDISC) {
1007                if (!clid)
1008                        return -EINVAL;
1009                if (q->handle == 0)
1010                        return -ENOENT;
1011                if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1012                        return err;
1013        } else {
1014                qdisc_notify(net, skb, n, clid, NULL, q);
1015        }
1016        return 0;
1017}
1018
1019/*
1020   Create/change qdisc.
1021 */
1022
1023static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1024{
1025        struct net *net = sock_net(skb->sk);
1026        struct tcmsg *tcm;
1027        struct nlattr *tca[TCA_MAX + 1];
1028        struct net_device *dev;
1029        u32 clid;
1030        struct Qdisc *q, *p;
1031        int err;
1032
1033replay:
1034        /* Reinit, just in case something touches this. */
1035        tcm = NLMSG_DATA(n);
1036        clid = tcm->tcm_parent;
1037        q = p = NULL;
1038
1039        if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1040                return -ENODEV;
1041
1042        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1043        if (err < 0)
1044                return err;
1045
1046        if (clid) {
1047                if (clid != TC_H_ROOT) {
1048                        if (clid != TC_H_INGRESS) {
1049                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1050                                        return -ENOENT;
1051                                q = qdisc_leaf(p, clid);
1052                        } else { /* ingress */
1053                                if (dev_ingress_queue_create(dev))
1054                                        q = dev_ingress_queue(dev)->qdisc_sleeping;
1055                        }
1056                } else {
1057                        q = dev->qdisc;
1058                }
1059
1060                /* It may be default qdisc, ignore it */
1061                if (q && q->handle == 0)
1062                        q = NULL;
1063
1064                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1065                        if (tcm->tcm_handle) {
1066                                if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1067                                        return -EEXIST;
1068                                if (TC_H_MIN(tcm->tcm_handle))
1069                                        return -EINVAL;
1070                                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1071                                        goto create_n_graft;
1072                                if (n->nlmsg_flags&NLM_F_EXCL)
1073                                        return -EEXIST;
1074                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1075                                        return -EINVAL;
1076                                if (q == p ||
1077                                    (p && check_loop(q, p, 0)))
1078                                        return -ELOOP;
1079                                atomic_inc(&q->refcnt);
1080                                goto graft;
1081                        } else {
1082                                if (q == NULL)
1083                                        goto create_n_graft;
1084
1085                                /* This magic test requires explanation.
1086                                 *
1087                                 *   We know, that some child q is already
1088                                 *   attached to this parent and have choice:
1089                                 *   either to change it or to create/graft new one.
1090                                 *
1091                                 *   1. We are allowed to create/graft only
1092                                 *   if CREATE and REPLACE flags are set.
1093                                 *
1094                                 *   2. If EXCL is set, requestor wanted to say,
1095                                 *   that qdisc tcm_handle is not expected
1096                                 *   to exist, so that we choose create/graft too.
1097                                 *
1098                                 *   3. The last case is when no flags are set.
1099                                 *   Alas, it is sort of hole in API, we
1100                                 *   cannot decide what to do unambiguously.
1101                                 *   For now we select create/graft, if
1102                                 *   user gave KIND, which does not match existing.
1103                                 */
1104                                if ((n->nlmsg_flags&NLM_F_CREATE) &&
1105                                    (n->nlmsg_flags&NLM_F_REPLACE) &&
1106                                    ((n->nlmsg_flags&NLM_F_EXCL) ||
1107                                     (tca[TCA_KIND] &&
1108                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1109                                        goto create_n_graft;
1110                        }
1111                }
1112        } else {
1113                if (!tcm->tcm_handle)
1114                        return -EINVAL;
1115                q = qdisc_lookup(dev, tcm->tcm_handle);
1116        }
1117
1118        /* Change qdisc parameters */
1119        if (q == NULL)
1120                return -ENOENT;
1121        if (n->nlmsg_flags&NLM_F_EXCL)
1122                return -EEXIST;
1123        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1124                return -EINVAL;
1125        err = qdisc_change(q, tca);
1126        if (err == 0)
1127                qdisc_notify(net, skb, n, clid, NULL, q);
1128        return err;
1129
1130create_n_graft:
1131        if (!(n->nlmsg_flags&NLM_F_CREATE))
1132                return -ENOENT;
1133        if (clid == TC_H_INGRESS) {
1134                if (dev_ingress_queue(dev))
1135                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1136                                         tcm->tcm_parent, tcm->tcm_parent,
1137                                         tca, &err);
1138                else
1139                        err = -ENOENT;
1140        } else {
1141                struct netdev_queue *dev_queue;
1142
1143                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1144                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1145                else if (p)
1146                        dev_queue = p->dev_queue;
1147                else
1148                        dev_queue = netdev_get_tx_queue(dev, 0);
1149
1150                q = qdisc_create(dev, dev_queue, p,
1151                                 tcm->tcm_parent, tcm->tcm_handle,
1152                                 tca, &err);
1153        }
1154        if (q == NULL) {
1155                if (err == -EAGAIN)
1156                        goto replay;
1157                return err;
1158        }
1159
1160graft:
1161        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1162        if (err) {
1163                if (q)
1164                        qdisc_destroy(q);
1165                return err;
1166        }
1167
1168        return 0;
1169}
1170
1171static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1172                         u32 pid, u32 seq, u16 flags, int event)
1173{
1174        struct tcmsg *tcm;
1175        struct nlmsghdr  *nlh;
1176        unsigned char *b = skb_tail_pointer(skb);
1177        struct gnet_dump d;
1178
1179        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1180        tcm = NLMSG_DATA(nlh);
1181        tcm->tcm_family = AF_UNSPEC;
1182        tcm->tcm__pad1 = 0;
1183        tcm->tcm__pad2 = 0;
1184        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1185        tcm->tcm_parent = clid;
1186        tcm->tcm_handle = q->handle;
1187        tcm->tcm_info = atomic_read(&q->refcnt);
1188        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1189        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1190                goto nla_put_failure;
1191        q->qstats.qlen = q->q.qlen;
1192
1193        if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1194                goto nla_put_failure;
1195
1196        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1197                                         qdisc_root_sleeping_lock(q), &d) < 0)
1198                goto nla_put_failure;
1199
1200        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1201                goto nla_put_failure;
1202
1203        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1204            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1205            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1206                goto nla_put_failure;
1207
1208        if (gnet_stats_finish_copy(&d) < 0)
1209                goto nla_put_failure;
1210
1211        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1212        return skb->len;
1213
1214nlmsg_failure:
1215nla_put_failure:
1216        nlmsg_trim(skb, b);
1217        return -1;
1218}
1219
1220static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1221{
1222        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1223}
1224
1225static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1226                        struct nlmsghdr *n, u32 clid,
1227                        struct Qdisc *old, struct Qdisc *new)
1228{
1229        struct sk_buff *skb;
1230        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1231
1232        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1233        if (!skb)
1234                return -ENOBUFS;
1235
1236        if (old && !tc_qdisc_dump_ignore(old)) {
1237                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1238                        goto err_out;
1239        }
1240        if (new && !tc_qdisc_dump_ignore(new)) {
1241                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1242                        goto err_out;
1243        }
1244
1245        if (skb->len)
1246                return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1247
1248err_out:
1249        kfree_skb(skb);
1250        return -EINVAL;
1251}
1252
1253static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1254                              struct netlink_callback *cb,
1255                              int *q_idx_p, int s_q_idx)
1256{
1257        int ret = 0, q_idx = *q_idx_p;
1258        struct Qdisc *q;
1259
1260        if (!root)
1261                return 0;
1262
1263        q = root;
1264        if (q_idx < s_q_idx) {
1265                q_idx++;
1266        } else {
1267                if (!tc_qdisc_dump_ignore(q) &&
1268                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1269                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1270                        goto done;
1271                q_idx++;
1272        }
1273        list_for_each_entry(q, &root->list, list) {
1274                if (q_idx < s_q_idx) {
1275                        q_idx++;
1276                        continue;
1277                }
1278                if (!tc_qdisc_dump_ignore(q) && 
1279                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1280                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1281                        goto done;
1282                q_idx++;
1283        }
1284
1285out:
1286        *q_idx_p = q_idx;
1287        return ret;
1288done:
1289        ret = -1;
1290        goto out;
1291}
1292
1293static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1294{
1295        struct net *net = sock_net(skb->sk);
1296        int idx, q_idx;
1297        int s_idx, s_q_idx;
1298        struct net_device *dev;
1299
1300        s_idx = cb->args[0];
1301        s_q_idx = q_idx = cb->args[1];
1302
1303        rcu_read_lock();
1304        idx = 0;
1305        for_each_netdev_rcu(net, dev) {
1306                struct netdev_queue *dev_queue;
1307
1308                if (idx < s_idx)
1309                        goto cont;
1310                if (idx > s_idx)
1311                        s_q_idx = 0;
1312                q_idx = 0;
1313
1314                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1315                        goto done;
1316
1317                dev_queue = dev_ingress_queue(dev);
1318                if (dev_queue &&
1319                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1320                                       &q_idx, s_q_idx) < 0)
1321                        goto done;
1322
1323cont:
1324                idx++;
1325        }
1326
1327done:
1328        rcu_read_unlock();
1329
1330        cb->args[0] = idx;
1331        cb->args[1] = q_idx;
1332
1333        return skb->len;
1334}
1335
1336
1337
1338/************************************************
1339 *      Traffic classes manipulation.           *
1340 ************************************************/
1341
1342
1343
1344static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1345{
1346        struct net *net = sock_net(skb->sk);
1347        struct tcmsg *tcm = NLMSG_DATA(n);
1348        struct nlattr *tca[TCA_MAX + 1];
1349        struct net_device *dev;
1350        struct Qdisc *q = NULL;
1351        const struct Qdisc_class_ops *cops;
1352        unsigned long cl = 0;
1353        unsigned long new_cl;
1354        u32 pid = tcm->tcm_parent;
1355        u32 clid = tcm->tcm_handle;
1356        u32 qid = TC_H_MAJ(clid);
1357        int err;
1358
1359        if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1360                return -ENODEV;
1361
1362        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1363        if (err < 0)
1364                return err;
1365
1366        /*
1367           parent == TC_H_UNSPEC - unspecified parent.
1368           parent == TC_H_ROOT   - class is root, which has no parent.
1369           parent == X:0         - parent is root class.
1370           parent == X:Y         - parent is a node in hierarchy.
1371           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1372
1373           handle == 0:0         - generate handle from kernel pool.
1374           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1375           handle == X:Y         - clear.
1376           handle == X:0         - root class.
1377         */
1378
1379        /* Step 1. Determine qdisc handle X:0 */
1380
1381        if (pid != TC_H_ROOT) {
1382                u32 qid1 = TC_H_MAJ(pid);
1383
1384                if (qid && qid1) {
1385                        /* If both majors are known, they must be identical. */
1386                        if (qid != qid1)
1387                                return -EINVAL;
1388                } else if (qid1) {
1389                        qid = qid1;
1390                } else if (qid == 0)
1391                        qid = dev->qdisc->handle;
1392
1393                /* Now qid is genuine qdisc handle consistent
1394                   both with parent and child.
1395
1396                   TC_H_MAJ(pid) still may be unspecified, complete it now.
1397                 */
1398                if (pid)
1399                        pid = TC_H_MAKE(qid, pid);
1400        } else {
1401                if (qid == 0)
1402                        qid = dev->qdisc->handle;
1403        }
1404
1405        /* OK. Locate qdisc */
1406        if ((q = qdisc_lookup(dev, qid)) == NULL)
1407                return -ENOENT;
1408
1409        /* An check that it supports classes */
1410        cops = q->ops->cl_ops;
1411        if (cops == NULL)
1412                return -EINVAL;
1413
1414        /* Now try to get class */
1415        if (clid == 0) {
1416                if (pid == TC_H_ROOT)
1417                        clid = qid;
1418        } else
1419                clid = TC_H_MAKE(qid, clid);
1420
1421        if (clid)
1422                cl = cops->get(q, clid);
1423
1424        if (cl == 0) {
1425                err = -ENOENT;
1426                if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1427                        goto out;
1428        } else {
1429                switch (n->nlmsg_type) {
1430                case RTM_NEWTCLASS:
1431                        err = -EEXIST;
1432                        if (n->nlmsg_flags&NLM_F_EXCL)
1433                                goto out;
1434                        break;
1435                case RTM_DELTCLASS:
1436                        err = -EOPNOTSUPP;
1437                        if (cops->delete)
1438                                err = cops->delete(q, cl);
1439                        if (err == 0)
1440                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1441                        goto out;
1442                case RTM_GETTCLASS:
1443                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1444                        goto out;
1445                default:
1446                        err = -EINVAL;
1447                        goto out;
1448                }
1449        }
1450
1451        new_cl = cl;
1452        err = -EOPNOTSUPP;
1453        if (cops->change)
1454                err = cops->change(q, clid, pid, tca, &new_cl);
1455        if (err == 0)
1456                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1457
1458out:
1459        if (cl)
1460                cops->put(q, cl);
1461
1462        return err;
1463}
1464
1465
1466static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1467                          unsigned long cl,
1468                          u32 pid, u32 seq, u16 flags, int event)
1469{
1470        struct tcmsg *tcm;
1471        struct nlmsghdr  *nlh;
1472        unsigned char *b = skb_tail_pointer(skb);
1473        struct gnet_dump d;
1474        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1475
1476        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1477        tcm = NLMSG_DATA(nlh);
1478        tcm->tcm_family = AF_UNSPEC;
1479        tcm->tcm__pad1 = 0;
1480        tcm->tcm__pad2 = 0;
1481        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1482        tcm->tcm_parent = q->handle;
1483        tcm->tcm_handle = q->handle;
1484        tcm->tcm_info = 0;
1485        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1486        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1487                goto nla_put_failure;
1488
1489        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1490                                         qdisc_root_sleeping_lock(q), &d) < 0)
1491                goto nla_put_failure;
1492
1493        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1494                goto nla_put_failure;
1495
1496        if (gnet_stats_finish_copy(&d) < 0)
1497                goto nla_put_failure;
1498
1499        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1500        return skb->len;
1501
1502nlmsg_failure:
1503nla_put_failure:
1504        nlmsg_trim(skb, b);
1505        return -1;
1506}
1507
1508static int tclass_notify(struct net *net, struct sk_buff *oskb,
1509                         struct nlmsghdr *n, struct Qdisc *q,
1510                         unsigned long cl, int event)
1511{
1512        struct sk_buff *skb;
1513        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1514
1515        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1516        if (!skb)
1517                return -ENOBUFS;
1518
1519        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1520                kfree_skb(skb);
1521                return -EINVAL;
1522        }
1523
1524        return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1525}
1526
1527struct qdisc_dump_args
1528{
1529        struct qdisc_walker w;
1530        struct sk_buff *skb;
1531        struct netlink_callback *cb;
1532};
1533
1534static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1535{
1536        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1537
1538        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1539                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1540}
1541
1542static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1543                                struct tcmsg *tcm, struct netlink_callback *cb,
1544                                int *t_p, int s_t)
1545{
1546        struct qdisc_dump_args arg;
1547
1548        if (tc_qdisc_dump_ignore(q) ||
1549            *t_p < s_t || !q->ops->cl_ops ||
1550            (tcm->tcm_parent &&
1551             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1552                (*t_p)++;
1553                return 0;
1554        }
1555        if (*t_p > s_t)
1556                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1557        arg.w.fn = qdisc_class_dump;
1558        arg.skb = skb;
1559        arg.cb = cb;
1560        arg.w.stop  = 0;
1561        arg.w.skip = cb->args[1];
1562        arg.w.count = 0;
1563        q->ops->cl_ops->walk(q, &arg.w);
1564        cb->args[1] = arg.w.count;
1565        if (arg.w.stop)
1566                return -1;
1567        (*t_p)++;
1568        return 0;
1569}
1570
1571static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1572                               struct tcmsg *tcm, struct netlink_callback *cb,
1573                               int *t_p, int s_t)
1574{
1575        struct Qdisc *q;
1576
1577        if (!root)
1578                return 0;
1579
1580        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1581                return -1;
1582
1583        list_for_each_entry(q, &root->list, list) {
1584                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1585                        return -1;
1586        }
1587
1588        return 0;
1589}
1590
1591static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1592{
1593        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1594        struct net *net = sock_net(skb->sk);
1595        struct netdev_queue *dev_queue;
1596        struct net_device *dev;
1597        int t, s_t;
1598
1599        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1600                return 0;
1601        if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1602                return 0;
1603
1604        s_t = cb->args[0];
1605        t = 0;
1606
1607        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1608                goto done;
1609
1610        dev_queue = dev_ingress_queue(dev);
1611        if (dev_queue &&
1612            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1613                                &t, s_t) < 0)
1614                goto done;
1615
1616done:
1617        cb->args[0] = t;
1618
1619        dev_put(dev);
1620        return skb->len;
1621}
1622
1623/* Main classifier routine: scans classifier chain attached
1624   to this qdisc, (optionally) tests for protocol and asks
1625   specific classifiers.
1626 */
1627int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1628                       struct tcf_result *res)
1629{
1630        __be16 protocol = skb->protocol;
1631        int err = 0;
1632
1633        for (; tp; tp = tp->next) {
1634                if ((tp->protocol == protocol ||
1635                     tp->protocol == htons(ETH_P_ALL)) &&
1636                    (err = tp->classify(skb, tp, res)) >= 0) {
1637#ifdef CONFIG_NET_CLS_ACT
1638                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1639                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1640#endif
1641                        return err;
1642                }
1643        }
1644        return -1;
1645}
1646EXPORT_SYMBOL(tc_classify_compat);
1647
1648int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1649                struct tcf_result *res)
1650{
1651        int err = 0;
1652        __be16 protocol;
1653#ifdef CONFIG_NET_CLS_ACT
1654        struct tcf_proto *otp = tp;
1655reclassify:
1656#endif
1657        protocol = skb->protocol;
1658
1659        err = tc_classify_compat(skb, tp, res);
1660#ifdef CONFIG_NET_CLS_ACT
1661        if (err == TC_ACT_RECLASSIFY) {
1662                u32 verd = G_TC_VERD(skb->tc_verd);
1663                tp = otp;
1664
1665                if (verd++ >= MAX_REC_LOOP) {
1666                        if (net_ratelimit())
1667                                printk(KERN_NOTICE
1668                                       "%s: packet reclassify loop"
1669                                          " rule prio %u protocol %02x\n",
1670                                       tp->q->ops->id,
1671                                       tp->prio & 0xffff, ntohs(tp->protocol));
1672                        return TC_ACT_SHOT;
1673                }
1674                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1675                goto reclassify;
1676        }
1677#endif
1678        return err;
1679}
1680EXPORT_SYMBOL(tc_classify);
1681
1682void tcf_destroy(struct tcf_proto *tp)
1683{
1684        tp->ops->destroy(tp);
1685        module_put(tp->ops->owner);
1686        kfree(tp);
1687}
1688
1689void tcf_destroy_chain(struct tcf_proto **fl)
1690{
1691        struct tcf_proto *tp;
1692
1693        while ((tp = *fl) != NULL) {
1694                *fl = tp->next;
1695                tcf_destroy(tp);
1696        }
1697}
1698EXPORT_SYMBOL(tcf_destroy_chain);
1699
1700#ifdef CONFIG_PROC_FS
1701static int psched_show(struct seq_file *seq, void *v)
1702{
1703        struct timespec ts;
1704
1705        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1706        seq_printf(seq, "%08x %08x %08x %08x\n",
1707                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1708                   1000000,
1709                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1710
1711        return 0;
1712}
1713
1714static int psched_open(struct inode *inode, struct file *file)
1715{
1716        return single_open(file, psched_show, NULL);
1717}
1718
1719static const struct file_operations psched_fops = {
1720        .owner = THIS_MODULE,
1721        .open = psched_open,
1722        .read  = seq_read,
1723        .llseek = seq_lseek,
1724        .release = single_release,
1725};
1726
1727static int __net_init psched_net_init(struct net *net)
1728{
1729        struct proc_dir_entry *e;
1730
1731        e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1732        if (e == NULL)
1733                return -ENOMEM;
1734
1735        return 0;
1736}
1737
1738static void __net_exit psched_net_exit(struct net *net)
1739{
1740        proc_net_remove(net, "psched");
1741}
1742#else
1743static int __net_init psched_net_init(struct net *net)
1744{
1745        return 0;
1746}
1747
1748static void __net_exit psched_net_exit(struct net *net)
1749{
1750}
1751#endif
1752
1753static struct pernet_operations psched_net_ops = {
1754        .init = psched_net_init,
1755        .exit = psched_net_exit,
1756};
1757
1758static int __init pktsched_init(void)
1759{
1760        int err;
1761
1762        err = register_pernet_subsys(&psched_net_ops);
1763        if (err) {
1764                printk(KERN_ERR "pktsched_init: "
1765                       "cannot initialize per netns operations\n");
1766                return err;
1767        }
1768
1769        register_qdisc(&pfifo_qdisc_ops);
1770        register_qdisc(&bfifo_qdisc_ops);
1771        register_qdisc(&pfifo_head_drop_qdisc_ops);
1772        register_qdisc(&mq_qdisc_ops);
1773
1774        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1775        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1776        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1777        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1778        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1779        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1780
1781        return 0;
1782}
1783
1784subsys_initcall(pktsched_init);
1785