linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/uregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* We know handle. Find qdisc among all qdisc's attached to device
 204   (root qdisc, all its children, children of children etc.)
 205 */
 206
 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 208{
 209        struct Qdisc *q;
 210
 211        if (!(root->flags & TCQ_F_BUILTIN) &&
 212            root->handle == handle)
 213                return root;
 214
 215        list_for_each_entry(q, &root->list, list) {
 216                if (q->handle == handle)
 217                        return q;
 218        }
 219        return NULL;
 220}
 221
 222static void qdisc_list_add(struct Qdisc *q)
 223{
 224        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 225                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 226}
 227
 228void qdisc_list_del(struct Qdisc *q)
 229{
 230        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 231                list_del(&q->list);
 232}
 233EXPORT_SYMBOL(qdisc_list_del);
 234
 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 236{
 237        struct Qdisc *q;
 238
 239        q = qdisc_match_from_root(dev->qdisc, handle);
 240        if (q)
 241                goto out;
 242
 243        if (dev_ingress_queue(dev))
 244                q = qdisc_match_from_root(
 245                        dev_ingress_queue(dev)->qdisc_sleeping,
 246                        handle);
 247out:
 248        return q;
 249}
 250
 251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 252{
 253        unsigned long cl;
 254        struct Qdisc *leaf;
 255        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 256
 257        if (cops == NULL)
 258                return NULL;
 259        cl = cops->get(p, classid);
 260
 261        if (cl == 0)
 262                return NULL;
 263        leaf = cops->leaf(p, cl);
 264        cops->put(p, cl);
 265        return leaf;
 266}
 267
 268/* Find queueing discipline by name */
 269
 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 271{
 272        struct Qdisc_ops *q = NULL;
 273
 274        if (kind) {
 275                read_lock(&qdisc_mod_lock);
 276                for (q = qdisc_base; q; q = q->next) {
 277                        if (nla_strcmp(kind, q->id) == 0) {
 278                                if (!try_module_get(q->owner))
 279                                        q = NULL;
 280                                break;
 281                        }
 282                }
 283                read_unlock(&qdisc_mod_lock);
 284        }
 285        return q;
 286}
 287
 288/* The linklayer setting were not transferred from iproute2, in older
 289 * versions, and the rate tables lookup systems have been dropped in
 290 * the kernel. To keep backward compatible with older iproute2 tc
 291 * utils, we detect the linklayer setting by detecting if the rate
 292 * table were modified.
 293 *
 294 * For linklayer ATM table entries, the rate table will be aligned to
 295 * 48 bytes, thus some table entries will contain the same value.  The
 296 * mpu (min packet unit) is also encoded into the old rate table, thus
 297 * starting from the mpu, we find low and high table entries for
 298 * mapping this cell.  If these entries contain the same value, when
 299 * the rate tables have been modified for linklayer ATM.
 300 *
 301 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 302 * and then roundup to the next cell, calc the table entry one below,
 303 * and compare.
 304 */
 305static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 306{
 307        int low       = roundup(r->mpu, 48);
 308        int high      = roundup(low+1, 48);
 309        int cell_low  = low >> r->cell_log;
 310        int cell_high = (high >> r->cell_log) - 1;
 311
 312        /* rtab is too inaccurate at rates > 100Mbit/s */
 313        if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
 314                pr_debug("TC linklayer: Giving up ATM detection\n");
 315                return TC_LINKLAYER_ETHERNET;
 316        }
 317
 318        if ((cell_high > cell_low) && (cell_high < 256)
 319            && (rtab[cell_low] == rtab[cell_high])) {
 320                pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
 321                         cell_low, cell_high, rtab[cell_high]);
 322                return TC_LINKLAYER_ATM;
 323        }
 324        return TC_LINKLAYER_ETHERNET;
 325}
 326
 327static struct qdisc_rate_table *qdisc_rtab_list;
 328
 329struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 330{
 331        struct qdisc_rate_table *rtab;
 332
 333        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 334            nla_len(tab) != TC_RTAB_SIZE)
 335                return NULL;
 336
 337        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 338                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 339                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 340                        rtab->refcnt++;
 341                        return rtab;
 342                }
 343        }
 344
 345        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 346        if (rtab) {
 347                rtab->rate = *r;
 348                rtab->refcnt = 1;
 349                memcpy(rtab->data, nla_data(tab), 1024);
 350                if (r->linklayer == TC_LINKLAYER_UNAWARE)
 351                        r->linklayer = __detect_linklayer(r, rtab->data);
 352                rtab->next = qdisc_rtab_list;
 353                qdisc_rtab_list = rtab;
 354        }
 355        return rtab;
 356}
 357EXPORT_SYMBOL(qdisc_get_rtab);
 358
 359void qdisc_put_rtab(struct qdisc_rate_table *tab)
 360{
 361        struct qdisc_rate_table *rtab, **rtabp;
 362
 363        if (!tab || --tab->refcnt)
 364                return;
 365
 366        for (rtabp = &qdisc_rtab_list;
 367             (rtab = *rtabp) != NULL;
 368             rtabp = &rtab->next) {
 369                if (rtab == tab) {
 370                        *rtabp = rtab->next;
 371                        kfree(rtab);
 372                        return;
 373                }
 374        }
 375}
 376EXPORT_SYMBOL(qdisc_put_rtab);
 377
 378static LIST_HEAD(qdisc_stab_list);
 379static DEFINE_SPINLOCK(qdisc_stab_lock);
 380
 381static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 382        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 383        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 384};
 385
 386static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 387{
 388        struct nlattr *tb[TCA_STAB_MAX + 1];
 389        struct qdisc_size_table *stab;
 390        struct tc_sizespec *s;
 391        unsigned int tsize = 0;
 392        u16 *tab = NULL;
 393        int err;
 394
 395        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 396        if (err < 0)
 397                return ERR_PTR(err);
 398        if (!tb[TCA_STAB_BASE])
 399                return ERR_PTR(-EINVAL);
 400
 401        s = nla_data(tb[TCA_STAB_BASE]);
 402
 403        if (s->tsize > 0) {
 404                if (!tb[TCA_STAB_DATA])
 405                        return ERR_PTR(-EINVAL);
 406                tab = nla_data(tb[TCA_STAB_DATA]);
 407                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 408        }
 409
 410        if (tsize != s->tsize || (!tab && tsize > 0))
 411                return ERR_PTR(-EINVAL);
 412
 413        spin_lock(&qdisc_stab_lock);
 414
 415        list_for_each_entry(stab, &qdisc_stab_list, list) {
 416                if (memcmp(&stab->szopts, s, sizeof(*s)))
 417                        continue;
 418                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 419                        continue;
 420                stab->refcnt++;
 421                spin_unlock(&qdisc_stab_lock);
 422                return stab;
 423        }
 424
 425        spin_unlock(&qdisc_stab_lock);
 426
 427        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 428        if (!stab)
 429                return ERR_PTR(-ENOMEM);
 430
 431        stab->refcnt = 1;
 432        stab->szopts = *s;
 433        if (tsize > 0)
 434                memcpy(stab->data, tab, tsize * sizeof(u16));
 435
 436        spin_lock(&qdisc_stab_lock);
 437        list_add_tail(&stab->list, &qdisc_stab_list);
 438        spin_unlock(&qdisc_stab_lock);
 439
 440        return stab;
 441}
 442
 443static void stab_kfree_rcu(struct rcu_head *head)
 444{
 445        kfree(container_of(head, struct qdisc_size_table, rcu));
 446}
 447
 448void qdisc_put_stab(struct qdisc_size_table *tab)
 449{
 450        if (!tab)
 451                return;
 452
 453        spin_lock(&qdisc_stab_lock);
 454
 455        if (--tab->refcnt == 0) {
 456                list_del(&tab->list);
 457                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 458        }
 459
 460        spin_unlock(&qdisc_stab_lock);
 461}
 462EXPORT_SYMBOL(qdisc_put_stab);
 463
 464static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 465{
 466        struct nlattr *nest;
 467
 468        nest = nla_nest_start(skb, TCA_STAB);
 469        if (nest == NULL)
 470                goto nla_put_failure;
 471        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 472                goto nla_put_failure;
 473        nla_nest_end(skb, nest);
 474
 475        return skb->len;
 476
 477nla_put_failure:
 478        return -1;
 479}
 480
 481void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 482{
 483        int pkt_len, slot;
 484
 485        pkt_len = skb->len + stab->szopts.overhead;
 486        if (unlikely(!stab->szopts.tsize))
 487                goto out;
 488
 489        slot = pkt_len + stab->szopts.cell_align;
 490        if (unlikely(slot < 0))
 491                slot = 0;
 492
 493        slot >>= stab->szopts.cell_log;
 494        if (likely(slot < stab->szopts.tsize))
 495                pkt_len = stab->data[slot];
 496        else
 497                pkt_len = stab->data[stab->szopts.tsize - 1] *
 498                                (slot / stab->szopts.tsize) +
 499                                stab->data[slot % stab->szopts.tsize];
 500
 501        pkt_len <<= stab->szopts.size_log;
 502out:
 503        if (unlikely(pkt_len < 1))
 504                pkt_len = 1;
 505        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 506}
 507EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 508
 509void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 510{
 511        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 512                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 513                        txt, qdisc->ops->id, qdisc->handle >> 16);
 514                qdisc->flags |= TCQ_F_WARN_NONWC;
 515        }
 516}
 517EXPORT_SYMBOL(qdisc_warn_nonwc);
 518
 519static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 520{
 521        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 522                                                 timer);
 523
 524        qdisc_unthrottled(wd->qdisc);
 525        __netif_schedule(qdisc_root(wd->qdisc));
 526
 527        return HRTIMER_NORESTART;
 528}
 529
 530void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 531{
 532        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 533        wd->timer.function = qdisc_watchdog;
 534        wd->qdisc = qdisc;
 535}
 536EXPORT_SYMBOL(qdisc_watchdog_init);
 537
 538void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
 539{
 540        if (test_bit(__QDISC_STATE_DEACTIVATED,
 541                     &qdisc_root_sleeping(wd->qdisc)->state))
 542                return;
 543
 544        qdisc_throttled(wd->qdisc);
 545
 546        hrtimer_start(&wd->timer,
 547                      ns_to_ktime(expires),
 548                      HRTIMER_MODE_ABS);
 549}
 550EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
 551
 552void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 553{
 554        hrtimer_cancel(&wd->timer);
 555        qdisc_unthrottled(wd->qdisc);
 556}
 557EXPORT_SYMBOL(qdisc_watchdog_cancel);
 558
 559static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 560{
 561        unsigned int size = n * sizeof(struct hlist_head), i;
 562        struct hlist_head *h;
 563
 564        if (size <= PAGE_SIZE)
 565                h = kmalloc(size, GFP_KERNEL);
 566        else
 567                h = (struct hlist_head *)
 568                        __get_free_pages(GFP_KERNEL, get_order(size));
 569
 570        if (h != NULL) {
 571                for (i = 0; i < n; i++)
 572                        INIT_HLIST_HEAD(&h[i]);
 573        }
 574        return h;
 575}
 576
 577static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 578{
 579        unsigned int size = n * sizeof(struct hlist_head);
 580
 581        if (size <= PAGE_SIZE)
 582                kfree(h);
 583        else
 584                free_pages((unsigned long)h, get_order(size));
 585}
 586
 587void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 588{
 589        struct Qdisc_class_common *cl;
 590        struct hlist_node *next;
 591        struct hlist_head *nhash, *ohash;
 592        unsigned int nsize, nmask, osize;
 593        unsigned int i, h;
 594
 595        /* Rehash when load factor exceeds 0.75 */
 596        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 597                return;
 598        nsize = clhash->hashsize * 2;
 599        nmask = nsize - 1;
 600        nhash = qdisc_class_hash_alloc(nsize);
 601        if (nhash == NULL)
 602                return;
 603
 604        ohash = clhash->hash;
 605        osize = clhash->hashsize;
 606
 607        sch_tree_lock(sch);
 608        for (i = 0; i < osize; i++) {
 609                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 610                        h = qdisc_class_hash(cl->classid, nmask);
 611                        hlist_add_head(&cl->hnode, &nhash[h]);
 612                }
 613        }
 614        clhash->hash     = nhash;
 615        clhash->hashsize = nsize;
 616        clhash->hashmask = nmask;
 617        sch_tree_unlock(sch);
 618
 619        qdisc_class_hash_free(ohash, osize);
 620}
 621EXPORT_SYMBOL(qdisc_class_hash_grow);
 622
 623int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 624{
 625        unsigned int size = 4;
 626
 627        clhash->hash = qdisc_class_hash_alloc(size);
 628        if (clhash->hash == NULL)
 629                return -ENOMEM;
 630        clhash->hashsize  = size;
 631        clhash->hashmask  = size - 1;
 632        clhash->hashelems = 0;
 633        return 0;
 634}
 635EXPORT_SYMBOL(qdisc_class_hash_init);
 636
 637void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 638{
 639        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 640}
 641EXPORT_SYMBOL(qdisc_class_hash_destroy);
 642
 643void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 644                             struct Qdisc_class_common *cl)
 645{
 646        unsigned int h;
 647
 648        INIT_HLIST_NODE(&cl->hnode);
 649        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 650        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 651        clhash->hashelems++;
 652}
 653EXPORT_SYMBOL(qdisc_class_hash_insert);
 654
 655void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 656                             struct Qdisc_class_common *cl)
 657{
 658        hlist_del(&cl->hnode);
 659        clhash->hashelems--;
 660}
 661EXPORT_SYMBOL(qdisc_class_hash_remove);
 662
 663/* Allocate an unique handle from space managed by kernel
 664 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 665 */
 666static u32 qdisc_alloc_handle(struct net_device *dev)
 667{
 668        int i = 0x8000;
 669        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 670
 671        do {
 672                autohandle += TC_H_MAKE(0x10000U, 0);
 673                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 674                        autohandle = TC_H_MAKE(0x80000000U, 0);
 675                if (!qdisc_lookup(dev, autohandle))
 676                        return autohandle;
 677                cond_resched();
 678        } while (--i > 0);
 679
 680        return 0;
 681}
 682
 683void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 684{
 685        const struct Qdisc_class_ops *cops;
 686        unsigned long cl;
 687        u32 parentid;
 688
 689        if (n == 0)
 690                return;
 691        while ((parentid = sch->parent)) {
 692                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 693                        return;
 694
 695                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 696                if (sch == NULL) {
 697                        WARN_ON(parentid != TC_H_ROOT);
 698                        return;
 699                }
 700                cops = sch->ops->cl_ops;
 701                if (cops->qlen_notify) {
 702                        cl = cops->get(sch, parentid);
 703                        cops->qlen_notify(sch, cl);
 704                        cops->put(sch, cl);
 705                }
 706                sch->q.qlen -= n;
 707        }
 708}
 709EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 710
 711static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 712                               struct nlmsghdr *n, u32 clid,
 713                               struct Qdisc *old, struct Qdisc *new)
 714{
 715        if (new || old)
 716                qdisc_notify(net, skb, n, clid, old, new);
 717
 718        if (old)
 719                qdisc_destroy(old);
 720}
 721
 722/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 723 * to device "dev".
 724 *
 725 * When appropriate send a netlink notification using 'skb'
 726 * and "n".
 727 *
 728 * On success, destroy old qdisc.
 729 */
 730
 731static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 732                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 733                       struct Qdisc *new, struct Qdisc *old)
 734{
 735        struct Qdisc *q = old;
 736        struct net *net = dev_net(dev);
 737        int err = 0;
 738
 739        if (parent == NULL) {
 740                unsigned int i, num_q, ingress;
 741
 742                ingress = 0;
 743                num_q = dev->num_tx_queues;
 744                if ((q && q->flags & TCQ_F_INGRESS) ||
 745                    (new && new->flags & TCQ_F_INGRESS)) {
 746                        num_q = 1;
 747                        ingress = 1;
 748                        if (!dev_ingress_queue(dev))
 749                                return -ENOENT;
 750                }
 751
 752                if (dev->flags & IFF_UP)
 753                        dev_deactivate(dev);
 754
 755                if (new && new->ops->attach) {
 756                        new->ops->attach(new);
 757                        num_q = 0;
 758                }
 759
 760                for (i = 0; i < num_q; i++) {
 761                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 762
 763                        if (!ingress)
 764                                dev_queue = netdev_get_tx_queue(dev, i);
 765
 766                        old = dev_graft_qdisc(dev_queue, new);
 767                        if (new && i > 0)
 768                                atomic_inc(&new->refcnt);
 769
 770                        if (!ingress)
 771                                qdisc_destroy(old);
 772                }
 773
 774                if (!ingress) {
 775                        notify_and_destroy(net, skb, n, classid,
 776                                           dev->qdisc, new);
 777                        if (new && !new->ops->attach)
 778                                atomic_inc(&new->refcnt);
 779                        dev->qdisc = new ? : &noop_qdisc;
 780                } else {
 781                        notify_and_destroy(net, skb, n, classid, old, new);
 782                }
 783
 784                if (dev->flags & IFF_UP)
 785                        dev_activate(dev);
 786        } else {
 787                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 788
 789                err = -EOPNOTSUPP;
 790                if (cops && cops->graft) {
 791                        unsigned long cl = cops->get(parent, classid);
 792                        if (cl) {
 793                                err = cops->graft(parent, cl, new, &old);
 794                                cops->put(parent, cl);
 795                        } else
 796                                err = -ENOENT;
 797                }
 798                if (!err)
 799                        notify_and_destroy(net, skb, n, classid, old, new);
 800        }
 801        return err;
 802}
 803
 804/* lockdep annotation is needed for ingress; egress gets it only for name */
 805static struct lock_class_key qdisc_tx_lock;
 806static struct lock_class_key qdisc_rx_lock;
 807
 808/*
 809   Allocate and initialize new qdisc.
 810
 811   Parameters are passed via opt.
 812 */
 813
 814static struct Qdisc *
 815qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 816             struct Qdisc *p, u32 parent, u32 handle,
 817             struct nlattr **tca, int *errp)
 818{
 819        int err;
 820        struct nlattr *kind = tca[TCA_KIND];
 821        struct Qdisc *sch;
 822        struct Qdisc_ops *ops;
 823        struct qdisc_size_table *stab;
 824
 825        ops = qdisc_lookup_ops(kind);
 826#ifdef CONFIG_MODULES
 827        if (ops == NULL && kind != NULL) {
 828                char name[IFNAMSIZ];
 829                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 830                        /* We dropped the RTNL semaphore in order to
 831                         * perform the module load.  So, even if we
 832                         * succeeded in loading the module we have to
 833                         * tell the caller to replay the request.  We
 834                         * indicate this using -EAGAIN.
 835                         * We replay the request because the device may
 836                         * go away in the mean time.
 837                         */
 838                        rtnl_unlock();
 839                        request_module("sch_%s", name);
 840                        rtnl_lock();
 841                        ops = qdisc_lookup_ops(kind);
 842                        if (ops != NULL) {
 843                                /* We will try again qdisc_lookup_ops,
 844                                 * so don't keep a reference.
 845                                 */
 846                                module_put(ops->owner);
 847                                err = -EAGAIN;
 848                                goto err_out;
 849                        }
 850                }
 851        }
 852#endif
 853
 854        err = -ENOENT;
 855        if (ops == NULL)
 856                goto err_out;
 857
 858        sch = qdisc_alloc(dev_queue, ops);
 859        if (IS_ERR(sch)) {
 860                err = PTR_ERR(sch);
 861                goto err_out2;
 862        }
 863
 864        sch->parent = parent;
 865
 866        if (handle == TC_H_INGRESS) {
 867                sch->flags |= TCQ_F_INGRESS;
 868                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 869                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 870        } else {
 871                if (handle == 0) {
 872                        handle = qdisc_alloc_handle(dev);
 873                        err = -ENOMEM;
 874                        if (handle == 0)
 875                                goto err_out3;
 876                }
 877                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 878                if (!netif_is_multiqueue(dev))
 879                        sch->flags |= TCQ_F_ONETXQUEUE;
 880        }
 881
 882        sch->handle = handle;
 883
 884        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 885                if (tca[TCA_STAB]) {
 886                        stab = qdisc_get_stab(tca[TCA_STAB]);
 887                        if (IS_ERR(stab)) {
 888                                err = PTR_ERR(stab);
 889                                goto err_out4;
 890                        }
 891                        rcu_assign_pointer(sch->stab, stab);
 892                }
 893                if (tca[TCA_RATE]) {
 894                        spinlock_t *root_lock;
 895
 896                        err = -EOPNOTSUPP;
 897                        if (sch->flags & TCQ_F_MQROOT)
 898                                goto err_out4;
 899
 900                        if ((sch->parent != TC_H_ROOT) &&
 901                            !(sch->flags & TCQ_F_INGRESS) &&
 902                            (!p || !(p->flags & TCQ_F_MQROOT)))
 903                                root_lock = qdisc_root_sleeping_lock(sch);
 904                        else
 905                                root_lock = qdisc_lock(sch);
 906
 907                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 908                                                root_lock, tca[TCA_RATE]);
 909                        if (err)
 910                                goto err_out4;
 911                }
 912
 913                qdisc_list_add(sch);
 914
 915                return sch;
 916        }
 917err_out3:
 918        dev_put(dev);
 919        kfree((char *) sch - sch->padded);
 920err_out2:
 921        module_put(ops->owner);
 922err_out:
 923        *errp = err;
 924        return NULL;
 925
 926err_out4:
 927        /*
 928         * Any broken qdiscs that would require a ops->reset() here?
 929         * The qdisc was never in action so it shouldn't be necessary.
 930         */
 931        qdisc_put_stab(rtnl_dereference(sch->stab));
 932        if (ops->destroy)
 933                ops->destroy(sch);
 934        goto err_out3;
 935}
 936
 937static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 938{
 939        struct qdisc_size_table *ostab, *stab = NULL;
 940        int err = 0;
 941
 942        if (tca[TCA_OPTIONS]) {
 943                if (sch->ops->change == NULL)
 944                        return -EINVAL;
 945                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 946                if (err)
 947                        return err;
 948        }
 949
 950        if (tca[TCA_STAB]) {
 951                stab = qdisc_get_stab(tca[TCA_STAB]);
 952                if (IS_ERR(stab))
 953                        return PTR_ERR(stab);
 954        }
 955
 956        ostab = rtnl_dereference(sch->stab);
 957        rcu_assign_pointer(sch->stab, stab);
 958        qdisc_put_stab(ostab);
 959
 960        if (tca[TCA_RATE]) {
 961                /* NB: ignores errors from replace_estimator
 962                   because change can't be undone. */
 963                if (sch->flags & TCQ_F_MQROOT)
 964                        goto out;
 965                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 966                                            qdisc_root_sleeping_lock(sch),
 967                                            tca[TCA_RATE]);
 968        }
 969out:
 970        return 0;
 971}
 972
 973struct check_loop_arg {
 974        struct qdisc_walker     w;
 975        struct Qdisc            *p;
 976        int                     depth;
 977};
 978
 979static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 980
 981static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 982{
 983        struct check_loop_arg   arg;
 984
 985        if (q->ops->cl_ops == NULL)
 986                return 0;
 987
 988        arg.w.stop = arg.w.skip = arg.w.count = 0;
 989        arg.w.fn = check_loop_fn;
 990        arg.depth = depth;
 991        arg.p = p;
 992        q->ops->cl_ops->walk(q, &arg.w);
 993        return arg.w.stop ? -ELOOP : 0;
 994}
 995
 996static int
 997check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 998{
 999        struct Qdisc *leaf;
1000        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1001        struct check_loop_arg *arg = (struct check_loop_arg *)w;
1002
1003        leaf = cops->leaf(q, cl);
1004        if (leaf) {
1005                if (leaf == arg->p || arg->depth > 7)
1006                        return -ELOOP;
1007                return check_loop(leaf, arg->p, arg->depth + 1);
1008        }
1009        return 0;
1010}
1011
1012/*
1013 * Delete/get qdisc.
1014 */
1015
1016static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1017{
1018        struct net *net = sock_net(skb->sk);
1019        struct tcmsg *tcm = nlmsg_data(n);
1020        struct nlattr *tca[TCA_MAX + 1];
1021        struct net_device *dev;
1022        u32 clid;
1023        struct Qdisc *q = NULL;
1024        struct Qdisc *p = NULL;
1025        int err;
1026
1027        if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
1028                return -EPERM;
1029
1030        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1031        if (err < 0)
1032                return err;
1033
1034        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1035        if (!dev)
1036                return -ENODEV;
1037
1038        clid = tcm->tcm_parent;
1039        if (clid) {
1040                if (clid != TC_H_ROOT) {
1041                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1042                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1043                                if (!p)
1044                                        return -ENOENT;
1045                                q = qdisc_leaf(p, clid);
1046                        } else if (dev_ingress_queue(dev)) {
1047                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1048                        }
1049                } else {
1050                        q = dev->qdisc;
1051                }
1052                if (!q)
1053                        return -ENOENT;
1054
1055                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1056                        return -EINVAL;
1057        } else {
1058                q = qdisc_lookup(dev, tcm->tcm_handle);
1059                if (!q)
1060                        return -ENOENT;
1061        }
1062
1063        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1064                return -EINVAL;
1065
1066        if (n->nlmsg_type == RTM_DELQDISC) {
1067                if (!clid)
1068                        return -EINVAL;
1069                if (q->handle == 0)
1070                        return -ENOENT;
1071                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1072                if (err != 0)
1073                        return err;
1074        } else {
1075                qdisc_notify(net, skb, n, clid, NULL, q);
1076        }
1077        return 0;
1078}
1079
1080/*
1081 * Create/change qdisc.
1082 */
1083
1084static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1085{
1086        struct net *net = sock_net(skb->sk);
1087        struct tcmsg *tcm;
1088        struct nlattr *tca[TCA_MAX + 1];
1089        struct net_device *dev;
1090        u32 clid;
1091        struct Qdisc *q, *p;
1092        int err;
1093
1094        if (!capable(CAP_NET_ADMIN))
1095                return -EPERM;
1096
1097replay:
1098        /* Reinit, just in case something touches this. */
1099        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1100        if (err < 0)
1101                return err;
1102
1103        tcm = nlmsg_data(n);
1104        clid = tcm->tcm_parent;
1105        q = p = NULL;
1106
1107        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1108        if (!dev)
1109                return -ENODEV;
1110
1111
1112        if (clid) {
1113                if (clid != TC_H_ROOT) {
1114                        if (clid != TC_H_INGRESS) {
1115                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1116                                if (!p)
1117                                        return -ENOENT;
1118                                q = qdisc_leaf(p, clid);
1119                        } else if (dev_ingress_queue_create(dev)) {
1120                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1121                        }
1122                } else {
1123                        q = dev->qdisc;
1124                }
1125
1126                /* It may be default qdisc, ignore it */
1127                if (q && q->handle == 0)
1128                        q = NULL;
1129
1130                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1131                        if (tcm->tcm_handle) {
1132                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1133                                        return -EEXIST;
1134                                if (TC_H_MIN(tcm->tcm_handle))
1135                                        return -EINVAL;
1136                                q = qdisc_lookup(dev, tcm->tcm_handle);
1137                                if (!q)
1138                                        goto create_n_graft;
1139                                if (n->nlmsg_flags & NLM_F_EXCL)
1140                                        return -EEXIST;
1141                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1142                                        return -EINVAL;
1143                                if (q == p ||
1144                                    (p && check_loop(q, p, 0)))
1145                                        return -ELOOP;
1146                                atomic_inc(&q->refcnt);
1147                                goto graft;
1148                        } else {
1149                                if (!q)
1150                                        goto create_n_graft;
1151
1152                                /* This magic test requires explanation.
1153                                 *
1154                                 *   We know, that some child q is already
1155                                 *   attached to this parent and have choice:
1156                                 *   either to change it or to create/graft new one.
1157                                 *
1158                                 *   1. We are allowed to create/graft only
1159                                 *   if CREATE and REPLACE flags are set.
1160                                 *
1161                                 *   2. If EXCL is set, requestor wanted to say,
1162                                 *   that qdisc tcm_handle is not expected
1163                                 *   to exist, so that we choose create/graft too.
1164                                 *
1165                                 *   3. The last case is when no flags are set.
1166                                 *   Alas, it is sort of hole in API, we
1167                                 *   cannot decide what to do unambiguously.
1168                                 *   For now we select create/graft, if
1169                                 *   user gave KIND, which does not match existing.
1170                                 */
1171                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1172                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1173                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1174                                     (tca[TCA_KIND] &&
1175                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1176                                        goto create_n_graft;
1177                        }
1178                }
1179        } else {
1180                if (!tcm->tcm_handle)
1181                        return -EINVAL;
1182                q = qdisc_lookup(dev, tcm->tcm_handle);
1183        }
1184
1185        /* Change qdisc parameters */
1186        if (q == NULL)
1187                return -ENOENT;
1188        if (n->nlmsg_flags & NLM_F_EXCL)
1189                return -EEXIST;
1190        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1191                return -EINVAL;
1192        err = qdisc_change(q, tca);
1193        if (err == 0)
1194                qdisc_notify(net, skb, n, clid, NULL, q);
1195        return err;
1196
1197create_n_graft:
1198        if (!(n->nlmsg_flags & NLM_F_CREATE))
1199                return -ENOENT;
1200        if (clid == TC_H_INGRESS) {
1201                if (dev_ingress_queue(dev))
1202                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1203                                         tcm->tcm_parent, tcm->tcm_parent,
1204                                         tca, &err);
1205                else
1206                        err = -ENOENT;
1207        } else {
1208                struct netdev_queue *dev_queue;
1209
1210                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1211                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1212                else if (p)
1213                        dev_queue = p->dev_queue;
1214                else
1215                        dev_queue = netdev_get_tx_queue(dev, 0);
1216
1217                q = qdisc_create(dev, dev_queue, p,
1218                                 tcm->tcm_parent, tcm->tcm_handle,
1219                                 tca, &err);
1220        }
1221        if (q == NULL) {
1222                if (err == -EAGAIN)
1223                        goto replay;
1224                return err;
1225        }
1226
1227graft:
1228        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1229        if (err) {
1230                if (q)
1231                        qdisc_destroy(q);
1232                return err;
1233        }
1234
1235        return 0;
1236}
1237
1238static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1239                         u32 portid, u32 seq, u16 flags, int event)
1240{
1241        struct tcmsg *tcm;
1242        struct nlmsghdr  *nlh;
1243        unsigned char *b = skb_tail_pointer(skb);
1244        struct gnet_dump d;
1245        struct qdisc_size_table *stab;
1246
1247        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1248        if (!nlh)
1249                goto out_nlmsg_trim;
1250        tcm = nlmsg_data(nlh);
1251        tcm->tcm_family = AF_UNSPEC;
1252        tcm->tcm__pad1 = 0;
1253        tcm->tcm__pad2 = 0;
1254        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1255        tcm->tcm_parent = clid;
1256        tcm->tcm_handle = q->handle;
1257        tcm->tcm_info = atomic_read(&q->refcnt);
1258        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1259                goto nla_put_failure;
1260        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1261                goto nla_put_failure;
1262        q->qstats.qlen = q->q.qlen;
1263
1264        stab = rtnl_dereference(q->stab);
1265        if (stab && qdisc_dump_stab(skb, stab) < 0)
1266                goto nla_put_failure;
1267
1268        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1269                                         qdisc_root_sleeping_lock(q), &d) < 0)
1270                goto nla_put_failure;
1271
1272        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1273                goto nla_put_failure;
1274
1275        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1276            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1277            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1278                goto nla_put_failure;
1279
1280        if (gnet_stats_finish_copy(&d) < 0)
1281                goto nla_put_failure;
1282
1283        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1284        return skb->len;
1285
1286out_nlmsg_trim:
1287nla_put_failure:
1288        nlmsg_trim(skb, b);
1289        return -1;
1290}
1291
1292static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1293{
1294        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1295}
1296
1297static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1298                        struct nlmsghdr *n, u32 clid,
1299                        struct Qdisc *old, struct Qdisc *new)
1300{
1301        struct sk_buff *skb;
1302        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1303
1304        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1305        if (!skb)
1306                return -ENOBUFS;
1307
1308        if (old && !tc_qdisc_dump_ignore(old)) {
1309                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1310                                  0, RTM_DELQDISC) < 0)
1311                        goto err_out;
1312        }
1313        if (new && !tc_qdisc_dump_ignore(new)) {
1314                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1315                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1316                        goto err_out;
1317        }
1318
1319        if (skb->len)
1320                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1321                                      n->nlmsg_flags & NLM_F_ECHO);
1322
1323err_out:
1324        kfree_skb(skb);
1325        return -EINVAL;
1326}
1327
1328static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1329                              struct netlink_callback *cb,
1330                              int *q_idx_p, int s_q_idx)
1331{
1332        int ret = 0, q_idx = *q_idx_p;
1333        struct Qdisc *q;
1334
1335        if (!root)
1336                return 0;
1337
1338        q = root;
1339        if (q_idx < s_q_idx) {
1340                q_idx++;
1341        } else {
1342                if (!tc_qdisc_dump_ignore(q) &&
1343                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1344                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1345                        goto done;
1346                q_idx++;
1347        }
1348        list_for_each_entry(q, &root->list, list) {
1349                if (q_idx < s_q_idx) {
1350                        q_idx++;
1351                        continue;
1352                }
1353                if (!tc_qdisc_dump_ignore(q) &&
1354                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1355                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1356                        goto done;
1357                q_idx++;
1358        }
1359
1360out:
1361        *q_idx_p = q_idx;
1362        return ret;
1363done:
1364        ret = -1;
1365        goto out;
1366}
1367
1368static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1369{
1370        struct net *net = sock_net(skb->sk);
1371        int idx, q_idx;
1372        int s_idx, s_q_idx;
1373        struct net_device *dev;
1374
1375        s_idx = cb->args[0];
1376        s_q_idx = q_idx = cb->args[1];
1377
1378        rcu_read_lock();
1379        idx = 0;
1380        for_each_netdev_rcu(net, dev) {
1381                struct netdev_queue *dev_queue;
1382
1383                if (idx < s_idx)
1384                        goto cont;
1385                if (idx > s_idx)
1386                        s_q_idx = 0;
1387                q_idx = 0;
1388
1389                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1390                        goto done;
1391
1392                dev_queue = dev_ingress_queue(dev);
1393                if (dev_queue &&
1394                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1395                                       &q_idx, s_q_idx) < 0)
1396                        goto done;
1397
1398cont:
1399                idx++;
1400        }
1401
1402done:
1403        rcu_read_unlock();
1404
1405        cb->args[0] = idx;
1406        cb->args[1] = q_idx;
1407
1408        return skb->len;
1409}
1410
1411
1412
1413/************************************************
1414 *      Traffic classes manipulation.           *
1415 ************************************************/
1416
1417
1418
1419static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1420{
1421        struct net *net = sock_net(skb->sk);
1422        struct tcmsg *tcm = nlmsg_data(n);
1423        struct nlattr *tca[TCA_MAX + 1];
1424        struct net_device *dev;
1425        struct Qdisc *q = NULL;
1426        const struct Qdisc_class_ops *cops;
1427        unsigned long cl = 0;
1428        unsigned long new_cl;
1429        u32 portid;
1430        u32 clid;
1431        u32 qid;
1432        int err;
1433
1434        if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1435                return -EPERM;
1436
1437        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1438        if (err < 0)
1439                return err;
1440
1441        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1442        if (!dev)
1443                return -ENODEV;
1444
1445        /*
1446           parent == TC_H_UNSPEC - unspecified parent.
1447           parent == TC_H_ROOT   - class is root, which has no parent.
1448           parent == X:0         - parent is root class.
1449           parent == X:Y         - parent is a node in hierarchy.
1450           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1451
1452           handle == 0:0         - generate handle from kernel pool.
1453           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1454           handle == X:Y         - clear.
1455           handle == X:0         - root class.
1456         */
1457
1458        /* Step 1. Determine qdisc handle X:0 */
1459
1460        portid = tcm->tcm_parent;
1461        clid = tcm->tcm_handle;
1462        qid = TC_H_MAJ(clid);
1463
1464        if (portid != TC_H_ROOT) {
1465                u32 qid1 = TC_H_MAJ(portid);
1466
1467                if (qid && qid1) {
1468                        /* If both majors are known, they must be identical. */
1469                        if (qid != qid1)
1470                                return -EINVAL;
1471                } else if (qid1) {
1472                        qid = qid1;
1473                } else if (qid == 0)
1474                        qid = dev->qdisc->handle;
1475
1476                /* Now qid is genuine qdisc handle consistent
1477                 * both with parent and child.
1478                 *
1479                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1480                 */
1481                if (portid)
1482                        portid = TC_H_MAKE(qid, portid);
1483        } else {
1484                if (qid == 0)
1485                        qid = dev->qdisc->handle;
1486        }
1487
1488        /* OK. Locate qdisc */
1489        q = qdisc_lookup(dev, qid);
1490        if (!q)
1491                return -ENOENT;
1492
1493        /* An check that it supports classes */
1494        cops = q->ops->cl_ops;
1495        if (cops == NULL)
1496                return -EINVAL;
1497
1498        /* Now try to get class */
1499        if (clid == 0) {
1500                if (portid == TC_H_ROOT)
1501                        clid = qid;
1502        } else
1503                clid = TC_H_MAKE(qid, clid);
1504
1505        if (clid)
1506                cl = cops->get(q, clid);
1507
1508        if (cl == 0) {
1509                err = -ENOENT;
1510                if (n->nlmsg_type != RTM_NEWTCLASS ||
1511                    !(n->nlmsg_flags & NLM_F_CREATE))
1512                        goto out;
1513        } else {
1514                switch (n->nlmsg_type) {
1515                case RTM_NEWTCLASS:
1516                        err = -EEXIST;
1517                        if (n->nlmsg_flags & NLM_F_EXCL)
1518                                goto out;
1519                        break;
1520                case RTM_DELTCLASS:
1521                        err = -EOPNOTSUPP;
1522                        if (cops->delete)
1523                                err = cops->delete(q, cl);
1524                        if (err == 0)
1525                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1526                        goto out;
1527                case RTM_GETTCLASS:
1528                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1529                        goto out;
1530                default:
1531                        err = -EINVAL;
1532                        goto out;
1533                }
1534        }
1535
1536        new_cl = cl;
1537        err = -EOPNOTSUPP;
1538        if (cops->change)
1539                err = cops->change(q, clid, portid, tca, &new_cl);
1540        if (err == 0)
1541                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1542
1543out:
1544        if (cl)
1545                cops->put(q, cl);
1546
1547        return err;
1548}
1549
1550
1551static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1552                          unsigned long cl,
1553                          u32 portid, u32 seq, u16 flags, int event)
1554{
1555        struct tcmsg *tcm;
1556        struct nlmsghdr  *nlh;
1557        unsigned char *b = skb_tail_pointer(skb);
1558        struct gnet_dump d;
1559        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1560
1561        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1562        if (!nlh)
1563                goto out_nlmsg_trim;
1564        tcm = nlmsg_data(nlh);
1565        tcm->tcm_family = AF_UNSPEC;
1566        tcm->tcm__pad1 = 0;
1567        tcm->tcm__pad2 = 0;
1568        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1569        tcm->tcm_parent = q->handle;
1570        tcm->tcm_handle = q->handle;
1571        tcm->tcm_info = 0;
1572        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1573                goto nla_put_failure;
1574        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1575                goto nla_put_failure;
1576
1577        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1578                                         qdisc_root_sleeping_lock(q), &d) < 0)
1579                goto nla_put_failure;
1580
1581        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1582                goto nla_put_failure;
1583
1584        if (gnet_stats_finish_copy(&d) < 0)
1585                goto nla_put_failure;
1586
1587        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1588        return skb->len;
1589
1590out_nlmsg_trim:
1591nla_put_failure:
1592        nlmsg_trim(skb, b);
1593        return -1;
1594}
1595
1596static int tclass_notify(struct net *net, struct sk_buff *oskb,
1597                         struct nlmsghdr *n, struct Qdisc *q,
1598                         unsigned long cl, int event)
1599{
1600        struct sk_buff *skb;
1601        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1602
1603        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1604        if (!skb)
1605                return -ENOBUFS;
1606
1607        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1608                kfree_skb(skb);
1609                return -EINVAL;
1610        }
1611
1612        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1613                              n->nlmsg_flags & NLM_F_ECHO);
1614}
1615
1616struct qdisc_dump_args {
1617        struct qdisc_walker     w;
1618        struct sk_buff          *skb;
1619        struct netlink_callback *cb;
1620};
1621
1622static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1623{
1624        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1625
1626        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1627                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1628}
1629
1630static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1631                                struct tcmsg *tcm, struct netlink_callback *cb,
1632                                int *t_p, int s_t)
1633{
1634        struct qdisc_dump_args arg;
1635
1636        if (tc_qdisc_dump_ignore(q) ||
1637            *t_p < s_t || !q->ops->cl_ops ||
1638            (tcm->tcm_parent &&
1639             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1640                (*t_p)++;
1641                return 0;
1642        }
1643        if (*t_p > s_t)
1644                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1645        arg.w.fn = qdisc_class_dump;
1646        arg.skb = skb;
1647        arg.cb = cb;
1648        arg.w.stop  = 0;
1649        arg.w.skip = cb->args[1];
1650        arg.w.count = 0;
1651        q->ops->cl_ops->walk(q, &arg.w);
1652        cb->args[1] = arg.w.count;
1653        if (arg.w.stop)
1654                return -1;
1655        (*t_p)++;
1656        return 0;
1657}
1658
1659static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1660                               struct tcmsg *tcm, struct netlink_callback *cb,
1661                               int *t_p, int s_t)
1662{
1663        struct Qdisc *q;
1664
1665        if (!root)
1666                return 0;
1667
1668        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1669                return -1;
1670
1671        list_for_each_entry(q, &root->list, list) {
1672                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1673                        return -1;
1674        }
1675
1676        return 0;
1677}
1678
1679static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1680{
1681        struct tcmsg *tcm = nlmsg_data(cb->nlh);
1682        struct net *net = sock_net(skb->sk);
1683        struct netdev_queue *dev_queue;
1684        struct net_device *dev;
1685        int t, s_t;
1686
1687        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1688                return 0;
1689        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1690        if (!dev)
1691                return 0;
1692
1693        s_t = cb->args[0];
1694        t = 0;
1695
1696        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1697                goto done;
1698
1699        dev_queue = dev_ingress_queue(dev);
1700        if (dev_queue &&
1701            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1702                                &t, s_t) < 0)
1703                goto done;
1704
1705done:
1706        cb->args[0] = t;
1707
1708        dev_put(dev);
1709        return skb->len;
1710}
1711
1712/* Main classifier routine: scans classifier chain attached
1713 * to this qdisc, (optionally) tests for protocol and asks
1714 * specific classifiers.
1715 */
1716int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1717                       struct tcf_result *res)
1718{
1719        __be16 protocol = skb->protocol;
1720        int err;
1721
1722        for (; tp; tp = tp->next) {
1723                if (tp->protocol != protocol &&
1724                    tp->protocol != htons(ETH_P_ALL))
1725                        continue;
1726                err = tp->classify(skb, tp, res);
1727
1728                if (err >= 0) {
1729#ifdef CONFIG_NET_CLS_ACT
1730                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1731                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1732#endif
1733                        return err;
1734                }
1735        }
1736        return -1;
1737}
1738EXPORT_SYMBOL(tc_classify_compat);
1739
1740int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1741                struct tcf_result *res)
1742{
1743        int err = 0;
1744#ifdef CONFIG_NET_CLS_ACT
1745        const struct tcf_proto *otp = tp;
1746reclassify:
1747#endif
1748
1749        err = tc_classify_compat(skb, tp, res);
1750#ifdef CONFIG_NET_CLS_ACT
1751        if (err == TC_ACT_RECLASSIFY) {
1752                u32 verd = G_TC_VERD(skb->tc_verd);
1753                tp = otp;
1754
1755                if (verd++ >= MAX_REC_LOOP) {
1756                        net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1757                                               tp->q->ops->id,
1758                                               tp->prio & 0xffff,
1759                                               ntohs(tp->protocol));
1760                        return TC_ACT_SHOT;
1761                }
1762                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1763                goto reclassify;
1764        }
1765#endif
1766        return err;
1767}
1768EXPORT_SYMBOL(tc_classify);
1769
1770void tcf_destroy(struct tcf_proto *tp)
1771{
1772        tp->ops->destroy(tp);
1773        module_put(tp->ops->owner);
1774        kfree(tp);
1775}
1776
1777void tcf_destroy_chain(struct tcf_proto **fl)
1778{
1779        struct tcf_proto *tp;
1780
1781        while ((tp = *fl) != NULL) {
1782                *fl = tp->next;
1783                tcf_destroy(tp);
1784        }
1785}
1786EXPORT_SYMBOL(tcf_destroy_chain);
1787
1788#ifdef CONFIG_PROC_FS
1789static int psched_show(struct seq_file *seq, void *v)
1790{
1791        struct timespec ts;
1792
1793        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1794        seq_printf(seq, "%08x %08x %08x %08x\n",
1795                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1796                   1000000,
1797                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1798
1799        return 0;
1800}
1801
1802static int psched_open(struct inode *inode, struct file *file)
1803{
1804        return single_open(file, psched_show, NULL);
1805}
1806
1807static const struct file_operations psched_fops = {
1808        .owner = THIS_MODULE,
1809        .open = psched_open,
1810        .read  = seq_read,
1811        .llseek = seq_lseek,
1812        .release = single_release,
1813};
1814
1815static int __net_init psched_net_init(struct net *net)
1816{
1817        struct proc_dir_entry *e;
1818
1819        e = proc_create("psched", 0, net->proc_net, &psched_fops);
1820        if (e == NULL)
1821                return -ENOMEM;
1822
1823        return 0;
1824}
1825
1826static void __net_exit psched_net_exit(struct net *net)
1827{
1828        remove_proc_entry("psched", net->proc_net);
1829}
1830#else
1831static int __net_init psched_net_init(struct net *net)
1832{
1833        return 0;
1834}
1835
1836static void __net_exit psched_net_exit(struct net *net)
1837{
1838}
1839#endif
1840
1841static struct pernet_operations psched_net_ops = {
1842        .init = psched_net_init,
1843        .exit = psched_net_exit,
1844};
1845
1846static int __init pktsched_init(void)
1847{
1848        int err;
1849
1850        err = register_pernet_subsys(&psched_net_ops);
1851        if (err) {
1852                pr_err("pktsched_init: "
1853                       "cannot initialize per netns operations\n");
1854                return err;
1855        }
1856
1857        register_qdisc(&pfifo_qdisc_ops);
1858        register_qdisc(&bfifo_qdisc_ops);
1859        register_qdisc(&pfifo_head_drop_qdisc_ops);
1860        register_qdisc(&mq_qdisc_ops);
1861
1862        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1863        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1864        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1865        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1866        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1867        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1868
1869        return 0;
1870}
1871
1872subsys_initcall(pktsched_init);
1873