linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/uregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* We know handle. Find qdisc among all qdisc's attached to device
 204   (root qdisc, all its children, children of children etc.)
 205 */
 206
 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 208{
 209        struct Qdisc *q;
 210
 211        if (!(root->flags & TCQ_F_BUILTIN) &&
 212            root->handle == handle)
 213                return root;
 214
 215        list_for_each_entry(q, &root->list, list) {
 216                if (q->handle == handle)
 217                        return q;
 218        }
 219        return NULL;
 220}
 221
 222static void qdisc_list_add(struct Qdisc *q)
 223{
 224        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 225                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 226}
 227
 228void qdisc_list_del(struct Qdisc *q)
 229{
 230        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 231                list_del(&q->list);
 232}
 233EXPORT_SYMBOL(qdisc_list_del);
 234
 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 236{
 237        struct Qdisc *q;
 238
 239        q = qdisc_match_from_root(dev->qdisc, handle);
 240        if (q)
 241                goto out;
 242
 243        if (dev_ingress_queue(dev))
 244                q = qdisc_match_from_root(
 245                        dev_ingress_queue(dev)->qdisc_sleeping,
 246                        handle);
 247out:
 248        return q;
 249}
 250
 251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 252{
 253        unsigned long cl;
 254        struct Qdisc *leaf;
 255        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 256
 257        if (cops == NULL)
 258                return NULL;
 259        cl = cops->get(p, classid);
 260
 261        if (cl == 0)
 262                return NULL;
 263        leaf = cops->leaf(p, cl);
 264        cops->put(p, cl);
 265        return leaf;
 266}
 267
 268/* Find queueing discipline by name */
 269
 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 271{
 272        struct Qdisc_ops *q = NULL;
 273
 274        if (kind) {
 275                read_lock(&qdisc_mod_lock);
 276                for (q = qdisc_base; q; q = q->next) {
 277                        if (nla_strcmp(kind, q->id) == 0) {
 278                                if (!try_module_get(q->owner))
 279                                        q = NULL;
 280                                break;
 281                        }
 282                }
 283                read_unlock(&qdisc_mod_lock);
 284        }
 285        return q;
 286}
 287
 288static struct qdisc_rate_table *qdisc_rtab_list;
 289
 290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 291{
 292        struct qdisc_rate_table *rtab;
 293
 294        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 295                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 296                        rtab->refcnt++;
 297                        return rtab;
 298                }
 299        }
 300
 301        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 302            nla_len(tab) != TC_RTAB_SIZE)
 303                return NULL;
 304
 305        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 306        if (rtab) {
 307                rtab->rate = *r;
 308                rtab->refcnt = 1;
 309                memcpy(rtab->data, nla_data(tab), 1024);
 310                rtab->next = qdisc_rtab_list;
 311                qdisc_rtab_list = rtab;
 312        }
 313        return rtab;
 314}
 315EXPORT_SYMBOL(qdisc_get_rtab);
 316
 317void qdisc_put_rtab(struct qdisc_rate_table *tab)
 318{
 319        struct qdisc_rate_table *rtab, **rtabp;
 320
 321        if (!tab || --tab->refcnt)
 322                return;
 323
 324        for (rtabp = &qdisc_rtab_list;
 325             (rtab = *rtabp) != NULL;
 326             rtabp = &rtab->next) {
 327                if (rtab == tab) {
 328                        *rtabp = rtab->next;
 329                        kfree(rtab);
 330                        return;
 331                }
 332        }
 333}
 334EXPORT_SYMBOL(qdisc_put_rtab);
 335
 336static LIST_HEAD(qdisc_stab_list);
 337static DEFINE_SPINLOCK(qdisc_stab_lock);
 338
 339static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 340        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 341        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 342};
 343
 344static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 345{
 346        struct nlattr *tb[TCA_STAB_MAX + 1];
 347        struct qdisc_size_table *stab;
 348        struct tc_sizespec *s;
 349        unsigned int tsize = 0;
 350        u16 *tab = NULL;
 351        int err;
 352
 353        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 354        if (err < 0)
 355                return ERR_PTR(err);
 356        if (!tb[TCA_STAB_BASE])
 357                return ERR_PTR(-EINVAL);
 358
 359        s = nla_data(tb[TCA_STAB_BASE]);
 360
 361        if (s->tsize > 0) {
 362                if (!tb[TCA_STAB_DATA])
 363                        return ERR_PTR(-EINVAL);
 364                tab = nla_data(tb[TCA_STAB_DATA]);
 365                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 366        }
 367
 368        if (tsize != s->tsize || (!tab && tsize > 0))
 369                return ERR_PTR(-EINVAL);
 370
 371        spin_lock(&qdisc_stab_lock);
 372
 373        list_for_each_entry(stab, &qdisc_stab_list, list) {
 374                if (memcmp(&stab->szopts, s, sizeof(*s)))
 375                        continue;
 376                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 377                        continue;
 378                stab->refcnt++;
 379                spin_unlock(&qdisc_stab_lock);
 380                return stab;
 381        }
 382
 383        spin_unlock(&qdisc_stab_lock);
 384
 385        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 386        if (!stab)
 387                return ERR_PTR(-ENOMEM);
 388
 389        stab->refcnt = 1;
 390        stab->szopts = *s;
 391        if (tsize > 0)
 392                memcpy(stab->data, tab, tsize * sizeof(u16));
 393
 394        spin_lock(&qdisc_stab_lock);
 395        list_add_tail(&stab->list, &qdisc_stab_list);
 396        spin_unlock(&qdisc_stab_lock);
 397
 398        return stab;
 399}
 400
 401static void stab_kfree_rcu(struct rcu_head *head)
 402{
 403        kfree(container_of(head, struct qdisc_size_table, rcu));
 404}
 405
 406void qdisc_put_stab(struct qdisc_size_table *tab)
 407{
 408        if (!tab)
 409                return;
 410
 411        spin_lock(&qdisc_stab_lock);
 412
 413        if (--tab->refcnt == 0) {
 414                list_del(&tab->list);
 415                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 416        }
 417
 418        spin_unlock(&qdisc_stab_lock);
 419}
 420EXPORT_SYMBOL(qdisc_put_stab);
 421
 422static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 423{
 424        struct nlattr *nest;
 425
 426        nest = nla_nest_start(skb, TCA_STAB);
 427        if (nest == NULL)
 428                goto nla_put_failure;
 429        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 430                goto nla_put_failure;
 431        nla_nest_end(skb, nest);
 432
 433        return skb->len;
 434
 435nla_put_failure:
 436        return -1;
 437}
 438
 439void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 440{
 441        int pkt_len, slot;
 442
 443        pkt_len = skb->len + stab->szopts.overhead;
 444        if (unlikely(!stab->szopts.tsize))
 445                goto out;
 446
 447        slot = pkt_len + stab->szopts.cell_align;
 448        if (unlikely(slot < 0))
 449                slot = 0;
 450
 451        slot >>= stab->szopts.cell_log;
 452        if (likely(slot < stab->szopts.tsize))
 453                pkt_len = stab->data[slot];
 454        else
 455                pkt_len = stab->data[stab->szopts.tsize - 1] *
 456                                (slot / stab->szopts.tsize) +
 457                                stab->data[slot % stab->szopts.tsize];
 458
 459        pkt_len <<= stab->szopts.size_log;
 460out:
 461        if (unlikely(pkt_len < 1))
 462                pkt_len = 1;
 463        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 464}
 465EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 466
 467void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 468{
 469        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 470                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 471                        txt, qdisc->ops->id, qdisc->handle >> 16);
 472                qdisc->flags |= TCQ_F_WARN_NONWC;
 473        }
 474}
 475EXPORT_SYMBOL(qdisc_warn_nonwc);
 476
 477static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 478{
 479        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 480                                                 timer);
 481
 482        qdisc_unthrottled(wd->qdisc);
 483        __netif_schedule(qdisc_root(wd->qdisc));
 484
 485        return HRTIMER_NORESTART;
 486}
 487
 488void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 489{
 490        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 491        wd->timer.function = qdisc_watchdog;
 492        wd->qdisc = qdisc;
 493}
 494EXPORT_SYMBOL(qdisc_watchdog_init);
 495
 496void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 497{
 498        ktime_t time;
 499
 500        if (test_bit(__QDISC_STATE_DEACTIVATED,
 501                     &qdisc_root_sleeping(wd->qdisc)->state))
 502                return;
 503
 504        qdisc_throttled(wd->qdisc);
 505        time = ktime_set(0, 0);
 506        time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
 507        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 508}
 509EXPORT_SYMBOL(qdisc_watchdog_schedule);
 510
 511void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 512{
 513        hrtimer_cancel(&wd->timer);
 514        qdisc_unthrottled(wd->qdisc);
 515}
 516EXPORT_SYMBOL(qdisc_watchdog_cancel);
 517
 518static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 519{
 520        unsigned int size = n * sizeof(struct hlist_head), i;
 521        struct hlist_head *h;
 522
 523        if (size <= PAGE_SIZE)
 524                h = kmalloc(size, GFP_KERNEL);
 525        else
 526                h = (struct hlist_head *)
 527                        __get_free_pages(GFP_KERNEL, get_order(size));
 528
 529        if (h != NULL) {
 530                for (i = 0; i < n; i++)
 531                        INIT_HLIST_HEAD(&h[i]);
 532        }
 533        return h;
 534}
 535
 536static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 537{
 538        unsigned int size = n * sizeof(struct hlist_head);
 539
 540        if (size <= PAGE_SIZE)
 541                kfree(h);
 542        else
 543                free_pages((unsigned long)h, get_order(size));
 544}
 545
 546void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 547{
 548        struct Qdisc_class_common *cl;
 549        struct hlist_node *n, *next;
 550        struct hlist_head *nhash, *ohash;
 551        unsigned int nsize, nmask, osize;
 552        unsigned int i, h;
 553
 554        /* Rehash when load factor exceeds 0.75 */
 555        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 556                return;
 557        nsize = clhash->hashsize * 2;
 558        nmask = nsize - 1;
 559        nhash = qdisc_class_hash_alloc(nsize);
 560        if (nhash == NULL)
 561                return;
 562
 563        ohash = clhash->hash;
 564        osize = clhash->hashsize;
 565
 566        sch_tree_lock(sch);
 567        for (i = 0; i < osize; i++) {
 568                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 569                        h = qdisc_class_hash(cl->classid, nmask);
 570                        hlist_add_head(&cl->hnode, &nhash[h]);
 571                }
 572        }
 573        clhash->hash     = nhash;
 574        clhash->hashsize = nsize;
 575        clhash->hashmask = nmask;
 576        sch_tree_unlock(sch);
 577
 578        qdisc_class_hash_free(ohash, osize);
 579}
 580EXPORT_SYMBOL(qdisc_class_hash_grow);
 581
 582int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 583{
 584        unsigned int size = 4;
 585
 586        clhash->hash = qdisc_class_hash_alloc(size);
 587        if (clhash->hash == NULL)
 588                return -ENOMEM;
 589        clhash->hashsize  = size;
 590        clhash->hashmask  = size - 1;
 591        clhash->hashelems = 0;
 592        return 0;
 593}
 594EXPORT_SYMBOL(qdisc_class_hash_init);
 595
 596void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 597{
 598        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 599}
 600EXPORT_SYMBOL(qdisc_class_hash_destroy);
 601
 602void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 603                             struct Qdisc_class_common *cl)
 604{
 605        unsigned int h;
 606
 607        INIT_HLIST_NODE(&cl->hnode);
 608        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 609        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 610        clhash->hashelems++;
 611}
 612EXPORT_SYMBOL(qdisc_class_hash_insert);
 613
 614void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 615                             struct Qdisc_class_common *cl)
 616{
 617        hlist_del(&cl->hnode);
 618        clhash->hashelems--;
 619}
 620EXPORT_SYMBOL(qdisc_class_hash_remove);
 621
 622/* Allocate an unique handle from space managed by kernel
 623 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 624 */
 625static u32 qdisc_alloc_handle(struct net_device *dev)
 626{
 627        int i = 0x8000;
 628        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 629
 630        do {
 631                autohandle += TC_H_MAKE(0x10000U, 0);
 632                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 633                        autohandle = TC_H_MAKE(0x80000000U, 0);
 634                if (!qdisc_lookup(dev, autohandle))
 635                        return autohandle;
 636                cond_resched();
 637        } while (--i > 0);
 638
 639        return 0;
 640}
 641
 642void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 643{
 644        const struct Qdisc_class_ops *cops;
 645        unsigned long cl;
 646        u32 parentid;
 647
 648        if (n == 0)
 649                return;
 650        while ((parentid = sch->parent)) {
 651                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 652                        return;
 653
 654                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 655                if (sch == NULL) {
 656                        WARN_ON(parentid != TC_H_ROOT);
 657                        return;
 658                }
 659                cops = sch->ops->cl_ops;
 660                if (cops->qlen_notify) {
 661                        cl = cops->get(sch, parentid);
 662                        cops->qlen_notify(sch, cl);
 663                        cops->put(sch, cl);
 664                }
 665                sch->q.qlen -= n;
 666        }
 667}
 668EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 669
 670static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 671                               struct nlmsghdr *n, u32 clid,
 672                               struct Qdisc *old, struct Qdisc *new)
 673{
 674        if (new || old)
 675                qdisc_notify(net, skb, n, clid, old, new);
 676
 677        if (old)
 678                qdisc_destroy(old);
 679}
 680
 681/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 682 * to device "dev".
 683 *
 684 * When appropriate send a netlink notification using 'skb'
 685 * and "n".
 686 *
 687 * On success, destroy old qdisc.
 688 */
 689
 690static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 691                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 692                       struct Qdisc *new, struct Qdisc *old)
 693{
 694        struct Qdisc *q = old;
 695        struct net *net = dev_net(dev);
 696        int err = 0;
 697
 698        if (parent == NULL) {
 699                unsigned int i, num_q, ingress;
 700
 701                ingress = 0;
 702                num_q = dev->num_tx_queues;
 703                if ((q && q->flags & TCQ_F_INGRESS) ||
 704                    (new && new->flags & TCQ_F_INGRESS)) {
 705                        num_q = 1;
 706                        ingress = 1;
 707                        if (!dev_ingress_queue(dev))
 708                                return -ENOENT;
 709                }
 710
 711                if (dev->flags & IFF_UP)
 712                        dev_deactivate(dev);
 713
 714                if (new && new->ops->attach) {
 715                        new->ops->attach(new);
 716                        num_q = 0;
 717                }
 718
 719                for (i = 0; i < num_q; i++) {
 720                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 721
 722                        if (!ingress)
 723                                dev_queue = netdev_get_tx_queue(dev, i);
 724
 725                        old = dev_graft_qdisc(dev_queue, new);
 726                        if (new && i > 0)
 727                                atomic_inc(&new->refcnt);
 728
 729                        if (!ingress)
 730                                qdisc_destroy(old);
 731                }
 732
 733                if (!ingress) {
 734                        notify_and_destroy(net, skb, n, classid,
 735                                           dev->qdisc, new);
 736                        if (new && !new->ops->attach)
 737                                atomic_inc(&new->refcnt);
 738                        dev->qdisc = new ? : &noop_qdisc;
 739                } else {
 740                        notify_and_destroy(net, skb, n, classid, old, new);
 741                }
 742
 743                if (dev->flags & IFF_UP)
 744                        dev_activate(dev);
 745        } else {
 746                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 747
 748                err = -EOPNOTSUPP;
 749                if (cops && cops->graft) {
 750                        unsigned long cl = cops->get(parent, classid);
 751                        if (cl) {
 752                                err = cops->graft(parent, cl, new, &old);
 753                                cops->put(parent, cl);
 754                        } else
 755                                err = -ENOENT;
 756                }
 757                if (!err)
 758                        notify_and_destroy(net, skb, n, classid, old, new);
 759        }
 760        return err;
 761}
 762
 763/* lockdep annotation is needed for ingress; egress gets it only for name */
 764static struct lock_class_key qdisc_tx_lock;
 765static struct lock_class_key qdisc_rx_lock;
 766
 767/*
 768   Allocate and initialize new qdisc.
 769
 770   Parameters are passed via opt.
 771 */
 772
 773static struct Qdisc *
 774qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 775             struct Qdisc *p, u32 parent, u32 handle,
 776             struct nlattr **tca, int *errp)
 777{
 778        int err;
 779        struct nlattr *kind = tca[TCA_KIND];
 780        struct Qdisc *sch;
 781        struct Qdisc_ops *ops;
 782        struct qdisc_size_table *stab;
 783
 784        ops = qdisc_lookup_ops(kind);
 785#ifdef CONFIG_MODULES
 786        if (ops == NULL && kind != NULL) {
 787                char name[IFNAMSIZ];
 788                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 789                        /* We dropped the RTNL semaphore in order to
 790                         * perform the module load.  So, even if we
 791                         * succeeded in loading the module we have to
 792                         * tell the caller to replay the request.  We
 793                         * indicate this using -EAGAIN.
 794                         * We replay the request because the device may
 795                         * go away in the mean time.
 796                         */
 797                        rtnl_unlock();
 798                        request_module("sch_%s", name);
 799                        rtnl_lock();
 800                        ops = qdisc_lookup_ops(kind);
 801                        if (ops != NULL) {
 802                                /* We will try again qdisc_lookup_ops,
 803                                 * so don't keep a reference.
 804                                 */
 805                                module_put(ops->owner);
 806                                err = -EAGAIN;
 807                                goto err_out;
 808                        }
 809                }
 810        }
 811#endif
 812
 813        err = -ENOENT;
 814        if (ops == NULL)
 815                goto err_out;
 816
 817        sch = qdisc_alloc(dev_queue, ops);
 818        if (IS_ERR(sch)) {
 819                err = PTR_ERR(sch);
 820                goto err_out2;
 821        }
 822
 823        sch->parent = parent;
 824
 825        if (handle == TC_H_INGRESS) {
 826                sch->flags |= TCQ_F_INGRESS;
 827                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 828                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 829        } else {
 830                if (handle == 0) {
 831                        handle = qdisc_alloc_handle(dev);
 832                        err = -ENOMEM;
 833                        if (handle == 0)
 834                                goto err_out3;
 835                }
 836                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 837        }
 838
 839        sch->handle = handle;
 840
 841        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 842                if (tca[TCA_STAB]) {
 843                        stab = qdisc_get_stab(tca[TCA_STAB]);
 844                        if (IS_ERR(stab)) {
 845                                err = PTR_ERR(stab);
 846                                goto err_out4;
 847                        }
 848                        rcu_assign_pointer(sch->stab, stab);
 849                }
 850                if (tca[TCA_RATE]) {
 851                        spinlock_t *root_lock;
 852
 853                        err = -EOPNOTSUPP;
 854                        if (sch->flags & TCQ_F_MQROOT)
 855                                goto err_out4;
 856
 857                        if ((sch->parent != TC_H_ROOT) &&
 858                            !(sch->flags & TCQ_F_INGRESS) &&
 859                            (!p || !(p->flags & TCQ_F_MQROOT)))
 860                                root_lock = qdisc_root_sleeping_lock(sch);
 861                        else
 862                                root_lock = qdisc_lock(sch);
 863
 864                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 865                                                root_lock, tca[TCA_RATE]);
 866                        if (err)
 867                                goto err_out4;
 868                }
 869
 870                qdisc_list_add(sch);
 871
 872                return sch;
 873        }
 874err_out3:
 875        dev_put(dev);
 876        kfree((char *) sch - sch->padded);
 877err_out2:
 878        module_put(ops->owner);
 879err_out:
 880        *errp = err;
 881        return NULL;
 882
 883err_out4:
 884        /*
 885         * Any broken qdiscs that would require a ops->reset() here?
 886         * The qdisc was never in action so it shouldn't be necessary.
 887         */
 888        qdisc_put_stab(rtnl_dereference(sch->stab));
 889        if (ops->destroy)
 890                ops->destroy(sch);
 891        goto err_out3;
 892}
 893
 894static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 895{
 896        struct qdisc_size_table *ostab, *stab = NULL;
 897        int err = 0;
 898
 899        if (tca[TCA_OPTIONS]) {
 900                if (sch->ops->change == NULL)
 901                        return -EINVAL;
 902                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 903                if (err)
 904                        return err;
 905        }
 906
 907        if (tca[TCA_STAB]) {
 908                stab = qdisc_get_stab(tca[TCA_STAB]);
 909                if (IS_ERR(stab))
 910                        return PTR_ERR(stab);
 911        }
 912
 913        ostab = rtnl_dereference(sch->stab);
 914        rcu_assign_pointer(sch->stab, stab);
 915        qdisc_put_stab(ostab);
 916
 917        if (tca[TCA_RATE]) {
 918                /* NB: ignores errors from replace_estimator
 919                   because change can't be undone. */
 920                if (sch->flags & TCQ_F_MQROOT)
 921                        goto out;
 922                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 923                                            qdisc_root_sleeping_lock(sch),
 924                                            tca[TCA_RATE]);
 925        }
 926out:
 927        return 0;
 928}
 929
 930struct check_loop_arg {
 931        struct qdisc_walker     w;
 932        struct Qdisc            *p;
 933        int                     depth;
 934};
 935
 936static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 937
 938static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 939{
 940        struct check_loop_arg   arg;
 941
 942        if (q->ops->cl_ops == NULL)
 943                return 0;
 944
 945        arg.w.stop = arg.w.skip = arg.w.count = 0;
 946        arg.w.fn = check_loop_fn;
 947        arg.depth = depth;
 948        arg.p = p;
 949        q->ops->cl_ops->walk(q, &arg.w);
 950        return arg.w.stop ? -ELOOP : 0;
 951}
 952
 953static int
 954check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 955{
 956        struct Qdisc *leaf;
 957        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 958        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 959
 960        leaf = cops->leaf(q, cl);
 961        if (leaf) {
 962                if (leaf == arg->p || arg->depth > 7)
 963                        return -ELOOP;
 964                return check_loop(leaf, arg->p, arg->depth + 1);
 965        }
 966        return 0;
 967}
 968
 969/*
 970 * Delete/get qdisc.
 971 */
 972
 973static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 974{
 975        struct net *net = sock_net(skb->sk);
 976        struct tcmsg *tcm = nlmsg_data(n);
 977        struct nlattr *tca[TCA_MAX + 1];
 978        struct net_device *dev;
 979        u32 clid = tcm->tcm_parent;
 980        struct Qdisc *q = NULL;
 981        struct Qdisc *p = NULL;
 982        int err;
 983
 984        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
 985        if (!dev)
 986                return -ENODEV;
 987
 988        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 989        if (err < 0)
 990                return err;
 991
 992        if (clid) {
 993                if (clid != TC_H_ROOT) {
 994                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 995                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
 996                                if (!p)
 997                                        return -ENOENT;
 998                                q = qdisc_leaf(p, clid);
 999                        } else if (dev_ingress_queue(dev)) {
1000                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1001                        }
1002                } else {
1003                        q = dev->qdisc;
1004                }
1005                if (!q)
1006                        return -ENOENT;
1007
1008                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1009                        return -EINVAL;
1010        } else {
1011                q = qdisc_lookup(dev, tcm->tcm_handle);
1012                if (!q)
1013                        return -ENOENT;
1014        }
1015
1016        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1017                return -EINVAL;
1018
1019        if (n->nlmsg_type == RTM_DELQDISC) {
1020                if (!clid)
1021                        return -EINVAL;
1022                if (q->handle == 0)
1023                        return -ENOENT;
1024                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1025                if (err != 0)
1026                        return err;
1027        } else {
1028                qdisc_notify(net, skb, n, clid, NULL, q);
1029        }
1030        return 0;
1031}
1032
1033/*
1034 * Create/change qdisc.
1035 */
1036
1037static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1038{
1039        struct net *net = sock_net(skb->sk);
1040        struct tcmsg *tcm;
1041        struct nlattr *tca[TCA_MAX + 1];
1042        struct net_device *dev;
1043        u32 clid;
1044        struct Qdisc *q, *p;
1045        int err;
1046
1047replay:
1048        /* Reinit, just in case something touches this. */
1049        tcm = nlmsg_data(n);
1050        clid = tcm->tcm_parent;
1051        q = p = NULL;
1052
1053        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1054        if (!dev)
1055                return -ENODEV;
1056
1057        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1058        if (err < 0)
1059                return err;
1060
1061        if (clid) {
1062                if (clid != TC_H_ROOT) {
1063                        if (clid != TC_H_INGRESS) {
1064                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1065                                if (!p)
1066                                        return -ENOENT;
1067                                q = qdisc_leaf(p, clid);
1068                        } else if (dev_ingress_queue_create(dev)) {
1069                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1070                        }
1071                } else {
1072                        q = dev->qdisc;
1073                }
1074
1075                /* It may be default qdisc, ignore it */
1076                if (q && q->handle == 0)
1077                        q = NULL;
1078
1079                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1080                        if (tcm->tcm_handle) {
1081                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1082                                        return -EEXIST;
1083                                if (TC_H_MIN(tcm->tcm_handle))
1084                                        return -EINVAL;
1085                                q = qdisc_lookup(dev, tcm->tcm_handle);
1086                                if (!q)
1087                                        goto create_n_graft;
1088                                if (n->nlmsg_flags & NLM_F_EXCL)
1089                                        return -EEXIST;
1090                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1091                                        return -EINVAL;
1092                                if (q == p ||
1093                                    (p && check_loop(q, p, 0)))
1094                                        return -ELOOP;
1095                                atomic_inc(&q->refcnt);
1096                                goto graft;
1097                        } else {
1098                                if (!q)
1099                                        goto create_n_graft;
1100
1101                                /* This magic test requires explanation.
1102                                 *
1103                                 *   We know, that some child q is already
1104                                 *   attached to this parent and have choice:
1105                                 *   either to change it or to create/graft new one.
1106                                 *
1107                                 *   1. We are allowed to create/graft only
1108                                 *   if CREATE and REPLACE flags are set.
1109                                 *
1110                                 *   2. If EXCL is set, requestor wanted to say,
1111                                 *   that qdisc tcm_handle is not expected
1112                                 *   to exist, so that we choose create/graft too.
1113                                 *
1114                                 *   3. The last case is when no flags are set.
1115                                 *   Alas, it is sort of hole in API, we
1116                                 *   cannot decide what to do unambiguously.
1117                                 *   For now we select create/graft, if
1118                                 *   user gave KIND, which does not match existing.
1119                                 */
1120                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1121                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1122                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1123                                     (tca[TCA_KIND] &&
1124                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1125                                        goto create_n_graft;
1126                        }
1127                }
1128        } else {
1129                if (!tcm->tcm_handle)
1130                        return -EINVAL;
1131                q = qdisc_lookup(dev, tcm->tcm_handle);
1132        }
1133
1134        /* Change qdisc parameters */
1135        if (q == NULL)
1136                return -ENOENT;
1137        if (n->nlmsg_flags & NLM_F_EXCL)
1138                return -EEXIST;
1139        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1140                return -EINVAL;
1141        err = qdisc_change(q, tca);
1142        if (err == 0)
1143                qdisc_notify(net, skb, n, clid, NULL, q);
1144        return err;
1145
1146create_n_graft:
1147        if (!(n->nlmsg_flags & NLM_F_CREATE))
1148                return -ENOENT;
1149        if (clid == TC_H_INGRESS) {
1150                if (dev_ingress_queue(dev))
1151                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1152                                         tcm->tcm_parent, tcm->tcm_parent,
1153                                         tca, &err);
1154                else
1155                        err = -ENOENT;
1156        } else {
1157                struct netdev_queue *dev_queue;
1158
1159                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1160                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1161                else if (p)
1162                        dev_queue = p->dev_queue;
1163                else
1164                        dev_queue = netdev_get_tx_queue(dev, 0);
1165
1166                q = qdisc_create(dev, dev_queue, p,
1167                                 tcm->tcm_parent, tcm->tcm_handle,
1168                                 tca, &err);
1169        }
1170        if (q == NULL) {
1171                if (err == -EAGAIN)
1172                        goto replay;
1173                return err;
1174        }
1175
1176graft:
1177        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1178        if (err) {
1179                if (q)
1180                        qdisc_destroy(q);
1181                return err;
1182        }
1183
1184        return 0;
1185}
1186
1187static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1188                         u32 portid, u32 seq, u16 flags, int event)
1189{
1190        struct tcmsg *tcm;
1191        struct nlmsghdr  *nlh;
1192        unsigned char *b = skb_tail_pointer(skb);
1193        struct gnet_dump d;
1194        struct qdisc_size_table *stab;
1195
1196        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1197        if (!nlh)
1198                goto out_nlmsg_trim;
1199        tcm = nlmsg_data(nlh);
1200        tcm->tcm_family = AF_UNSPEC;
1201        tcm->tcm__pad1 = 0;
1202        tcm->tcm__pad2 = 0;
1203        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1204        tcm->tcm_parent = clid;
1205        tcm->tcm_handle = q->handle;
1206        tcm->tcm_info = atomic_read(&q->refcnt);
1207        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1208                goto nla_put_failure;
1209        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1210                goto nla_put_failure;
1211        q->qstats.qlen = q->q.qlen;
1212
1213        stab = rtnl_dereference(q->stab);
1214        if (stab && qdisc_dump_stab(skb, stab) < 0)
1215                goto nla_put_failure;
1216
1217        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1218                                         qdisc_root_sleeping_lock(q), &d) < 0)
1219                goto nla_put_failure;
1220
1221        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1222                goto nla_put_failure;
1223
1224        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1225            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1226            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1227                goto nla_put_failure;
1228
1229        if (gnet_stats_finish_copy(&d) < 0)
1230                goto nla_put_failure;
1231
1232        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1233        return skb->len;
1234
1235out_nlmsg_trim:
1236nla_put_failure:
1237        nlmsg_trim(skb, b);
1238        return -1;
1239}
1240
1241static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1242{
1243        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1244}
1245
1246static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1247                        struct nlmsghdr *n, u32 clid,
1248                        struct Qdisc *old, struct Qdisc *new)
1249{
1250        struct sk_buff *skb;
1251        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1252
1253        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1254        if (!skb)
1255                return -ENOBUFS;
1256
1257        if (old && !tc_qdisc_dump_ignore(old)) {
1258                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1259                                  0, RTM_DELQDISC) < 0)
1260                        goto err_out;
1261        }
1262        if (new && !tc_qdisc_dump_ignore(new)) {
1263                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1264                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1265                        goto err_out;
1266        }
1267
1268        if (skb->len)
1269                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1270                                      n->nlmsg_flags & NLM_F_ECHO);
1271
1272err_out:
1273        kfree_skb(skb);
1274        return -EINVAL;
1275}
1276
1277static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1278                              struct netlink_callback *cb,
1279                              int *q_idx_p, int s_q_idx)
1280{
1281        int ret = 0, q_idx = *q_idx_p;
1282        struct Qdisc *q;
1283
1284        if (!root)
1285                return 0;
1286
1287        q = root;
1288        if (q_idx < s_q_idx) {
1289                q_idx++;
1290        } else {
1291                if (!tc_qdisc_dump_ignore(q) &&
1292                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1293                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1294                        goto done;
1295                q_idx++;
1296        }
1297        list_for_each_entry(q, &root->list, list) {
1298                if (q_idx < s_q_idx) {
1299                        q_idx++;
1300                        continue;
1301                }
1302                if (!tc_qdisc_dump_ignore(q) &&
1303                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1304                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1305                        goto done;
1306                q_idx++;
1307        }
1308
1309out:
1310        *q_idx_p = q_idx;
1311        return ret;
1312done:
1313        ret = -1;
1314        goto out;
1315}
1316
1317static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1318{
1319        struct net *net = sock_net(skb->sk);
1320        int idx, q_idx;
1321        int s_idx, s_q_idx;
1322        struct net_device *dev;
1323
1324        s_idx = cb->args[0];
1325        s_q_idx = q_idx = cb->args[1];
1326
1327        rcu_read_lock();
1328        idx = 0;
1329        for_each_netdev_rcu(net, dev) {
1330                struct netdev_queue *dev_queue;
1331
1332                if (idx < s_idx)
1333                        goto cont;
1334                if (idx > s_idx)
1335                        s_q_idx = 0;
1336                q_idx = 0;
1337
1338                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1339                        goto done;
1340
1341                dev_queue = dev_ingress_queue(dev);
1342                if (dev_queue &&
1343                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1344                                       &q_idx, s_q_idx) < 0)
1345                        goto done;
1346
1347cont:
1348                idx++;
1349        }
1350
1351done:
1352        rcu_read_unlock();
1353
1354        cb->args[0] = idx;
1355        cb->args[1] = q_idx;
1356
1357        return skb->len;
1358}
1359
1360
1361
1362/************************************************
1363 *      Traffic classes manipulation.           *
1364 ************************************************/
1365
1366
1367
1368static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1369{
1370        struct net *net = sock_net(skb->sk);
1371        struct tcmsg *tcm = nlmsg_data(n);
1372        struct nlattr *tca[TCA_MAX + 1];
1373        struct net_device *dev;
1374        struct Qdisc *q = NULL;
1375        const struct Qdisc_class_ops *cops;
1376        unsigned long cl = 0;
1377        unsigned long new_cl;
1378        u32 portid = tcm->tcm_parent;
1379        u32 clid = tcm->tcm_handle;
1380        u32 qid = TC_H_MAJ(clid);
1381        int err;
1382
1383        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1384        if (!dev)
1385                return -ENODEV;
1386
1387        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1388        if (err < 0)
1389                return err;
1390
1391        /*
1392           parent == TC_H_UNSPEC - unspecified parent.
1393           parent == TC_H_ROOT   - class is root, which has no parent.
1394           parent == X:0         - parent is root class.
1395           parent == X:Y         - parent is a node in hierarchy.
1396           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1397
1398           handle == 0:0         - generate handle from kernel pool.
1399           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1400           handle == X:Y         - clear.
1401           handle == X:0         - root class.
1402         */
1403
1404        /* Step 1. Determine qdisc handle X:0 */
1405
1406        if (portid != TC_H_ROOT) {
1407                u32 qid1 = TC_H_MAJ(portid);
1408
1409                if (qid && qid1) {
1410                        /* If both majors are known, they must be identical. */
1411                        if (qid != qid1)
1412                                return -EINVAL;
1413                } else if (qid1) {
1414                        qid = qid1;
1415                } else if (qid == 0)
1416                        qid = dev->qdisc->handle;
1417
1418                /* Now qid is genuine qdisc handle consistent
1419                 * both with parent and child.
1420                 *
1421                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1422                 */
1423                if (portid)
1424                        portid = TC_H_MAKE(qid, portid);
1425        } else {
1426                if (qid == 0)
1427                        qid = dev->qdisc->handle;
1428        }
1429
1430        /* OK. Locate qdisc */
1431        q = qdisc_lookup(dev, qid);
1432        if (!q)
1433                return -ENOENT;
1434
1435        /* An check that it supports classes */
1436        cops = q->ops->cl_ops;
1437        if (cops == NULL)
1438                return -EINVAL;
1439
1440        /* Now try to get class */
1441        if (clid == 0) {
1442                if (portid == TC_H_ROOT)
1443                        clid = qid;
1444        } else
1445                clid = TC_H_MAKE(qid, clid);
1446
1447        if (clid)
1448                cl = cops->get(q, clid);
1449
1450        if (cl == 0) {
1451                err = -ENOENT;
1452                if (n->nlmsg_type != RTM_NEWTCLASS ||
1453                    !(n->nlmsg_flags & NLM_F_CREATE))
1454                        goto out;
1455        } else {
1456                switch (n->nlmsg_type) {
1457                case RTM_NEWTCLASS:
1458                        err = -EEXIST;
1459                        if (n->nlmsg_flags & NLM_F_EXCL)
1460                                goto out;
1461                        break;
1462                case RTM_DELTCLASS:
1463                        err = -EOPNOTSUPP;
1464                        if (cops->delete)
1465                                err = cops->delete(q, cl);
1466                        if (err == 0)
1467                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1468                        goto out;
1469                case RTM_GETTCLASS:
1470                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1471                        goto out;
1472                default:
1473                        err = -EINVAL;
1474                        goto out;
1475                }
1476        }
1477
1478        new_cl = cl;
1479        err = -EOPNOTSUPP;
1480        if (cops->change)
1481                err = cops->change(q, clid, portid, tca, &new_cl);
1482        if (err == 0)
1483                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1484
1485out:
1486        if (cl)
1487                cops->put(q, cl);
1488
1489        return err;
1490}
1491
1492
1493static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1494                          unsigned long cl,
1495                          u32 portid, u32 seq, u16 flags, int event)
1496{
1497        struct tcmsg *tcm;
1498        struct nlmsghdr  *nlh;
1499        unsigned char *b = skb_tail_pointer(skb);
1500        struct gnet_dump d;
1501        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1502
1503        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1504        if (!nlh)
1505                goto out_nlmsg_trim;
1506        tcm = nlmsg_data(nlh);
1507        tcm->tcm_family = AF_UNSPEC;
1508        tcm->tcm__pad1 = 0;
1509        tcm->tcm__pad2 = 0;
1510        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1511        tcm->tcm_parent = q->handle;
1512        tcm->tcm_handle = q->handle;
1513        tcm->tcm_info = 0;
1514        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1515                goto nla_put_failure;
1516        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1517                goto nla_put_failure;
1518
1519        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1520                                         qdisc_root_sleeping_lock(q), &d) < 0)
1521                goto nla_put_failure;
1522
1523        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1524                goto nla_put_failure;
1525
1526        if (gnet_stats_finish_copy(&d) < 0)
1527                goto nla_put_failure;
1528
1529        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1530        return skb->len;
1531
1532out_nlmsg_trim:
1533nla_put_failure:
1534        nlmsg_trim(skb, b);
1535        return -1;
1536}
1537
1538static int tclass_notify(struct net *net, struct sk_buff *oskb,
1539                         struct nlmsghdr *n, struct Qdisc *q,
1540                         unsigned long cl, int event)
1541{
1542        struct sk_buff *skb;
1543        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1544
1545        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1546        if (!skb)
1547                return -ENOBUFS;
1548
1549        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1550                kfree_skb(skb);
1551                return -EINVAL;
1552        }
1553
1554        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1555                              n->nlmsg_flags & NLM_F_ECHO);
1556}
1557
1558struct qdisc_dump_args {
1559        struct qdisc_walker     w;
1560        struct sk_buff          *skb;
1561        struct netlink_callback *cb;
1562};
1563
1564static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1565{
1566        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1567
1568        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1569                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1570}
1571
1572static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1573                                struct tcmsg *tcm, struct netlink_callback *cb,
1574                                int *t_p, int s_t)
1575{
1576        struct qdisc_dump_args arg;
1577
1578        if (tc_qdisc_dump_ignore(q) ||
1579            *t_p < s_t || !q->ops->cl_ops ||
1580            (tcm->tcm_parent &&
1581             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1582                (*t_p)++;
1583                return 0;
1584        }
1585        if (*t_p > s_t)
1586                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1587        arg.w.fn = qdisc_class_dump;
1588        arg.skb = skb;
1589        arg.cb = cb;
1590        arg.w.stop  = 0;
1591        arg.w.skip = cb->args[1];
1592        arg.w.count = 0;
1593        q->ops->cl_ops->walk(q, &arg.w);
1594        cb->args[1] = arg.w.count;
1595        if (arg.w.stop)
1596                return -1;
1597        (*t_p)++;
1598        return 0;
1599}
1600
1601static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1602                               struct tcmsg *tcm, struct netlink_callback *cb,
1603                               int *t_p, int s_t)
1604{
1605        struct Qdisc *q;
1606
1607        if (!root)
1608                return 0;
1609
1610        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1611                return -1;
1612
1613        list_for_each_entry(q, &root->list, list) {
1614                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1615                        return -1;
1616        }
1617
1618        return 0;
1619}
1620
1621static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1622{
1623        struct tcmsg *tcm = nlmsg_data(cb->nlh);
1624        struct net *net = sock_net(skb->sk);
1625        struct netdev_queue *dev_queue;
1626        struct net_device *dev;
1627        int t, s_t;
1628
1629        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1630                return 0;
1631        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1632        if (!dev)
1633                return 0;
1634
1635        s_t = cb->args[0];
1636        t = 0;
1637
1638        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1639                goto done;
1640
1641        dev_queue = dev_ingress_queue(dev);
1642        if (dev_queue &&
1643            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1644                                &t, s_t) < 0)
1645                goto done;
1646
1647done:
1648        cb->args[0] = t;
1649
1650        dev_put(dev);
1651        return skb->len;
1652}
1653
1654/* Main classifier routine: scans classifier chain attached
1655 * to this qdisc, (optionally) tests for protocol and asks
1656 * specific classifiers.
1657 */
1658int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1659                       struct tcf_result *res)
1660{
1661        __be16 protocol = skb->protocol;
1662        int err;
1663
1664        for (; tp; tp = tp->next) {
1665                if (tp->protocol != protocol &&
1666                    tp->protocol != htons(ETH_P_ALL))
1667                        continue;
1668                err = tp->classify(skb, tp, res);
1669
1670                if (err >= 0) {
1671#ifdef CONFIG_NET_CLS_ACT
1672                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1673                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1674#endif
1675                        return err;
1676                }
1677        }
1678        return -1;
1679}
1680EXPORT_SYMBOL(tc_classify_compat);
1681
1682int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1683                struct tcf_result *res)
1684{
1685        int err = 0;
1686#ifdef CONFIG_NET_CLS_ACT
1687        const struct tcf_proto *otp = tp;
1688reclassify:
1689#endif
1690
1691        err = tc_classify_compat(skb, tp, res);
1692#ifdef CONFIG_NET_CLS_ACT
1693        if (err == TC_ACT_RECLASSIFY) {
1694                u32 verd = G_TC_VERD(skb->tc_verd);
1695                tp = otp;
1696
1697                if (verd++ >= MAX_REC_LOOP) {
1698                        net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1699                                               tp->q->ops->id,
1700                                               tp->prio & 0xffff,
1701                                               ntohs(tp->protocol));
1702                        return TC_ACT_SHOT;
1703                }
1704                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1705                goto reclassify;
1706        }
1707#endif
1708        return err;
1709}
1710EXPORT_SYMBOL(tc_classify);
1711
1712void tcf_destroy(struct tcf_proto *tp)
1713{
1714        tp->ops->destroy(tp);
1715        module_put(tp->ops->owner);
1716        kfree(tp);
1717}
1718
1719void tcf_destroy_chain(struct tcf_proto **fl)
1720{
1721        struct tcf_proto *tp;
1722
1723        while ((tp = *fl) != NULL) {
1724                *fl = tp->next;
1725                tcf_destroy(tp);
1726        }
1727}
1728EXPORT_SYMBOL(tcf_destroy_chain);
1729
1730#ifdef CONFIG_PROC_FS
1731static int psched_show(struct seq_file *seq, void *v)
1732{
1733        struct timespec ts;
1734
1735        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1736        seq_printf(seq, "%08x %08x %08x %08x\n",
1737                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1738                   1000000,
1739                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1740
1741        return 0;
1742}
1743
1744static int psched_open(struct inode *inode, struct file *file)
1745{
1746        return single_open(file, psched_show, NULL);
1747}
1748
1749static const struct file_operations psched_fops = {
1750        .owner = THIS_MODULE,
1751        .open = psched_open,
1752        .read  = seq_read,
1753        .llseek = seq_lseek,
1754        .release = single_release,
1755};
1756
1757static int __net_init psched_net_init(struct net *net)
1758{
1759        struct proc_dir_entry *e;
1760
1761        e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1762        if (e == NULL)
1763                return -ENOMEM;
1764
1765        return 0;
1766}
1767
1768static void __net_exit psched_net_exit(struct net *net)
1769{
1770        proc_net_remove(net, "psched");
1771}
1772#else
1773static int __net_init psched_net_init(struct net *net)
1774{
1775        return 0;
1776}
1777
1778static void __net_exit psched_net_exit(struct net *net)
1779{
1780}
1781#endif
1782
1783static struct pernet_operations psched_net_ops = {
1784        .init = psched_net_init,
1785        .exit = psched_net_exit,
1786};
1787
1788static int __init pktsched_init(void)
1789{
1790        int err;
1791
1792        err = register_pernet_subsys(&psched_net_ops);
1793        if (err) {
1794                pr_err("pktsched_init: "
1795                       "cannot initialize per netns operations\n");
1796                return err;
1797        }
1798
1799        register_qdisc(&pfifo_qdisc_ops);
1800        register_qdisc(&bfifo_qdisc_ops);
1801        register_qdisc(&pfifo_head_drop_qdisc_ops);
1802        register_qdisc(&mq_qdisc_ops);
1803
1804        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1805        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1806        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1807        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1808        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1809        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1810
1811        return 0;
1812}
1813
1814subsys_initcall(pktsched_init);
1815