linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/uregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* We know handle. Find qdisc among all qdisc's attached to device
 204   (root qdisc, all its children, children of children etc.)
 205 */
 206
 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 208{
 209        struct Qdisc *q;
 210
 211        if (!(root->flags & TCQ_F_BUILTIN) &&
 212            root->handle == handle)
 213                return root;
 214
 215        list_for_each_entry(q, &root->list, list) {
 216                if (q->handle == handle)
 217                        return q;
 218        }
 219        return NULL;
 220}
 221
 222static void qdisc_list_add(struct Qdisc *q)
 223{
 224        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 225                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 226}
 227
 228void qdisc_list_del(struct Qdisc *q)
 229{
 230        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 231                list_del(&q->list);
 232}
 233EXPORT_SYMBOL(qdisc_list_del);
 234
 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 236{
 237        struct Qdisc *q;
 238
 239        q = qdisc_match_from_root(dev->qdisc, handle);
 240        if (q)
 241                goto out;
 242
 243        if (dev_ingress_queue(dev))
 244                q = qdisc_match_from_root(
 245                        dev_ingress_queue(dev)->qdisc_sleeping,
 246                        handle);
 247out:
 248        return q;
 249}
 250
 251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 252{
 253        unsigned long cl;
 254        struct Qdisc *leaf;
 255        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 256
 257        if (cops == NULL)
 258                return NULL;
 259        cl = cops->get(p, classid);
 260
 261        if (cl == 0)
 262                return NULL;
 263        leaf = cops->leaf(p, cl);
 264        cops->put(p, cl);
 265        return leaf;
 266}
 267
 268/* Find queueing discipline by name */
 269
 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 271{
 272        struct Qdisc_ops *q = NULL;
 273
 274        if (kind) {
 275                read_lock(&qdisc_mod_lock);
 276                for (q = qdisc_base; q; q = q->next) {
 277                        if (nla_strcmp(kind, q->id) == 0) {
 278                                if (!try_module_get(q->owner))
 279                                        q = NULL;
 280                                break;
 281                        }
 282                }
 283                read_unlock(&qdisc_mod_lock);
 284        }
 285        return q;
 286}
 287
 288static struct qdisc_rate_table *qdisc_rtab_list;
 289
 290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 291{
 292        struct qdisc_rate_table *rtab;
 293
 294        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 295            nla_len(tab) != TC_RTAB_SIZE)
 296                return NULL;
 297
 298        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 299                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 300                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 301                        rtab->refcnt++;
 302                        return rtab;
 303                }
 304        }
 305
 306        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 307        if (rtab) {
 308                rtab->rate = *r;
 309                rtab->refcnt = 1;
 310                memcpy(rtab->data, nla_data(tab), 1024);
 311                rtab->next = qdisc_rtab_list;
 312                qdisc_rtab_list = rtab;
 313        }
 314        return rtab;
 315}
 316EXPORT_SYMBOL(qdisc_get_rtab);
 317
 318void qdisc_put_rtab(struct qdisc_rate_table *tab)
 319{
 320        struct qdisc_rate_table *rtab, **rtabp;
 321
 322        if (!tab || --tab->refcnt)
 323                return;
 324
 325        for (rtabp = &qdisc_rtab_list;
 326             (rtab = *rtabp) != NULL;
 327             rtabp = &rtab->next) {
 328                if (rtab == tab) {
 329                        *rtabp = rtab->next;
 330                        kfree(rtab);
 331                        return;
 332                }
 333        }
 334}
 335EXPORT_SYMBOL(qdisc_put_rtab);
 336
 337static LIST_HEAD(qdisc_stab_list);
 338static DEFINE_SPINLOCK(qdisc_stab_lock);
 339
 340static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 341        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 342        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 343};
 344
 345static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 346{
 347        struct nlattr *tb[TCA_STAB_MAX + 1];
 348        struct qdisc_size_table *stab;
 349        struct tc_sizespec *s;
 350        unsigned int tsize = 0;
 351        u16 *tab = NULL;
 352        int err;
 353
 354        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 355        if (err < 0)
 356                return ERR_PTR(err);
 357        if (!tb[TCA_STAB_BASE])
 358                return ERR_PTR(-EINVAL);
 359
 360        s = nla_data(tb[TCA_STAB_BASE]);
 361
 362        if (s->tsize > 0) {
 363                if (!tb[TCA_STAB_DATA])
 364                        return ERR_PTR(-EINVAL);
 365                tab = nla_data(tb[TCA_STAB_DATA]);
 366                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 367        }
 368
 369        if (tsize != s->tsize || (!tab && tsize > 0))
 370                return ERR_PTR(-EINVAL);
 371
 372        spin_lock(&qdisc_stab_lock);
 373
 374        list_for_each_entry(stab, &qdisc_stab_list, list) {
 375                if (memcmp(&stab->szopts, s, sizeof(*s)))
 376                        continue;
 377                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 378                        continue;
 379                stab->refcnt++;
 380                spin_unlock(&qdisc_stab_lock);
 381                return stab;
 382        }
 383
 384        spin_unlock(&qdisc_stab_lock);
 385
 386        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 387        if (!stab)
 388                return ERR_PTR(-ENOMEM);
 389
 390        stab->refcnt = 1;
 391        stab->szopts = *s;
 392        if (tsize > 0)
 393                memcpy(stab->data, tab, tsize * sizeof(u16));
 394
 395        spin_lock(&qdisc_stab_lock);
 396        list_add_tail(&stab->list, &qdisc_stab_list);
 397        spin_unlock(&qdisc_stab_lock);
 398
 399        return stab;
 400}
 401
 402static void stab_kfree_rcu(struct rcu_head *head)
 403{
 404        kfree(container_of(head, struct qdisc_size_table, rcu));
 405}
 406
 407void qdisc_put_stab(struct qdisc_size_table *tab)
 408{
 409        if (!tab)
 410                return;
 411
 412        spin_lock(&qdisc_stab_lock);
 413
 414        if (--tab->refcnt == 0) {
 415                list_del(&tab->list);
 416                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 417        }
 418
 419        spin_unlock(&qdisc_stab_lock);
 420}
 421EXPORT_SYMBOL(qdisc_put_stab);
 422
 423static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 424{
 425        struct nlattr *nest;
 426
 427        nest = nla_nest_start(skb, TCA_STAB);
 428        if (nest == NULL)
 429                goto nla_put_failure;
 430        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 431                goto nla_put_failure;
 432        nla_nest_end(skb, nest);
 433
 434        return skb->len;
 435
 436nla_put_failure:
 437        return -1;
 438}
 439
 440void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 441{
 442        int pkt_len, slot;
 443
 444        pkt_len = skb->len + stab->szopts.overhead;
 445        if (unlikely(!stab->szopts.tsize))
 446                goto out;
 447
 448        slot = pkt_len + stab->szopts.cell_align;
 449        if (unlikely(slot < 0))
 450                slot = 0;
 451
 452        slot >>= stab->szopts.cell_log;
 453        if (likely(slot < stab->szopts.tsize))
 454                pkt_len = stab->data[slot];
 455        else
 456                pkt_len = stab->data[stab->szopts.tsize - 1] *
 457                                (slot / stab->szopts.tsize) +
 458                                stab->data[slot % stab->szopts.tsize];
 459
 460        pkt_len <<= stab->szopts.size_log;
 461out:
 462        if (unlikely(pkt_len < 1))
 463                pkt_len = 1;
 464        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 465}
 466EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 467
 468void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 469{
 470        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 471                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 472                        txt, qdisc->ops->id, qdisc->handle >> 16);
 473                qdisc->flags |= TCQ_F_WARN_NONWC;
 474        }
 475}
 476EXPORT_SYMBOL(qdisc_warn_nonwc);
 477
 478static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 479{
 480        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 481                                                 timer);
 482
 483        qdisc_unthrottled(wd->qdisc);
 484        __netif_schedule(qdisc_root(wd->qdisc));
 485
 486        return HRTIMER_NORESTART;
 487}
 488
 489void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 490{
 491        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 492        wd->timer.function = qdisc_watchdog;
 493        wd->qdisc = qdisc;
 494}
 495EXPORT_SYMBOL(qdisc_watchdog_init);
 496
 497void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
 498{
 499        if (test_bit(__QDISC_STATE_DEACTIVATED,
 500                     &qdisc_root_sleeping(wd->qdisc)->state))
 501                return;
 502
 503        qdisc_throttled(wd->qdisc);
 504
 505        hrtimer_start(&wd->timer,
 506                      ns_to_ktime(expires),
 507                      HRTIMER_MODE_ABS);
 508}
 509EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
 510
 511void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 512{
 513        hrtimer_cancel(&wd->timer);
 514        qdisc_unthrottled(wd->qdisc);
 515}
 516EXPORT_SYMBOL(qdisc_watchdog_cancel);
 517
 518static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 519{
 520        unsigned int size = n * sizeof(struct hlist_head), i;
 521        struct hlist_head *h;
 522
 523        if (size <= PAGE_SIZE)
 524                h = kmalloc(size, GFP_KERNEL);
 525        else
 526                h = (struct hlist_head *)
 527                        __get_free_pages(GFP_KERNEL, get_order(size));
 528
 529        if (h != NULL) {
 530                for (i = 0; i < n; i++)
 531                        INIT_HLIST_HEAD(&h[i]);
 532        }
 533        return h;
 534}
 535
 536static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 537{
 538        unsigned int size = n * sizeof(struct hlist_head);
 539
 540        if (size <= PAGE_SIZE)
 541                kfree(h);
 542        else
 543                free_pages((unsigned long)h, get_order(size));
 544}
 545
 546void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 547{
 548        struct Qdisc_class_common *cl;
 549        struct hlist_node *next;
 550        struct hlist_head *nhash, *ohash;
 551        unsigned int nsize, nmask, osize;
 552        unsigned int i, h;
 553
 554        /* Rehash when load factor exceeds 0.75 */
 555        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 556                return;
 557        nsize = clhash->hashsize * 2;
 558        nmask = nsize - 1;
 559        nhash = qdisc_class_hash_alloc(nsize);
 560        if (nhash == NULL)
 561                return;
 562
 563        ohash = clhash->hash;
 564        osize = clhash->hashsize;
 565
 566        sch_tree_lock(sch);
 567        for (i = 0; i < osize; i++) {
 568                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 569                        h = qdisc_class_hash(cl->classid, nmask);
 570                        hlist_add_head(&cl->hnode, &nhash[h]);
 571                }
 572        }
 573        clhash->hash     = nhash;
 574        clhash->hashsize = nsize;
 575        clhash->hashmask = nmask;
 576        sch_tree_unlock(sch);
 577
 578        qdisc_class_hash_free(ohash, osize);
 579}
 580EXPORT_SYMBOL(qdisc_class_hash_grow);
 581
 582int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 583{
 584        unsigned int size = 4;
 585
 586        clhash->hash = qdisc_class_hash_alloc(size);
 587        if (clhash->hash == NULL)
 588                return -ENOMEM;
 589        clhash->hashsize  = size;
 590        clhash->hashmask  = size - 1;
 591        clhash->hashelems = 0;
 592        return 0;
 593}
 594EXPORT_SYMBOL(qdisc_class_hash_init);
 595
 596void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 597{
 598        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 599}
 600EXPORT_SYMBOL(qdisc_class_hash_destroy);
 601
 602void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 603                             struct Qdisc_class_common *cl)
 604{
 605        unsigned int h;
 606
 607        INIT_HLIST_NODE(&cl->hnode);
 608        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 609        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 610        clhash->hashelems++;
 611}
 612EXPORT_SYMBOL(qdisc_class_hash_insert);
 613
 614void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 615                             struct Qdisc_class_common *cl)
 616{
 617        hlist_del(&cl->hnode);
 618        clhash->hashelems--;
 619}
 620EXPORT_SYMBOL(qdisc_class_hash_remove);
 621
 622/* Allocate an unique handle from space managed by kernel
 623 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 624 */
 625static u32 qdisc_alloc_handle(struct net_device *dev)
 626{
 627        int i = 0x8000;
 628        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 629
 630        do {
 631                autohandle += TC_H_MAKE(0x10000U, 0);
 632                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 633                        autohandle = TC_H_MAKE(0x80000000U, 0);
 634                if (!qdisc_lookup(dev, autohandle))
 635                        return autohandle;
 636                cond_resched();
 637        } while (--i > 0);
 638
 639        return 0;
 640}
 641
 642void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 643{
 644        const struct Qdisc_class_ops *cops;
 645        unsigned long cl;
 646        u32 parentid;
 647
 648        if (n == 0)
 649                return;
 650        while ((parentid = sch->parent)) {
 651                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 652                        return;
 653
 654                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 655                if (sch == NULL) {
 656                        WARN_ON(parentid != TC_H_ROOT);
 657                        return;
 658                }
 659                cops = sch->ops->cl_ops;
 660                if (cops->qlen_notify) {
 661                        cl = cops->get(sch, parentid);
 662                        cops->qlen_notify(sch, cl);
 663                        cops->put(sch, cl);
 664                }
 665                sch->q.qlen -= n;
 666        }
 667}
 668EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 669
 670static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 671                               struct nlmsghdr *n, u32 clid,
 672                               struct Qdisc *old, struct Qdisc *new)
 673{
 674        if (new || old)
 675                qdisc_notify(net, skb, n, clid, old, new);
 676
 677        if (old)
 678                qdisc_destroy(old);
 679}
 680
 681/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 682 * to device "dev".
 683 *
 684 * When appropriate send a netlink notification using 'skb'
 685 * and "n".
 686 *
 687 * On success, destroy old qdisc.
 688 */
 689
 690static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 691                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 692                       struct Qdisc *new, struct Qdisc *old)
 693{
 694        struct Qdisc *q = old;
 695        struct net *net = dev_net(dev);
 696        int err = 0;
 697
 698        if (parent == NULL) {
 699                unsigned int i, num_q, ingress;
 700
 701                ingress = 0;
 702                num_q = dev->num_tx_queues;
 703                if ((q && q->flags & TCQ_F_INGRESS) ||
 704                    (new && new->flags & TCQ_F_INGRESS)) {
 705                        num_q = 1;
 706                        ingress = 1;
 707                        if (!dev_ingress_queue(dev))
 708                                return -ENOENT;
 709                }
 710
 711                if (dev->flags & IFF_UP)
 712                        dev_deactivate(dev);
 713
 714                if (new && new->ops->attach) {
 715                        new->ops->attach(new);
 716                        num_q = 0;
 717                }
 718
 719                for (i = 0; i < num_q; i++) {
 720                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 721
 722                        if (!ingress)
 723                                dev_queue = netdev_get_tx_queue(dev, i);
 724
 725                        old = dev_graft_qdisc(dev_queue, new);
 726                        if (new && i > 0)
 727                                atomic_inc(&new->refcnt);
 728
 729                        if (!ingress)
 730                                qdisc_destroy(old);
 731                }
 732
 733                if (!ingress) {
 734                        notify_and_destroy(net, skb, n, classid,
 735                                           dev->qdisc, new);
 736                        if (new && !new->ops->attach)
 737                                atomic_inc(&new->refcnt);
 738                        dev->qdisc = new ? : &noop_qdisc;
 739                } else {
 740                        notify_and_destroy(net, skb, n, classid, old, new);
 741                }
 742
 743                if (dev->flags & IFF_UP)
 744                        dev_activate(dev);
 745        } else {
 746                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 747
 748                err = -EOPNOTSUPP;
 749                if (cops && cops->graft) {
 750                        unsigned long cl = cops->get(parent, classid);
 751                        if (cl) {
 752                                err = cops->graft(parent, cl, new, &old);
 753                                cops->put(parent, cl);
 754                        } else
 755                                err = -ENOENT;
 756                }
 757                if (!err)
 758                        notify_and_destroy(net, skb, n, classid, old, new);
 759        }
 760        return err;
 761}
 762
 763/* lockdep annotation is needed for ingress; egress gets it only for name */
 764static struct lock_class_key qdisc_tx_lock;
 765static struct lock_class_key qdisc_rx_lock;
 766
 767/*
 768   Allocate and initialize new qdisc.
 769
 770   Parameters are passed via opt.
 771 */
 772
 773static struct Qdisc *
 774qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 775             struct Qdisc *p, u32 parent, u32 handle,
 776             struct nlattr **tca, int *errp)
 777{
 778        int err;
 779        struct nlattr *kind = tca[TCA_KIND];
 780        struct Qdisc *sch;
 781        struct Qdisc_ops *ops;
 782        struct qdisc_size_table *stab;
 783
 784        ops = qdisc_lookup_ops(kind);
 785#ifdef CONFIG_MODULES
 786        if (ops == NULL && kind != NULL) {
 787                char name[IFNAMSIZ];
 788                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 789                        /* We dropped the RTNL semaphore in order to
 790                         * perform the module load.  So, even if we
 791                         * succeeded in loading the module we have to
 792                         * tell the caller to replay the request.  We
 793                         * indicate this using -EAGAIN.
 794                         * We replay the request because the device may
 795                         * go away in the mean time.
 796                         */
 797                        rtnl_unlock();
 798                        request_module("sch_%s", name);
 799                        rtnl_lock();
 800                        ops = qdisc_lookup_ops(kind);
 801                        if (ops != NULL) {
 802                                /* We will try again qdisc_lookup_ops,
 803                                 * so don't keep a reference.
 804                                 */
 805                                module_put(ops->owner);
 806                                err = -EAGAIN;
 807                                goto err_out;
 808                        }
 809                }
 810        }
 811#endif
 812
 813        err = -ENOENT;
 814        if (ops == NULL)
 815                goto err_out;
 816
 817        sch = qdisc_alloc(dev_queue, ops);
 818        if (IS_ERR(sch)) {
 819                err = PTR_ERR(sch);
 820                goto err_out2;
 821        }
 822
 823        sch->parent = parent;
 824
 825        if (handle == TC_H_INGRESS) {
 826                sch->flags |= TCQ_F_INGRESS;
 827                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 828                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 829        } else {
 830                if (handle == 0) {
 831                        handle = qdisc_alloc_handle(dev);
 832                        err = -ENOMEM;
 833                        if (handle == 0)
 834                                goto err_out3;
 835                }
 836                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 837                if (!netif_is_multiqueue(dev))
 838                        sch->flags |= TCQ_F_ONETXQUEUE;
 839        }
 840
 841        sch->handle = handle;
 842
 843        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 844                if (tca[TCA_STAB]) {
 845                        stab = qdisc_get_stab(tca[TCA_STAB]);
 846                        if (IS_ERR(stab)) {
 847                                err = PTR_ERR(stab);
 848                                goto err_out4;
 849                        }
 850                        rcu_assign_pointer(sch->stab, stab);
 851                }
 852                if (tca[TCA_RATE]) {
 853                        spinlock_t *root_lock;
 854
 855                        err = -EOPNOTSUPP;
 856                        if (sch->flags & TCQ_F_MQROOT)
 857                                goto err_out4;
 858
 859                        if ((sch->parent != TC_H_ROOT) &&
 860                            !(sch->flags & TCQ_F_INGRESS) &&
 861                            (!p || !(p->flags & TCQ_F_MQROOT)))
 862                                root_lock = qdisc_root_sleeping_lock(sch);
 863                        else
 864                                root_lock = qdisc_lock(sch);
 865
 866                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 867                                                root_lock, tca[TCA_RATE]);
 868                        if (err)
 869                                goto err_out4;
 870                }
 871
 872                qdisc_list_add(sch);
 873
 874                return sch;
 875        }
 876err_out3:
 877        dev_put(dev);
 878        kfree((char *) sch - sch->padded);
 879err_out2:
 880        module_put(ops->owner);
 881err_out:
 882        *errp = err;
 883        return NULL;
 884
 885err_out4:
 886        /*
 887         * Any broken qdiscs that would require a ops->reset() here?
 888         * The qdisc was never in action so it shouldn't be necessary.
 889         */
 890        qdisc_put_stab(rtnl_dereference(sch->stab));
 891        if (ops->destroy)
 892                ops->destroy(sch);
 893        goto err_out3;
 894}
 895
 896static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 897{
 898        struct qdisc_size_table *ostab, *stab = NULL;
 899        int err = 0;
 900
 901        if (tca[TCA_OPTIONS]) {
 902                if (sch->ops->change == NULL)
 903                        return -EINVAL;
 904                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 905                if (err)
 906                        return err;
 907        }
 908
 909        if (tca[TCA_STAB]) {
 910                stab = qdisc_get_stab(tca[TCA_STAB]);
 911                if (IS_ERR(stab))
 912                        return PTR_ERR(stab);
 913        }
 914
 915        ostab = rtnl_dereference(sch->stab);
 916        rcu_assign_pointer(sch->stab, stab);
 917        qdisc_put_stab(ostab);
 918
 919        if (tca[TCA_RATE]) {
 920                /* NB: ignores errors from replace_estimator
 921                   because change can't be undone. */
 922                if (sch->flags & TCQ_F_MQROOT)
 923                        goto out;
 924                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 925                                            qdisc_root_sleeping_lock(sch),
 926                                            tca[TCA_RATE]);
 927        }
 928out:
 929        return 0;
 930}
 931
 932struct check_loop_arg {
 933        struct qdisc_walker     w;
 934        struct Qdisc            *p;
 935        int                     depth;
 936};
 937
 938static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 939
 940static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 941{
 942        struct check_loop_arg   arg;
 943
 944        if (q->ops->cl_ops == NULL)
 945                return 0;
 946
 947        arg.w.stop = arg.w.skip = arg.w.count = 0;
 948        arg.w.fn = check_loop_fn;
 949        arg.depth = depth;
 950        arg.p = p;
 951        q->ops->cl_ops->walk(q, &arg.w);
 952        return arg.w.stop ? -ELOOP : 0;
 953}
 954
 955static int
 956check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 957{
 958        struct Qdisc *leaf;
 959        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 960        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 961
 962        leaf = cops->leaf(q, cl);
 963        if (leaf) {
 964                if (leaf == arg->p || arg->depth > 7)
 965                        return -ELOOP;
 966                return check_loop(leaf, arg->p, arg->depth + 1);
 967        }
 968        return 0;
 969}
 970
 971/*
 972 * Delete/get qdisc.
 973 */
 974
 975static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
 976{
 977        struct net *net = sock_net(skb->sk);
 978        struct tcmsg *tcm = nlmsg_data(n);
 979        struct nlattr *tca[TCA_MAX + 1];
 980        struct net_device *dev;
 981        u32 clid;
 982        struct Qdisc *q = NULL;
 983        struct Qdisc *p = NULL;
 984        int err;
 985
 986        if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
 987                return -EPERM;
 988
 989        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 990        if (err < 0)
 991                return err;
 992
 993        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
 994        if (!dev)
 995                return -ENODEV;
 996
 997        clid = tcm->tcm_parent;
 998        if (clid) {
 999                if (clid != TC_H_ROOT) {
1000                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1001                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1002                                if (!p)
1003                                        return -ENOENT;
1004                                q = qdisc_leaf(p, clid);
1005                        } else if (dev_ingress_queue(dev)) {
1006                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1007                        }
1008                } else {
1009                        q = dev->qdisc;
1010                }
1011                if (!q)
1012                        return -ENOENT;
1013
1014                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1015                        return -EINVAL;
1016        } else {
1017                q = qdisc_lookup(dev, tcm->tcm_handle);
1018                if (!q)
1019                        return -ENOENT;
1020        }
1021
1022        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1023                return -EINVAL;
1024
1025        if (n->nlmsg_type == RTM_DELQDISC) {
1026                if (!clid)
1027                        return -EINVAL;
1028                if (q->handle == 0)
1029                        return -ENOENT;
1030                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1031                if (err != 0)
1032                        return err;
1033        } else {
1034                qdisc_notify(net, skb, n, clid, NULL, q);
1035        }
1036        return 0;
1037}
1038
1039/*
1040 * Create/change qdisc.
1041 */
1042
1043static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1044{
1045        struct net *net = sock_net(skb->sk);
1046        struct tcmsg *tcm;
1047        struct nlattr *tca[TCA_MAX + 1];
1048        struct net_device *dev;
1049        u32 clid;
1050        struct Qdisc *q, *p;
1051        int err;
1052
1053        if (!capable(CAP_NET_ADMIN))
1054                return -EPERM;
1055
1056replay:
1057        /* Reinit, just in case something touches this. */
1058        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1059        if (err < 0)
1060                return err;
1061
1062        tcm = nlmsg_data(n);
1063        clid = tcm->tcm_parent;
1064        q = p = NULL;
1065
1066        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1067        if (!dev)
1068                return -ENODEV;
1069
1070
1071        if (clid) {
1072                if (clid != TC_H_ROOT) {
1073                        if (clid != TC_H_INGRESS) {
1074                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1075                                if (!p)
1076                                        return -ENOENT;
1077                                q = qdisc_leaf(p, clid);
1078                        } else if (dev_ingress_queue_create(dev)) {
1079                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1080                        }
1081                } else {
1082                        q = dev->qdisc;
1083                }
1084
1085                /* It may be default qdisc, ignore it */
1086                if (q && q->handle == 0)
1087                        q = NULL;
1088
1089                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1090                        if (tcm->tcm_handle) {
1091                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1092                                        return -EEXIST;
1093                                if (TC_H_MIN(tcm->tcm_handle))
1094                                        return -EINVAL;
1095                                q = qdisc_lookup(dev, tcm->tcm_handle);
1096                                if (!q)
1097                                        goto create_n_graft;
1098                                if (n->nlmsg_flags & NLM_F_EXCL)
1099                                        return -EEXIST;
1100                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1101                                        return -EINVAL;
1102                                if (q == p ||
1103                                    (p && check_loop(q, p, 0)))
1104                                        return -ELOOP;
1105                                atomic_inc(&q->refcnt);
1106                                goto graft;
1107                        } else {
1108                                if (!q)
1109                                        goto create_n_graft;
1110
1111                                /* This magic test requires explanation.
1112                                 *
1113                                 *   We know, that some child q is already
1114                                 *   attached to this parent and have choice:
1115                                 *   either to change it or to create/graft new one.
1116                                 *
1117                                 *   1. We are allowed to create/graft only
1118                                 *   if CREATE and REPLACE flags are set.
1119                                 *
1120                                 *   2. If EXCL is set, requestor wanted to say,
1121                                 *   that qdisc tcm_handle is not expected
1122                                 *   to exist, so that we choose create/graft too.
1123                                 *
1124                                 *   3. The last case is when no flags are set.
1125                                 *   Alas, it is sort of hole in API, we
1126                                 *   cannot decide what to do unambiguously.
1127                                 *   For now we select create/graft, if
1128                                 *   user gave KIND, which does not match existing.
1129                                 */
1130                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1131                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1132                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1133                                     (tca[TCA_KIND] &&
1134                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1135                                        goto create_n_graft;
1136                        }
1137                }
1138        } else {
1139                if (!tcm->tcm_handle)
1140                        return -EINVAL;
1141                q = qdisc_lookup(dev, tcm->tcm_handle);
1142        }
1143
1144        /* Change qdisc parameters */
1145        if (q == NULL)
1146                return -ENOENT;
1147        if (n->nlmsg_flags & NLM_F_EXCL)
1148                return -EEXIST;
1149        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1150                return -EINVAL;
1151        err = qdisc_change(q, tca);
1152        if (err == 0)
1153                qdisc_notify(net, skb, n, clid, NULL, q);
1154        return err;
1155
1156create_n_graft:
1157        if (!(n->nlmsg_flags & NLM_F_CREATE))
1158                return -ENOENT;
1159        if (clid == TC_H_INGRESS) {
1160                if (dev_ingress_queue(dev))
1161                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1162                                         tcm->tcm_parent, tcm->tcm_parent,
1163                                         tca, &err);
1164                else
1165                        err = -ENOENT;
1166        } else {
1167                struct netdev_queue *dev_queue;
1168
1169                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1170                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1171                else if (p)
1172                        dev_queue = p->dev_queue;
1173                else
1174                        dev_queue = netdev_get_tx_queue(dev, 0);
1175
1176                q = qdisc_create(dev, dev_queue, p,
1177                                 tcm->tcm_parent, tcm->tcm_handle,
1178                                 tca, &err);
1179        }
1180        if (q == NULL) {
1181                if (err == -EAGAIN)
1182                        goto replay;
1183                return err;
1184        }
1185
1186graft:
1187        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1188        if (err) {
1189                if (q)
1190                        qdisc_destroy(q);
1191                return err;
1192        }
1193
1194        return 0;
1195}
1196
1197static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1198                         u32 portid, u32 seq, u16 flags, int event)
1199{
1200        struct tcmsg *tcm;
1201        struct nlmsghdr  *nlh;
1202        unsigned char *b = skb_tail_pointer(skb);
1203        struct gnet_dump d;
1204        struct qdisc_size_table *stab;
1205
1206        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1207        if (!nlh)
1208                goto out_nlmsg_trim;
1209        tcm = nlmsg_data(nlh);
1210        tcm->tcm_family = AF_UNSPEC;
1211        tcm->tcm__pad1 = 0;
1212        tcm->tcm__pad2 = 0;
1213        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1214        tcm->tcm_parent = clid;
1215        tcm->tcm_handle = q->handle;
1216        tcm->tcm_info = atomic_read(&q->refcnt);
1217        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1218                goto nla_put_failure;
1219        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1220                goto nla_put_failure;
1221        q->qstats.qlen = q->q.qlen;
1222
1223        stab = rtnl_dereference(q->stab);
1224        if (stab && qdisc_dump_stab(skb, stab) < 0)
1225                goto nla_put_failure;
1226
1227        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1228                                         qdisc_root_sleeping_lock(q), &d) < 0)
1229                goto nla_put_failure;
1230
1231        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1232                goto nla_put_failure;
1233
1234        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1235            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1236            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1237                goto nla_put_failure;
1238
1239        if (gnet_stats_finish_copy(&d) < 0)
1240                goto nla_put_failure;
1241
1242        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1243        return skb->len;
1244
1245out_nlmsg_trim:
1246nla_put_failure:
1247        nlmsg_trim(skb, b);
1248        return -1;
1249}
1250
1251static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1252{
1253        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1254}
1255
1256static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1257                        struct nlmsghdr *n, u32 clid,
1258                        struct Qdisc *old, struct Qdisc *new)
1259{
1260        struct sk_buff *skb;
1261        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1262
1263        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1264        if (!skb)
1265                return -ENOBUFS;
1266
1267        if (old && !tc_qdisc_dump_ignore(old)) {
1268                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1269                                  0, RTM_DELQDISC) < 0)
1270                        goto err_out;
1271        }
1272        if (new && !tc_qdisc_dump_ignore(new)) {
1273                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1274                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1275                        goto err_out;
1276        }
1277
1278        if (skb->len)
1279                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1280                                      n->nlmsg_flags & NLM_F_ECHO);
1281
1282err_out:
1283        kfree_skb(skb);
1284        return -EINVAL;
1285}
1286
1287static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1288                              struct netlink_callback *cb,
1289                              int *q_idx_p, int s_q_idx)
1290{
1291        int ret = 0, q_idx = *q_idx_p;
1292        struct Qdisc *q;
1293
1294        if (!root)
1295                return 0;
1296
1297        q = root;
1298        if (q_idx < s_q_idx) {
1299                q_idx++;
1300        } else {
1301                if (!tc_qdisc_dump_ignore(q) &&
1302                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1303                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1304                        goto done;
1305                q_idx++;
1306        }
1307        list_for_each_entry(q, &root->list, list) {
1308                if (q_idx < s_q_idx) {
1309                        q_idx++;
1310                        continue;
1311                }
1312                if (!tc_qdisc_dump_ignore(q) &&
1313                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1314                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1315                        goto done;
1316                q_idx++;
1317        }
1318
1319out:
1320        *q_idx_p = q_idx;
1321        return ret;
1322done:
1323        ret = -1;
1324        goto out;
1325}
1326
1327static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1328{
1329        struct net *net = sock_net(skb->sk);
1330        int idx, q_idx;
1331        int s_idx, s_q_idx;
1332        struct net_device *dev;
1333
1334        s_idx = cb->args[0];
1335        s_q_idx = q_idx = cb->args[1];
1336
1337        rcu_read_lock();
1338        idx = 0;
1339        for_each_netdev_rcu(net, dev) {
1340                struct netdev_queue *dev_queue;
1341
1342                if (idx < s_idx)
1343                        goto cont;
1344                if (idx > s_idx)
1345                        s_q_idx = 0;
1346                q_idx = 0;
1347
1348                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1349                        goto done;
1350
1351                dev_queue = dev_ingress_queue(dev);
1352                if (dev_queue &&
1353                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1354                                       &q_idx, s_q_idx) < 0)
1355                        goto done;
1356
1357cont:
1358                idx++;
1359        }
1360
1361done:
1362        rcu_read_unlock();
1363
1364        cb->args[0] = idx;
1365        cb->args[1] = q_idx;
1366
1367        return skb->len;
1368}
1369
1370
1371
1372/************************************************
1373 *      Traffic classes manipulation.           *
1374 ************************************************/
1375
1376
1377
1378static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1379{
1380        struct net *net = sock_net(skb->sk);
1381        struct tcmsg *tcm = nlmsg_data(n);
1382        struct nlattr *tca[TCA_MAX + 1];
1383        struct net_device *dev;
1384        struct Qdisc *q = NULL;
1385        const struct Qdisc_class_ops *cops;
1386        unsigned long cl = 0;
1387        unsigned long new_cl;
1388        u32 portid;
1389        u32 clid;
1390        u32 qid;
1391        int err;
1392
1393        if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1394                return -EPERM;
1395
1396        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1397        if (err < 0)
1398                return err;
1399
1400        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1401        if (!dev)
1402                return -ENODEV;
1403
1404        /*
1405           parent == TC_H_UNSPEC - unspecified parent.
1406           parent == TC_H_ROOT   - class is root, which has no parent.
1407           parent == X:0         - parent is root class.
1408           parent == X:Y         - parent is a node in hierarchy.
1409           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1410
1411           handle == 0:0         - generate handle from kernel pool.
1412           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1413           handle == X:Y         - clear.
1414           handle == X:0         - root class.
1415         */
1416
1417        /* Step 1. Determine qdisc handle X:0 */
1418
1419        portid = tcm->tcm_parent;
1420        clid = tcm->tcm_handle;
1421        qid = TC_H_MAJ(clid);
1422
1423        if (portid != TC_H_ROOT) {
1424                u32 qid1 = TC_H_MAJ(portid);
1425
1426                if (qid && qid1) {
1427                        /* If both majors are known, they must be identical. */
1428                        if (qid != qid1)
1429                                return -EINVAL;
1430                } else if (qid1) {
1431                        qid = qid1;
1432                } else if (qid == 0)
1433                        qid = dev->qdisc->handle;
1434
1435                /* Now qid is genuine qdisc handle consistent
1436                 * both with parent and child.
1437                 *
1438                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1439                 */
1440                if (portid)
1441                        portid = TC_H_MAKE(qid, portid);
1442        } else {
1443                if (qid == 0)
1444                        qid = dev->qdisc->handle;
1445        }
1446
1447        /* OK. Locate qdisc */
1448        q = qdisc_lookup(dev, qid);
1449        if (!q)
1450                return -ENOENT;
1451
1452        /* An check that it supports classes */
1453        cops = q->ops->cl_ops;
1454        if (cops == NULL)
1455                return -EINVAL;
1456
1457        /* Now try to get class */
1458        if (clid == 0) {
1459                if (portid == TC_H_ROOT)
1460                        clid = qid;
1461        } else
1462                clid = TC_H_MAKE(qid, clid);
1463
1464        if (clid)
1465                cl = cops->get(q, clid);
1466
1467        if (cl == 0) {
1468                err = -ENOENT;
1469                if (n->nlmsg_type != RTM_NEWTCLASS ||
1470                    !(n->nlmsg_flags & NLM_F_CREATE))
1471                        goto out;
1472        } else {
1473                switch (n->nlmsg_type) {
1474                case RTM_NEWTCLASS:
1475                        err = -EEXIST;
1476                        if (n->nlmsg_flags & NLM_F_EXCL)
1477                                goto out;
1478                        break;
1479                case RTM_DELTCLASS:
1480                        err = -EOPNOTSUPP;
1481                        if (cops->delete)
1482                                err = cops->delete(q, cl);
1483                        if (err == 0)
1484                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1485                        goto out;
1486                case RTM_GETTCLASS:
1487                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1488                        goto out;
1489                default:
1490                        err = -EINVAL;
1491                        goto out;
1492                }
1493        }
1494
1495        new_cl = cl;
1496        err = -EOPNOTSUPP;
1497        if (cops->change)
1498                err = cops->change(q, clid, portid, tca, &new_cl);
1499        if (err == 0)
1500                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1501
1502out:
1503        if (cl)
1504                cops->put(q, cl);
1505
1506        return err;
1507}
1508
1509
1510static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1511                          unsigned long cl,
1512                          u32 portid, u32 seq, u16 flags, int event)
1513{
1514        struct tcmsg *tcm;
1515        struct nlmsghdr  *nlh;
1516        unsigned char *b = skb_tail_pointer(skb);
1517        struct gnet_dump d;
1518        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1519
1520        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1521        if (!nlh)
1522                goto out_nlmsg_trim;
1523        tcm = nlmsg_data(nlh);
1524        tcm->tcm_family = AF_UNSPEC;
1525        tcm->tcm__pad1 = 0;
1526        tcm->tcm__pad2 = 0;
1527        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1528        tcm->tcm_parent = q->handle;
1529        tcm->tcm_handle = q->handle;
1530        tcm->tcm_info = 0;
1531        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1532                goto nla_put_failure;
1533        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1534                goto nla_put_failure;
1535
1536        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1537                                         qdisc_root_sleeping_lock(q), &d) < 0)
1538                goto nla_put_failure;
1539
1540        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1541                goto nla_put_failure;
1542
1543        if (gnet_stats_finish_copy(&d) < 0)
1544                goto nla_put_failure;
1545
1546        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1547        return skb->len;
1548
1549out_nlmsg_trim:
1550nla_put_failure:
1551        nlmsg_trim(skb, b);
1552        return -1;
1553}
1554
1555static int tclass_notify(struct net *net, struct sk_buff *oskb,
1556                         struct nlmsghdr *n, struct Qdisc *q,
1557                         unsigned long cl, int event)
1558{
1559        struct sk_buff *skb;
1560        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1561
1562        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1563        if (!skb)
1564                return -ENOBUFS;
1565
1566        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1567                kfree_skb(skb);
1568                return -EINVAL;
1569        }
1570
1571        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1572                              n->nlmsg_flags & NLM_F_ECHO);
1573}
1574
1575struct qdisc_dump_args {
1576        struct qdisc_walker     w;
1577        struct sk_buff          *skb;
1578        struct netlink_callback *cb;
1579};
1580
1581static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1582{
1583        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1584
1585        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1586                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1587}
1588
1589static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1590                                struct tcmsg *tcm, struct netlink_callback *cb,
1591                                int *t_p, int s_t)
1592{
1593        struct qdisc_dump_args arg;
1594
1595        if (tc_qdisc_dump_ignore(q) ||
1596            *t_p < s_t || !q->ops->cl_ops ||
1597            (tcm->tcm_parent &&
1598             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1599                (*t_p)++;
1600                return 0;
1601        }
1602        if (*t_p > s_t)
1603                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1604        arg.w.fn = qdisc_class_dump;
1605        arg.skb = skb;
1606        arg.cb = cb;
1607        arg.w.stop  = 0;
1608        arg.w.skip = cb->args[1];
1609        arg.w.count = 0;
1610        q->ops->cl_ops->walk(q, &arg.w);
1611        cb->args[1] = arg.w.count;
1612        if (arg.w.stop)
1613                return -1;
1614        (*t_p)++;
1615        return 0;
1616}
1617
1618static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1619                               struct tcmsg *tcm, struct netlink_callback *cb,
1620                               int *t_p, int s_t)
1621{
1622        struct Qdisc *q;
1623
1624        if (!root)
1625                return 0;
1626
1627        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1628                return -1;
1629
1630        list_for_each_entry(q, &root->list, list) {
1631                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1632                        return -1;
1633        }
1634
1635        return 0;
1636}
1637
1638static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1639{
1640        struct tcmsg *tcm = nlmsg_data(cb->nlh);
1641        struct net *net = sock_net(skb->sk);
1642        struct netdev_queue *dev_queue;
1643        struct net_device *dev;
1644        int t, s_t;
1645
1646        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1647                return 0;
1648        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1649        if (!dev)
1650                return 0;
1651
1652        s_t = cb->args[0];
1653        t = 0;
1654
1655        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1656                goto done;
1657
1658        dev_queue = dev_ingress_queue(dev);
1659        if (dev_queue &&
1660            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1661                                &t, s_t) < 0)
1662                goto done;
1663
1664done:
1665        cb->args[0] = t;
1666
1667        dev_put(dev);
1668        return skb->len;
1669}
1670
1671/* Main classifier routine: scans classifier chain attached
1672 * to this qdisc, (optionally) tests for protocol and asks
1673 * specific classifiers.
1674 */
1675int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1676                       struct tcf_result *res)
1677{
1678        __be16 protocol = skb->protocol;
1679        int err;
1680
1681        for (; tp; tp = tp->next) {
1682                if (tp->protocol != protocol &&
1683                    tp->protocol != htons(ETH_P_ALL))
1684                        continue;
1685                err = tp->classify(skb, tp, res);
1686
1687                if (err >= 0) {
1688#ifdef CONFIG_NET_CLS_ACT
1689                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1690                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1691#endif
1692                        return err;
1693                }
1694        }
1695        return -1;
1696}
1697EXPORT_SYMBOL(tc_classify_compat);
1698
1699int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1700                struct tcf_result *res)
1701{
1702        int err = 0;
1703#ifdef CONFIG_NET_CLS_ACT
1704        const struct tcf_proto *otp = tp;
1705reclassify:
1706#endif
1707
1708        err = tc_classify_compat(skb, tp, res);
1709#ifdef CONFIG_NET_CLS_ACT
1710        if (err == TC_ACT_RECLASSIFY) {
1711                u32 verd = G_TC_VERD(skb->tc_verd);
1712                tp = otp;
1713
1714                if (verd++ >= MAX_REC_LOOP) {
1715                        net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1716                                               tp->q->ops->id,
1717                                               tp->prio & 0xffff,
1718                                               ntohs(tp->protocol));
1719                        return TC_ACT_SHOT;
1720                }
1721                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1722                goto reclassify;
1723        }
1724#endif
1725        return err;
1726}
1727EXPORT_SYMBOL(tc_classify);
1728
1729void tcf_destroy(struct tcf_proto *tp)
1730{
1731        tp->ops->destroy(tp);
1732        module_put(tp->ops->owner);
1733        kfree(tp);
1734}
1735
1736void tcf_destroy_chain(struct tcf_proto **fl)
1737{
1738        struct tcf_proto *tp;
1739
1740        while ((tp = *fl) != NULL) {
1741                *fl = tp->next;
1742                tcf_destroy(tp);
1743        }
1744}
1745EXPORT_SYMBOL(tcf_destroy_chain);
1746
1747#ifdef CONFIG_PROC_FS
1748static int psched_show(struct seq_file *seq, void *v)
1749{
1750        struct timespec ts;
1751
1752        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1753        seq_printf(seq, "%08x %08x %08x %08x\n",
1754                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1755                   1000000,
1756                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1757
1758        return 0;
1759}
1760
1761static int psched_open(struct inode *inode, struct file *file)
1762{
1763        return single_open(file, psched_show, NULL);
1764}
1765
1766static const struct file_operations psched_fops = {
1767        .owner = THIS_MODULE,
1768        .open = psched_open,
1769        .read  = seq_read,
1770        .llseek = seq_lseek,
1771        .release = single_release,
1772};
1773
1774static int __net_init psched_net_init(struct net *net)
1775{
1776        struct proc_dir_entry *e;
1777
1778        e = proc_create("psched", 0, net->proc_net, &psched_fops);
1779        if (e == NULL)
1780                return -ENOMEM;
1781
1782        return 0;
1783}
1784
1785static void __net_exit psched_net_exit(struct net *net)
1786{
1787        remove_proc_entry("psched", net->proc_net);
1788}
1789#else
1790static int __net_init psched_net_init(struct net *net)
1791{
1792        return 0;
1793}
1794
1795static void __net_exit psched_net_exit(struct net *net)
1796{
1797}
1798#endif
1799
1800static struct pernet_operations psched_net_ops = {
1801        .init = psched_net_init,
1802        .exit = psched_net_exit,
1803};
1804
1805static int __init pktsched_init(void)
1806{
1807        int err;
1808
1809        err = register_pernet_subsys(&psched_net_ops);
1810        if (err) {
1811                pr_err("pktsched_init: "
1812                       "cannot initialize per netns operations\n");
1813                return err;
1814        }
1815
1816        register_qdisc(&pfifo_qdisc_ops);
1817        register_qdisc(&bfifo_qdisc_ops);
1818        register_qdisc(&pfifo_head_drop_qdisc_ops);
1819        register_qdisc(&mq_qdisc_ops);
1820
1821        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1822        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1823        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1824        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1825        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1826        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1827
1828        return 0;
1829}
1830
1831subsys_initcall(pktsched_init);
1832