linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98
  99   Auxiliary routines:
 100
 101   ---peek
 102
 103   like dequeue but without removing a packet from the queue
 104
 105   ---reset
 106
 107   returns qdisc to initial state: purge all buffers, clear all
 108   timers, counters (except for statistics) etc.
 109
 110   ---init
 111
 112   initializes newly created qdisc.
 113
 114   ---destroy
 115
 116   destroys resources allocated by init and during lifetime of qdisc.
 117
 118   ---change
 119
 120   changes qdisc parameters.
 121 */
 122
 123/* Protects list of registered TC modules. It is pure SMP lock. */
 124static DEFINE_RWLOCK(qdisc_mod_lock);
 125
 126
 127/************************************************
 128 *      Queueing disciplines manipulation.      *
 129 ************************************************/
 130
 131
 132/* The list of all installed queueing disciplines. */
 133
 134static struct Qdisc_ops *qdisc_base;
 135
 136/* Register/unregister queueing discipline */
 137
 138int register_qdisc(struct Qdisc_ops *qops)
 139{
 140        struct Qdisc_ops *q, **qp;
 141        int rc = -EEXIST;
 142
 143        write_lock(&qdisc_mod_lock);
 144        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 145                if (!strcmp(qops->id, q->id))
 146                        goto out;
 147
 148        if (qops->enqueue == NULL)
 149                qops->enqueue = noop_qdisc_ops.enqueue;
 150        if (qops->peek == NULL) {
 151                if (qops->dequeue == NULL)
 152                        qops->peek = noop_qdisc_ops.peek;
 153                else
 154                        goto out_einval;
 155        }
 156        if (qops->dequeue == NULL)
 157                qops->dequeue = noop_qdisc_ops.dequeue;
 158
 159        if (qops->cl_ops) {
 160                const struct Qdisc_class_ops *cops = qops->cl_ops;
 161
 162                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 163                        goto out_einval;
 164
 165                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 166                        goto out_einval;
 167        }
 168
 169        qops->next = NULL;
 170        *qp = qops;
 171        rc = 0;
 172out:
 173        write_unlock(&qdisc_mod_lock);
 174        return rc;
 175
 176out_einval:
 177        rc = -EINVAL;
 178        goto out;
 179}
 180EXPORT_SYMBOL(register_qdisc);
 181
 182int unregister_qdisc(struct Qdisc_ops *qops)
 183{
 184        struct Qdisc_ops *q, **qp;
 185        int err = -ENOENT;
 186
 187        write_lock(&qdisc_mod_lock);
 188        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 189                if (q == qops)
 190                        break;
 191        if (q) {
 192                *qp = q->next;
 193                q->next = NULL;
 194                err = 0;
 195        }
 196        write_unlock(&qdisc_mod_lock);
 197        return err;
 198}
 199EXPORT_SYMBOL(unregister_qdisc);
 200
 201/* Get default qdisc if not otherwise specified */
 202void qdisc_get_default(char *name, size_t len)
 203{
 204        read_lock(&qdisc_mod_lock);
 205        strlcpy(name, default_qdisc_ops->id, len);
 206        read_unlock(&qdisc_mod_lock);
 207}
 208
 209static struct Qdisc_ops *qdisc_lookup_default(const char *name)
 210{
 211        struct Qdisc_ops *q = NULL;
 212
 213        for (q = qdisc_base; q; q = q->next) {
 214                if (!strcmp(name, q->id)) {
 215                        if (!try_module_get(q->owner))
 216                                q = NULL;
 217                        break;
 218                }
 219        }
 220
 221        return q;
 222}
 223
 224/* Set new default qdisc to use */
 225int qdisc_set_default(const char *name)
 226{
 227        const struct Qdisc_ops *ops;
 228
 229        if (!capable(CAP_NET_ADMIN))
 230                return -EPERM;
 231
 232        write_lock(&qdisc_mod_lock);
 233        ops = qdisc_lookup_default(name);
 234        if (!ops) {
 235                /* Not found, drop lock and try to load module */
 236                write_unlock(&qdisc_mod_lock);
 237                request_module("sch_%s", name);
 238                write_lock(&qdisc_mod_lock);
 239
 240                ops = qdisc_lookup_default(name);
 241        }
 242
 243        if (ops) {
 244                /* Set new default */
 245                module_put(default_qdisc_ops->owner);
 246                default_qdisc_ops = ops;
 247        }
 248        write_unlock(&qdisc_mod_lock);
 249
 250        return ops ? 0 : -ENOENT;
 251}
 252
 253/* We know handle. Find qdisc among all qdisc's attached to device
 254 * (root qdisc, all its children, children of children etc.)
 255 * Note: caller either uses rtnl or rcu_read_lock()
 256 */
 257
 258static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 259{
 260        struct Qdisc *q;
 261
 262        if (!(root->flags & TCQ_F_BUILTIN) &&
 263            root->handle == handle)
 264                return root;
 265
 266        list_for_each_entry_rcu(q, &root->list, list) {
 267                if (q->handle == handle)
 268                        return q;
 269        }
 270        return NULL;
 271}
 272
 273void qdisc_list_add(struct Qdisc *q)
 274{
 275        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 276                struct Qdisc *root = qdisc_dev(q)->qdisc;
 277
 278                WARN_ON_ONCE(root == &noop_qdisc);
 279                ASSERT_RTNL();
 280                list_add_tail_rcu(&q->list, &root->list);
 281        }
 282}
 283EXPORT_SYMBOL(qdisc_list_add);
 284
 285void qdisc_list_del(struct Qdisc *q)
 286{
 287        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 288                ASSERT_RTNL();
 289                list_del_rcu(&q->list);
 290        }
 291}
 292EXPORT_SYMBOL(qdisc_list_del);
 293
 294struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 295{
 296        struct Qdisc *q;
 297
 298        q = qdisc_match_from_root(dev->qdisc, handle);
 299        if (q)
 300                goto out;
 301
 302        if (dev_ingress_queue(dev))
 303                q = qdisc_match_from_root(
 304                        dev_ingress_queue(dev)->qdisc_sleeping,
 305                        handle);
 306out:
 307        return q;
 308}
 309
 310static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 311{
 312        unsigned long cl;
 313        struct Qdisc *leaf;
 314        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 315
 316        if (cops == NULL)
 317                return NULL;
 318        cl = cops->get(p, classid);
 319
 320        if (cl == 0)
 321                return NULL;
 322        leaf = cops->leaf(p, cl);
 323        cops->put(p, cl);
 324        return leaf;
 325}
 326
 327/* Find queueing discipline by name */
 328
 329static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 330{
 331        struct Qdisc_ops *q = NULL;
 332
 333        if (kind) {
 334                read_lock(&qdisc_mod_lock);
 335                for (q = qdisc_base; q; q = q->next) {
 336                        if (nla_strcmp(kind, q->id) == 0) {
 337                                if (!try_module_get(q->owner))
 338                                        q = NULL;
 339                                break;
 340                        }
 341                }
 342                read_unlock(&qdisc_mod_lock);
 343        }
 344        return q;
 345}
 346
 347/* The linklayer setting were not transferred from iproute2, in older
 348 * versions, and the rate tables lookup systems have been dropped in
 349 * the kernel. To keep backward compatible with older iproute2 tc
 350 * utils, we detect the linklayer setting by detecting if the rate
 351 * table were modified.
 352 *
 353 * For linklayer ATM table entries, the rate table will be aligned to
 354 * 48 bytes, thus some table entries will contain the same value.  The
 355 * mpu (min packet unit) is also encoded into the old rate table, thus
 356 * starting from the mpu, we find low and high table entries for
 357 * mapping this cell.  If these entries contain the same value, when
 358 * the rate tables have been modified for linklayer ATM.
 359 *
 360 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 361 * and then roundup to the next cell, calc the table entry one below,
 362 * and compare.
 363 */
 364static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 365{
 366        int low       = roundup(r->mpu, 48);
 367        int high      = roundup(low+1, 48);
 368        int cell_low  = low >> r->cell_log;
 369        int cell_high = (high >> r->cell_log) - 1;
 370
 371        /* rtab is too inaccurate at rates > 100Mbit/s */
 372        if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
 373                pr_debug("TC linklayer: Giving up ATM detection\n");
 374                return TC_LINKLAYER_ETHERNET;
 375        }
 376
 377        if ((cell_high > cell_low) && (cell_high < 256)
 378            && (rtab[cell_low] == rtab[cell_high])) {
 379                pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
 380                         cell_low, cell_high, rtab[cell_high]);
 381                return TC_LINKLAYER_ATM;
 382        }
 383        return TC_LINKLAYER_ETHERNET;
 384}
 385
 386static struct qdisc_rate_table *qdisc_rtab_list;
 387
 388struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 389{
 390        struct qdisc_rate_table *rtab;
 391
 392        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 393            nla_len(tab) != TC_RTAB_SIZE)
 394                return NULL;
 395
 396        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 397                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 398                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 399                        rtab->refcnt++;
 400                        return rtab;
 401                }
 402        }
 403
 404        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 405        if (rtab) {
 406                rtab->rate = *r;
 407                rtab->refcnt = 1;
 408                memcpy(rtab->data, nla_data(tab), 1024);
 409                if (r->linklayer == TC_LINKLAYER_UNAWARE)
 410                        r->linklayer = __detect_linklayer(r, rtab->data);
 411                rtab->next = qdisc_rtab_list;
 412                qdisc_rtab_list = rtab;
 413        }
 414        return rtab;
 415}
 416EXPORT_SYMBOL(qdisc_get_rtab);
 417
 418void qdisc_put_rtab(struct qdisc_rate_table *tab)
 419{
 420        struct qdisc_rate_table *rtab, **rtabp;
 421
 422        if (!tab || --tab->refcnt)
 423                return;
 424
 425        for (rtabp = &qdisc_rtab_list;
 426             (rtab = *rtabp) != NULL;
 427             rtabp = &rtab->next) {
 428                if (rtab == tab) {
 429                        *rtabp = rtab->next;
 430                        kfree(rtab);
 431                        return;
 432                }
 433        }
 434}
 435EXPORT_SYMBOL(qdisc_put_rtab);
 436
 437static LIST_HEAD(qdisc_stab_list);
 438static DEFINE_SPINLOCK(qdisc_stab_lock);
 439
 440static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 441        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 442        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 443};
 444
 445static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 446{
 447        struct nlattr *tb[TCA_STAB_MAX + 1];
 448        struct qdisc_size_table *stab;
 449        struct tc_sizespec *s;
 450        unsigned int tsize = 0;
 451        u16 *tab = NULL;
 452        int err;
 453
 454        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 455        if (err < 0)
 456                return ERR_PTR(err);
 457        if (!tb[TCA_STAB_BASE])
 458                return ERR_PTR(-EINVAL);
 459
 460        s = nla_data(tb[TCA_STAB_BASE]);
 461
 462        if (s->tsize > 0) {
 463                if (!tb[TCA_STAB_DATA])
 464                        return ERR_PTR(-EINVAL);
 465                tab = nla_data(tb[TCA_STAB_DATA]);
 466                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 467        }
 468
 469        if (tsize != s->tsize || (!tab && tsize > 0))
 470                return ERR_PTR(-EINVAL);
 471
 472        spin_lock(&qdisc_stab_lock);
 473
 474        list_for_each_entry(stab, &qdisc_stab_list, list) {
 475                if (memcmp(&stab->szopts, s, sizeof(*s)))
 476                        continue;
 477                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 478                        continue;
 479                stab->refcnt++;
 480                spin_unlock(&qdisc_stab_lock);
 481                return stab;
 482        }
 483
 484        spin_unlock(&qdisc_stab_lock);
 485
 486        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 487        if (!stab)
 488                return ERR_PTR(-ENOMEM);
 489
 490        stab->refcnt = 1;
 491        stab->szopts = *s;
 492        if (tsize > 0)
 493                memcpy(stab->data, tab, tsize * sizeof(u16));
 494
 495        spin_lock(&qdisc_stab_lock);
 496        list_add_tail(&stab->list, &qdisc_stab_list);
 497        spin_unlock(&qdisc_stab_lock);
 498
 499        return stab;
 500}
 501
 502static void stab_kfree_rcu(struct rcu_head *head)
 503{
 504        kfree(container_of(head, struct qdisc_size_table, rcu));
 505}
 506
 507void qdisc_put_stab(struct qdisc_size_table *tab)
 508{
 509        if (!tab)
 510                return;
 511
 512        spin_lock(&qdisc_stab_lock);
 513
 514        if (--tab->refcnt == 0) {
 515                list_del(&tab->list);
 516                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 517        }
 518
 519        spin_unlock(&qdisc_stab_lock);
 520}
 521EXPORT_SYMBOL(qdisc_put_stab);
 522
 523static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 524{
 525        struct nlattr *nest;
 526
 527        nest = nla_nest_start(skb, TCA_STAB);
 528        if (nest == NULL)
 529                goto nla_put_failure;
 530        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 531                goto nla_put_failure;
 532        nla_nest_end(skb, nest);
 533
 534        return skb->len;
 535
 536nla_put_failure:
 537        return -1;
 538}
 539
 540void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 541{
 542        int pkt_len, slot;
 543
 544        pkt_len = skb->len + stab->szopts.overhead;
 545        if (unlikely(!stab->szopts.tsize))
 546                goto out;
 547
 548        slot = pkt_len + stab->szopts.cell_align;
 549        if (unlikely(slot < 0))
 550                slot = 0;
 551
 552        slot >>= stab->szopts.cell_log;
 553        if (likely(slot < stab->szopts.tsize))
 554                pkt_len = stab->data[slot];
 555        else
 556                pkt_len = stab->data[stab->szopts.tsize - 1] *
 557                                (slot / stab->szopts.tsize) +
 558                                stab->data[slot % stab->szopts.tsize];
 559
 560        pkt_len <<= stab->szopts.size_log;
 561out:
 562        if (unlikely(pkt_len < 1))
 563                pkt_len = 1;
 564        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 565}
 566EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 567
 568void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
 569{
 570        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 571                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 572                        txt, qdisc->ops->id, qdisc->handle >> 16);
 573                qdisc->flags |= TCQ_F_WARN_NONWC;
 574        }
 575}
 576EXPORT_SYMBOL(qdisc_warn_nonwc);
 577
 578static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 579{
 580        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 581                                                 timer);
 582
 583        rcu_read_lock();
 584        __netif_schedule(qdisc_root(wd->qdisc));
 585        rcu_read_unlock();
 586
 587        return HRTIMER_NORESTART;
 588}
 589
 590void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 591{
 592        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 593        wd->timer.function = qdisc_watchdog;
 594        wd->qdisc = qdisc;
 595}
 596EXPORT_SYMBOL(qdisc_watchdog_init);
 597
 598void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
 599{
 600        if (test_bit(__QDISC_STATE_DEACTIVATED,
 601                     &qdisc_root_sleeping(wd->qdisc)->state))
 602                return;
 603
 604        if (wd->last_expires == expires)
 605                return;
 606
 607        wd->last_expires = expires;
 608        hrtimer_start(&wd->timer,
 609                      ns_to_ktime(expires),
 610                      HRTIMER_MODE_ABS_PINNED);
 611}
 612EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
 613
 614void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 615{
 616        hrtimer_cancel(&wd->timer);
 617}
 618EXPORT_SYMBOL(qdisc_watchdog_cancel);
 619
 620static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 621{
 622        unsigned int size = n * sizeof(struct hlist_head), i;
 623        struct hlist_head *h;
 624
 625        if (size <= PAGE_SIZE)
 626                h = kmalloc(size, GFP_KERNEL);
 627        else
 628                h = (struct hlist_head *)
 629                        __get_free_pages(GFP_KERNEL, get_order(size));
 630
 631        if (h != NULL) {
 632                for (i = 0; i < n; i++)
 633                        INIT_HLIST_HEAD(&h[i]);
 634        }
 635        return h;
 636}
 637
 638static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 639{
 640        unsigned int size = n * sizeof(struct hlist_head);
 641
 642        if (size <= PAGE_SIZE)
 643                kfree(h);
 644        else
 645                free_pages((unsigned long)h, get_order(size));
 646}
 647
 648void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 649{
 650        struct Qdisc_class_common *cl;
 651        struct hlist_node *next;
 652        struct hlist_head *nhash, *ohash;
 653        unsigned int nsize, nmask, osize;
 654        unsigned int i, h;
 655
 656        /* Rehash when load factor exceeds 0.75 */
 657        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 658                return;
 659        nsize = clhash->hashsize * 2;
 660        nmask = nsize - 1;
 661        nhash = qdisc_class_hash_alloc(nsize);
 662        if (nhash == NULL)
 663                return;
 664
 665        ohash = clhash->hash;
 666        osize = clhash->hashsize;
 667
 668        sch_tree_lock(sch);
 669        for (i = 0; i < osize; i++) {
 670                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 671                        h = qdisc_class_hash(cl->classid, nmask);
 672                        hlist_add_head(&cl->hnode, &nhash[h]);
 673                }
 674        }
 675        clhash->hash     = nhash;
 676        clhash->hashsize = nsize;
 677        clhash->hashmask = nmask;
 678        sch_tree_unlock(sch);
 679
 680        qdisc_class_hash_free(ohash, osize);
 681}
 682EXPORT_SYMBOL(qdisc_class_hash_grow);
 683
 684int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 685{
 686        unsigned int size = 4;
 687
 688        clhash->hash = qdisc_class_hash_alloc(size);
 689        if (clhash->hash == NULL)
 690                return -ENOMEM;
 691        clhash->hashsize  = size;
 692        clhash->hashmask  = size - 1;
 693        clhash->hashelems = 0;
 694        return 0;
 695}
 696EXPORT_SYMBOL(qdisc_class_hash_init);
 697
 698void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 699{
 700        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 701}
 702EXPORT_SYMBOL(qdisc_class_hash_destroy);
 703
 704void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 705                             struct Qdisc_class_common *cl)
 706{
 707        unsigned int h;
 708
 709        INIT_HLIST_NODE(&cl->hnode);
 710        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 711        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 712        clhash->hashelems++;
 713}
 714EXPORT_SYMBOL(qdisc_class_hash_insert);
 715
 716void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 717                             struct Qdisc_class_common *cl)
 718{
 719        hlist_del(&cl->hnode);
 720        clhash->hashelems--;
 721}
 722EXPORT_SYMBOL(qdisc_class_hash_remove);
 723
 724/* Allocate an unique handle from space managed by kernel
 725 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 726 */
 727static u32 qdisc_alloc_handle(struct net_device *dev)
 728{
 729        int i = 0x8000;
 730        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 731
 732        do {
 733                autohandle += TC_H_MAKE(0x10000U, 0);
 734                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 735                        autohandle = TC_H_MAKE(0x80000000U, 0);
 736                if (!qdisc_lookup(dev, autohandle))
 737                        return autohandle;
 738                cond_resched();
 739        } while (--i > 0);
 740
 741        return 0;
 742}
 743
 744void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
 745                               unsigned int len)
 746{
 747        const struct Qdisc_class_ops *cops;
 748        unsigned long cl;
 749        u32 parentid;
 750        int drops;
 751
 752        if (n == 0 && len == 0)
 753                return;
 754        drops = max_t(int, n, 0);
 755        rcu_read_lock();
 756        while ((parentid = sch->parent)) {
 757                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 758                        break;
 759
 760                if (sch->flags & TCQ_F_NOPARENT)
 761                        break;
 762                /* TODO: perform the search on a per txq basis */
 763                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 764                if (sch == NULL) {
 765                        WARN_ON_ONCE(parentid != TC_H_ROOT);
 766                        break;
 767                }
 768                cops = sch->ops->cl_ops;
 769                if (cops->qlen_notify) {
 770                        cl = cops->get(sch, parentid);
 771                        cops->qlen_notify(sch, cl);
 772                        cops->put(sch, cl);
 773                }
 774                sch->q.qlen -= n;
 775                sch->qstats.backlog -= len;
 776                __qdisc_qstats_drop(sch, drops);
 777        }
 778        rcu_read_unlock();
 779}
 780EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 781
 782static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 783                               struct nlmsghdr *n, u32 clid,
 784                               struct Qdisc *old, struct Qdisc *new)
 785{
 786        if (new || old)
 787                qdisc_notify(net, skb, n, clid, old, new);
 788
 789        if (old)
 790                qdisc_destroy(old);
 791}
 792
 793/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 794 * to device "dev".
 795 *
 796 * When appropriate send a netlink notification using 'skb'
 797 * and "n".
 798 *
 799 * On success, destroy old qdisc.
 800 */
 801
 802static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 803                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 804                       struct Qdisc *new, struct Qdisc *old)
 805{
 806        struct Qdisc *q = old;
 807        struct net *net = dev_net(dev);
 808        int err = 0;
 809
 810        if (parent == NULL) {
 811                unsigned int i, num_q, ingress;
 812
 813                ingress = 0;
 814                num_q = dev->num_tx_queues;
 815                if ((q && q->flags & TCQ_F_INGRESS) ||
 816                    (new && new->flags & TCQ_F_INGRESS)) {
 817                        num_q = 1;
 818                        ingress = 1;
 819                        if (!dev_ingress_queue(dev))
 820                                return -ENOENT;
 821                }
 822
 823                if (dev->flags & IFF_UP)
 824                        dev_deactivate(dev);
 825
 826                if (new && new->ops->attach)
 827                        goto skip;
 828
 829                for (i = 0; i < num_q; i++) {
 830                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 831
 832                        if (!ingress)
 833                                dev_queue = netdev_get_tx_queue(dev, i);
 834
 835                        old = dev_graft_qdisc(dev_queue, new);
 836                        if (new && i > 0)
 837                                atomic_inc(&new->refcnt);
 838
 839                        if (!ingress)
 840                                qdisc_destroy(old);
 841                }
 842
 843skip:
 844                if (!ingress) {
 845                        notify_and_destroy(net, skb, n, classid,
 846                                           dev->qdisc, new);
 847                        if (new && !new->ops->attach)
 848                                atomic_inc(&new->refcnt);
 849                        dev->qdisc = new ? : &noop_qdisc;
 850
 851                        if (new && new->ops->attach)
 852                                new->ops->attach(new);
 853                } else {
 854                        notify_and_destroy(net, skb, n, classid, old, new);
 855                }
 856
 857                if (dev->flags & IFF_UP)
 858                        dev_activate(dev);
 859        } else {
 860                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 861
 862                err = -EOPNOTSUPP;
 863                if (cops && cops->graft) {
 864                        unsigned long cl = cops->get(parent, classid);
 865                        if (cl) {
 866                                err = cops->graft(parent, cl, new, &old);
 867                                cops->put(parent, cl);
 868                        } else
 869                                err = -ENOENT;
 870                }
 871                if (!err)
 872                        notify_and_destroy(net, skb, n, classid, old, new);
 873        }
 874        return err;
 875}
 876
 877/* lockdep annotation is needed for ingress; egress gets it only for name */
 878static struct lock_class_key qdisc_tx_lock;
 879static struct lock_class_key qdisc_rx_lock;
 880
 881/*
 882   Allocate and initialize new qdisc.
 883
 884   Parameters are passed via opt.
 885 */
 886
 887static struct Qdisc *
 888qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 889             struct Qdisc *p, u32 parent, u32 handle,
 890             struct nlattr **tca, int *errp)
 891{
 892        int err;
 893        struct nlattr *kind = tca[TCA_KIND];
 894        struct Qdisc *sch;
 895        struct Qdisc_ops *ops;
 896        struct qdisc_size_table *stab;
 897
 898        ops = qdisc_lookup_ops(kind);
 899#ifdef CONFIG_MODULES
 900        if (ops == NULL && kind != NULL) {
 901                char name[IFNAMSIZ];
 902                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 903                        /* We dropped the RTNL semaphore in order to
 904                         * perform the module load.  So, even if we
 905                         * succeeded in loading the module we have to
 906                         * tell the caller to replay the request.  We
 907                         * indicate this using -EAGAIN.
 908                         * We replay the request because the device may
 909                         * go away in the mean time.
 910                         */
 911                        rtnl_unlock();
 912                        request_module("sch_%s", name);
 913                        rtnl_lock();
 914                        ops = qdisc_lookup_ops(kind);
 915                        if (ops != NULL) {
 916                                /* We will try again qdisc_lookup_ops,
 917                                 * so don't keep a reference.
 918                                 */
 919                                module_put(ops->owner);
 920                                err = -EAGAIN;
 921                                goto err_out;
 922                        }
 923                }
 924        }
 925#endif
 926
 927        err = -ENOENT;
 928        if (ops == NULL)
 929                goto err_out;
 930
 931        sch = qdisc_alloc(dev_queue, ops);
 932        if (IS_ERR(sch)) {
 933                err = PTR_ERR(sch);
 934                goto err_out2;
 935        }
 936
 937        sch->parent = parent;
 938
 939        if (handle == TC_H_INGRESS) {
 940                sch->flags |= TCQ_F_INGRESS;
 941                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 942                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 943        } else {
 944                if (handle == 0) {
 945                        handle = qdisc_alloc_handle(dev);
 946                        err = -ENOMEM;
 947                        if (handle == 0)
 948                                goto err_out3;
 949                }
 950                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 951                if (!netif_is_multiqueue(dev))
 952                        sch->flags |= TCQ_F_ONETXQUEUE;
 953        }
 954
 955        sch->handle = handle;
 956
 957        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 958                if (qdisc_is_percpu_stats(sch)) {
 959                        sch->cpu_bstats =
 960                                netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
 961                        if (!sch->cpu_bstats)
 962                                goto err_out4;
 963
 964                        sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
 965                        if (!sch->cpu_qstats)
 966                                goto err_out4;
 967                }
 968
 969                if (tca[TCA_STAB]) {
 970                        stab = qdisc_get_stab(tca[TCA_STAB]);
 971                        if (IS_ERR(stab)) {
 972                                err = PTR_ERR(stab);
 973                                goto err_out4;
 974                        }
 975                        rcu_assign_pointer(sch->stab, stab);
 976                }
 977                if (tca[TCA_RATE]) {
 978                        seqcount_t *running;
 979
 980                        err = -EOPNOTSUPP;
 981                        if (sch->flags & TCQ_F_MQROOT)
 982                                goto err_out4;
 983
 984                        if ((sch->parent != TC_H_ROOT) &&
 985                            !(sch->flags & TCQ_F_INGRESS) &&
 986                            (!p || !(p->flags & TCQ_F_MQROOT)))
 987                                running = qdisc_root_sleeping_running(sch);
 988                        else
 989                                running = &sch->running;
 990
 991                        err = gen_new_estimator(&sch->bstats,
 992                                                sch->cpu_bstats,
 993                                                &sch->rate_est,
 994                                                NULL,
 995                                                running,
 996                                                tca[TCA_RATE]);
 997                        if (err)
 998                                goto err_out4;
 999                }
1000
1001                qdisc_list_add(sch);
1002
1003                return sch;
1004        }
1005err_out3:
1006        dev_put(dev);
1007        kfree((char *) sch - sch->padded);
1008err_out2:
1009        module_put(ops->owner);
1010err_out:
1011        *errp = err;
1012        return NULL;
1013
1014err_out4:
1015        free_percpu(sch->cpu_bstats);
1016        free_percpu(sch->cpu_qstats);
1017        /*
1018         * Any broken qdiscs that would require a ops->reset() here?
1019         * The qdisc was never in action so it shouldn't be necessary.
1020         */
1021        qdisc_put_stab(rtnl_dereference(sch->stab));
1022        if (ops->destroy)
1023                ops->destroy(sch);
1024        goto err_out3;
1025}
1026
1027static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1028{
1029        struct qdisc_size_table *ostab, *stab = NULL;
1030        int err = 0;
1031
1032        if (tca[TCA_OPTIONS]) {
1033                if (sch->ops->change == NULL)
1034                        return -EINVAL;
1035                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1036                if (err)
1037                        return err;
1038        }
1039
1040        if (tca[TCA_STAB]) {
1041                stab = qdisc_get_stab(tca[TCA_STAB]);
1042                if (IS_ERR(stab))
1043                        return PTR_ERR(stab);
1044        }
1045
1046        ostab = rtnl_dereference(sch->stab);
1047        rcu_assign_pointer(sch->stab, stab);
1048        qdisc_put_stab(ostab);
1049
1050        if (tca[TCA_RATE]) {
1051                /* NB: ignores errors from replace_estimator
1052                   because change can't be undone. */
1053                if (sch->flags & TCQ_F_MQROOT)
1054                        goto out;
1055                gen_replace_estimator(&sch->bstats,
1056                                      sch->cpu_bstats,
1057                                      &sch->rate_est,
1058                                      NULL,
1059                                      qdisc_root_sleeping_running(sch),
1060                                      tca[TCA_RATE]);
1061        }
1062out:
1063        return 0;
1064}
1065
1066struct check_loop_arg {
1067        struct qdisc_walker     w;
1068        struct Qdisc            *p;
1069        int                     depth;
1070};
1071
1072static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1073
1074static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1075{
1076        struct check_loop_arg   arg;
1077
1078        if (q->ops->cl_ops == NULL)
1079                return 0;
1080
1081        arg.w.stop = arg.w.skip = arg.w.count = 0;
1082        arg.w.fn = check_loop_fn;
1083        arg.depth = depth;
1084        arg.p = p;
1085        q->ops->cl_ops->walk(q, &arg.w);
1086        return arg.w.stop ? -ELOOP : 0;
1087}
1088
1089static int
1090check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1091{
1092        struct Qdisc *leaf;
1093        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1094        struct check_loop_arg *arg = (struct check_loop_arg *)w;
1095
1096        leaf = cops->leaf(q, cl);
1097        if (leaf) {
1098                if (leaf == arg->p || arg->depth > 7)
1099                        return -ELOOP;
1100                return check_loop(leaf, arg->p, arg->depth + 1);
1101        }
1102        return 0;
1103}
1104
1105/*
1106 * Delete/get qdisc.
1107 */
1108
1109static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1110{
1111        struct net *net = sock_net(skb->sk);
1112        struct tcmsg *tcm = nlmsg_data(n);
1113        struct nlattr *tca[TCA_MAX + 1];
1114        struct net_device *dev;
1115        u32 clid;
1116        struct Qdisc *q = NULL;
1117        struct Qdisc *p = NULL;
1118        int err;
1119
1120        if ((n->nlmsg_type != RTM_GETQDISC) &&
1121            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1122                return -EPERM;
1123
1124        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1125        if (err < 0)
1126                return err;
1127
1128        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1129        if (!dev)
1130                return -ENODEV;
1131
1132        clid = tcm->tcm_parent;
1133        if (clid) {
1134                if (clid != TC_H_ROOT) {
1135                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1136                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1137                                if (!p)
1138                                        return -ENOENT;
1139                                q = qdisc_leaf(p, clid);
1140                        } else if (dev_ingress_queue(dev)) {
1141                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1142                        }
1143                } else {
1144                        q = dev->qdisc;
1145                }
1146                if (!q)
1147                        return -ENOENT;
1148
1149                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1150                        return -EINVAL;
1151        } else {
1152                q = qdisc_lookup(dev, tcm->tcm_handle);
1153                if (!q)
1154                        return -ENOENT;
1155        }
1156
1157        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1158                return -EINVAL;
1159
1160        if (n->nlmsg_type == RTM_DELQDISC) {
1161                if (!clid)
1162                        return -EINVAL;
1163                if (q->handle == 0)
1164                        return -ENOENT;
1165                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1166                if (err != 0)
1167                        return err;
1168        } else {
1169                qdisc_notify(net, skb, n, clid, NULL, q);
1170        }
1171        return 0;
1172}
1173
1174/*
1175 * Create/change qdisc.
1176 */
1177
1178static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1179{
1180        struct net *net = sock_net(skb->sk);
1181        struct tcmsg *tcm;
1182        struct nlattr *tca[TCA_MAX + 1];
1183        struct net_device *dev;
1184        u32 clid;
1185        struct Qdisc *q, *p;
1186        int err;
1187
1188        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1189                return -EPERM;
1190
1191replay:
1192        /* Reinit, just in case something touches this. */
1193        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1194        if (err < 0)
1195                return err;
1196
1197        tcm = nlmsg_data(n);
1198        clid = tcm->tcm_parent;
1199        q = p = NULL;
1200
1201        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1202        if (!dev)
1203                return -ENODEV;
1204
1205
1206        if (clid) {
1207                if (clid != TC_H_ROOT) {
1208                        if (clid != TC_H_INGRESS) {
1209                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1210                                if (!p)
1211                                        return -ENOENT;
1212                                q = qdisc_leaf(p, clid);
1213                        } else if (dev_ingress_queue_create(dev)) {
1214                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1215                        }
1216                } else {
1217                        q = dev->qdisc;
1218                }
1219
1220                /* It may be default qdisc, ignore it */
1221                if (q && q->handle == 0)
1222                        q = NULL;
1223
1224                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1225                        if (tcm->tcm_handle) {
1226                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1227                                        return -EEXIST;
1228                                if (TC_H_MIN(tcm->tcm_handle))
1229                                        return -EINVAL;
1230                                q = qdisc_lookup(dev, tcm->tcm_handle);
1231                                if (!q)
1232                                        goto create_n_graft;
1233                                if (n->nlmsg_flags & NLM_F_EXCL)
1234                                        return -EEXIST;
1235                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1236                                        return -EINVAL;
1237                                if (q == p ||
1238                                    (p && check_loop(q, p, 0)))
1239                                        return -ELOOP;
1240                                atomic_inc(&q->refcnt);
1241                                goto graft;
1242                        } else {
1243                                if (!q)
1244                                        goto create_n_graft;
1245
1246                                /* This magic test requires explanation.
1247                                 *
1248                                 *   We know, that some child q is already
1249                                 *   attached to this parent and have choice:
1250                                 *   either to change it or to create/graft new one.
1251                                 *
1252                                 *   1. We are allowed to create/graft only
1253                                 *   if CREATE and REPLACE flags are set.
1254                                 *
1255                                 *   2. If EXCL is set, requestor wanted to say,
1256                                 *   that qdisc tcm_handle is not expected
1257                                 *   to exist, so that we choose create/graft too.
1258                                 *
1259                                 *   3. The last case is when no flags are set.
1260                                 *   Alas, it is sort of hole in API, we
1261                                 *   cannot decide what to do unambiguously.
1262                                 *   For now we select create/graft, if
1263                                 *   user gave KIND, which does not match existing.
1264                                 */
1265                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1266                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1267                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1268                                     (tca[TCA_KIND] &&
1269                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1270                                        goto create_n_graft;
1271                        }
1272                }
1273        } else {
1274                if (!tcm->tcm_handle)
1275                        return -EINVAL;
1276                q = qdisc_lookup(dev, tcm->tcm_handle);
1277        }
1278
1279        /* Change qdisc parameters */
1280        if (q == NULL)
1281                return -ENOENT;
1282        if (n->nlmsg_flags & NLM_F_EXCL)
1283                return -EEXIST;
1284        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1285                return -EINVAL;
1286        err = qdisc_change(q, tca);
1287        if (err == 0)
1288                qdisc_notify(net, skb, n, clid, NULL, q);
1289        return err;
1290
1291create_n_graft:
1292        if (!(n->nlmsg_flags & NLM_F_CREATE))
1293                return -ENOENT;
1294        if (clid == TC_H_INGRESS) {
1295                if (dev_ingress_queue(dev))
1296                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1297                                         tcm->tcm_parent, tcm->tcm_parent,
1298                                         tca, &err);
1299                else
1300                        err = -ENOENT;
1301        } else {
1302                struct netdev_queue *dev_queue;
1303
1304                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1305                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1306                else if (p)
1307                        dev_queue = p->dev_queue;
1308                else
1309                        dev_queue = netdev_get_tx_queue(dev, 0);
1310
1311                q = qdisc_create(dev, dev_queue, p,
1312                                 tcm->tcm_parent, tcm->tcm_handle,
1313                                 tca, &err);
1314        }
1315        if (q == NULL) {
1316                if (err == -EAGAIN)
1317                        goto replay;
1318                return err;
1319        }
1320
1321graft:
1322        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1323        if (err) {
1324                if (q)
1325                        qdisc_destroy(q);
1326                return err;
1327        }
1328
1329        return 0;
1330}
1331
1332static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1333                         u32 portid, u32 seq, u16 flags, int event)
1334{
1335        struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1336        struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1337        struct tcmsg *tcm;
1338        struct nlmsghdr  *nlh;
1339        unsigned char *b = skb_tail_pointer(skb);
1340        struct gnet_dump d;
1341        struct qdisc_size_table *stab;
1342        __u32 qlen;
1343
1344        cond_resched();
1345        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1346        if (!nlh)
1347                goto out_nlmsg_trim;
1348        tcm = nlmsg_data(nlh);
1349        tcm->tcm_family = AF_UNSPEC;
1350        tcm->tcm__pad1 = 0;
1351        tcm->tcm__pad2 = 0;
1352        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1353        tcm->tcm_parent = clid;
1354        tcm->tcm_handle = q->handle;
1355        tcm->tcm_info = atomic_read(&q->refcnt);
1356        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1357                goto nla_put_failure;
1358        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1359                goto nla_put_failure;
1360        qlen = q->q.qlen;
1361
1362        stab = rtnl_dereference(q->stab);
1363        if (stab && qdisc_dump_stab(skb, stab) < 0)
1364                goto nla_put_failure;
1365
1366        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1367                                         NULL, &d, TCA_PAD) < 0)
1368                goto nla_put_failure;
1369
1370        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1371                goto nla_put_failure;
1372
1373        if (qdisc_is_percpu_stats(q)) {
1374                cpu_bstats = q->cpu_bstats;
1375                cpu_qstats = q->cpu_qstats;
1376        }
1377
1378        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
1379                                  &d, cpu_bstats, &q->bstats) < 0 ||
1380            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1381            gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1382                goto nla_put_failure;
1383
1384        if (gnet_stats_finish_copy(&d) < 0)
1385                goto nla_put_failure;
1386
1387        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1388        return skb->len;
1389
1390out_nlmsg_trim:
1391nla_put_failure:
1392        nlmsg_trim(skb, b);
1393        return -1;
1394}
1395
1396static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1397{
1398        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1399}
1400
1401static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1402                        struct nlmsghdr *n, u32 clid,
1403                        struct Qdisc *old, struct Qdisc *new)
1404{
1405        struct sk_buff *skb;
1406        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1407
1408        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1409        if (!skb)
1410                return -ENOBUFS;
1411
1412        if (old && !tc_qdisc_dump_ignore(old)) {
1413                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1414                                  0, RTM_DELQDISC) < 0)
1415                        goto err_out;
1416        }
1417        if (new && !tc_qdisc_dump_ignore(new)) {
1418                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1419                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1420                        goto err_out;
1421        }
1422
1423        if (skb->len)
1424                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1425                                      n->nlmsg_flags & NLM_F_ECHO);
1426
1427err_out:
1428        kfree_skb(skb);
1429        return -EINVAL;
1430}
1431
1432static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1433                              struct netlink_callback *cb,
1434                              int *q_idx_p, int s_q_idx)
1435{
1436        int ret = 0, q_idx = *q_idx_p;
1437        struct Qdisc *q;
1438
1439        if (!root)
1440                return 0;
1441
1442        q = root;
1443        if (q_idx < s_q_idx) {
1444                q_idx++;
1445        } else {
1446                if (!tc_qdisc_dump_ignore(q) &&
1447                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1448                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1449                        goto done;
1450                q_idx++;
1451        }
1452        list_for_each_entry(q, &root->list, list) {
1453                if (q_idx < s_q_idx) {
1454                        q_idx++;
1455                        continue;
1456                }
1457                if (!tc_qdisc_dump_ignore(q) &&
1458                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1459                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1460                        goto done;
1461                q_idx++;
1462        }
1463
1464out:
1465        *q_idx_p = q_idx;
1466        return ret;
1467done:
1468        ret = -1;
1469        goto out;
1470}
1471
1472static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1473{
1474        struct net *net = sock_net(skb->sk);
1475        int idx, q_idx;
1476        int s_idx, s_q_idx;
1477        struct net_device *dev;
1478
1479        s_idx = cb->args[0];
1480        s_q_idx = q_idx = cb->args[1];
1481
1482        idx = 0;
1483        ASSERT_RTNL();
1484        for_each_netdev(net, dev) {
1485                struct netdev_queue *dev_queue;
1486
1487                if (idx < s_idx)
1488                        goto cont;
1489                if (idx > s_idx)
1490                        s_q_idx = 0;
1491                q_idx = 0;
1492
1493                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1494                        goto done;
1495
1496                dev_queue = dev_ingress_queue(dev);
1497                if (dev_queue &&
1498                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1499                                       &q_idx, s_q_idx) < 0)
1500                        goto done;
1501
1502cont:
1503                idx++;
1504        }
1505
1506done:
1507        cb->args[0] = idx;
1508        cb->args[1] = q_idx;
1509
1510        return skb->len;
1511}
1512
1513
1514
1515/************************************************
1516 *      Traffic classes manipulation.           *
1517 ************************************************/
1518
1519
1520
1521static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1522{
1523        struct net *net = sock_net(skb->sk);
1524        struct tcmsg *tcm = nlmsg_data(n);
1525        struct nlattr *tca[TCA_MAX + 1];
1526        struct net_device *dev;
1527        struct Qdisc *q = NULL;
1528        const struct Qdisc_class_ops *cops;
1529        unsigned long cl = 0;
1530        unsigned long new_cl;
1531        u32 portid;
1532        u32 clid;
1533        u32 qid;
1534        int err;
1535
1536        if ((n->nlmsg_type != RTM_GETTCLASS) &&
1537            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1538                return -EPERM;
1539
1540        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1541        if (err < 0)
1542                return err;
1543
1544        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1545        if (!dev)
1546                return -ENODEV;
1547
1548        /*
1549           parent == TC_H_UNSPEC - unspecified parent.
1550           parent == TC_H_ROOT   - class is root, which has no parent.
1551           parent == X:0         - parent is root class.
1552           parent == X:Y         - parent is a node in hierarchy.
1553           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1554
1555           handle == 0:0         - generate handle from kernel pool.
1556           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1557           handle == X:Y         - clear.
1558           handle == X:0         - root class.
1559         */
1560
1561        /* Step 1. Determine qdisc handle X:0 */
1562
1563        portid = tcm->tcm_parent;
1564        clid = tcm->tcm_handle;
1565        qid = TC_H_MAJ(clid);
1566
1567        if (portid != TC_H_ROOT) {
1568                u32 qid1 = TC_H_MAJ(portid);
1569
1570                if (qid && qid1) {
1571                        /* If both majors are known, they must be identical. */
1572                        if (qid != qid1)
1573                                return -EINVAL;
1574                } else if (qid1) {
1575                        qid = qid1;
1576                } else if (qid == 0)
1577                        qid = dev->qdisc->handle;
1578
1579                /* Now qid is genuine qdisc handle consistent
1580                 * both with parent and child.
1581                 *
1582                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1583                 */
1584                if (portid)
1585                        portid = TC_H_MAKE(qid, portid);
1586        } else {
1587                if (qid == 0)
1588                        qid = dev->qdisc->handle;
1589        }
1590
1591        /* OK. Locate qdisc */
1592        q = qdisc_lookup(dev, qid);
1593        if (!q)
1594                return -ENOENT;
1595
1596        /* An check that it supports classes */
1597        cops = q->ops->cl_ops;
1598        if (cops == NULL)
1599                return -EINVAL;
1600
1601        /* Now try to get class */
1602        if (clid == 0) {
1603                if (portid == TC_H_ROOT)
1604                        clid = qid;
1605        } else
1606                clid = TC_H_MAKE(qid, clid);
1607
1608        if (clid)
1609                cl = cops->get(q, clid);
1610
1611        if (cl == 0) {
1612                err = -ENOENT;
1613                if (n->nlmsg_type != RTM_NEWTCLASS ||
1614                    !(n->nlmsg_flags & NLM_F_CREATE))
1615                        goto out;
1616        } else {
1617                switch (n->nlmsg_type) {
1618                case RTM_NEWTCLASS:
1619                        err = -EEXIST;
1620                        if (n->nlmsg_flags & NLM_F_EXCL)
1621                                goto out;
1622                        break;
1623                case RTM_DELTCLASS:
1624                        err = -EOPNOTSUPP;
1625                        if (cops->delete)
1626                                err = cops->delete(q, cl);
1627                        if (err == 0)
1628                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1629                        goto out;
1630                case RTM_GETTCLASS:
1631                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1632                        goto out;
1633                default:
1634                        err = -EINVAL;
1635                        goto out;
1636                }
1637        }
1638
1639        new_cl = cl;
1640        err = -EOPNOTSUPP;
1641        if (cops->change)
1642                err = cops->change(q, clid, portid, tca, &new_cl);
1643        if (err == 0)
1644                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1645
1646out:
1647        if (cl)
1648                cops->put(q, cl);
1649
1650        return err;
1651}
1652
1653
1654static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1655                          unsigned long cl,
1656                          u32 portid, u32 seq, u16 flags, int event)
1657{
1658        struct tcmsg *tcm;
1659        struct nlmsghdr  *nlh;
1660        unsigned char *b = skb_tail_pointer(skb);
1661        struct gnet_dump d;
1662        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1663
1664        cond_resched();
1665        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1666        if (!nlh)
1667                goto out_nlmsg_trim;
1668        tcm = nlmsg_data(nlh);
1669        tcm->tcm_family = AF_UNSPEC;
1670        tcm->tcm__pad1 = 0;
1671        tcm->tcm__pad2 = 0;
1672        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1673        tcm->tcm_parent = q->handle;
1674        tcm->tcm_handle = q->handle;
1675        tcm->tcm_info = 0;
1676        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1677                goto nla_put_failure;
1678        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1679                goto nla_put_failure;
1680
1681        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1682                                         NULL, &d, TCA_PAD) < 0)
1683                goto nla_put_failure;
1684
1685        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1686                goto nla_put_failure;
1687
1688        if (gnet_stats_finish_copy(&d) < 0)
1689                goto nla_put_failure;
1690
1691        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1692        return skb->len;
1693
1694out_nlmsg_trim:
1695nla_put_failure:
1696        nlmsg_trim(skb, b);
1697        return -1;
1698}
1699
1700static int tclass_notify(struct net *net, struct sk_buff *oskb,
1701                         struct nlmsghdr *n, struct Qdisc *q,
1702                         unsigned long cl, int event)
1703{
1704        struct sk_buff *skb;
1705        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1706
1707        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1708        if (!skb)
1709                return -ENOBUFS;
1710
1711        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1712                kfree_skb(skb);
1713                return -EINVAL;
1714        }
1715
1716        return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1717                              n->nlmsg_flags & NLM_F_ECHO);
1718}
1719
1720struct qdisc_dump_args {
1721        struct qdisc_walker     w;
1722        struct sk_buff          *skb;
1723        struct netlink_callback *cb;
1724};
1725
1726static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1727{
1728        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1729
1730        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1731                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1732}
1733
1734static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1735                                struct tcmsg *tcm, struct netlink_callback *cb,
1736                                int *t_p, int s_t)
1737{
1738        struct qdisc_dump_args arg;
1739
1740        if (tc_qdisc_dump_ignore(q) ||
1741            *t_p < s_t || !q->ops->cl_ops ||
1742            (tcm->tcm_parent &&
1743             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1744                (*t_p)++;
1745                return 0;
1746        }
1747        if (*t_p > s_t)
1748                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1749        arg.w.fn = qdisc_class_dump;
1750        arg.skb = skb;
1751        arg.cb = cb;
1752        arg.w.stop  = 0;
1753        arg.w.skip = cb->args[1];
1754        arg.w.count = 0;
1755        q->ops->cl_ops->walk(q, &arg.w);
1756        cb->args[1] = arg.w.count;
1757        if (arg.w.stop)
1758                return -1;
1759        (*t_p)++;
1760        return 0;
1761}
1762
1763static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1764                               struct tcmsg *tcm, struct netlink_callback *cb,
1765                               int *t_p, int s_t)
1766{
1767        struct Qdisc *q;
1768
1769        if (!root)
1770                return 0;
1771
1772        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1773                return -1;
1774
1775        list_for_each_entry(q, &root->list, list) {
1776                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1777                        return -1;
1778        }
1779
1780        return 0;
1781}
1782
1783static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1784{
1785        struct tcmsg *tcm = nlmsg_data(cb->nlh);
1786        struct net *net = sock_net(skb->sk);
1787        struct netdev_queue *dev_queue;
1788        struct net_device *dev;
1789        int t, s_t;
1790
1791        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1792                return 0;
1793        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1794        if (!dev)
1795                return 0;
1796
1797        s_t = cb->args[0];
1798        t = 0;
1799
1800        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1801                goto done;
1802
1803        dev_queue = dev_ingress_queue(dev);
1804        if (dev_queue &&
1805            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1806                                &t, s_t) < 0)
1807                goto done;
1808
1809done:
1810        cb->args[0] = t;
1811
1812        dev_put(dev);
1813        return skb->len;
1814}
1815
1816/* Main classifier routine: scans classifier chain attached
1817 * to this qdisc, (optionally) tests for protocol and asks
1818 * specific classifiers.
1819 */
1820int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1821                struct tcf_result *res, bool compat_mode)
1822{
1823        __be16 protocol = tc_skb_protocol(skb);
1824#ifdef CONFIG_NET_CLS_ACT
1825        const struct tcf_proto *old_tp = tp;
1826        int limit = 0;
1827
1828reclassify:
1829#endif
1830        for (; tp; tp = rcu_dereference_bh(tp->next)) {
1831                int err;
1832
1833                if (tp->protocol != protocol &&
1834                    tp->protocol != htons(ETH_P_ALL))
1835                        continue;
1836
1837                err = tp->classify(skb, tp, res);
1838#ifdef CONFIG_NET_CLS_ACT
1839                if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
1840                        goto reset;
1841#endif
1842                if (err >= 0)
1843                        return err;
1844        }
1845
1846        return TC_ACT_UNSPEC; /* signal: continue lookup */
1847#ifdef CONFIG_NET_CLS_ACT
1848reset:
1849        if (unlikely(limit++ >= MAX_REC_LOOP)) {
1850                net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
1851                                       tp->q->ops->id, tp->prio & 0xffff,
1852                                       ntohs(tp->protocol));
1853                return TC_ACT_SHOT;
1854        }
1855
1856        tp = old_tp;
1857        protocol = tc_skb_protocol(skb);
1858        goto reclassify;
1859#endif
1860}
1861EXPORT_SYMBOL(tc_classify);
1862
1863bool tcf_destroy(struct tcf_proto *tp, bool force)
1864{
1865        if (tp->ops->destroy(tp, force)) {
1866                module_put(tp->ops->owner);
1867                kfree_rcu(tp, rcu);
1868                return true;
1869        }
1870
1871        return false;
1872}
1873
1874void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1875{
1876        struct tcf_proto *tp;
1877
1878        while ((tp = rtnl_dereference(*fl)) != NULL) {
1879                RCU_INIT_POINTER(*fl, tp->next);
1880                tcf_destroy(tp, true);
1881        }
1882}
1883EXPORT_SYMBOL(tcf_destroy_chain);
1884
1885#ifdef CONFIG_PROC_FS
1886static int psched_show(struct seq_file *seq, void *v)
1887{
1888        seq_printf(seq, "%08x %08x %08x %08x\n",
1889                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1890                   1000000,
1891                   (u32)NSEC_PER_SEC / hrtimer_resolution);
1892
1893        return 0;
1894}
1895
1896static int psched_open(struct inode *inode, struct file *file)
1897{
1898        return single_open(file, psched_show, NULL);
1899}
1900
1901static const struct file_operations psched_fops = {
1902        .owner = THIS_MODULE,
1903        .open = psched_open,
1904        .read  = seq_read,
1905        .llseek = seq_lseek,
1906        .release = single_release,
1907};
1908
1909static int __net_init psched_net_init(struct net *net)
1910{
1911        struct proc_dir_entry *e;
1912
1913        e = proc_create("psched", 0, net->proc_net, &psched_fops);
1914        if (e == NULL)
1915                return -ENOMEM;
1916
1917        return 0;
1918}
1919
1920static void __net_exit psched_net_exit(struct net *net)
1921{
1922        remove_proc_entry("psched", net->proc_net);
1923}
1924#else
1925static int __net_init psched_net_init(struct net *net)
1926{
1927        return 0;
1928}
1929
1930static void __net_exit psched_net_exit(struct net *net)
1931{
1932}
1933#endif
1934
1935static struct pernet_operations psched_net_ops = {
1936        .init = psched_net_init,
1937        .exit = psched_net_exit,
1938};
1939
1940static int __init pktsched_init(void)
1941{
1942        int err;
1943
1944        err = register_pernet_subsys(&psched_net_ops);
1945        if (err) {
1946                pr_err("pktsched_init: "
1947                       "cannot initialize per netns operations\n");
1948                return err;
1949        }
1950
1951        register_qdisc(&pfifo_fast_ops);
1952        register_qdisc(&pfifo_qdisc_ops);
1953        register_qdisc(&bfifo_qdisc_ops);
1954        register_qdisc(&pfifo_head_drop_qdisc_ops);
1955        register_qdisc(&mq_qdisc_ops);
1956        register_qdisc(&noqueue_qdisc_ops);
1957
1958        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1959        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1960        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1961        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1962        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1963        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1964
1965        return 0;
1966}
1967
1968subsys_initcall(pktsched_init);
1969