linux/net/sched/sch_cbq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/slab.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <net/netlink.h>
  21#include <net/pkt_sched.h>
  22
  23
  24/*      Class-Based Queueing (CBQ) algorithm.
  25        =======================================
  26
  27        Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  28                 Management Models for Packet Networks",
  29                 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  30
  31                 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  32
  33                 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  34                 Parameters", 1996
  35
  36                 [4] Sally Floyd and Michael Speer, "Experimental Results
  37                 for Class-Based Queueing", 1998, not published.
  38
  39        -----------------------------------------------------------------------
  40
  41        Algorithm skeleton was taken from NS simulator cbq.cc.
  42        If someone wants to check this code against the LBL version,
  43        he should take into account that ONLY the skeleton was borrowed,
  44        the implementation is different. Particularly:
  45
  46        --- The WRR algorithm is different. Our version looks more
  47        reasonable (I hope) and works when quanta are allowed to be
  48        less than MTU, which is always the case when real time classes
  49        have small rates. Note, that the statement of [3] is
  50        incomplete, delay may actually be estimated even if class
  51        per-round allotment is less than MTU. Namely, if per-round
  52        allotment is W*r_i, and r_1+...+r_k = r < 1
  53
  54        delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  55
  56        In the worst case we have IntServ estimate with D = W*r+k*MTU
  57        and C = MTU*r. The proof (if correct at all) is trivial.
  58
  59
  60        --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  61        interpret some places, which look like wrong translations
  62        from NS. Anyone is advised to find these differences
  63        and explain to me, why I am wrong 8).
  64
  65        --- Linux has no EOI event, so that we cannot estimate true class
  66        idle time. Workaround is to consider the next dequeue event
  67        as sign that previous packet is finished. This is wrong because of
  68        internal device queueing, but on a permanently loaded link it is true.
  69        Moreover, combined with clock integrator, this scheme looks
  70        very close to an ideal solution.  */
  71
  72struct cbq_sched_data;
  73
  74
  75struct cbq_class {
  76        struct Qdisc_class_common common;
  77        struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  78
  79/* Parameters */
  80        unsigned char           priority;       /* class priority */
  81        unsigned char           priority2;      /* priority to be used after overlimit */
  82        unsigned char           ewma_log;       /* time constant for idle time calculation */
  83
  84        u32                     defmap;
  85
  86        /* Link-sharing scheduler parameters */
  87        long                    maxidle;        /* Class parameters: see below. */
  88        long                    offtime;
  89        long                    minidle;
  90        u32                     avpkt;
  91        struct qdisc_rate_table *R_tab;
  92
  93        /* General scheduler (WRR) parameters */
  94        long                    allot;
  95        long                    quantum;        /* Allotment per WRR round */
  96        long                    weight;         /* Relative allotment: see below */
  97
  98        struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
  99        struct cbq_class        *split;         /* Ptr to split node */
 100        struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
 101        struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
 102        struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
 103                                                   parent otherwise */
 104        struct cbq_class        *sibling;       /* Sibling chain */
 105        struct cbq_class        *children;      /* Pointer to children chain */
 106
 107        struct Qdisc            *q;             /* Elementary queueing discipline */
 108
 109
 110/* Variables */
 111        unsigned char           cpriority;      /* Effective priority */
 112        unsigned char           delayed;
 113        unsigned char           level;          /* level of the class in hierarchy:
 114                                                   0 for leaf classes, and maximal
 115                                                   level of children + 1 for nodes.
 116                                                 */
 117
 118        psched_time_t           last;           /* Last end of service */
 119        psched_time_t           undertime;
 120        long                    avgidle;
 121        long                    deficit;        /* Saved deficit for WRR */
 122        psched_time_t           penalized;
 123        struct gnet_stats_basic_packed bstats;
 124        struct gnet_stats_queue qstats;
 125        struct gnet_stats_rate_est64 rate_est;
 126        struct tc_cbq_xstats    xstats;
 127
 128        struct tcf_proto __rcu  *filter_list;
 129
 130        int                     refcnt;
 131        int                     filters;
 132
 133        struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 134};
 135
 136struct cbq_sched_data {
 137        struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 138        int                     nclasses[TC_CBQ_MAXPRIO + 1];
 139        unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 140
 141        struct cbq_class        link;
 142
 143        unsigned int            activemask;
 144        struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 145                                                                   with backlog */
 146
 147#ifdef CONFIG_NET_CLS_ACT
 148        struct cbq_class        *rx_class;
 149#endif
 150        struct cbq_class        *tx_class;
 151        struct cbq_class        *tx_borrowed;
 152        int                     tx_len;
 153        psched_time_t           now;            /* Cached timestamp */
 154        unsigned int            pmask;
 155
 156        struct hrtimer          delay_timer;
 157        struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 158                                                   started when CBQ has
 159                                                   backlog, but cannot
 160                                                   transmit just now */
 161        psched_tdiff_t          wd_expires;
 162        int                     toplevel;
 163        u32                     hgenerator;
 164};
 165
 166
 167#define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 168
 169static inline struct cbq_class *
 170cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 171{
 172        struct Qdisc_class_common *clc;
 173
 174        clc = qdisc_class_find(&q->clhash, classid);
 175        if (clc == NULL)
 176                return NULL;
 177        return container_of(clc, struct cbq_class, common);
 178}
 179
 180#ifdef CONFIG_NET_CLS_ACT
 181
 182static struct cbq_class *
 183cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 184{
 185        struct cbq_class *cl;
 186
 187        for (cl = this->tparent; cl; cl = cl->tparent) {
 188                struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 189
 190                if (new != NULL && new != this)
 191                        return new;
 192        }
 193        return NULL;
 194}
 195
 196#endif
 197
 198/* Classify packet. The procedure is pretty complicated, but
 199 * it allows us to combine link sharing and priority scheduling
 200 * transparently.
 201 *
 202 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 203 * so that it resolves to split nodes. Then packets are classified
 204 * by logical priority, or a more specific classifier may be attached
 205 * to the split node.
 206 */
 207
 208static struct cbq_class *
 209cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 210{
 211        struct cbq_sched_data *q = qdisc_priv(sch);
 212        struct cbq_class *head = &q->link;
 213        struct cbq_class **defmap;
 214        struct cbq_class *cl = NULL;
 215        u32 prio = skb->priority;
 216        struct tcf_proto *fl;
 217        struct tcf_result res;
 218
 219        /*
 220         *  Step 1. If skb->priority points to one of our classes, use it.
 221         */
 222        if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 223            (cl = cbq_class_lookup(q, prio)) != NULL)
 224                return cl;
 225
 226        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 227        for (;;) {
 228                int result = 0;
 229                defmap = head->defaults;
 230
 231                fl = rcu_dereference_bh(head->filter_list);
 232                /*
 233                 * Step 2+n. Apply classifier.
 234                 */
 235                result = tc_classify(skb, fl, &res, true);
 236                if (!fl || result < 0)
 237                        goto fallback;
 238
 239                cl = (void *)res.class;
 240                if (!cl) {
 241                        if (TC_H_MAJ(res.classid))
 242                                cl = cbq_class_lookup(q, res.classid);
 243                        else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 244                                cl = defmap[TC_PRIO_BESTEFFORT];
 245
 246                        if (cl == NULL)
 247                                goto fallback;
 248                }
 249                if (cl->level >= head->level)
 250                        goto fallback;
 251#ifdef CONFIG_NET_CLS_ACT
 252                switch (result) {
 253                case TC_ACT_QUEUED:
 254                case TC_ACT_STOLEN:
 255                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 256                case TC_ACT_SHOT:
 257                        return NULL;
 258                case TC_ACT_RECLASSIFY:
 259                        return cbq_reclassify(skb, cl);
 260                }
 261#endif
 262                if (cl->level == 0)
 263                        return cl;
 264
 265                /*
 266                 * Step 3+n. If classifier selected a link sharing class,
 267                 *         apply agency specific classifier.
 268                 *         Repeat this procdure until we hit a leaf node.
 269                 */
 270                head = cl;
 271        }
 272
 273fallback:
 274        cl = head;
 275
 276        /*
 277         * Step 4. No success...
 278         */
 279        if (TC_H_MAJ(prio) == 0 &&
 280            !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 281            !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 282                return head;
 283
 284        return cl;
 285}
 286
 287/*
 288 * A packet has just been enqueued on the empty class.
 289 * cbq_activate_class adds it to the tail of active class list
 290 * of its priority band.
 291 */
 292
 293static inline void cbq_activate_class(struct cbq_class *cl)
 294{
 295        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 296        int prio = cl->cpriority;
 297        struct cbq_class *cl_tail;
 298
 299        cl_tail = q->active[prio];
 300        q->active[prio] = cl;
 301
 302        if (cl_tail != NULL) {
 303                cl->next_alive = cl_tail->next_alive;
 304                cl_tail->next_alive = cl;
 305        } else {
 306                cl->next_alive = cl;
 307                q->activemask |= (1<<prio);
 308        }
 309}
 310
 311/*
 312 * Unlink class from active chain.
 313 * Note that this same procedure is done directly in cbq_dequeue*
 314 * during round-robin procedure.
 315 */
 316
 317static void cbq_deactivate_class(struct cbq_class *this)
 318{
 319        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 320        int prio = this->cpriority;
 321        struct cbq_class *cl;
 322        struct cbq_class *cl_prev = q->active[prio];
 323
 324        do {
 325                cl = cl_prev->next_alive;
 326                if (cl == this) {
 327                        cl_prev->next_alive = cl->next_alive;
 328                        cl->next_alive = NULL;
 329
 330                        if (cl == q->active[prio]) {
 331                                q->active[prio] = cl_prev;
 332                                if (cl == q->active[prio]) {
 333                                        q->active[prio] = NULL;
 334                                        q->activemask &= ~(1<<prio);
 335                                        return;
 336                                }
 337                        }
 338                        return;
 339                }
 340        } while ((cl_prev = cl) != q->active[prio]);
 341}
 342
 343static void
 344cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 345{
 346        int toplevel = q->toplevel;
 347
 348        if (toplevel > cl->level) {
 349                psched_time_t now = psched_get_time();
 350
 351                do {
 352                        if (cl->undertime < now) {
 353                                q->toplevel = cl->level;
 354                                return;
 355                        }
 356                } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 357        }
 358}
 359
 360static int
 361cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 362            struct sk_buff **to_free)
 363{
 364        struct cbq_sched_data *q = qdisc_priv(sch);
 365        int uninitialized_var(ret);
 366        struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 367
 368#ifdef CONFIG_NET_CLS_ACT
 369        q->rx_class = cl;
 370#endif
 371        if (cl == NULL) {
 372                if (ret & __NET_XMIT_BYPASS)
 373                        qdisc_qstats_drop(sch);
 374                __qdisc_drop(skb, to_free);
 375                return ret;
 376        }
 377
 378        ret = qdisc_enqueue(skb, cl->q, to_free);
 379        if (ret == NET_XMIT_SUCCESS) {
 380                sch->q.qlen++;
 381                cbq_mark_toplevel(q, cl);
 382                if (!cl->next_alive)
 383                        cbq_activate_class(cl);
 384                return ret;
 385        }
 386
 387        if (net_xmit_drop_count(ret)) {
 388                qdisc_qstats_drop(sch);
 389                cbq_mark_toplevel(q, cl);
 390                cl->qstats.drops++;
 391        }
 392        return ret;
 393}
 394
 395/* Overlimit action: penalize leaf class by adding offtime */
 396static void cbq_overlimit(struct cbq_class *cl)
 397{
 398        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 399        psched_tdiff_t delay = cl->undertime - q->now;
 400
 401        if (!cl->delayed) {
 402                delay += cl->offtime;
 403
 404                /*
 405                 * Class goes to sleep, so that it will have no
 406                 * chance to work avgidle. Let's forgive it 8)
 407                 *
 408                 * BTW cbq-2.0 has a crap in this
 409                 * place, apparently they forgot to shift it by cl->ewma_log.
 410                 */
 411                if (cl->avgidle < 0)
 412                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 413                if (cl->avgidle < cl->minidle)
 414                        cl->avgidle = cl->minidle;
 415                if (delay <= 0)
 416                        delay = 1;
 417                cl->undertime = q->now + delay;
 418
 419                cl->xstats.overactions++;
 420                cl->delayed = 1;
 421        }
 422        if (q->wd_expires == 0 || q->wd_expires > delay)
 423                q->wd_expires = delay;
 424
 425        /* Dirty work! We must schedule wakeups based on
 426         * real available rate, rather than leaf rate,
 427         * which may be tiny (even zero).
 428         */
 429        if (q->toplevel == TC_CBQ_MAXLEVEL) {
 430                struct cbq_class *b;
 431                psched_tdiff_t base_delay = q->wd_expires;
 432
 433                for (b = cl->borrow; b; b = b->borrow) {
 434                        delay = b->undertime - q->now;
 435                        if (delay < base_delay) {
 436                                if (delay <= 0)
 437                                        delay = 1;
 438                                base_delay = delay;
 439                        }
 440                }
 441
 442                q->wd_expires = base_delay;
 443        }
 444}
 445
 446static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 447                                       psched_time_t now)
 448{
 449        struct cbq_class *cl;
 450        struct cbq_class *cl_prev = q->active[prio];
 451        psched_time_t sched = now;
 452
 453        if (cl_prev == NULL)
 454                return 0;
 455
 456        do {
 457                cl = cl_prev->next_alive;
 458                if (now - cl->penalized > 0) {
 459                        cl_prev->next_alive = cl->next_alive;
 460                        cl->next_alive = NULL;
 461                        cl->cpriority = cl->priority;
 462                        cl->delayed = 0;
 463                        cbq_activate_class(cl);
 464
 465                        if (cl == q->active[prio]) {
 466                                q->active[prio] = cl_prev;
 467                                if (cl == q->active[prio]) {
 468                                        q->active[prio] = NULL;
 469                                        return 0;
 470                                }
 471                        }
 472
 473                        cl = cl_prev->next_alive;
 474                } else if (sched - cl->penalized > 0)
 475                        sched = cl->penalized;
 476        } while ((cl_prev = cl) != q->active[prio]);
 477
 478        return sched - now;
 479}
 480
 481static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 482{
 483        struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 484                                                delay_timer);
 485        struct Qdisc *sch = q->watchdog.qdisc;
 486        psched_time_t now;
 487        psched_tdiff_t delay = 0;
 488        unsigned int pmask;
 489
 490        now = psched_get_time();
 491
 492        pmask = q->pmask;
 493        q->pmask = 0;
 494
 495        while (pmask) {
 496                int prio = ffz(~pmask);
 497                psched_tdiff_t tmp;
 498
 499                pmask &= ~(1<<prio);
 500
 501                tmp = cbq_undelay_prio(q, prio, now);
 502                if (tmp > 0) {
 503                        q->pmask |= 1<<prio;
 504                        if (tmp < delay || delay == 0)
 505                                delay = tmp;
 506                }
 507        }
 508
 509        if (delay) {
 510                ktime_t time;
 511
 512                time = ktime_set(0, 0);
 513                time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 514                hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 515        }
 516
 517        __netif_schedule(qdisc_root(sch));
 518        return HRTIMER_NORESTART;
 519}
 520
 521/*
 522 * It is mission critical procedure.
 523 *
 524 * We "regenerate" toplevel cutoff, if transmitting class
 525 * has backlog and it is not regulated. It is not part of
 526 * original CBQ description, but looks more reasonable.
 527 * Probably, it is wrong. This question needs further investigation.
 528 */
 529
 530static inline void
 531cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 532                    struct cbq_class *borrowed)
 533{
 534        if (cl && q->toplevel >= borrowed->level) {
 535                if (cl->q->q.qlen > 1) {
 536                        do {
 537                                if (borrowed->undertime == PSCHED_PASTPERFECT) {
 538                                        q->toplevel = borrowed->level;
 539                                        return;
 540                                }
 541                        } while ((borrowed = borrowed->borrow) != NULL);
 542                }
 543#if 0
 544        /* It is not necessary now. Uncommenting it
 545           will save CPU cycles, but decrease fairness.
 546         */
 547                q->toplevel = TC_CBQ_MAXLEVEL;
 548#endif
 549        }
 550}
 551
 552static void
 553cbq_update(struct cbq_sched_data *q)
 554{
 555        struct cbq_class *this = q->tx_class;
 556        struct cbq_class *cl = this;
 557        int len = q->tx_len;
 558        psched_time_t now;
 559
 560        q->tx_class = NULL;
 561        /* Time integrator. We calculate EOS time
 562         * by adding expected packet transmission time.
 563         */
 564        now = q->now + L2T(&q->link, len);
 565
 566        for ( ; cl; cl = cl->share) {
 567                long avgidle = cl->avgidle;
 568                long idle;
 569
 570                cl->bstats.packets++;
 571                cl->bstats.bytes += len;
 572
 573                /*
 574                 * (now - last) is total time between packet right edges.
 575                 * (last_pktlen/rate) is "virtual" busy time, so that
 576                 *
 577                 *      idle = (now - last) - last_pktlen/rate
 578                 */
 579
 580                idle = now - cl->last;
 581                if ((unsigned long)idle > 128*1024*1024) {
 582                        avgidle = cl->maxidle;
 583                } else {
 584                        idle -= L2T(cl, len);
 585
 586                /* true_avgidle := (1-W)*true_avgidle + W*idle,
 587                 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 588                 * cl->avgidle == true_avgidle/W,
 589                 * hence:
 590                 */
 591                        avgidle += idle - (avgidle>>cl->ewma_log);
 592                }
 593
 594                if (avgidle <= 0) {
 595                        /* Overlimit or at-limit */
 596
 597                        if (avgidle < cl->minidle)
 598                                avgidle = cl->minidle;
 599
 600                        cl->avgidle = avgidle;
 601
 602                        /* Calculate expected time, when this class
 603                         * will be allowed to send.
 604                         * It will occur, when:
 605                         * (1-W)*true_avgidle + W*delay = 0, i.e.
 606                         * idle = (1/W - 1)*(-true_avgidle)
 607                         * or
 608                         * idle = (1 - W)*(-cl->avgidle);
 609                         */
 610                        idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 611
 612                        /*
 613                         * That is not all.
 614                         * To maintain the rate allocated to the class,
 615                         * we add to undertime virtual clock,
 616                         * necessary to complete transmitted packet.
 617                         * (len/phys_bandwidth has been already passed
 618                         * to the moment of cbq_update)
 619                         */
 620
 621                        idle -= L2T(&q->link, len);
 622                        idle += L2T(cl, len);
 623
 624                        cl->undertime = now + idle;
 625                } else {
 626                        /* Underlimit */
 627
 628                        cl->undertime = PSCHED_PASTPERFECT;
 629                        if (avgidle > cl->maxidle)
 630                                cl->avgidle = cl->maxidle;
 631                        else
 632                                cl->avgidle = avgidle;
 633                }
 634                if ((s64)(now - cl->last) > 0)
 635                        cl->last = now;
 636        }
 637
 638        cbq_update_toplevel(q, this, q->tx_borrowed);
 639}
 640
 641static inline struct cbq_class *
 642cbq_under_limit(struct cbq_class *cl)
 643{
 644        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 645        struct cbq_class *this_cl = cl;
 646
 647        if (cl->tparent == NULL)
 648                return cl;
 649
 650        if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 651                cl->delayed = 0;
 652                return cl;
 653        }
 654
 655        do {
 656                /* It is very suspicious place. Now overlimit
 657                 * action is generated for not bounded classes
 658                 * only if link is completely congested.
 659                 * Though it is in agree with ancestor-only paradigm,
 660                 * it looks very stupid. Particularly,
 661                 * it means that this chunk of code will either
 662                 * never be called or result in strong amplification
 663                 * of burstiness. Dangerous, silly, and, however,
 664                 * no another solution exists.
 665                 */
 666                cl = cl->borrow;
 667                if (!cl) {
 668                        this_cl->qstats.overlimits++;
 669                        cbq_overlimit(this_cl);
 670                        return NULL;
 671                }
 672                if (cl->level > q->toplevel)
 673                        return NULL;
 674        } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 675
 676        cl->delayed = 0;
 677        return cl;
 678}
 679
 680static inline struct sk_buff *
 681cbq_dequeue_prio(struct Qdisc *sch, int prio)
 682{
 683        struct cbq_sched_data *q = qdisc_priv(sch);
 684        struct cbq_class *cl_tail, *cl_prev, *cl;
 685        struct sk_buff *skb;
 686        int deficit;
 687
 688        cl_tail = cl_prev = q->active[prio];
 689        cl = cl_prev->next_alive;
 690
 691        do {
 692                deficit = 0;
 693
 694                /* Start round */
 695                do {
 696                        struct cbq_class *borrow = cl;
 697
 698                        if (cl->q->q.qlen &&
 699                            (borrow = cbq_under_limit(cl)) == NULL)
 700                                goto skip_class;
 701
 702                        if (cl->deficit <= 0) {
 703                                /* Class exhausted its allotment per
 704                                 * this round. Switch to the next one.
 705                                 */
 706                                deficit = 1;
 707                                cl->deficit += cl->quantum;
 708                                goto next_class;
 709                        }
 710
 711                        skb = cl->q->dequeue(cl->q);
 712
 713                        /* Class did not give us any skb :-(
 714                         * It could occur even if cl->q->q.qlen != 0
 715                         * f.e. if cl->q == "tbf"
 716                         */
 717                        if (skb == NULL)
 718                                goto skip_class;
 719
 720                        cl->deficit -= qdisc_pkt_len(skb);
 721                        q->tx_class = cl;
 722                        q->tx_borrowed = borrow;
 723                        if (borrow != cl) {
 724#ifndef CBQ_XSTATS_BORROWS_BYTES
 725                                borrow->xstats.borrows++;
 726                                cl->xstats.borrows++;
 727#else
 728                                borrow->xstats.borrows += qdisc_pkt_len(skb);
 729                                cl->xstats.borrows += qdisc_pkt_len(skb);
 730#endif
 731                        }
 732                        q->tx_len = qdisc_pkt_len(skb);
 733
 734                        if (cl->deficit <= 0) {
 735                                q->active[prio] = cl;
 736                                cl = cl->next_alive;
 737                                cl->deficit += cl->quantum;
 738                        }
 739                        return skb;
 740
 741skip_class:
 742                        if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 743                                /* Class is empty or penalized.
 744                                 * Unlink it from active chain.
 745                                 */
 746                                cl_prev->next_alive = cl->next_alive;
 747                                cl->next_alive = NULL;
 748
 749                                /* Did cl_tail point to it? */
 750                                if (cl == cl_tail) {
 751                                        /* Repair it! */
 752                                        cl_tail = cl_prev;
 753
 754                                        /* Was it the last class in this band? */
 755                                        if (cl == cl_tail) {
 756                                                /* Kill the band! */
 757                                                q->active[prio] = NULL;
 758                                                q->activemask &= ~(1<<prio);
 759                                                if (cl->q->q.qlen)
 760                                                        cbq_activate_class(cl);
 761                                                return NULL;
 762                                        }
 763
 764                                        q->active[prio] = cl_tail;
 765                                }
 766                                if (cl->q->q.qlen)
 767                                        cbq_activate_class(cl);
 768
 769                                cl = cl_prev;
 770                        }
 771
 772next_class:
 773                        cl_prev = cl;
 774                        cl = cl->next_alive;
 775                } while (cl_prev != cl_tail);
 776        } while (deficit);
 777
 778        q->active[prio] = cl_prev;
 779
 780        return NULL;
 781}
 782
 783static inline struct sk_buff *
 784cbq_dequeue_1(struct Qdisc *sch)
 785{
 786        struct cbq_sched_data *q = qdisc_priv(sch);
 787        struct sk_buff *skb;
 788        unsigned int activemask;
 789
 790        activemask = q->activemask & 0xFF;
 791        while (activemask) {
 792                int prio = ffz(~activemask);
 793                activemask &= ~(1<<prio);
 794                skb = cbq_dequeue_prio(sch, prio);
 795                if (skb)
 796                        return skb;
 797        }
 798        return NULL;
 799}
 800
 801static struct sk_buff *
 802cbq_dequeue(struct Qdisc *sch)
 803{
 804        struct sk_buff *skb;
 805        struct cbq_sched_data *q = qdisc_priv(sch);
 806        psched_time_t now;
 807
 808        now = psched_get_time();
 809
 810        if (q->tx_class)
 811                cbq_update(q);
 812
 813        q->now = now;
 814
 815        for (;;) {
 816                q->wd_expires = 0;
 817
 818                skb = cbq_dequeue_1(sch);
 819                if (skb) {
 820                        qdisc_bstats_update(sch, skb);
 821                        sch->q.qlen--;
 822                        return skb;
 823                }
 824
 825                /* All the classes are overlimit.
 826                 *
 827                 * It is possible, if:
 828                 *
 829                 * 1. Scheduler is empty.
 830                 * 2. Toplevel cutoff inhibited borrowing.
 831                 * 3. Root class is overlimit.
 832                 *
 833                 * Reset 2d and 3d conditions and retry.
 834                 *
 835                 * Note, that NS and cbq-2.0 are buggy, peeking
 836                 * an arbitrary class is appropriate for ancestor-only
 837                 * sharing, but not for toplevel algorithm.
 838                 *
 839                 * Our version is better, but slower, because it requires
 840                 * two passes, but it is unavoidable with top-level sharing.
 841                 */
 842
 843                if (q->toplevel == TC_CBQ_MAXLEVEL &&
 844                    q->link.undertime == PSCHED_PASTPERFECT)
 845                        break;
 846
 847                q->toplevel = TC_CBQ_MAXLEVEL;
 848                q->link.undertime = PSCHED_PASTPERFECT;
 849        }
 850
 851        /* No packets in scheduler or nobody wants to give them to us :-(
 852         * Sigh... start watchdog timer in the last case.
 853         */
 854
 855        if (sch->q.qlen) {
 856                qdisc_qstats_overlimit(sch);
 857                if (q->wd_expires)
 858                        qdisc_watchdog_schedule(&q->watchdog,
 859                                                now + q->wd_expires);
 860        }
 861        return NULL;
 862}
 863
 864/* CBQ class maintanance routines */
 865
 866static void cbq_adjust_levels(struct cbq_class *this)
 867{
 868        if (this == NULL)
 869                return;
 870
 871        do {
 872                int level = 0;
 873                struct cbq_class *cl;
 874
 875                cl = this->children;
 876                if (cl) {
 877                        do {
 878                                if (cl->level > level)
 879                                        level = cl->level;
 880                        } while ((cl = cl->sibling) != this->children);
 881                }
 882                this->level = level + 1;
 883        } while ((this = this->tparent) != NULL);
 884}
 885
 886static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 887{
 888        struct cbq_class *cl;
 889        unsigned int h;
 890
 891        if (q->quanta[prio] == 0)
 892                return;
 893
 894        for (h = 0; h < q->clhash.hashsize; h++) {
 895                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 896                        /* BUGGGG... Beware! This expression suffer of
 897                         * arithmetic overflows!
 898                         */
 899                        if (cl->priority == prio) {
 900                                cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 901                                        q->quanta[prio];
 902                        }
 903                        if (cl->quantum <= 0 ||
 904                            cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 905                                pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 906                                        cl->common.classid, cl->quantum);
 907                                cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 908                        }
 909                }
 910        }
 911}
 912
 913static void cbq_sync_defmap(struct cbq_class *cl)
 914{
 915        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 916        struct cbq_class *split = cl->split;
 917        unsigned int h;
 918        int i;
 919
 920        if (split == NULL)
 921                return;
 922
 923        for (i = 0; i <= TC_PRIO_MAX; i++) {
 924                if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 925                        split->defaults[i] = NULL;
 926        }
 927
 928        for (i = 0; i <= TC_PRIO_MAX; i++) {
 929                int level = split->level;
 930
 931                if (split->defaults[i])
 932                        continue;
 933
 934                for (h = 0; h < q->clhash.hashsize; h++) {
 935                        struct cbq_class *c;
 936
 937                        hlist_for_each_entry(c, &q->clhash.hash[h],
 938                                             common.hnode) {
 939                                if (c->split == split && c->level < level &&
 940                                    c->defmap & (1<<i)) {
 941                                        split->defaults[i] = c;
 942                                        level = c->level;
 943                                }
 944                        }
 945                }
 946        }
 947}
 948
 949static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 950{
 951        struct cbq_class *split = NULL;
 952
 953        if (splitid == 0) {
 954                split = cl->split;
 955                if (!split)
 956                        return;
 957                splitid = split->common.classid;
 958        }
 959
 960        if (split == NULL || split->common.classid != splitid) {
 961                for (split = cl->tparent; split; split = split->tparent)
 962                        if (split->common.classid == splitid)
 963                                break;
 964        }
 965
 966        if (split == NULL)
 967                return;
 968
 969        if (cl->split != split) {
 970                cl->defmap = 0;
 971                cbq_sync_defmap(cl);
 972                cl->split = split;
 973                cl->defmap = def & mask;
 974        } else
 975                cl->defmap = (cl->defmap & ~mask) | (def & mask);
 976
 977        cbq_sync_defmap(cl);
 978}
 979
 980static void cbq_unlink_class(struct cbq_class *this)
 981{
 982        struct cbq_class *cl, **clp;
 983        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 984
 985        qdisc_class_hash_remove(&q->clhash, &this->common);
 986
 987        if (this->tparent) {
 988                clp = &this->sibling;
 989                cl = *clp;
 990                do {
 991                        if (cl == this) {
 992                                *clp = cl->sibling;
 993                                break;
 994                        }
 995                        clp = &cl->sibling;
 996                } while ((cl = *clp) != this->sibling);
 997
 998                if (this->tparent->children == this) {
 999                        this->tparent->children = this->sibling;
1000                        if (this->sibling == this)
1001                                this->tparent->children = NULL;
1002                }
1003        } else {
1004                WARN_ON(this->sibling != this);
1005        }
1006}
1007
1008static void cbq_link_class(struct cbq_class *this)
1009{
1010        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1011        struct cbq_class *parent = this->tparent;
1012
1013        this->sibling = this;
1014        qdisc_class_hash_insert(&q->clhash, &this->common);
1015
1016        if (parent == NULL)
1017                return;
1018
1019        if (parent->children == NULL) {
1020                parent->children = this;
1021        } else {
1022                this->sibling = parent->children->sibling;
1023                parent->children->sibling = this;
1024        }
1025}
1026
1027static void
1028cbq_reset(struct Qdisc *sch)
1029{
1030        struct cbq_sched_data *q = qdisc_priv(sch);
1031        struct cbq_class *cl;
1032        int prio;
1033        unsigned int h;
1034
1035        q->activemask = 0;
1036        q->pmask = 0;
1037        q->tx_class = NULL;
1038        q->tx_borrowed = NULL;
1039        qdisc_watchdog_cancel(&q->watchdog);
1040        hrtimer_cancel(&q->delay_timer);
1041        q->toplevel = TC_CBQ_MAXLEVEL;
1042        q->now = psched_get_time();
1043
1044        for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1045                q->active[prio] = NULL;
1046
1047        for (h = 0; h < q->clhash.hashsize; h++) {
1048                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1049                        qdisc_reset(cl->q);
1050
1051                        cl->next_alive = NULL;
1052                        cl->undertime = PSCHED_PASTPERFECT;
1053                        cl->avgidle = cl->maxidle;
1054                        cl->deficit = cl->quantum;
1055                        cl->cpriority = cl->priority;
1056                }
1057        }
1058        sch->q.qlen = 0;
1059}
1060
1061
1062static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1063{
1064        if (lss->change & TCF_CBQ_LSS_FLAGS) {
1065                cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1066                cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1067        }
1068        if (lss->change & TCF_CBQ_LSS_EWMA)
1069                cl->ewma_log = lss->ewma_log;
1070        if (lss->change & TCF_CBQ_LSS_AVPKT)
1071                cl->avpkt = lss->avpkt;
1072        if (lss->change & TCF_CBQ_LSS_MINIDLE)
1073                cl->minidle = -(long)lss->minidle;
1074        if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1075                cl->maxidle = lss->maxidle;
1076                cl->avgidle = lss->maxidle;
1077        }
1078        if (lss->change & TCF_CBQ_LSS_OFFTIME)
1079                cl->offtime = lss->offtime;
1080        return 0;
1081}
1082
1083static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1084{
1085        q->nclasses[cl->priority]--;
1086        q->quanta[cl->priority] -= cl->weight;
1087        cbq_normalize_quanta(q, cl->priority);
1088}
1089
1090static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1091{
1092        q->nclasses[cl->priority]++;
1093        q->quanta[cl->priority] += cl->weight;
1094        cbq_normalize_quanta(q, cl->priority);
1095}
1096
1097static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1098{
1099        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1100
1101        if (wrr->allot)
1102                cl->allot = wrr->allot;
1103        if (wrr->weight)
1104                cl->weight = wrr->weight;
1105        if (wrr->priority) {
1106                cl->priority = wrr->priority - 1;
1107                cl->cpriority = cl->priority;
1108                if (cl->priority >= cl->priority2)
1109                        cl->priority2 = TC_CBQ_MAXPRIO - 1;
1110        }
1111
1112        cbq_addprio(q, cl);
1113        return 0;
1114}
1115
1116static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1117{
1118        cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1119        return 0;
1120}
1121
1122static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1123        [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1124        [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1125        [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1126        [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1127        [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1128        [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1129        [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1130};
1131
1132static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1133{
1134        struct cbq_sched_data *q = qdisc_priv(sch);
1135        struct nlattr *tb[TCA_CBQ_MAX + 1];
1136        struct tc_ratespec *r;
1137        int err;
1138
1139        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1140        if (err < 0)
1141                return err;
1142
1143        if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1144                return -EINVAL;
1145
1146        r = nla_data(tb[TCA_CBQ_RATE]);
1147
1148        if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1149                return -EINVAL;
1150
1151        err = qdisc_class_hash_init(&q->clhash);
1152        if (err < 0)
1153                goto put_rtab;
1154
1155        q->link.refcnt = 1;
1156        q->link.sibling = &q->link;
1157        q->link.common.classid = sch->handle;
1158        q->link.qdisc = sch;
1159        q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1160                                      sch->handle);
1161        if (!q->link.q)
1162                q->link.q = &noop_qdisc;
1163
1164        q->link.priority = TC_CBQ_MAXPRIO - 1;
1165        q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1166        q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1167        q->link.allot = psched_mtu(qdisc_dev(sch));
1168        q->link.quantum = q->link.allot;
1169        q->link.weight = q->link.R_tab->rate.rate;
1170
1171        q->link.ewma_log = TC_CBQ_DEF_EWMA;
1172        q->link.avpkt = q->link.allot/2;
1173        q->link.minidle = -0x7FFFFFFF;
1174
1175        qdisc_watchdog_init(&q->watchdog, sch);
1176        hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1177        q->delay_timer.function = cbq_undelay;
1178        q->toplevel = TC_CBQ_MAXLEVEL;
1179        q->now = psched_get_time();
1180
1181        cbq_link_class(&q->link);
1182
1183        if (tb[TCA_CBQ_LSSOPT])
1184                cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1185
1186        cbq_addprio(q, &q->link);
1187        return 0;
1188
1189put_rtab:
1190        qdisc_put_rtab(q->link.R_tab);
1191        return err;
1192}
1193
1194static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1195{
1196        unsigned char *b = skb_tail_pointer(skb);
1197
1198        if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1199                goto nla_put_failure;
1200        return skb->len;
1201
1202nla_put_failure:
1203        nlmsg_trim(skb, b);
1204        return -1;
1205}
1206
1207static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1208{
1209        unsigned char *b = skb_tail_pointer(skb);
1210        struct tc_cbq_lssopt opt;
1211
1212        opt.flags = 0;
1213        if (cl->borrow == NULL)
1214                opt.flags |= TCF_CBQ_LSS_BOUNDED;
1215        if (cl->share == NULL)
1216                opt.flags |= TCF_CBQ_LSS_ISOLATED;
1217        opt.ewma_log = cl->ewma_log;
1218        opt.level = cl->level;
1219        opt.avpkt = cl->avpkt;
1220        opt.maxidle = cl->maxidle;
1221        opt.minidle = (u32)(-cl->minidle);
1222        opt.offtime = cl->offtime;
1223        opt.change = ~0;
1224        if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1225                goto nla_put_failure;
1226        return skb->len;
1227
1228nla_put_failure:
1229        nlmsg_trim(skb, b);
1230        return -1;
1231}
1232
1233static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1234{
1235        unsigned char *b = skb_tail_pointer(skb);
1236        struct tc_cbq_wrropt opt;
1237
1238        memset(&opt, 0, sizeof(opt));
1239        opt.flags = 0;
1240        opt.allot = cl->allot;
1241        opt.priority = cl->priority + 1;
1242        opt.cpriority = cl->cpriority + 1;
1243        opt.weight = cl->weight;
1244        if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1245                goto nla_put_failure;
1246        return skb->len;
1247
1248nla_put_failure:
1249        nlmsg_trim(skb, b);
1250        return -1;
1251}
1252
1253static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1254{
1255        unsigned char *b = skb_tail_pointer(skb);
1256        struct tc_cbq_fopt opt;
1257
1258        if (cl->split || cl->defmap) {
1259                opt.split = cl->split ? cl->split->common.classid : 0;
1260                opt.defmap = cl->defmap;
1261                opt.defchange = ~0;
1262                if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1263                        goto nla_put_failure;
1264        }
1265        return skb->len;
1266
1267nla_put_failure:
1268        nlmsg_trim(skb, b);
1269        return -1;
1270}
1271
1272static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1273{
1274        if (cbq_dump_lss(skb, cl) < 0 ||
1275            cbq_dump_rate(skb, cl) < 0 ||
1276            cbq_dump_wrr(skb, cl) < 0 ||
1277            cbq_dump_fopt(skb, cl) < 0)
1278                return -1;
1279        return 0;
1280}
1281
1282static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1283{
1284        struct cbq_sched_data *q = qdisc_priv(sch);
1285        struct nlattr *nest;
1286
1287        nest = nla_nest_start(skb, TCA_OPTIONS);
1288        if (nest == NULL)
1289                goto nla_put_failure;
1290        if (cbq_dump_attr(skb, &q->link) < 0)
1291                goto nla_put_failure;
1292        return nla_nest_end(skb, nest);
1293
1294nla_put_failure:
1295        nla_nest_cancel(skb, nest);
1296        return -1;
1297}
1298
1299static int
1300cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1301{
1302        struct cbq_sched_data *q = qdisc_priv(sch);
1303
1304        q->link.xstats.avgidle = q->link.avgidle;
1305        return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1306}
1307
1308static int
1309cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1310               struct sk_buff *skb, struct tcmsg *tcm)
1311{
1312        struct cbq_class *cl = (struct cbq_class *)arg;
1313        struct nlattr *nest;
1314
1315        if (cl->tparent)
1316                tcm->tcm_parent = cl->tparent->common.classid;
1317        else
1318                tcm->tcm_parent = TC_H_ROOT;
1319        tcm->tcm_handle = cl->common.classid;
1320        tcm->tcm_info = cl->q->handle;
1321
1322        nest = nla_nest_start(skb, TCA_OPTIONS);
1323        if (nest == NULL)
1324                goto nla_put_failure;
1325        if (cbq_dump_attr(skb, cl) < 0)
1326                goto nla_put_failure;
1327        return nla_nest_end(skb, nest);
1328
1329nla_put_failure:
1330        nla_nest_cancel(skb, nest);
1331        return -1;
1332}
1333
1334static int
1335cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1336        struct gnet_dump *d)
1337{
1338        struct cbq_sched_data *q = qdisc_priv(sch);
1339        struct cbq_class *cl = (struct cbq_class *)arg;
1340
1341        cl->xstats.avgidle = cl->avgidle;
1342        cl->xstats.undertime = 0;
1343
1344        if (cl->undertime != PSCHED_PASTPERFECT)
1345                cl->xstats.undertime = cl->undertime - q->now;
1346
1347        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1348                                  d, NULL, &cl->bstats) < 0 ||
1349            gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1350            gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1351                return -1;
1352
1353        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1354}
1355
1356static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1357                     struct Qdisc **old)
1358{
1359        struct cbq_class *cl = (struct cbq_class *)arg;
1360
1361        if (new == NULL) {
1362                new = qdisc_create_dflt(sch->dev_queue,
1363                                        &pfifo_qdisc_ops, cl->common.classid);
1364                if (new == NULL)
1365                        return -ENOBUFS;
1366        }
1367
1368        *old = qdisc_replace(sch, new, &cl->q);
1369        return 0;
1370}
1371
1372static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1373{
1374        struct cbq_class *cl = (struct cbq_class *)arg;
1375
1376        return cl->q;
1377}
1378
1379static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1380{
1381        struct cbq_class *cl = (struct cbq_class *)arg;
1382
1383        if (cl->q->q.qlen == 0)
1384                cbq_deactivate_class(cl);
1385}
1386
1387static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1388{
1389        struct cbq_sched_data *q = qdisc_priv(sch);
1390        struct cbq_class *cl = cbq_class_lookup(q, classid);
1391
1392        if (cl) {
1393                cl->refcnt++;
1394                return (unsigned long)cl;
1395        }
1396        return 0;
1397}
1398
1399static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1400{
1401        struct cbq_sched_data *q = qdisc_priv(sch);
1402
1403        WARN_ON(cl->filters);
1404
1405        tcf_destroy_chain(&cl->filter_list);
1406        qdisc_destroy(cl->q);
1407        qdisc_put_rtab(cl->R_tab);
1408        gen_kill_estimator(&cl->bstats, &cl->rate_est);
1409        if (cl != &q->link)
1410                kfree(cl);
1411}
1412
1413static void cbq_destroy(struct Qdisc *sch)
1414{
1415        struct cbq_sched_data *q = qdisc_priv(sch);
1416        struct hlist_node *next;
1417        struct cbq_class *cl;
1418        unsigned int h;
1419
1420#ifdef CONFIG_NET_CLS_ACT
1421        q->rx_class = NULL;
1422#endif
1423        /*
1424         * Filters must be destroyed first because we don't destroy the
1425         * classes from root to leafs which means that filters can still
1426         * be bound to classes which have been destroyed already. --TGR '04
1427         */
1428        for (h = 0; h < q->clhash.hashsize; h++) {
1429                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1430                        tcf_destroy_chain(&cl->filter_list);
1431        }
1432        for (h = 0; h < q->clhash.hashsize; h++) {
1433                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1434                                          common.hnode)
1435                        cbq_destroy_class(sch, cl);
1436        }
1437        qdisc_class_hash_destroy(&q->clhash);
1438}
1439
1440static void cbq_put(struct Qdisc *sch, unsigned long arg)
1441{
1442        struct cbq_class *cl = (struct cbq_class *)arg;
1443
1444        if (--cl->refcnt == 0) {
1445#ifdef CONFIG_NET_CLS_ACT
1446                spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1447                struct cbq_sched_data *q = qdisc_priv(sch);
1448
1449                spin_lock_bh(root_lock);
1450                if (q->rx_class == cl)
1451                        q->rx_class = NULL;
1452                spin_unlock_bh(root_lock);
1453#endif
1454
1455                cbq_destroy_class(sch, cl);
1456        }
1457}
1458
1459static int
1460cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1461                 unsigned long *arg)
1462{
1463        int err;
1464        struct cbq_sched_data *q = qdisc_priv(sch);
1465        struct cbq_class *cl = (struct cbq_class *)*arg;
1466        struct nlattr *opt = tca[TCA_OPTIONS];
1467        struct nlattr *tb[TCA_CBQ_MAX + 1];
1468        struct cbq_class *parent;
1469        struct qdisc_rate_table *rtab = NULL;
1470
1471        if (opt == NULL)
1472                return -EINVAL;
1473
1474        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1475        if (err < 0)
1476                return err;
1477
1478        if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1479                return -EOPNOTSUPP;
1480
1481        if (cl) {
1482                /* Check parent */
1483                if (parentid) {
1484                        if (cl->tparent &&
1485                            cl->tparent->common.classid != parentid)
1486                                return -EINVAL;
1487                        if (!cl->tparent && parentid != TC_H_ROOT)
1488                                return -EINVAL;
1489                }
1490
1491                if (tb[TCA_CBQ_RATE]) {
1492                        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1493                                              tb[TCA_CBQ_RTAB]);
1494                        if (rtab == NULL)
1495                                return -EINVAL;
1496                }
1497
1498                if (tca[TCA_RATE]) {
1499                        err = gen_replace_estimator(&cl->bstats, NULL,
1500                                                    &cl->rate_est,
1501                                                    NULL,
1502                                                    qdisc_root_sleeping_running(sch),
1503                                                    tca[TCA_RATE]);
1504                        if (err) {
1505                                qdisc_put_rtab(rtab);
1506                                return err;
1507                        }
1508                }
1509
1510                /* Change class parameters */
1511                sch_tree_lock(sch);
1512
1513                if (cl->next_alive != NULL)
1514                        cbq_deactivate_class(cl);
1515
1516                if (rtab) {
1517                        qdisc_put_rtab(cl->R_tab);
1518                        cl->R_tab = rtab;
1519                }
1520
1521                if (tb[TCA_CBQ_LSSOPT])
1522                        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1523
1524                if (tb[TCA_CBQ_WRROPT]) {
1525                        cbq_rmprio(q, cl);
1526                        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1527                }
1528
1529                if (tb[TCA_CBQ_FOPT])
1530                        cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1531
1532                if (cl->q->q.qlen)
1533                        cbq_activate_class(cl);
1534
1535                sch_tree_unlock(sch);
1536
1537                return 0;
1538        }
1539
1540        if (parentid == TC_H_ROOT)
1541                return -EINVAL;
1542
1543        if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1544            tb[TCA_CBQ_LSSOPT] == NULL)
1545                return -EINVAL;
1546
1547        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1548        if (rtab == NULL)
1549                return -EINVAL;
1550
1551        if (classid) {
1552                err = -EINVAL;
1553                if (TC_H_MAJ(classid ^ sch->handle) ||
1554                    cbq_class_lookup(q, classid))
1555                        goto failure;
1556        } else {
1557                int i;
1558                classid = TC_H_MAKE(sch->handle, 0x8000);
1559
1560                for (i = 0; i < 0x8000; i++) {
1561                        if (++q->hgenerator >= 0x8000)
1562                                q->hgenerator = 1;
1563                        if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1564                                break;
1565                }
1566                err = -ENOSR;
1567                if (i >= 0x8000)
1568                        goto failure;
1569                classid = classid|q->hgenerator;
1570        }
1571
1572        parent = &q->link;
1573        if (parentid) {
1574                parent = cbq_class_lookup(q, parentid);
1575                err = -EINVAL;
1576                if (parent == NULL)
1577                        goto failure;
1578        }
1579
1580        err = -ENOBUFS;
1581        cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1582        if (cl == NULL)
1583                goto failure;
1584
1585        if (tca[TCA_RATE]) {
1586                err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1587                                        NULL,
1588                                        qdisc_root_sleeping_running(sch),
1589                                        tca[TCA_RATE]);
1590                if (err) {
1591                        kfree(cl);
1592                        goto failure;
1593                }
1594        }
1595
1596        cl->R_tab = rtab;
1597        rtab = NULL;
1598        cl->refcnt = 1;
1599        cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1600        if (!cl->q)
1601                cl->q = &noop_qdisc;
1602        cl->common.classid = classid;
1603        cl->tparent = parent;
1604        cl->qdisc = sch;
1605        cl->allot = parent->allot;
1606        cl->quantum = cl->allot;
1607        cl->weight = cl->R_tab->rate.rate;
1608
1609        sch_tree_lock(sch);
1610        cbq_link_class(cl);
1611        cl->borrow = cl->tparent;
1612        if (cl->tparent != &q->link)
1613                cl->share = cl->tparent;
1614        cbq_adjust_levels(parent);
1615        cl->minidle = -0x7FFFFFFF;
1616        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1617        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1618        if (cl->ewma_log == 0)
1619                cl->ewma_log = q->link.ewma_log;
1620        if (cl->maxidle == 0)
1621                cl->maxidle = q->link.maxidle;
1622        if (cl->avpkt == 0)
1623                cl->avpkt = q->link.avpkt;
1624        if (tb[TCA_CBQ_FOPT])
1625                cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1626        sch_tree_unlock(sch);
1627
1628        qdisc_class_hash_grow(sch, &q->clhash);
1629
1630        *arg = (unsigned long)cl;
1631        return 0;
1632
1633failure:
1634        qdisc_put_rtab(rtab);
1635        return err;
1636}
1637
1638static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1639{
1640        struct cbq_sched_data *q = qdisc_priv(sch);
1641        struct cbq_class *cl = (struct cbq_class *)arg;
1642        unsigned int qlen, backlog;
1643
1644        if (cl->filters || cl->children || cl == &q->link)
1645                return -EBUSY;
1646
1647        sch_tree_lock(sch);
1648
1649        qlen = cl->q->q.qlen;
1650        backlog = cl->q->qstats.backlog;
1651        qdisc_reset(cl->q);
1652        qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1653
1654        if (cl->next_alive)
1655                cbq_deactivate_class(cl);
1656
1657        if (q->tx_borrowed == cl)
1658                q->tx_borrowed = q->tx_class;
1659        if (q->tx_class == cl) {
1660                q->tx_class = NULL;
1661                q->tx_borrowed = NULL;
1662        }
1663#ifdef CONFIG_NET_CLS_ACT
1664        if (q->rx_class == cl)
1665                q->rx_class = NULL;
1666#endif
1667
1668        cbq_unlink_class(cl);
1669        cbq_adjust_levels(cl->tparent);
1670        cl->defmap = 0;
1671        cbq_sync_defmap(cl);
1672
1673        cbq_rmprio(q, cl);
1674        sch_tree_unlock(sch);
1675
1676        BUG_ON(--cl->refcnt == 0);
1677        /*
1678         * This shouldn't happen: we "hold" one cops->get() when called
1679         * from tc_ctl_tclass; the destroy method is done from cops->put().
1680         */
1681
1682        return 0;
1683}
1684
1685static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
1686                                             unsigned long arg)
1687{
1688        struct cbq_sched_data *q = qdisc_priv(sch);
1689        struct cbq_class *cl = (struct cbq_class *)arg;
1690
1691        if (cl == NULL)
1692                cl = &q->link;
1693
1694        return &cl->filter_list;
1695}
1696
1697static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1698                                     u32 classid)
1699{
1700        struct cbq_sched_data *q = qdisc_priv(sch);
1701        struct cbq_class *p = (struct cbq_class *)parent;
1702        struct cbq_class *cl = cbq_class_lookup(q, classid);
1703
1704        if (cl) {
1705                if (p && p->level <= cl->level)
1706                        return 0;
1707                cl->filters++;
1708                return (unsigned long)cl;
1709        }
1710        return 0;
1711}
1712
1713static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1714{
1715        struct cbq_class *cl = (struct cbq_class *)arg;
1716
1717        cl->filters--;
1718}
1719
1720static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1721{
1722        struct cbq_sched_data *q = qdisc_priv(sch);
1723        struct cbq_class *cl;
1724        unsigned int h;
1725
1726        if (arg->stop)
1727                return;
1728
1729        for (h = 0; h < q->clhash.hashsize; h++) {
1730                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1731                        if (arg->count < arg->skip) {
1732                                arg->count++;
1733                                continue;
1734                        }
1735                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1736                                arg->stop = 1;
1737                                return;
1738                        }
1739                        arg->count++;
1740                }
1741        }
1742}
1743
1744static const struct Qdisc_class_ops cbq_class_ops = {
1745        .graft          =       cbq_graft,
1746        .leaf           =       cbq_leaf,
1747        .qlen_notify    =       cbq_qlen_notify,
1748        .get            =       cbq_get,
1749        .put            =       cbq_put,
1750        .change         =       cbq_change_class,
1751        .delete         =       cbq_delete,
1752        .walk           =       cbq_walk,
1753        .tcf_chain      =       cbq_find_tcf,
1754        .bind_tcf       =       cbq_bind_filter,
1755        .unbind_tcf     =       cbq_unbind_filter,
1756        .dump           =       cbq_dump_class,
1757        .dump_stats     =       cbq_dump_class_stats,
1758};
1759
1760static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1761        .next           =       NULL,
1762        .cl_ops         =       &cbq_class_ops,
1763        .id             =       "cbq",
1764        .priv_size      =       sizeof(struct cbq_sched_data),
1765        .enqueue        =       cbq_enqueue,
1766        .dequeue        =       cbq_dequeue,
1767        .peek           =       qdisc_peek_dequeued,
1768        .init           =       cbq_init,
1769        .reset          =       cbq_reset,
1770        .destroy        =       cbq_destroy,
1771        .change         =       NULL,
1772        .dump           =       cbq_dump,
1773        .dump_stats     =       cbq_dump_stats,
1774        .owner          =       THIS_MODULE,
1775};
1776
1777static int __init cbq_module_init(void)
1778{
1779        return register_qdisc(&cbq_qdisc_ops);
1780}
1781static void __exit cbq_module_exit(void)
1782{
1783        unregister_qdisc(&cbq_qdisc_ops);
1784}
1785module_init(cbq_module_init)
1786module_exit(cbq_module_exit)
1787MODULE_LICENSE("GPL");
1788