linux/net/sched/sch_cbq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/slab.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <net/netlink.h>
  21#include <net/pkt_sched.h>
  22#include <net/pkt_cls.h>
  23
  24
  25/*      Class-Based Queueing (CBQ) algorithm.
  26        =======================================
  27
  28        Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  29                 Management Models for Packet Networks",
  30                 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  31
  32                 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  33
  34                 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  35                 Parameters", 1996
  36
  37                 [4] Sally Floyd and Michael Speer, "Experimental Results
  38                 for Class-Based Queueing", 1998, not published.
  39
  40        -----------------------------------------------------------------------
  41
  42        Algorithm skeleton was taken from NS simulator cbq.cc.
  43        If someone wants to check this code against the LBL version,
  44        he should take into account that ONLY the skeleton was borrowed,
  45        the implementation is different. Particularly:
  46
  47        --- The WRR algorithm is different. Our version looks more
  48        reasonable (I hope) and works when quanta are allowed to be
  49        less than MTU, which is always the case when real time classes
  50        have small rates. Note, that the statement of [3] is
  51        incomplete, delay may actually be estimated even if class
  52        per-round allotment is less than MTU. Namely, if per-round
  53        allotment is W*r_i, and r_1+...+r_k = r < 1
  54
  55        delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  56
  57        In the worst case we have IntServ estimate with D = W*r+k*MTU
  58        and C = MTU*r. The proof (if correct at all) is trivial.
  59
  60
  61        --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  62        interpret some places, which look like wrong translations
  63        from NS. Anyone is advised to find these differences
  64        and explain to me, why I am wrong 8).
  65
  66        --- Linux has no EOI event, so that we cannot estimate true class
  67        idle time. Workaround is to consider the next dequeue event
  68        as sign that previous packet is finished. This is wrong because of
  69        internal device queueing, but on a permanently loaded link it is true.
  70        Moreover, combined with clock integrator, this scheme looks
  71        very close to an ideal solution.  */
  72
  73struct cbq_sched_data;
  74
  75
  76struct cbq_class {
  77        struct Qdisc_class_common common;
  78        struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  79
  80/* Parameters */
  81        unsigned char           priority;       /* class priority */
  82        unsigned char           priority2;      /* priority to be used after overlimit */
  83        unsigned char           ewma_log;       /* time constant for idle time calculation */
  84
  85        u32                     defmap;
  86
  87        /* Link-sharing scheduler parameters */
  88        long                    maxidle;        /* Class parameters: see below. */
  89        long                    offtime;
  90        long                    minidle;
  91        u32                     avpkt;
  92        struct qdisc_rate_table *R_tab;
  93
  94        /* General scheduler (WRR) parameters */
  95        long                    allot;
  96        long                    quantum;        /* Allotment per WRR round */
  97        long                    weight;         /* Relative allotment: see below */
  98
  99        struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
 100        struct cbq_class        *split;         /* Ptr to split node */
 101        struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
 102        struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
 103        struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
 104                                                   parent otherwise */
 105        struct cbq_class        *sibling;       /* Sibling chain */
 106        struct cbq_class        *children;      /* Pointer to children chain */
 107
 108        struct Qdisc            *q;             /* Elementary queueing discipline */
 109
 110
 111/* Variables */
 112        unsigned char           cpriority;      /* Effective priority */
 113        unsigned char           delayed;
 114        unsigned char           level;          /* level of the class in hierarchy:
 115                                                   0 for leaf classes, and maximal
 116                                                   level of children + 1 for nodes.
 117                                                 */
 118
 119        psched_time_t           last;           /* Last end of service */
 120        psched_time_t           undertime;
 121        long                    avgidle;
 122        long                    deficit;        /* Saved deficit for WRR */
 123        psched_time_t           penalized;
 124        struct gnet_stats_basic_packed bstats;
 125        struct gnet_stats_queue qstats;
 126        struct net_rate_estimator __rcu *rate_est;
 127        struct tc_cbq_xstats    xstats;
 128
 129        struct tcf_proto __rcu  *filter_list;
 130        struct tcf_block        *block;
 131
 132        int                     filters;
 133
 134        struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 135};
 136
 137struct cbq_sched_data {
 138        struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 139        int                     nclasses[TC_CBQ_MAXPRIO + 1];
 140        unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 141
 142        struct cbq_class        link;
 143
 144        unsigned int            activemask;
 145        struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 146                                                                   with backlog */
 147
 148#ifdef CONFIG_NET_CLS_ACT
 149        struct cbq_class        *rx_class;
 150#endif
 151        struct cbq_class        *tx_class;
 152        struct cbq_class        *tx_borrowed;
 153        int                     tx_len;
 154        psched_time_t           now;            /* Cached timestamp */
 155        unsigned int            pmask;
 156
 157        struct hrtimer          delay_timer;
 158        struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 159                                                   started when CBQ has
 160                                                   backlog, but cannot
 161                                                   transmit just now */
 162        psched_tdiff_t          wd_expires;
 163        int                     toplevel;
 164        u32                     hgenerator;
 165};
 166
 167
 168#define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 169
 170static inline struct cbq_class *
 171cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 172{
 173        struct Qdisc_class_common *clc;
 174
 175        clc = qdisc_class_find(&q->clhash, classid);
 176        if (clc == NULL)
 177                return NULL;
 178        return container_of(clc, struct cbq_class, common);
 179}
 180
 181#ifdef CONFIG_NET_CLS_ACT
 182
 183static struct cbq_class *
 184cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 185{
 186        struct cbq_class *cl;
 187
 188        for (cl = this->tparent; cl; cl = cl->tparent) {
 189                struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 190
 191                if (new != NULL && new != this)
 192                        return new;
 193        }
 194        return NULL;
 195}
 196
 197#endif
 198
 199/* Classify packet. The procedure is pretty complicated, but
 200 * it allows us to combine link sharing and priority scheduling
 201 * transparently.
 202 *
 203 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 204 * so that it resolves to split nodes. Then packets are classified
 205 * by logical priority, or a more specific classifier may be attached
 206 * to the split node.
 207 */
 208
 209static struct cbq_class *
 210cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 211{
 212        struct cbq_sched_data *q = qdisc_priv(sch);
 213        struct cbq_class *head = &q->link;
 214        struct cbq_class **defmap;
 215        struct cbq_class *cl = NULL;
 216        u32 prio = skb->priority;
 217        struct tcf_proto *fl;
 218        struct tcf_result res;
 219
 220        /*
 221         *  Step 1. If skb->priority points to one of our classes, use it.
 222         */
 223        if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 224            (cl = cbq_class_lookup(q, prio)) != NULL)
 225                return cl;
 226
 227        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 228        for (;;) {
 229                int result = 0;
 230                defmap = head->defaults;
 231
 232                fl = rcu_dereference_bh(head->filter_list);
 233                /*
 234                 * Step 2+n. Apply classifier.
 235                 */
 236                result = tcf_classify(skb, fl, &res, true);
 237                if (!fl || result < 0)
 238                        goto fallback;
 239
 240                cl = (void *)res.class;
 241                if (!cl) {
 242                        if (TC_H_MAJ(res.classid))
 243                                cl = cbq_class_lookup(q, res.classid);
 244                        else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 245                                cl = defmap[TC_PRIO_BESTEFFORT];
 246
 247                        if (cl == NULL)
 248                                goto fallback;
 249                }
 250                if (cl->level >= head->level)
 251                        goto fallback;
 252#ifdef CONFIG_NET_CLS_ACT
 253                switch (result) {
 254                case TC_ACT_QUEUED:
 255                case TC_ACT_STOLEN:
 256                case TC_ACT_TRAP:
 257                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 258                case TC_ACT_SHOT:
 259                        return NULL;
 260                case TC_ACT_RECLASSIFY:
 261                        return cbq_reclassify(skb, cl);
 262                }
 263#endif
 264                if (cl->level == 0)
 265                        return cl;
 266
 267                /*
 268                 * Step 3+n. If classifier selected a link sharing class,
 269                 *         apply agency specific classifier.
 270                 *         Repeat this procdure until we hit a leaf node.
 271                 */
 272                head = cl;
 273        }
 274
 275fallback:
 276        cl = head;
 277
 278        /*
 279         * Step 4. No success...
 280         */
 281        if (TC_H_MAJ(prio) == 0 &&
 282            !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 283            !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 284                return head;
 285
 286        return cl;
 287}
 288
 289/*
 290 * A packet has just been enqueued on the empty class.
 291 * cbq_activate_class adds it to the tail of active class list
 292 * of its priority band.
 293 */
 294
 295static inline void cbq_activate_class(struct cbq_class *cl)
 296{
 297        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 298        int prio = cl->cpriority;
 299        struct cbq_class *cl_tail;
 300
 301        cl_tail = q->active[prio];
 302        q->active[prio] = cl;
 303
 304        if (cl_tail != NULL) {
 305                cl->next_alive = cl_tail->next_alive;
 306                cl_tail->next_alive = cl;
 307        } else {
 308                cl->next_alive = cl;
 309                q->activemask |= (1<<prio);
 310        }
 311}
 312
 313/*
 314 * Unlink class from active chain.
 315 * Note that this same procedure is done directly in cbq_dequeue*
 316 * during round-robin procedure.
 317 */
 318
 319static void cbq_deactivate_class(struct cbq_class *this)
 320{
 321        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 322        int prio = this->cpriority;
 323        struct cbq_class *cl;
 324        struct cbq_class *cl_prev = q->active[prio];
 325
 326        do {
 327                cl = cl_prev->next_alive;
 328                if (cl == this) {
 329                        cl_prev->next_alive = cl->next_alive;
 330                        cl->next_alive = NULL;
 331
 332                        if (cl == q->active[prio]) {
 333                                q->active[prio] = cl_prev;
 334                                if (cl == q->active[prio]) {
 335                                        q->active[prio] = NULL;
 336                                        q->activemask &= ~(1<<prio);
 337                                        return;
 338                                }
 339                        }
 340                        return;
 341                }
 342        } while ((cl_prev = cl) != q->active[prio]);
 343}
 344
 345static void
 346cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 347{
 348        int toplevel = q->toplevel;
 349
 350        if (toplevel > cl->level) {
 351                psched_time_t now = psched_get_time();
 352
 353                do {
 354                        if (cl->undertime < now) {
 355                                q->toplevel = cl->level;
 356                                return;
 357                        }
 358                } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 359        }
 360}
 361
 362static int
 363cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 364            struct sk_buff **to_free)
 365{
 366        struct cbq_sched_data *q = qdisc_priv(sch);
 367        int uninitialized_var(ret);
 368        struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 369
 370#ifdef CONFIG_NET_CLS_ACT
 371        q->rx_class = cl;
 372#endif
 373        if (cl == NULL) {
 374                if (ret & __NET_XMIT_BYPASS)
 375                        qdisc_qstats_drop(sch);
 376                __qdisc_drop(skb, to_free);
 377                return ret;
 378        }
 379
 380        ret = qdisc_enqueue(skb, cl->q, to_free);
 381        if (ret == NET_XMIT_SUCCESS) {
 382                sch->q.qlen++;
 383                cbq_mark_toplevel(q, cl);
 384                if (!cl->next_alive)
 385                        cbq_activate_class(cl);
 386                return ret;
 387        }
 388
 389        if (net_xmit_drop_count(ret)) {
 390                qdisc_qstats_drop(sch);
 391                cbq_mark_toplevel(q, cl);
 392                cl->qstats.drops++;
 393        }
 394        return ret;
 395}
 396
 397/* Overlimit action: penalize leaf class by adding offtime */
 398static void cbq_overlimit(struct cbq_class *cl)
 399{
 400        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 401        psched_tdiff_t delay = cl->undertime - q->now;
 402
 403        if (!cl->delayed) {
 404                delay += cl->offtime;
 405
 406                /*
 407                 * Class goes to sleep, so that it will have no
 408                 * chance to work avgidle. Let's forgive it 8)
 409                 *
 410                 * BTW cbq-2.0 has a crap in this
 411                 * place, apparently they forgot to shift it by cl->ewma_log.
 412                 */
 413                if (cl->avgidle < 0)
 414                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 415                if (cl->avgidle < cl->minidle)
 416                        cl->avgidle = cl->minidle;
 417                if (delay <= 0)
 418                        delay = 1;
 419                cl->undertime = q->now + delay;
 420
 421                cl->xstats.overactions++;
 422                cl->delayed = 1;
 423        }
 424        if (q->wd_expires == 0 || q->wd_expires > delay)
 425                q->wd_expires = delay;
 426
 427        /* Dirty work! We must schedule wakeups based on
 428         * real available rate, rather than leaf rate,
 429         * which may be tiny (even zero).
 430         */
 431        if (q->toplevel == TC_CBQ_MAXLEVEL) {
 432                struct cbq_class *b;
 433                psched_tdiff_t base_delay = q->wd_expires;
 434
 435                for (b = cl->borrow; b; b = b->borrow) {
 436                        delay = b->undertime - q->now;
 437                        if (delay < base_delay) {
 438                                if (delay <= 0)
 439                                        delay = 1;
 440                                base_delay = delay;
 441                        }
 442                }
 443
 444                q->wd_expires = base_delay;
 445        }
 446}
 447
 448static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 449                                       psched_time_t now)
 450{
 451        struct cbq_class *cl;
 452        struct cbq_class *cl_prev = q->active[prio];
 453        psched_time_t sched = now;
 454
 455        if (cl_prev == NULL)
 456                return 0;
 457
 458        do {
 459                cl = cl_prev->next_alive;
 460                if (now - cl->penalized > 0) {
 461                        cl_prev->next_alive = cl->next_alive;
 462                        cl->next_alive = NULL;
 463                        cl->cpriority = cl->priority;
 464                        cl->delayed = 0;
 465                        cbq_activate_class(cl);
 466
 467                        if (cl == q->active[prio]) {
 468                                q->active[prio] = cl_prev;
 469                                if (cl == q->active[prio]) {
 470                                        q->active[prio] = NULL;
 471                                        return 0;
 472                                }
 473                        }
 474
 475                        cl = cl_prev->next_alive;
 476                } else if (sched - cl->penalized > 0)
 477                        sched = cl->penalized;
 478        } while ((cl_prev = cl) != q->active[prio]);
 479
 480        return sched - now;
 481}
 482
 483static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 484{
 485        struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 486                                                delay_timer);
 487        struct Qdisc *sch = q->watchdog.qdisc;
 488        psched_time_t now;
 489        psched_tdiff_t delay = 0;
 490        unsigned int pmask;
 491
 492        now = psched_get_time();
 493
 494        pmask = q->pmask;
 495        q->pmask = 0;
 496
 497        while (pmask) {
 498                int prio = ffz(~pmask);
 499                psched_tdiff_t tmp;
 500
 501                pmask &= ~(1<<prio);
 502
 503                tmp = cbq_undelay_prio(q, prio, now);
 504                if (tmp > 0) {
 505                        q->pmask |= 1<<prio;
 506                        if (tmp < delay || delay == 0)
 507                                delay = tmp;
 508                }
 509        }
 510
 511        if (delay) {
 512                ktime_t time;
 513
 514                time = 0;
 515                time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 516                hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 517        }
 518
 519        __netif_schedule(qdisc_root(sch));
 520        return HRTIMER_NORESTART;
 521}
 522
 523/*
 524 * It is mission critical procedure.
 525 *
 526 * We "regenerate" toplevel cutoff, if transmitting class
 527 * has backlog and it is not regulated. It is not part of
 528 * original CBQ description, but looks more reasonable.
 529 * Probably, it is wrong. This question needs further investigation.
 530 */
 531
 532static inline void
 533cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 534                    struct cbq_class *borrowed)
 535{
 536        if (cl && q->toplevel >= borrowed->level) {
 537                if (cl->q->q.qlen > 1) {
 538                        do {
 539                                if (borrowed->undertime == PSCHED_PASTPERFECT) {
 540                                        q->toplevel = borrowed->level;
 541                                        return;
 542                                }
 543                        } while ((borrowed = borrowed->borrow) != NULL);
 544                }
 545#if 0
 546        /* It is not necessary now. Uncommenting it
 547           will save CPU cycles, but decrease fairness.
 548         */
 549                q->toplevel = TC_CBQ_MAXLEVEL;
 550#endif
 551        }
 552}
 553
 554static void
 555cbq_update(struct cbq_sched_data *q)
 556{
 557        struct cbq_class *this = q->tx_class;
 558        struct cbq_class *cl = this;
 559        int len = q->tx_len;
 560        psched_time_t now;
 561
 562        q->tx_class = NULL;
 563        /* Time integrator. We calculate EOS time
 564         * by adding expected packet transmission time.
 565         */
 566        now = q->now + L2T(&q->link, len);
 567
 568        for ( ; cl; cl = cl->share) {
 569                long avgidle = cl->avgidle;
 570                long idle;
 571
 572                cl->bstats.packets++;
 573                cl->bstats.bytes += len;
 574
 575                /*
 576                 * (now - last) is total time between packet right edges.
 577                 * (last_pktlen/rate) is "virtual" busy time, so that
 578                 *
 579                 *      idle = (now - last) - last_pktlen/rate
 580                 */
 581
 582                idle = now - cl->last;
 583                if ((unsigned long)idle > 128*1024*1024) {
 584                        avgidle = cl->maxidle;
 585                } else {
 586                        idle -= L2T(cl, len);
 587
 588                /* true_avgidle := (1-W)*true_avgidle + W*idle,
 589                 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 590                 * cl->avgidle == true_avgidle/W,
 591                 * hence:
 592                 */
 593                        avgidle += idle - (avgidle>>cl->ewma_log);
 594                }
 595
 596                if (avgidle <= 0) {
 597                        /* Overlimit or at-limit */
 598
 599                        if (avgidle < cl->minidle)
 600                                avgidle = cl->minidle;
 601
 602                        cl->avgidle = avgidle;
 603
 604                        /* Calculate expected time, when this class
 605                         * will be allowed to send.
 606                         * It will occur, when:
 607                         * (1-W)*true_avgidle + W*delay = 0, i.e.
 608                         * idle = (1/W - 1)*(-true_avgidle)
 609                         * or
 610                         * idle = (1 - W)*(-cl->avgidle);
 611                         */
 612                        idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 613
 614                        /*
 615                         * That is not all.
 616                         * To maintain the rate allocated to the class,
 617                         * we add to undertime virtual clock,
 618                         * necessary to complete transmitted packet.
 619                         * (len/phys_bandwidth has been already passed
 620                         * to the moment of cbq_update)
 621                         */
 622
 623                        idle -= L2T(&q->link, len);
 624                        idle += L2T(cl, len);
 625
 626                        cl->undertime = now + idle;
 627                } else {
 628                        /* Underlimit */
 629
 630                        cl->undertime = PSCHED_PASTPERFECT;
 631                        if (avgidle > cl->maxidle)
 632                                cl->avgidle = cl->maxidle;
 633                        else
 634                                cl->avgidle = avgidle;
 635                }
 636                if ((s64)(now - cl->last) > 0)
 637                        cl->last = now;
 638        }
 639
 640        cbq_update_toplevel(q, this, q->tx_borrowed);
 641}
 642
 643static inline struct cbq_class *
 644cbq_under_limit(struct cbq_class *cl)
 645{
 646        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 647        struct cbq_class *this_cl = cl;
 648
 649        if (cl->tparent == NULL)
 650                return cl;
 651
 652        if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 653                cl->delayed = 0;
 654                return cl;
 655        }
 656
 657        do {
 658                /* It is very suspicious place. Now overlimit
 659                 * action is generated for not bounded classes
 660                 * only if link is completely congested.
 661                 * Though it is in agree with ancestor-only paradigm,
 662                 * it looks very stupid. Particularly,
 663                 * it means that this chunk of code will either
 664                 * never be called or result in strong amplification
 665                 * of burstiness. Dangerous, silly, and, however,
 666                 * no another solution exists.
 667                 */
 668                cl = cl->borrow;
 669                if (!cl) {
 670                        this_cl->qstats.overlimits++;
 671                        cbq_overlimit(this_cl);
 672                        return NULL;
 673                }
 674                if (cl->level > q->toplevel)
 675                        return NULL;
 676        } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 677
 678        cl->delayed = 0;
 679        return cl;
 680}
 681
 682static inline struct sk_buff *
 683cbq_dequeue_prio(struct Qdisc *sch, int prio)
 684{
 685        struct cbq_sched_data *q = qdisc_priv(sch);
 686        struct cbq_class *cl_tail, *cl_prev, *cl;
 687        struct sk_buff *skb;
 688        int deficit;
 689
 690        cl_tail = cl_prev = q->active[prio];
 691        cl = cl_prev->next_alive;
 692
 693        do {
 694                deficit = 0;
 695
 696                /* Start round */
 697                do {
 698                        struct cbq_class *borrow = cl;
 699
 700                        if (cl->q->q.qlen &&
 701                            (borrow = cbq_under_limit(cl)) == NULL)
 702                                goto skip_class;
 703
 704                        if (cl->deficit <= 0) {
 705                                /* Class exhausted its allotment per
 706                                 * this round. Switch to the next one.
 707                                 */
 708                                deficit = 1;
 709                                cl->deficit += cl->quantum;
 710                                goto next_class;
 711                        }
 712
 713                        skb = cl->q->dequeue(cl->q);
 714
 715                        /* Class did not give us any skb :-(
 716                         * It could occur even if cl->q->q.qlen != 0
 717                         * f.e. if cl->q == "tbf"
 718                         */
 719                        if (skb == NULL)
 720                                goto skip_class;
 721
 722                        cl->deficit -= qdisc_pkt_len(skb);
 723                        q->tx_class = cl;
 724                        q->tx_borrowed = borrow;
 725                        if (borrow != cl) {
 726#ifndef CBQ_XSTATS_BORROWS_BYTES
 727                                borrow->xstats.borrows++;
 728                                cl->xstats.borrows++;
 729#else
 730                                borrow->xstats.borrows += qdisc_pkt_len(skb);
 731                                cl->xstats.borrows += qdisc_pkt_len(skb);
 732#endif
 733                        }
 734                        q->tx_len = qdisc_pkt_len(skb);
 735
 736                        if (cl->deficit <= 0) {
 737                                q->active[prio] = cl;
 738                                cl = cl->next_alive;
 739                                cl->deficit += cl->quantum;
 740                        }
 741                        return skb;
 742
 743skip_class:
 744                        if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 745                                /* Class is empty or penalized.
 746                                 * Unlink it from active chain.
 747                                 */
 748                                cl_prev->next_alive = cl->next_alive;
 749                                cl->next_alive = NULL;
 750
 751                                /* Did cl_tail point to it? */
 752                                if (cl == cl_tail) {
 753                                        /* Repair it! */
 754                                        cl_tail = cl_prev;
 755
 756                                        /* Was it the last class in this band? */
 757                                        if (cl == cl_tail) {
 758                                                /* Kill the band! */
 759                                                q->active[prio] = NULL;
 760                                                q->activemask &= ~(1<<prio);
 761                                                if (cl->q->q.qlen)
 762                                                        cbq_activate_class(cl);
 763                                                return NULL;
 764                                        }
 765
 766                                        q->active[prio] = cl_tail;
 767                                }
 768                                if (cl->q->q.qlen)
 769                                        cbq_activate_class(cl);
 770
 771                                cl = cl_prev;
 772                        }
 773
 774next_class:
 775                        cl_prev = cl;
 776                        cl = cl->next_alive;
 777                } while (cl_prev != cl_tail);
 778        } while (deficit);
 779
 780        q->active[prio] = cl_prev;
 781
 782        return NULL;
 783}
 784
 785static inline struct sk_buff *
 786cbq_dequeue_1(struct Qdisc *sch)
 787{
 788        struct cbq_sched_data *q = qdisc_priv(sch);
 789        struct sk_buff *skb;
 790        unsigned int activemask;
 791
 792        activemask = q->activemask & 0xFF;
 793        while (activemask) {
 794                int prio = ffz(~activemask);
 795                activemask &= ~(1<<prio);
 796                skb = cbq_dequeue_prio(sch, prio);
 797                if (skb)
 798                        return skb;
 799        }
 800        return NULL;
 801}
 802
 803static struct sk_buff *
 804cbq_dequeue(struct Qdisc *sch)
 805{
 806        struct sk_buff *skb;
 807        struct cbq_sched_data *q = qdisc_priv(sch);
 808        psched_time_t now;
 809
 810        now = psched_get_time();
 811
 812        if (q->tx_class)
 813                cbq_update(q);
 814
 815        q->now = now;
 816
 817        for (;;) {
 818                q->wd_expires = 0;
 819
 820                skb = cbq_dequeue_1(sch);
 821                if (skb) {
 822                        qdisc_bstats_update(sch, skb);
 823                        sch->q.qlen--;
 824                        return skb;
 825                }
 826
 827                /* All the classes are overlimit.
 828                 *
 829                 * It is possible, if:
 830                 *
 831                 * 1. Scheduler is empty.
 832                 * 2. Toplevel cutoff inhibited borrowing.
 833                 * 3. Root class is overlimit.
 834                 *
 835                 * Reset 2d and 3d conditions and retry.
 836                 *
 837                 * Note, that NS and cbq-2.0 are buggy, peeking
 838                 * an arbitrary class is appropriate for ancestor-only
 839                 * sharing, but not for toplevel algorithm.
 840                 *
 841                 * Our version is better, but slower, because it requires
 842                 * two passes, but it is unavoidable with top-level sharing.
 843                 */
 844
 845                if (q->toplevel == TC_CBQ_MAXLEVEL &&
 846                    q->link.undertime == PSCHED_PASTPERFECT)
 847                        break;
 848
 849                q->toplevel = TC_CBQ_MAXLEVEL;
 850                q->link.undertime = PSCHED_PASTPERFECT;
 851        }
 852
 853        /* No packets in scheduler or nobody wants to give them to us :-(
 854         * Sigh... start watchdog timer in the last case.
 855         */
 856
 857        if (sch->q.qlen) {
 858                qdisc_qstats_overlimit(sch);
 859                if (q->wd_expires)
 860                        qdisc_watchdog_schedule(&q->watchdog,
 861                                                now + q->wd_expires);
 862        }
 863        return NULL;
 864}
 865
 866/* CBQ class maintanance routines */
 867
 868static void cbq_adjust_levels(struct cbq_class *this)
 869{
 870        if (this == NULL)
 871                return;
 872
 873        do {
 874                int level = 0;
 875                struct cbq_class *cl;
 876
 877                cl = this->children;
 878                if (cl) {
 879                        do {
 880                                if (cl->level > level)
 881                                        level = cl->level;
 882                        } while ((cl = cl->sibling) != this->children);
 883                }
 884                this->level = level + 1;
 885        } while ((this = this->tparent) != NULL);
 886}
 887
 888static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 889{
 890        struct cbq_class *cl;
 891        unsigned int h;
 892
 893        if (q->quanta[prio] == 0)
 894                return;
 895
 896        for (h = 0; h < q->clhash.hashsize; h++) {
 897                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 898                        /* BUGGGG... Beware! This expression suffer of
 899                         * arithmetic overflows!
 900                         */
 901                        if (cl->priority == prio) {
 902                                cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 903                                        q->quanta[prio];
 904                        }
 905                        if (cl->quantum <= 0 ||
 906                            cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 907                                pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 908                                        cl->common.classid, cl->quantum);
 909                                cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 910                        }
 911                }
 912        }
 913}
 914
 915static void cbq_sync_defmap(struct cbq_class *cl)
 916{
 917        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 918        struct cbq_class *split = cl->split;
 919        unsigned int h;
 920        int i;
 921
 922        if (split == NULL)
 923                return;
 924
 925        for (i = 0; i <= TC_PRIO_MAX; i++) {
 926                if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 927                        split->defaults[i] = NULL;
 928        }
 929
 930        for (i = 0; i <= TC_PRIO_MAX; i++) {
 931                int level = split->level;
 932
 933                if (split->defaults[i])
 934                        continue;
 935
 936                for (h = 0; h < q->clhash.hashsize; h++) {
 937                        struct cbq_class *c;
 938
 939                        hlist_for_each_entry(c, &q->clhash.hash[h],
 940                                             common.hnode) {
 941                                if (c->split == split && c->level < level &&
 942                                    c->defmap & (1<<i)) {
 943                                        split->defaults[i] = c;
 944                                        level = c->level;
 945                                }
 946                        }
 947                }
 948        }
 949}
 950
 951static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 952{
 953        struct cbq_class *split = NULL;
 954
 955        if (splitid == 0) {
 956                split = cl->split;
 957                if (!split)
 958                        return;
 959                splitid = split->common.classid;
 960        }
 961
 962        if (split == NULL || split->common.classid != splitid) {
 963                for (split = cl->tparent; split; split = split->tparent)
 964                        if (split->common.classid == splitid)
 965                                break;
 966        }
 967
 968        if (split == NULL)
 969                return;
 970
 971        if (cl->split != split) {
 972                cl->defmap = 0;
 973                cbq_sync_defmap(cl);
 974                cl->split = split;
 975                cl->defmap = def & mask;
 976        } else
 977                cl->defmap = (cl->defmap & ~mask) | (def & mask);
 978
 979        cbq_sync_defmap(cl);
 980}
 981
 982static void cbq_unlink_class(struct cbq_class *this)
 983{
 984        struct cbq_class *cl, **clp;
 985        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 986
 987        qdisc_class_hash_remove(&q->clhash, &this->common);
 988
 989        if (this->tparent) {
 990                clp = &this->sibling;
 991                cl = *clp;
 992                do {
 993                        if (cl == this) {
 994                                *clp = cl->sibling;
 995                                break;
 996                        }
 997                        clp = &cl->sibling;
 998                } while ((cl = *clp) != this->sibling);
 999
1000                if (this->tparent->children == this) {
1001                        this->tparent->children = this->sibling;
1002                        if (this->sibling == this)
1003                                this->tparent->children = NULL;
1004                }
1005        } else {
1006                WARN_ON(this->sibling != this);
1007        }
1008}
1009
1010static void cbq_link_class(struct cbq_class *this)
1011{
1012        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1013        struct cbq_class *parent = this->tparent;
1014
1015        this->sibling = this;
1016        qdisc_class_hash_insert(&q->clhash, &this->common);
1017
1018        if (parent == NULL)
1019                return;
1020
1021        if (parent->children == NULL) {
1022                parent->children = this;
1023        } else {
1024                this->sibling = parent->children->sibling;
1025                parent->children->sibling = this;
1026        }
1027}
1028
1029static void
1030cbq_reset(struct Qdisc *sch)
1031{
1032        struct cbq_sched_data *q = qdisc_priv(sch);
1033        struct cbq_class *cl;
1034        int prio;
1035        unsigned int h;
1036
1037        q->activemask = 0;
1038        q->pmask = 0;
1039        q->tx_class = NULL;
1040        q->tx_borrowed = NULL;
1041        qdisc_watchdog_cancel(&q->watchdog);
1042        hrtimer_cancel(&q->delay_timer);
1043        q->toplevel = TC_CBQ_MAXLEVEL;
1044        q->now = psched_get_time();
1045
1046        for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1047                q->active[prio] = NULL;
1048
1049        for (h = 0; h < q->clhash.hashsize; h++) {
1050                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1051                        qdisc_reset(cl->q);
1052
1053                        cl->next_alive = NULL;
1054                        cl->undertime = PSCHED_PASTPERFECT;
1055                        cl->avgidle = cl->maxidle;
1056                        cl->deficit = cl->quantum;
1057                        cl->cpriority = cl->priority;
1058                }
1059        }
1060        sch->q.qlen = 0;
1061}
1062
1063
1064static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1065{
1066        if (lss->change & TCF_CBQ_LSS_FLAGS) {
1067                cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1068                cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1069        }
1070        if (lss->change & TCF_CBQ_LSS_EWMA)
1071                cl->ewma_log = lss->ewma_log;
1072        if (lss->change & TCF_CBQ_LSS_AVPKT)
1073                cl->avpkt = lss->avpkt;
1074        if (lss->change & TCF_CBQ_LSS_MINIDLE)
1075                cl->minidle = -(long)lss->minidle;
1076        if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1077                cl->maxidle = lss->maxidle;
1078                cl->avgidle = lss->maxidle;
1079        }
1080        if (lss->change & TCF_CBQ_LSS_OFFTIME)
1081                cl->offtime = lss->offtime;
1082        return 0;
1083}
1084
1085static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1086{
1087        q->nclasses[cl->priority]--;
1088        q->quanta[cl->priority] -= cl->weight;
1089        cbq_normalize_quanta(q, cl->priority);
1090}
1091
1092static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1093{
1094        q->nclasses[cl->priority]++;
1095        q->quanta[cl->priority] += cl->weight;
1096        cbq_normalize_quanta(q, cl->priority);
1097}
1098
1099static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1100{
1101        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1102
1103        if (wrr->allot)
1104                cl->allot = wrr->allot;
1105        if (wrr->weight)
1106                cl->weight = wrr->weight;
1107        if (wrr->priority) {
1108                cl->priority = wrr->priority - 1;
1109                cl->cpriority = cl->priority;
1110                if (cl->priority >= cl->priority2)
1111                        cl->priority2 = TC_CBQ_MAXPRIO - 1;
1112        }
1113
1114        cbq_addprio(q, cl);
1115        return 0;
1116}
1117
1118static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1119{
1120        cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1121        return 0;
1122}
1123
1124static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1125        [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1126        [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1127        [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1128        [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1129        [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1130        [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1131        [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1132};
1133
1134static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1135{
1136        struct cbq_sched_data *q = qdisc_priv(sch);
1137        struct nlattr *tb[TCA_CBQ_MAX + 1];
1138        struct tc_ratespec *r;
1139        int err;
1140
1141        qdisc_watchdog_init(&q->watchdog, sch);
1142        hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1143        q->delay_timer.function = cbq_undelay;
1144
1145        if (!opt)
1146                return -EINVAL;
1147
1148        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
1149        if (err < 0)
1150                return err;
1151
1152        if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1153                return -EINVAL;
1154
1155        r = nla_data(tb[TCA_CBQ_RATE]);
1156
1157        if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1158                return -EINVAL;
1159
1160        err = qdisc_class_hash_init(&q->clhash);
1161        if (err < 0)
1162                goto put_rtab;
1163
1164        q->link.sibling = &q->link;
1165        q->link.common.classid = sch->handle;
1166        q->link.qdisc = sch;
1167        q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1168                                      sch->handle);
1169        if (!q->link.q)
1170                q->link.q = &noop_qdisc;
1171        else
1172                qdisc_hash_add(q->link.q, true);
1173
1174        q->link.priority = TC_CBQ_MAXPRIO - 1;
1175        q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1176        q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1177        q->link.allot = psched_mtu(qdisc_dev(sch));
1178        q->link.quantum = q->link.allot;
1179        q->link.weight = q->link.R_tab->rate.rate;
1180
1181        q->link.ewma_log = TC_CBQ_DEF_EWMA;
1182        q->link.avpkt = q->link.allot/2;
1183        q->link.minidle = -0x7FFFFFFF;
1184
1185        q->toplevel = TC_CBQ_MAXLEVEL;
1186        q->now = psched_get_time();
1187
1188        cbq_link_class(&q->link);
1189
1190        if (tb[TCA_CBQ_LSSOPT])
1191                cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1192
1193        cbq_addprio(q, &q->link);
1194        return 0;
1195
1196put_rtab:
1197        qdisc_put_rtab(q->link.R_tab);
1198        return err;
1199}
1200
1201static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1202{
1203        unsigned char *b = skb_tail_pointer(skb);
1204
1205        if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1206                goto nla_put_failure;
1207        return skb->len;
1208
1209nla_put_failure:
1210        nlmsg_trim(skb, b);
1211        return -1;
1212}
1213
1214static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1215{
1216        unsigned char *b = skb_tail_pointer(skb);
1217        struct tc_cbq_lssopt opt;
1218
1219        opt.flags = 0;
1220        if (cl->borrow == NULL)
1221                opt.flags |= TCF_CBQ_LSS_BOUNDED;
1222        if (cl->share == NULL)
1223                opt.flags |= TCF_CBQ_LSS_ISOLATED;
1224        opt.ewma_log = cl->ewma_log;
1225        opt.level = cl->level;
1226        opt.avpkt = cl->avpkt;
1227        opt.maxidle = cl->maxidle;
1228        opt.minidle = (u32)(-cl->minidle);
1229        opt.offtime = cl->offtime;
1230        opt.change = ~0;
1231        if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1232                goto nla_put_failure;
1233        return skb->len;
1234
1235nla_put_failure:
1236        nlmsg_trim(skb, b);
1237        return -1;
1238}
1239
1240static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1241{
1242        unsigned char *b = skb_tail_pointer(skb);
1243        struct tc_cbq_wrropt opt;
1244
1245        memset(&opt, 0, sizeof(opt));
1246        opt.flags = 0;
1247        opt.allot = cl->allot;
1248        opt.priority = cl->priority + 1;
1249        opt.cpriority = cl->cpriority + 1;
1250        opt.weight = cl->weight;
1251        if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1252                goto nla_put_failure;
1253        return skb->len;
1254
1255nla_put_failure:
1256        nlmsg_trim(skb, b);
1257        return -1;
1258}
1259
1260static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1261{
1262        unsigned char *b = skb_tail_pointer(skb);
1263        struct tc_cbq_fopt opt;
1264
1265        if (cl->split || cl->defmap) {
1266                opt.split = cl->split ? cl->split->common.classid : 0;
1267                opt.defmap = cl->defmap;
1268                opt.defchange = ~0;
1269                if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1270                        goto nla_put_failure;
1271        }
1272        return skb->len;
1273
1274nla_put_failure:
1275        nlmsg_trim(skb, b);
1276        return -1;
1277}
1278
1279static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1280{
1281        if (cbq_dump_lss(skb, cl) < 0 ||
1282            cbq_dump_rate(skb, cl) < 0 ||
1283            cbq_dump_wrr(skb, cl) < 0 ||
1284            cbq_dump_fopt(skb, cl) < 0)
1285                return -1;
1286        return 0;
1287}
1288
1289static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1290{
1291        struct cbq_sched_data *q = qdisc_priv(sch);
1292        struct nlattr *nest;
1293
1294        nest = nla_nest_start(skb, TCA_OPTIONS);
1295        if (nest == NULL)
1296                goto nla_put_failure;
1297        if (cbq_dump_attr(skb, &q->link) < 0)
1298                goto nla_put_failure;
1299        return nla_nest_end(skb, nest);
1300
1301nla_put_failure:
1302        nla_nest_cancel(skb, nest);
1303        return -1;
1304}
1305
1306static int
1307cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1308{
1309        struct cbq_sched_data *q = qdisc_priv(sch);
1310
1311        q->link.xstats.avgidle = q->link.avgidle;
1312        return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1313}
1314
1315static int
1316cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1317               struct sk_buff *skb, struct tcmsg *tcm)
1318{
1319        struct cbq_class *cl = (struct cbq_class *)arg;
1320        struct nlattr *nest;
1321
1322        if (cl->tparent)
1323                tcm->tcm_parent = cl->tparent->common.classid;
1324        else
1325                tcm->tcm_parent = TC_H_ROOT;
1326        tcm->tcm_handle = cl->common.classid;
1327        tcm->tcm_info = cl->q->handle;
1328
1329        nest = nla_nest_start(skb, TCA_OPTIONS);
1330        if (nest == NULL)
1331                goto nla_put_failure;
1332        if (cbq_dump_attr(skb, cl) < 0)
1333                goto nla_put_failure;
1334        return nla_nest_end(skb, nest);
1335
1336nla_put_failure:
1337        nla_nest_cancel(skb, nest);
1338        return -1;
1339}
1340
1341static int
1342cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1343        struct gnet_dump *d)
1344{
1345        struct cbq_sched_data *q = qdisc_priv(sch);
1346        struct cbq_class *cl = (struct cbq_class *)arg;
1347
1348        cl->xstats.avgidle = cl->avgidle;
1349        cl->xstats.undertime = 0;
1350
1351        if (cl->undertime != PSCHED_PASTPERFECT)
1352                cl->xstats.undertime = cl->undertime - q->now;
1353
1354        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1355                                  d, NULL, &cl->bstats) < 0 ||
1356            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1357            gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1358                return -1;
1359
1360        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1361}
1362
1363static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1364                     struct Qdisc **old)
1365{
1366        struct cbq_class *cl = (struct cbq_class *)arg;
1367
1368        if (new == NULL) {
1369                new = qdisc_create_dflt(sch->dev_queue,
1370                                        &pfifo_qdisc_ops, cl->common.classid);
1371                if (new == NULL)
1372                        return -ENOBUFS;
1373        }
1374
1375        *old = qdisc_replace(sch, new, &cl->q);
1376        return 0;
1377}
1378
1379static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1380{
1381        struct cbq_class *cl = (struct cbq_class *)arg;
1382
1383        return cl->q;
1384}
1385
1386static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1387{
1388        struct cbq_class *cl = (struct cbq_class *)arg;
1389
1390        cbq_deactivate_class(cl);
1391}
1392
1393static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1394{
1395        struct cbq_sched_data *q = qdisc_priv(sch);
1396
1397        return (unsigned long)cbq_class_lookup(q, classid);
1398}
1399
1400static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1401{
1402        struct cbq_sched_data *q = qdisc_priv(sch);
1403
1404        WARN_ON(cl->filters);
1405
1406        tcf_block_put(cl->block);
1407        qdisc_destroy(cl->q);
1408        qdisc_put_rtab(cl->R_tab);
1409        gen_kill_estimator(&cl->rate_est);
1410        if (cl != &q->link)
1411                kfree(cl);
1412}
1413
1414static void cbq_destroy(struct Qdisc *sch)
1415{
1416        struct cbq_sched_data *q = qdisc_priv(sch);
1417        struct hlist_node *next;
1418        struct cbq_class *cl;
1419        unsigned int h;
1420
1421#ifdef CONFIG_NET_CLS_ACT
1422        q->rx_class = NULL;
1423#endif
1424        /*
1425         * Filters must be destroyed first because we don't destroy the
1426         * classes from root to leafs which means that filters can still
1427         * be bound to classes which have been destroyed already. --TGR '04
1428         */
1429        for (h = 0; h < q->clhash.hashsize; h++) {
1430                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1431                        tcf_block_put(cl->block);
1432                        cl->block = NULL;
1433                }
1434        }
1435        for (h = 0; h < q->clhash.hashsize; h++) {
1436                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1437                                          common.hnode)
1438                        cbq_destroy_class(sch, cl);
1439        }
1440        qdisc_class_hash_destroy(&q->clhash);
1441}
1442
1443static int
1444cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1445                 unsigned long *arg)
1446{
1447        int err;
1448        struct cbq_sched_data *q = qdisc_priv(sch);
1449        struct cbq_class *cl = (struct cbq_class *)*arg;
1450        struct nlattr *opt = tca[TCA_OPTIONS];
1451        struct nlattr *tb[TCA_CBQ_MAX + 1];
1452        struct cbq_class *parent;
1453        struct qdisc_rate_table *rtab = NULL;
1454
1455        if (opt == NULL)
1456                return -EINVAL;
1457
1458        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
1459        if (err < 0)
1460                return err;
1461
1462        if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1463                return -EOPNOTSUPP;
1464
1465        if (cl) {
1466                /* Check parent */
1467                if (parentid) {
1468                        if (cl->tparent &&
1469                            cl->tparent->common.classid != parentid)
1470                                return -EINVAL;
1471                        if (!cl->tparent && parentid != TC_H_ROOT)
1472                                return -EINVAL;
1473                }
1474
1475                if (tb[TCA_CBQ_RATE]) {
1476                        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1477                                              tb[TCA_CBQ_RTAB]);
1478                        if (rtab == NULL)
1479                                return -EINVAL;
1480                }
1481
1482                if (tca[TCA_RATE]) {
1483                        err = gen_replace_estimator(&cl->bstats, NULL,
1484                                                    &cl->rate_est,
1485                                                    NULL,
1486                                                    qdisc_root_sleeping_running(sch),
1487                                                    tca[TCA_RATE]);
1488                        if (err) {
1489                                qdisc_put_rtab(rtab);
1490                                return err;
1491                        }
1492                }
1493
1494                /* Change class parameters */
1495                sch_tree_lock(sch);
1496
1497                if (cl->next_alive != NULL)
1498                        cbq_deactivate_class(cl);
1499
1500                if (rtab) {
1501                        qdisc_put_rtab(cl->R_tab);
1502                        cl->R_tab = rtab;
1503                }
1504
1505                if (tb[TCA_CBQ_LSSOPT])
1506                        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1507
1508                if (tb[TCA_CBQ_WRROPT]) {
1509                        cbq_rmprio(q, cl);
1510                        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1511                }
1512
1513                if (tb[TCA_CBQ_FOPT])
1514                        cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1515
1516                if (cl->q->q.qlen)
1517                        cbq_activate_class(cl);
1518
1519                sch_tree_unlock(sch);
1520
1521                return 0;
1522        }
1523
1524        if (parentid == TC_H_ROOT)
1525                return -EINVAL;
1526
1527        if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1528            tb[TCA_CBQ_LSSOPT] == NULL)
1529                return -EINVAL;
1530
1531        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1532        if (rtab == NULL)
1533                return -EINVAL;
1534
1535        if (classid) {
1536                err = -EINVAL;
1537                if (TC_H_MAJ(classid ^ sch->handle) ||
1538                    cbq_class_lookup(q, classid))
1539                        goto failure;
1540        } else {
1541                int i;
1542                classid = TC_H_MAKE(sch->handle, 0x8000);
1543
1544                for (i = 0; i < 0x8000; i++) {
1545                        if (++q->hgenerator >= 0x8000)
1546                                q->hgenerator = 1;
1547                        if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1548                                break;
1549                }
1550                err = -ENOSR;
1551                if (i >= 0x8000)
1552                        goto failure;
1553                classid = classid|q->hgenerator;
1554        }
1555
1556        parent = &q->link;
1557        if (parentid) {
1558                parent = cbq_class_lookup(q, parentid);
1559                err = -EINVAL;
1560                if (parent == NULL)
1561                        goto failure;
1562        }
1563
1564        err = -ENOBUFS;
1565        cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1566        if (cl == NULL)
1567                goto failure;
1568
1569        err = tcf_block_get(&cl->block, &cl->filter_list);
1570        if (err) {
1571                kfree(cl);
1572                return err;
1573        }
1574
1575        if (tca[TCA_RATE]) {
1576                err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1577                                        NULL,
1578                                        qdisc_root_sleeping_running(sch),
1579                                        tca[TCA_RATE]);
1580                if (err) {
1581                        tcf_block_put(cl->block);
1582                        kfree(cl);
1583                        goto failure;
1584                }
1585        }
1586
1587        cl->R_tab = rtab;
1588        rtab = NULL;
1589        cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1590        if (!cl->q)
1591                cl->q = &noop_qdisc;
1592        else
1593                qdisc_hash_add(cl->q, true);
1594
1595        cl->common.classid = classid;
1596        cl->tparent = parent;
1597        cl->qdisc = sch;
1598        cl->allot = parent->allot;
1599        cl->quantum = cl->allot;
1600        cl->weight = cl->R_tab->rate.rate;
1601
1602        sch_tree_lock(sch);
1603        cbq_link_class(cl);
1604        cl->borrow = cl->tparent;
1605        if (cl->tparent != &q->link)
1606                cl->share = cl->tparent;
1607        cbq_adjust_levels(parent);
1608        cl->minidle = -0x7FFFFFFF;
1609        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1610        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1611        if (cl->ewma_log == 0)
1612                cl->ewma_log = q->link.ewma_log;
1613        if (cl->maxidle == 0)
1614                cl->maxidle = q->link.maxidle;
1615        if (cl->avpkt == 0)
1616                cl->avpkt = q->link.avpkt;
1617        if (tb[TCA_CBQ_FOPT])
1618                cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1619        sch_tree_unlock(sch);
1620
1621        qdisc_class_hash_grow(sch, &q->clhash);
1622
1623        *arg = (unsigned long)cl;
1624        return 0;
1625
1626failure:
1627        qdisc_put_rtab(rtab);
1628        return err;
1629}
1630
1631static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1632{
1633        struct cbq_sched_data *q = qdisc_priv(sch);
1634        struct cbq_class *cl = (struct cbq_class *)arg;
1635        unsigned int qlen, backlog;
1636
1637        if (cl->filters || cl->children || cl == &q->link)
1638                return -EBUSY;
1639
1640        sch_tree_lock(sch);
1641
1642        qlen = cl->q->q.qlen;
1643        backlog = cl->q->qstats.backlog;
1644        qdisc_reset(cl->q);
1645        qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1646
1647        if (cl->next_alive)
1648                cbq_deactivate_class(cl);
1649
1650        if (q->tx_borrowed == cl)
1651                q->tx_borrowed = q->tx_class;
1652        if (q->tx_class == cl) {
1653                q->tx_class = NULL;
1654                q->tx_borrowed = NULL;
1655        }
1656#ifdef CONFIG_NET_CLS_ACT
1657        if (q->rx_class == cl)
1658                q->rx_class = NULL;
1659#endif
1660
1661        cbq_unlink_class(cl);
1662        cbq_adjust_levels(cl->tparent);
1663        cl->defmap = 0;
1664        cbq_sync_defmap(cl);
1665
1666        cbq_rmprio(q, cl);
1667        sch_tree_unlock(sch);
1668
1669        cbq_destroy_class(sch, cl);
1670        return 0;
1671}
1672
1673static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg)
1674{
1675        struct cbq_sched_data *q = qdisc_priv(sch);
1676        struct cbq_class *cl = (struct cbq_class *)arg;
1677
1678        if (cl == NULL)
1679                cl = &q->link;
1680
1681        return cl->block;
1682}
1683
1684static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1685                                     u32 classid)
1686{
1687        struct cbq_sched_data *q = qdisc_priv(sch);
1688        struct cbq_class *p = (struct cbq_class *)parent;
1689        struct cbq_class *cl = cbq_class_lookup(q, classid);
1690
1691        if (cl) {
1692                if (p && p->level <= cl->level)
1693                        return 0;
1694                cl->filters++;
1695                return (unsigned long)cl;
1696        }
1697        return 0;
1698}
1699
1700static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1701{
1702        struct cbq_class *cl = (struct cbq_class *)arg;
1703
1704        cl->filters--;
1705}
1706
1707static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1708{
1709        struct cbq_sched_data *q = qdisc_priv(sch);
1710        struct cbq_class *cl;
1711        unsigned int h;
1712
1713        if (arg->stop)
1714                return;
1715
1716        for (h = 0; h < q->clhash.hashsize; h++) {
1717                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1718                        if (arg->count < arg->skip) {
1719                                arg->count++;
1720                                continue;
1721                        }
1722                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1723                                arg->stop = 1;
1724                                return;
1725                        }
1726                        arg->count++;
1727                }
1728        }
1729}
1730
1731static const struct Qdisc_class_ops cbq_class_ops = {
1732        .graft          =       cbq_graft,
1733        .leaf           =       cbq_leaf,
1734        .qlen_notify    =       cbq_qlen_notify,
1735        .find           =       cbq_find,
1736        .change         =       cbq_change_class,
1737        .delete         =       cbq_delete,
1738        .walk           =       cbq_walk,
1739        .tcf_block      =       cbq_tcf_block,
1740        .bind_tcf       =       cbq_bind_filter,
1741        .unbind_tcf     =       cbq_unbind_filter,
1742        .dump           =       cbq_dump_class,
1743        .dump_stats     =       cbq_dump_class_stats,
1744};
1745
1746static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1747        .next           =       NULL,
1748        .cl_ops         =       &cbq_class_ops,
1749        .id             =       "cbq",
1750        .priv_size      =       sizeof(struct cbq_sched_data),
1751        .enqueue        =       cbq_enqueue,
1752        .dequeue        =       cbq_dequeue,
1753        .peek           =       qdisc_peek_dequeued,
1754        .init           =       cbq_init,
1755        .reset          =       cbq_reset,
1756        .destroy        =       cbq_destroy,
1757        .change         =       NULL,
1758        .dump           =       cbq_dump,
1759        .dump_stats     =       cbq_dump_stats,
1760        .owner          =       THIS_MODULE,
1761};
1762
1763static int __init cbq_module_init(void)
1764{
1765        return register_qdisc(&cbq_qdisc_ops);
1766}
1767static void __exit cbq_module_exit(void)
1768{
1769        unregister_qdisc(&cbq_qdisc_ops);
1770}
1771module_init(cbq_module_init)
1772module_exit(cbq_module_exit)
1773MODULE_LICENSE("GPL");
1774