linux/net/sched/sch_cbq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/slab.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <net/netlink.h>
  21#include <net/pkt_sched.h>
  22#include <net/pkt_cls.h>
  23
  24
  25/*      Class-Based Queueing (CBQ) algorithm.
  26        =======================================
  27
  28        Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  29                 Management Models for Packet Networks",
  30                 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  31
  32                 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  33
  34                 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  35                 Parameters", 1996
  36
  37                 [4] Sally Floyd and Michael Speer, "Experimental Results
  38                 for Class-Based Queueing", 1998, not published.
  39
  40        -----------------------------------------------------------------------
  41
  42        Algorithm skeleton was taken from NS simulator cbq.cc.
  43        If someone wants to check this code against the LBL version,
  44        he should take into account that ONLY the skeleton was borrowed,
  45        the implementation is different. Particularly:
  46
  47        --- The WRR algorithm is different. Our version looks more
  48        reasonable (I hope) and works when quanta are allowed to be
  49        less than MTU, which is always the case when real time classes
  50        have small rates. Note, that the statement of [3] is
  51        incomplete, delay may actually be estimated even if class
  52        per-round allotment is less than MTU. Namely, if per-round
  53        allotment is W*r_i, and r_1+...+r_k = r < 1
  54
  55        delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  56
  57        In the worst case we have IntServ estimate with D = W*r+k*MTU
  58        and C = MTU*r. The proof (if correct at all) is trivial.
  59
  60
  61        --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  62        interpret some places, which look like wrong translations
  63        from NS. Anyone is advised to find these differences
  64        and explain to me, why I am wrong 8).
  65
  66        --- Linux has no EOI event, so that we cannot estimate true class
  67        idle time. Workaround is to consider the next dequeue event
  68        as sign that previous packet is finished. This is wrong because of
  69        internal device queueing, but on a permanently loaded link it is true.
  70        Moreover, combined with clock integrator, this scheme looks
  71        very close to an ideal solution.  */
  72
  73struct cbq_sched_data;
  74
  75
  76struct cbq_class {
  77        struct Qdisc_class_common common;
  78        struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  79
  80/* Parameters */
  81        unsigned char           priority;       /* class priority */
  82        unsigned char           priority2;      /* priority to be used after overlimit */
  83        unsigned char           ewma_log;       /* time constant for idle time calculation */
  84
  85        u32                     defmap;
  86
  87        /* Link-sharing scheduler parameters */
  88        long                    maxidle;        /* Class parameters: see below. */
  89        long                    offtime;
  90        long                    minidle;
  91        u32                     avpkt;
  92        struct qdisc_rate_table *R_tab;
  93
  94        /* General scheduler (WRR) parameters */
  95        long                    allot;
  96        long                    quantum;        /* Allotment per WRR round */
  97        long                    weight;         /* Relative allotment: see below */
  98
  99        struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
 100        struct cbq_class        *split;         /* Ptr to split node */
 101        struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
 102        struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
 103        struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
 104                                                   parent otherwise */
 105        struct cbq_class        *sibling;       /* Sibling chain */
 106        struct cbq_class        *children;      /* Pointer to children chain */
 107
 108        struct Qdisc            *q;             /* Elementary queueing discipline */
 109
 110
 111/* Variables */
 112        unsigned char           cpriority;      /* Effective priority */
 113        unsigned char           delayed;
 114        unsigned char           level;          /* level of the class in hierarchy:
 115                                                   0 for leaf classes, and maximal
 116                                                   level of children + 1 for nodes.
 117                                                 */
 118
 119        psched_time_t           last;           /* Last end of service */
 120        psched_time_t           undertime;
 121        long                    avgidle;
 122        long                    deficit;        /* Saved deficit for WRR */
 123        psched_time_t           penalized;
 124        struct gnet_stats_basic_packed bstats;
 125        struct gnet_stats_queue qstats;
 126        struct net_rate_estimator __rcu *rate_est;
 127        struct tc_cbq_xstats    xstats;
 128
 129        struct tcf_proto __rcu  *filter_list;
 130        struct tcf_block        *block;
 131
 132        int                     filters;
 133
 134        struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 135};
 136
 137struct cbq_sched_data {
 138        struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 139        int                     nclasses[TC_CBQ_MAXPRIO + 1];
 140        unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 141
 142        struct cbq_class        link;
 143
 144        unsigned int            activemask;
 145        struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 146                                                                   with backlog */
 147
 148#ifdef CONFIG_NET_CLS_ACT
 149        struct cbq_class        *rx_class;
 150#endif
 151        struct cbq_class        *tx_class;
 152        struct cbq_class        *tx_borrowed;
 153        int                     tx_len;
 154        psched_time_t           now;            /* Cached timestamp */
 155        unsigned int            pmask;
 156
 157        struct hrtimer          delay_timer;
 158        struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 159                                                   started when CBQ has
 160                                                   backlog, but cannot
 161                                                   transmit just now */
 162        psched_tdiff_t          wd_expires;
 163        int                     toplevel;
 164        u32                     hgenerator;
 165};
 166
 167
 168#define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 169
 170static inline struct cbq_class *
 171cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 172{
 173        struct Qdisc_class_common *clc;
 174
 175        clc = qdisc_class_find(&q->clhash, classid);
 176        if (clc == NULL)
 177                return NULL;
 178        return container_of(clc, struct cbq_class, common);
 179}
 180
 181#ifdef CONFIG_NET_CLS_ACT
 182
 183static struct cbq_class *
 184cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 185{
 186        struct cbq_class *cl;
 187
 188        for (cl = this->tparent; cl; cl = cl->tparent) {
 189                struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 190
 191                if (new != NULL && new != this)
 192                        return new;
 193        }
 194        return NULL;
 195}
 196
 197#endif
 198
 199/* Classify packet. The procedure is pretty complicated, but
 200 * it allows us to combine link sharing and priority scheduling
 201 * transparently.
 202 *
 203 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 204 * so that it resolves to split nodes. Then packets are classified
 205 * by logical priority, or a more specific classifier may be attached
 206 * to the split node.
 207 */
 208
 209static struct cbq_class *
 210cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 211{
 212        struct cbq_sched_data *q = qdisc_priv(sch);
 213        struct cbq_class *head = &q->link;
 214        struct cbq_class **defmap;
 215        struct cbq_class *cl = NULL;
 216        u32 prio = skb->priority;
 217        struct tcf_proto *fl;
 218        struct tcf_result res;
 219
 220        /*
 221         *  Step 1. If skb->priority points to one of our classes, use it.
 222         */
 223        if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 224            (cl = cbq_class_lookup(q, prio)) != NULL)
 225                return cl;
 226
 227        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 228        for (;;) {
 229                int result = 0;
 230                defmap = head->defaults;
 231
 232                fl = rcu_dereference_bh(head->filter_list);
 233                /*
 234                 * Step 2+n. Apply classifier.
 235                 */
 236                result = tcf_classify(skb, fl, &res, true);
 237                if (!fl || result < 0)
 238                        goto fallback;
 239
 240                cl = (void *)res.class;
 241                if (!cl) {
 242                        if (TC_H_MAJ(res.classid))
 243                                cl = cbq_class_lookup(q, res.classid);
 244                        else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 245                                cl = defmap[TC_PRIO_BESTEFFORT];
 246
 247                        if (cl == NULL)
 248                                goto fallback;
 249                }
 250                if (cl->level >= head->level)
 251                        goto fallback;
 252#ifdef CONFIG_NET_CLS_ACT
 253                switch (result) {
 254                case TC_ACT_QUEUED:
 255                case TC_ACT_STOLEN:
 256                case TC_ACT_TRAP:
 257                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 258                case TC_ACT_SHOT:
 259                        return NULL;
 260                case TC_ACT_RECLASSIFY:
 261                        return cbq_reclassify(skb, cl);
 262                }
 263#endif
 264                if (cl->level == 0)
 265                        return cl;
 266
 267                /*
 268                 * Step 3+n. If classifier selected a link sharing class,
 269                 *         apply agency specific classifier.
 270                 *         Repeat this procdure until we hit a leaf node.
 271                 */
 272                head = cl;
 273        }
 274
 275fallback:
 276        cl = head;
 277
 278        /*
 279         * Step 4. No success...
 280         */
 281        if (TC_H_MAJ(prio) == 0 &&
 282            !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 283            !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 284                return head;
 285
 286        return cl;
 287}
 288
 289/*
 290 * A packet has just been enqueued on the empty class.
 291 * cbq_activate_class adds it to the tail of active class list
 292 * of its priority band.
 293 */
 294
 295static inline void cbq_activate_class(struct cbq_class *cl)
 296{
 297        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 298        int prio = cl->cpriority;
 299        struct cbq_class *cl_tail;
 300
 301        cl_tail = q->active[prio];
 302        q->active[prio] = cl;
 303
 304        if (cl_tail != NULL) {
 305                cl->next_alive = cl_tail->next_alive;
 306                cl_tail->next_alive = cl;
 307        } else {
 308                cl->next_alive = cl;
 309                q->activemask |= (1<<prio);
 310        }
 311}
 312
 313/*
 314 * Unlink class from active chain.
 315 * Note that this same procedure is done directly in cbq_dequeue*
 316 * during round-robin procedure.
 317 */
 318
 319static void cbq_deactivate_class(struct cbq_class *this)
 320{
 321        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 322        int prio = this->cpriority;
 323        struct cbq_class *cl;
 324        struct cbq_class *cl_prev = q->active[prio];
 325
 326        do {
 327                cl = cl_prev->next_alive;
 328                if (cl == this) {
 329                        cl_prev->next_alive = cl->next_alive;
 330                        cl->next_alive = NULL;
 331
 332                        if (cl == q->active[prio]) {
 333                                q->active[prio] = cl_prev;
 334                                if (cl == q->active[prio]) {
 335                                        q->active[prio] = NULL;
 336                                        q->activemask &= ~(1<<prio);
 337                                        return;
 338                                }
 339                        }
 340                        return;
 341                }
 342        } while ((cl_prev = cl) != q->active[prio]);
 343}
 344
 345static void
 346cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 347{
 348        int toplevel = q->toplevel;
 349
 350        if (toplevel > cl->level) {
 351                psched_time_t now = psched_get_time();
 352
 353                do {
 354                        if (cl->undertime < now) {
 355                                q->toplevel = cl->level;
 356                                return;
 357                        }
 358                } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 359        }
 360}
 361
 362static int
 363cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 364            struct sk_buff **to_free)
 365{
 366        struct cbq_sched_data *q = qdisc_priv(sch);
 367        int uninitialized_var(ret);
 368        struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 369
 370#ifdef CONFIG_NET_CLS_ACT
 371        q->rx_class = cl;
 372#endif
 373        if (cl == NULL) {
 374                if (ret & __NET_XMIT_BYPASS)
 375                        qdisc_qstats_drop(sch);
 376                __qdisc_drop(skb, to_free);
 377                return ret;
 378        }
 379
 380        ret = qdisc_enqueue(skb, cl->q, to_free);
 381        if (ret == NET_XMIT_SUCCESS) {
 382                sch->q.qlen++;
 383                cbq_mark_toplevel(q, cl);
 384                if (!cl->next_alive)
 385                        cbq_activate_class(cl);
 386                return ret;
 387        }
 388
 389        if (net_xmit_drop_count(ret)) {
 390                qdisc_qstats_drop(sch);
 391                cbq_mark_toplevel(q, cl);
 392                cl->qstats.drops++;
 393        }
 394        return ret;
 395}
 396
 397/* Overlimit action: penalize leaf class by adding offtime */
 398static void cbq_overlimit(struct cbq_class *cl)
 399{
 400        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 401        psched_tdiff_t delay = cl->undertime - q->now;
 402
 403        if (!cl->delayed) {
 404                delay += cl->offtime;
 405
 406                /*
 407                 * Class goes to sleep, so that it will have no
 408                 * chance to work avgidle. Let's forgive it 8)
 409                 *
 410                 * BTW cbq-2.0 has a crap in this
 411                 * place, apparently they forgot to shift it by cl->ewma_log.
 412                 */
 413                if (cl->avgidle < 0)
 414                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 415                if (cl->avgidle < cl->minidle)
 416                        cl->avgidle = cl->minidle;
 417                if (delay <= 0)
 418                        delay = 1;
 419                cl->undertime = q->now + delay;
 420
 421                cl->xstats.overactions++;
 422                cl->delayed = 1;
 423        }
 424        if (q->wd_expires == 0 || q->wd_expires > delay)
 425                q->wd_expires = delay;
 426
 427        /* Dirty work! We must schedule wakeups based on
 428         * real available rate, rather than leaf rate,
 429         * which may be tiny (even zero).
 430         */
 431        if (q->toplevel == TC_CBQ_MAXLEVEL) {
 432                struct cbq_class *b;
 433                psched_tdiff_t base_delay = q->wd_expires;
 434
 435                for (b = cl->borrow; b; b = b->borrow) {
 436                        delay = b->undertime - q->now;
 437                        if (delay < base_delay) {
 438                                if (delay <= 0)
 439                                        delay = 1;
 440                                base_delay = delay;
 441                        }
 442                }
 443
 444                q->wd_expires = base_delay;
 445        }
 446}
 447
 448static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 449                                       psched_time_t now)
 450{
 451        struct cbq_class *cl;
 452        struct cbq_class *cl_prev = q->active[prio];
 453        psched_time_t sched = now;
 454
 455        if (cl_prev == NULL)
 456                return 0;
 457
 458        do {
 459                cl = cl_prev->next_alive;
 460                if (now - cl->penalized > 0) {
 461                        cl_prev->next_alive = cl->next_alive;
 462                        cl->next_alive = NULL;
 463                        cl->cpriority = cl->priority;
 464                        cl->delayed = 0;
 465                        cbq_activate_class(cl);
 466
 467                        if (cl == q->active[prio]) {
 468                                q->active[prio] = cl_prev;
 469                                if (cl == q->active[prio]) {
 470                                        q->active[prio] = NULL;
 471                                        return 0;
 472                                }
 473                        }
 474
 475                        cl = cl_prev->next_alive;
 476                } else if (sched - cl->penalized > 0)
 477                        sched = cl->penalized;
 478        } while ((cl_prev = cl) != q->active[prio]);
 479
 480        return sched - now;
 481}
 482
 483static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 484{
 485        struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 486                                                delay_timer);
 487        struct Qdisc *sch = q->watchdog.qdisc;
 488        psched_time_t now;
 489        psched_tdiff_t delay = 0;
 490        unsigned int pmask;
 491
 492        now = psched_get_time();
 493
 494        pmask = q->pmask;
 495        q->pmask = 0;
 496
 497        while (pmask) {
 498                int prio = ffz(~pmask);
 499                psched_tdiff_t tmp;
 500
 501                pmask &= ~(1<<prio);
 502
 503                tmp = cbq_undelay_prio(q, prio, now);
 504                if (tmp > 0) {
 505                        q->pmask |= 1<<prio;
 506                        if (tmp < delay || delay == 0)
 507                                delay = tmp;
 508                }
 509        }
 510
 511        if (delay) {
 512                ktime_t time;
 513
 514                time = ktime_set(0, 0);
 515                time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 516                hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 517        }
 518
 519        __netif_schedule(qdisc_root(sch));
 520        return HRTIMER_NORESTART;
 521}
 522
 523/*
 524 * It is mission critical procedure.
 525 *
 526 * We "regenerate" toplevel cutoff, if transmitting class
 527 * has backlog and it is not regulated. It is not part of
 528 * original CBQ description, but looks more reasonable.
 529 * Probably, it is wrong. This question needs further investigation.
 530 */
 531
 532static inline void
 533cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 534                    struct cbq_class *borrowed)
 535{
 536        if (cl && q->toplevel >= borrowed->level) {
 537                if (cl->q->q.qlen > 1) {
 538                        do {
 539                                if (borrowed->undertime == PSCHED_PASTPERFECT) {
 540                                        q->toplevel = borrowed->level;
 541                                        return;
 542                                }
 543                        } while ((borrowed = borrowed->borrow) != NULL);
 544                }
 545#if 0
 546        /* It is not necessary now. Uncommenting it
 547           will save CPU cycles, but decrease fairness.
 548         */
 549                q->toplevel = TC_CBQ_MAXLEVEL;
 550#endif
 551        }
 552}
 553
 554static void
 555cbq_update(struct cbq_sched_data *q)
 556{
 557        struct cbq_class *this = q->tx_class;
 558        struct cbq_class *cl = this;
 559        int len = q->tx_len;
 560        psched_time_t now;
 561
 562        q->tx_class = NULL;
 563        /* Time integrator. We calculate EOS time
 564         * by adding expected packet transmission time.
 565         */
 566        now = q->now + L2T(&q->link, len);
 567
 568        for ( ; cl; cl = cl->share) {
 569                long avgidle = cl->avgidle;
 570                long idle;
 571
 572                cl->bstats.packets++;
 573                cl->bstats.bytes += len;
 574
 575                /*
 576                 * (now - last) is total time between packet right edges.
 577                 * (last_pktlen/rate) is "virtual" busy time, so that
 578                 *
 579                 *      idle = (now - last) - last_pktlen/rate
 580                 */
 581
 582                idle = now - cl->last;
 583                if ((unsigned long)idle > 128*1024*1024) {
 584                        avgidle = cl->maxidle;
 585                } else {
 586                        idle -= L2T(cl, len);
 587
 588                /* true_avgidle := (1-W)*true_avgidle + W*idle,
 589                 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 590                 * cl->avgidle == true_avgidle/W,
 591                 * hence:
 592                 */
 593                        avgidle += idle - (avgidle>>cl->ewma_log);
 594                }
 595
 596                if (avgidle <= 0) {
 597                        /* Overlimit or at-limit */
 598
 599                        if (avgidle < cl->minidle)
 600                                avgidle = cl->minidle;
 601
 602                        cl->avgidle = avgidle;
 603
 604                        /* Calculate expected time, when this class
 605                         * will be allowed to send.
 606                         * It will occur, when:
 607                         * (1-W)*true_avgidle + W*delay = 0, i.e.
 608                         * idle = (1/W - 1)*(-true_avgidle)
 609                         * or
 610                         * idle = (1 - W)*(-cl->avgidle);
 611                         */
 612                        idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 613
 614                        /*
 615                         * That is not all.
 616                         * To maintain the rate allocated to the class,
 617                         * we add to undertime virtual clock,
 618                         * necessary to complete transmitted packet.
 619                         * (len/phys_bandwidth has been already passed
 620                         * to the moment of cbq_update)
 621                         */
 622
 623                        idle -= L2T(&q->link, len);
 624                        idle += L2T(cl, len);
 625
 626                        cl->undertime = now + idle;
 627                } else {
 628                        /* Underlimit */
 629
 630                        cl->undertime = PSCHED_PASTPERFECT;
 631                        if (avgidle > cl->maxidle)
 632                                cl->avgidle = cl->maxidle;
 633                        else
 634                                cl->avgidle = avgidle;
 635                }
 636                if ((s64)(now - cl->last) > 0)
 637                        cl->last = now;
 638        }
 639
 640        cbq_update_toplevel(q, this, q->tx_borrowed);
 641}
 642
 643static inline struct cbq_class *
 644cbq_under_limit(struct cbq_class *cl)
 645{
 646        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 647        struct cbq_class *this_cl = cl;
 648
 649        if (cl->tparent == NULL)
 650                return cl;
 651
 652        if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 653                cl->delayed = 0;
 654                return cl;
 655        }
 656
 657        do {
 658                /* It is very suspicious place. Now overlimit
 659                 * action is generated for not bounded classes
 660                 * only if link is completely congested.
 661                 * Though it is in agree with ancestor-only paradigm,
 662                 * it looks very stupid. Particularly,
 663                 * it means that this chunk of code will either
 664                 * never be called or result in strong amplification
 665                 * of burstiness. Dangerous, silly, and, however,
 666                 * no another solution exists.
 667                 */
 668                cl = cl->borrow;
 669                if (!cl) {
 670                        this_cl->qstats.overlimits++;
 671                        cbq_overlimit(this_cl);
 672                        return NULL;
 673                }
 674                if (cl->level > q->toplevel)
 675                        return NULL;
 676        } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 677
 678        cl->delayed = 0;
 679        return cl;
 680}
 681
 682static inline struct sk_buff *
 683cbq_dequeue_prio(struct Qdisc *sch, int prio)
 684{
 685        struct cbq_sched_data *q = qdisc_priv(sch);
 686        struct cbq_class *cl_tail, *cl_prev, *cl;
 687        struct sk_buff *skb;
 688        int deficit;
 689
 690        cl_tail = cl_prev = q->active[prio];
 691        cl = cl_prev->next_alive;
 692
 693        do {
 694                deficit = 0;
 695
 696                /* Start round */
 697                do {
 698                        struct cbq_class *borrow = cl;
 699
 700                        if (cl->q->q.qlen &&
 701                            (borrow = cbq_under_limit(cl)) == NULL)
 702                                goto skip_class;
 703
 704                        if (cl->deficit <= 0) {
 705                                /* Class exhausted its allotment per
 706                                 * this round. Switch to the next one.
 707                                 */
 708                                deficit = 1;
 709                                cl->deficit += cl->quantum;
 710                                goto next_class;
 711                        }
 712
 713                        skb = cl->q->dequeue(cl->q);
 714
 715                        /* Class did not give us any skb :-(
 716                         * It could occur even if cl->q->q.qlen != 0
 717                         * f.e. if cl->q == "tbf"
 718                         */
 719                        if (skb == NULL)
 720                                goto skip_class;
 721
 722                        cl->deficit -= qdisc_pkt_len(skb);
 723                        q->tx_class = cl;
 724                        q->tx_borrowed = borrow;
 725                        if (borrow != cl) {
 726#ifndef CBQ_XSTATS_BORROWS_BYTES
 727                                borrow->xstats.borrows++;
 728                                cl->xstats.borrows++;
 729#else
 730                                borrow->xstats.borrows += qdisc_pkt_len(skb);
 731                                cl->xstats.borrows += qdisc_pkt_len(skb);
 732#endif
 733                        }
 734                        q->tx_len = qdisc_pkt_len(skb);
 735
 736                        if (cl->deficit <= 0) {
 737                                q->active[prio] = cl;
 738                                cl = cl->next_alive;
 739                                cl->deficit += cl->quantum;
 740                        }
 741                        return skb;
 742
 743skip_class:
 744                        if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 745                                /* Class is empty or penalized.
 746                                 * Unlink it from active chain.
 747                                 */
 748                                cl_prev->next_alive = cl->next_alive;
 749                                cl->next_alive = NULL;
 750
 751                                /* Did cl_tail point to it? */
 752                                if (cl == cl_tail) {
 753                                        /* Repair it! */
 754                                        cl_tail = cl_prev;
 755
 756                                        /* Was it the last class in this band? */
 757                                        if (cl == cl_tail) {
 758                                                /* Kill the band! */
 759                                                q->active[prio] = NULL;
 760                                                q->activemask &= ~(1<<prio);
 761                                                if (cl->q->q.qlen)
 762                                                        cbq_activate_class(cl);
 763                                                return NULL;
 764                                        }
 765
 766                                        q->active[prio] = cl_tail;
 767                                }
 768                                if (cl->q->q.qlen)
 769                                        cbq_activate_class(cl);
 770
 771                                cl = cl_prev;
 772                        }
 773
 774next_class:
 775                        cl_prev = cl;
 776                        cl = cl->next_alive;
 777                } while (cl_prev != cl_tail);
 778        } while (deficit);
 779
 780        q->active[prio] = cl_prev;
 781
 782        return NULL;
 783}
 784
 785static inline struct sk_buff *
 786cbq_dequeue_1(struct Qdisc *sch)
 787{
 788        struct cbq_sched_data *q = qdisc_priv(sch);
 789        struct sk_buff *skb;
 790        unsigned int activemask;
 791
 792        activemask = q->activemask & 0xFF;
 793        while (activemask) {
 794                int prio = ffz(~activemask);
 795                activemask &= ~(1<<prio);
 796                skb = cbq_dequeue_prio(sch, prio);
 797                if (skb)
 798                        return skb;
 799        }
 800        return NULL;
 801}
 802
 803static struct sk_buff *
 804cbq_dequeue(struct Qdisc *sch)
 805{
 806        struct sk_buff *skb;
 807        struct cbq_sched_data *q = qdisc_priv(sch);
 808        psched_time_t now;
 809
 810        now = psched_get_time();
 811
 812        if (q->tx_class)
 813                cbq_update(q);
 814
 815        q->now = now;
 816
 817        for (;;) {
 818                q->wd_expires = 0;
 819
 820                skb = cbq_dequeue_1(sch);
 821                if (skb) {
 822                        qdisc_bstats_update(sch, skb);
 823                        sch->q.qlen--;
 824                        return skb;
 825                }
 826
 827                /* All the classes are overlimit.
 828                 *
 829                 * It is possible, if:
 830                 *
 831                 * 1. Scheduler is empty.
 832                 * 2. Toplevel cutoff inhibited borrowing.
 833                 * 3. Root class is overlimit.
 834                 *
 835                 * Reset 2d and 3d conditions and retry.
 836                 *
 837                 * Note, that NS and cbq-2.0 are buggy, peeking
 838                 * an arbitrary class is appropriate for ancestor-only
 839                 * sharing, but not for toplevel algorithm.
 840                 *
 841                 * Our version is better, but slower, because it requires
 842                 * two passes, but it is unavoidable with top-level sharing.
 843                 */
 844
 845                if (q->toplevel == TC_CBQ_MAXLEVEL &&
 846                    q->link.undertime == PSCHED_PASTPERFECT)
 847                        break;
 848
 849                q->toplevel = TC_CBQ_MAXLEVEL;
 850                q->link.undertime = PSCHED_PASTPERFECT;
 851        }
 852
 853        /* No packets in scheduler or nobody wants to give them to us :-(
 854         * Sigh... start watchdog timer in the last case.
 855         */
 856
 857        if (sch->q.qlen) {
 858                qdisc_qstats_overlimit(sch);
 859                if (q->wd_expires)
 860                        qdisc_watchdog_schedule(&q->watchdog,
 861                                                now + q->wd_expires);
 862        }
 863        return NULL;
 864}
 865
 866/* CBQ class maintanance routines */
 867
 868static void cbq_adjust_levels(struct cbq_class *this)
 869{
 870        if (this == NULL)
 871                return;
 872
 873        do {
 874                int level = 0;
 875                struct cbq_class *cl;
 876
 877                cl = this->children;
 878                if (cl) {
 879                        do {
 880                                if (cl->level > level)
 881                                        level = cl->level;
 882                        } while ((cl = cl->sibling) != this->children);
 883                }
 884                this->level = level + 1;
 885        } while ((this = this->tparent) != NULL);
 886}
 887
 888static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 889{
 890        struct cbq_class *cl;
 891        unsigned int h;
 892
 893        if (q->quanta[prio] == 0)
 894                return;
 895
 896        for (h = 0; h < q->clhash.hashsize; h++) {
 897                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 898                        /* BUGGGG... Beware! This expression suffer of
 899                         * arithmetic overflows!
 900                         */
 901                        if (cl->priority == prio) {
 902                                cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 903                                        q->quanta[prio];
 904                        }
 905                        if (cl->quantum <= 0 ||
 906                            cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 907                                pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 908                                        cl->common.classid, cl->quantum);
 909                                cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 910                        }
 911                }
 912        }
 913}
 914
 915static void cbq_sync_defmap(struct cbq_class *cl)
 916{
 917        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 918        struct cbq_class *split = cl->split;
 919        unsigned int h;
 920        int i;
 921
 922        if (split == NULL)
 923                return;
 924
 925        for (i = 0; i <= TC_PRIO_MAX; i++) {
 926                if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 927                        split->defaults[i] = NULL;
 928        }
 929
 930        for (i = 0; i <= TC_PRIO_MAX; i++) {
 931                int level = split->level;
 932
 933                if (split->defaults[i])
 934                        continue;
 935
 936                for (h = 0; h < q->clhash.hashsize; h++) {
 937                        struct cbq_class *c;
 938
 939                        hlist_for_each_entry(c, &q->clhash.hash[h],
 940                                             common.hnode) {
 941                                if (c->split == split && c->level < level &&
 942                                    c->defmap & (1<<i)) {
 943                                        split->defaults[i] = c;
 944                                        level = c->level;
 945                                }
 946                        }
 947                }
 948        }
 949}
 950
 951static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 952{
 953        struct cbq_class *split = NULL;
 954
 955        if (splitid == 0) {
 956                split = cl->split;
 957                if (!split)
 958                        return;
 959                splitid = split->common.classid;
 960        }
 961
 962        if (split == NULL || split->common.classid != splitid) {
 963                for (split = cl->tparent; split; split = split->tparent)
 964                        if (split->common.classid == splitid)
 965                                break;
 966        }
 967
 968        if (split == NULL)
 969                return;
 970
 971        if (cl->split != split) {
 972                cl->defmap = 0;
 973                cbq_sync_defmap(cl);
 974                cl->split = split;
 975                cl->defmap = def & mask;
 976        } else
 977                cl->defmap = (cl->defmap & ~mask) | (def & mask);
 978
 979        cbq_sync_defmap(cl);
 980}
 981
 982static void cbq_unlink_class(struct cbq_class *this)
 983{
 984        struct cbq_class *cl, **clp;
 985        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 986
 987        qdisc_class_hash_remove(&q->clhash, &this->common);
 988
 989        if (this->tparent) {
 990                clp = &this->sibling;
 991                cl = *clp;
 992                do {
 993                        if (cl == this) {
 994                                *clp = cl->sibling;
 995                                break;
 996                        }
 997                        clp = &cl->sibling;
 998                } while ((cl = *clp) != this->sibling);
 999
1000                if (this->tparent->children == this) {
1001                        this->tparent->children = this->sibling;
1002                        if (this->sibling == this)
1003                                this->tparent->children = NULL;
1004                }
1005        } else {
1006                WARN_ON(this->sibling != this);
1007        }
1008}
1009
1010static void cbq_link_class(struct cbq_class *this)
1011{
1012        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1013        struct cbq_class *parent = this->tparent;
1014
1015        this->sibling = this;
1016        qdisc_class_hash_insert(&q->clhash, &this->common);
1017
1018        if (parent == NULL)
1019                return;
1020
1021        if (parent->children == NULL) {
1022                parent->children = this;
1023        } else {
1024                this->sibling = parent->children->sibling;
1025                parent->children->sibling = this;
1026        }
1027}
1028
1029static void
1030cbq_reset(struct Qdisc *sch)
1031{
1032        struct cbq_sched_data *q = qdisc_priv(sch);
1033        struct cbq_class *cl;
1034        int prio;
1035        unsigned int h;
1036
1037        q->activemask = 0;
1038        q->pmask = 0;
1039        q->tx_class = NULL;
1040        q->tx_borrowed = NULL;
1041        qdisc_watchdog_cancel(&q->watchdog);
1042        hrtimer_cancel(&q->delay_timer);
1043        q->toplevel = TC_CBQ_MAXLEVEL;
1044        q->now = psched_get_time();
1045
1046        for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1047                q->active[prio] = NULL;
1048
1049        for (h = 0; h < q->clhash.hashsize; h++) {
1050                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1051                        qdisc_reset(cl->q);
1052
1053                        cl->next_alive = NULL;
1054                        cl->undertime = PSCHED_PASTPERFECT;
1055                        cl->avgidle = cl->maxidle;
1056                        cl->deficit = cl->quantum;
1057                        cl->cpriority = cl->priority;
1058                }
1059        }
1060        sch->q.qlen = 0;
1061}
1062
1063
1064static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1065{
1066        if (lss->change & TCF_CBQ_LSS_FLAGS) {
1067                cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1068                cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1069        }
1070        if (lss->change & TCF_CBQ_LSS_EWMA)
1071                cl->ewma_log = lss->ewma_log;
1072        if (lss->change & TCF_CBQ_LSS_AVPKT)
1073                cl->avpkt = lss->avpkt;
1074        if (lss->change & TCF_CBQ_LSS_MINIDLE)
1075                cl->minidle = -(long)lss->minidle;
1076        if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1077                cl->maxidle = lss->maxidle;
1078                cl->avgidle = lss->maxidle;
1079        }
1080        if (lss->change & TCF_CBQ_LSS_OFFTIME)
1081                cl->offtime = lss->offtime;
1082        return 0;
1083}
1084
1085static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1086{
1087        q->nclasses[cl->priority]--;
1088        q->quanta[cl->priority] -= cl->weight;
1089        cbq_normalize_quanta(q, cl->priority);
1090}
1091
1092static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1093{
1094        q->nclasses[cl->priority]++;
1095        q->quanta[cl->priority] += cl->weight;
1096        cbq_normalize_quanta(q, cl->priority);
1097}
1098
1099static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1100{
1101        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1102
1103        if (wrr->allot)
1104                cl->allot = wrr->allot;
1105        if (wrr->weight)
1106                cl->weight = wrr->weight;
1107        if (wrr->priority) {
1108                cl->priority = wrr->priority - 1;
1109                cl->cpriority = cl->priority;
1110                if (cl->priority >= cl->priority2)
1111                        cl->priority2 = TC_CBQ_MAXPRIO - 1;
1112        }
1113
1114        cbq_addprio(q, cl);
1115        return 0;
1116}
1117
1118static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1119{
1120        cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1121        return 0;
1122}
1123
1124static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1125        [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1126        [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1127        [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1128        [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1129        [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1130        [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1131        [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1132};
1133
1134static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1135{
1136        struct cbq_sched_data *q = qdisc_priv(sch);
1137        struct nlattr *tb[TCA_CBQ_MAX + 1];
1138        struct tc_ratespec *r;
1139        int err;
1140
1141        qdisc_watchdog_init(&q->watchdog, sch);
1142        hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1143        q->delay_timer.function = cbq_undelay;
1144
1145        if (!opt)
1146                return -EINVAL;
1147
1148        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1149        if (err < 0)
1150                return err;
1151
1152        if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1153                return -EINVAL;
1154
1155        r = nla_data(tb[TCA_CBQ_RATE]);
1156
1157        if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1158                return -EINVAL;
1159
1160        err = tcf_block_get(&q->link.block, &q->link.filter_list);
1161        if (err)
1162                goto put_rtab;
1163
1164        err = qdisc_class_hash_init(&q->clhash);
1165        if (err < 0)
1166                goto put_block;
1167
1168        q->link.sibling = &q->link;
1169        q->link.common.classid = sch->handle;
1170        q->link.qdisc = sch;
1171        q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1172                                      sch->handle);
1173        if (!q->link.q)
1174                q->link.q = &noop_qdisc;
1175        else
1176                qdisc_hash_add(q->link.q, true);
1177
1178        q->link.priority = TC_CBQ_MAXPRIO - 1;
1179        q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1180        q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1181        q->link.allot = psched_mtu(qdisc_dev(sch));
1182        q->link.quantum = q->link.allot;
1183        q->link.weight = q->link.R_tab->rate.rate;
1184
1185        q->link.ewma_log = TC_CBQ_DEF_EWMA;
1186        q->link.avpkt = q->link.allot/2;
1187        q->link.minidle = -0x7FFFFFFF;
1188
1189        q->toplevel = TC_CBQ_MAXLEVEL;
1190        q->now = psched_get_time();
1191
1192        cbq_link_class(&q->link);
1193
1194        if (tb[TCA_CBQ_LSSOPT])
1195                cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1196
1197        cbq_addprio(q, &q->link);
1198        return 0;
1199
1200put_block:
1201        tcf_block_put(q->link.block);
1202
1203put_rtab:
1204        qdisc_put_rtab(q->link.R_tab);
1205        return err;
1206}
1207
1208static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1209{
1210        unsigned char *b = skb_tail_pointer(skb);
1211
1212        if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1213                goto nla_put_failure;
1214        return skb->len;
1215
1216nla_put_failure:
1217        nlmsg_trim(skb, b);
1218        return -1;
1219}
1220
1221static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1222{
1223        unsigned char *b = skb_tail_pointer(skb);
1224        struct tc_cbq_lssopt opt;
1225
1226        opt.flags = 0;
1227        if (cl->borrow == NULL)
1228                opt.flags |= TCF_CBQ_LSS_BOUNDED;
1229        if (cl->share == NULL)
1230                opt.flags |= TCF_CBQ_LSS_ISOLATED;
1231        opt.ewma_log = cl->ewma_log;
1232        opt.level = cl->level;
1233        opt.avpkt = cl->avpkt;
1234        opt.maxidle = cl->maxidle;
1235        opt.minidle = (u32)(-cl->minidle);
1236        opt.offtime = cl->offtime;
1237        opt.change = ~0;
1238        if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1239                goto nla_put_failure;
1240        return skb->len;
1241
1242nla_put_failure:
1243        nlmsg_trim(skb, b);
1244        return -1;
1245}
1246
1247static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1248{
1249        unsigned char *b = skb_tail_pointer(skb);
1250        struct tc_cbq_wrropt opt;
1251
1252        memset(&opt, 0, sizeof(opt));
1253        opt.flags = 0;
1254        opt.allot = cl->allot;
1255        opt.priority = cl->priority + 1;
1256        opt.cpriority = cl->cpriority + 1;
1257        opt.weight = cl->weight;
1258        if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1259                goto nla_put_failure;
1260        return skb->len;
1261
1262nla_put_failure:
1263        nlmsg_trim(skb, b);
1264        return -1;
1265}
1266
1267static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1268{
1269        unsigned char *b = skb_tail_pointer(skb);
1270        struct tc_cbq_fopt opt;
1271
1272        if (cl->split || cl->defmap) {
1273                opt.split = cl->split ? cl->split->common.classid : 0;
1274                opt.defmap = cl->defmap;
1275                opt.defchange = ~0;
1276                if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1277                        goto nla_put_failure;
1278        }
1279        return skb->len;
1280
1281nla_put_failure:
1282        nlmsg_trim(skb, b);
1283        return -1;
1284}
1285
1286static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1287{
1288        if (cbq_dump_lss(skb, cl) < 0 ||
1289            cbq_dump_rate(skb, cl) < 0 ||
1290            cbq_dump_wrr(skb, cl) < 0 ||
1291            cbq_dump_fopt(skb, cl) < 0)
1292                return -1;
1293        return 0;
1294}
1295
1296static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1297{
1298        struct cbq_sched_data *q = qdisc_priv(sch);
1299        struct nlattr *nest;
1300
1301        nest = nla_nest_start(skb, TCA_OPTIONS);
1302        if (nest == NULL)
1303                goto nla_put_failure;
1304        if (cbq_dump_attr(skb, &q->link) < 0)
1305                goto nla_put_failure;
1306        return nla_nest_end(skb, nest);
1307
1308nla_put_failure:
1309        nla_nest_cancel(skb, nest);
1310        return -1;
1311}
1312
1313static int
1314cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1315{
1316        struct cbq_sched_data *q = qdisc_priv(sch);
1317
1318        q->link.xstats.avgidle = q->link.avgidle;
1319        return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1320}
1321
1322static int
1323cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1324               struct sk_buff *skb, struct tcmsg *tcm)
1325{
1326        struct cbq_class *cl = (struct cbq_class *)arg;
1327        struct nlattr *nest;
1328
1329        if (cl->tparent)
1330                tcm->tcm_parent = cl->tparent->common.classid;
1331        else
1332                tcm->tcm_parent = TC_H_ROOT;
1333        tcm->tcm_handle = cl->common.classid;
1334        tcm->tcm_info = cl->q->handle;
1335
1336        nest = nla_nest_start(skb, TCA_OPTIONS);
1337        if (nest == NULL)
1338                goto nla_put_failure;
1339        if (cbq_dump_attr(skb, cl) < 0)
1340                goto nla_put_failure;
1341        return nla_nest_end(skb, nest);
1342
1343nla_put_failure:
1344        nla_nest_cancel(skb, nest);
1345        return -1;
1346}
1347
1348static int
1349cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1350        struct gnet_dump *d)
1351{
1352        struct cbq_sched_data *q = qdisc_priv(sch);
1353        struct cbq_class *cl = (struct cbq_class *)arg;
1354
1355        cl->xstats.avgidle = cl->avgidle;
1356        cl->xstats.undertime = 0;
1357
1358        if (cl->undertime != PSCHED_PASTPERFECT)
1359                cl->xstats.undertime = cl->undertime - q->now;
1360
1361        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1362                                  d, NULL, &cl->bstats) < 0 ||
1363            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1364            gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1365                return -1;
1366
1367        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1368}
1369
1370static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1371                     struct Qdisc **old)
1372{
1373        struct cbq_class *cl = (struct cbq_class *)arg;
1374
1375        if (new == NULL) {
1376                new = qdisc_create_dflt(sch->dev_queue,
1377                                        &pfifo_qdisc_ops, cl->common.classid);
1378                if (new == NULL)
1379                        return -ENOBUFS;
1380        }
1381
1382        *old = qdisc_replace(sch, new, &cl->q);
1383        return 0;
1384}
1385
1386static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1387{
1388        struct cbq_class *cl = (struct cbq_class *)arg;
1389
1390        return cl->q;
1391}
1392
1393static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1394{
1395        struct cbq_class *cl = (struct cbq_class *)arg;
1396
1397        cbq_deactivate_class(cl);
1398}
1399
1400static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1401{
1402        struct cbq_sched_data *q = qdisc_priv(sch);
1403
1404        return (unsigned long)cbq_class_lookup(q, classid);
1405}
1406
1407static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1408{
1409        struct cbq_sched_data *q = qdisc_priv(sch);
1410
1411        WARN_ON(cl->filters);
1412
1413        tcf_block_put(cl->block);
1414        qdisc_destroy(cl->q);
1415        qdisc_put_rtab(cl->R_tab);
1416        gen_kill_estimator(&cl->rate_est);
1417        if (cl != &q->link)
1418                kfree(cl);
1419}
1420
1421static void cbq_destroy(struct Qdisc *sch)
1422{
1423        struct cbq_sched_data *q = qdisc_priv(sch);
1424        struct hlist_node *next;
1425        struct cbq_class *cl;
1426        unsigned int h;
1427
1428#ifdef CONFIG_NET_CLS_ACT
1429        q->rx_class = NULL;
1430#endif
1431        /*
1432         * Filters must be destroyed first because we don't destroy the
1433         * classes from root to leafs which means that filters can still
1434         * be bound to classes which have been destroyed already. --TGR '04
1435         */
1436        for (h = 0; h < q->clhash.hashsize; h++) {
1437                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1438                        tcf_block_put(cl->block);
1439                        cl->block = NULL;
1440                }
1441        }
1442        for (h = 0; h < q->clhash.hashsize; h++) {
1443                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1444                                          common.hnode)
1445                        cbq_destroy_class(sch, cl);
1446        }
1447        qdisc_class_hash_destroy(&q->clhash);
1448}
1449
1450static int
1451cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1452                 unsigned long *arg)
1453{
1454        int err;
1455        struct cbq_sched_data *q = qdisc_priv(sch);
1456        struct cbq_class *cl = (struct cbq_class *)*arg;
1457        struct nlattr *opt = tca[TCA_OPTIONS];
1458        struct nlattr *tb[TCA_CBQ_MAX + 1];
1459        struct cbq_class *parent;
1460        struct qdisc_rate_table *rtab = NULL;
1461
1462        if (opt == NULL)
1463                return -EINVAL;
1464
1465        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1466        if (err < 0)
1467                return err;
1468
1469        if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1470                return -EOPNOTSUPP;
1471
1472        if (cl) {
1473                /* Check parent */
1474                if (parentid) {
1475                        if (cl->tparent &&
1476                            cl->tparent->common.classid != parentid)
1477                                return -EINVAL;
1478                        if (!cl->tparent && parentid != TC_H_ROOT)
1479                                return -EINVAL;
1480                }
1481
1482                if (tb[TCA_CBQ_RATE]) {
1483                        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1484                                              tb[TCA_CBQ_RTAB]);
1485                        if (rtab == NULL)
1486                                return -EINVAL;
1487                }
1488
1489                if (tca[TCA_RATE]) {
1490                        err = gen_replace_estimator(&cl->bstats, NULL,
1491                                                    &cl->rate_est,
1492                                                    NULL,
1493                                                    qdisc_root_sleeping_running(sch),
1494                                                    tca[TCA_RATE]);
1495                        if (err) {
1496                                qdisc_put_rtab(rtab);
1497                                return err;
1498                        }
1499                }
1500
1501                /* Change class parameters */
1502                sch_tree_lock(sch);
1503
1504                if (cl->next_alive != NULL)
1505                        cbq_deactivate_class(cl);
1506
1507                if (rtab) {
1508                        qdisc_put_rtab(cl->R_tab);
1509                        cl->R_tab = rtab;
1510                }
1511
1512                if (tb[TCA_CBQ_LSSOPT])
1513                        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1514
1515                if (tb[TCA_CBQ_WRROPT]) {
1516                        cbq_rmprio(q, cl);
1517                        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1518                }
1519
1520                if (tb[TCA_CBQ_FOPT])
1521                        cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1522
1523                if (cl->q->q.qlen)
1524                        cbq_activate_class(cl);
1525
1526                sch_tree_unlock(sch);
1527
1528                return 0;
1529        }
1530
1531        if (parentid == TC_H_ROOT)
1532                return -EINVAL;
1533
1534        if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1535            tb[TCA_CBQ_LSSOPT] == NULL)
1536                return -EINVAL;
1537
1538        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1539        if (rtab == NULL)
1540                return -EINVAL;
1541
1542        if (classid) {
1543                err = -EINVAL;
1544                if (TC_H_MAJ(classid ^ sch->handle) ||
1545                    cbq_class_lookup(q, classid))
1546                        goto failure;
1547        } else {
1548                int i;
1549                classid = TC_H_MAKE(sch->handle, 0x8000);
1550
1551                for (i = 0; i < 0x8000; i++) {
1552                        if (++q->hgenerator >= 0x8000)
1553                                q->hgenerator = 1;
1554                        if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1555                                break;
1556                }
1557                err = -ENOSR;
1558                if (i >= 0x8000)
1559                        goto failure;
1560                classid = classid|q->hgenerator;
1561        }
1562
1563        parent = &q->link;
1564        if (parentid) {
1565                parent = cbq_class_lookup(q, parentid);
1566                err = -EINVAL;
1567                if (parent == NULL)
1568                        goto failure;
1569        }
1570
1571        err = -ENOBUFS;
1572        cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1573        if (cl == NULL)
1574                goto failure;
1575
1576        err = tcf_block_get(&cl->block, &cl->filter_list);
1577        if (err) {
1578                kfree(cl);
1579                return err;
1580        }
1581
1582        if (tca[TCA_RATE]) {
1583                err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1584                                        NULL,
1585                                        qdisc_root_sleeping_running(sch),
1586                                        tca[TCA_RATE]);
1587                if (err) {
1588                        tcf_block_put(cl->block);
1589                        kfree(cl);
1590                        goto failure;
1591                }
1592        }
1593
1594        cl->R_tab = rtab;
1595        rtab = NULL;
1596        cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1597        if (!cl->q)
1598                cl->q = &noop_qdisc;
1599        else
1600                qdisc_hash_add(cl->q, true);
1601
1602        cl->common.classid = classid;
1603        cl->tparent = parent;
1604        cl->qdisc = sch;
1605        cl->allot = parent->allot;
1606        cl->quantum = cl->allot;
1607        cl->weight = cl->R_tab->rate.rate;
1608
1609        sch_tree_lock(sch);
1610        cbq_link_class(cl);
1611        cl->borrow = cl->tparent;
1612        if (cl->tparent != &q->link)
1613                cl->share = cl->tparent;
1614        cbq_adjust_levels(parent);
1615        cl->minidle = -0x7FFFFFFF;
1616        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1617        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1618        if (cl->ewma_log == 0)
1619                cl->ewma_log = q->link.ewma_log;
1620        if (cl->maxidle == 0)
1621                cl->maxidle = q->link.maxidle;
1622        if (cl->avpkt == 0)
1623                cl->avpkt = q->link.avpkt;
1624        if (tb[TCA_CBQ_FOPT])
1625                cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1626        sch_tree_unlock(sch);
1627
1628        qdisc_class_hash_grow(sch, &q->clhash);
1629
1630        *arg = (unsigned long)cl;
1631        return 0;
1632
1633failure:
1634        qdisc_put_rtab(rtab);
1635        return err;
1636}
1637
1638static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1639{
1640        struct cbq_sched_data *q = qdisc_priv(sch);
1641        struct cbq_class *cl = (struct cbq_class *)arg;
1642        unsigned int qlen, backlog;
1643
1644        if (cl->filters || cl->children || cl == &q->link)
1645                return -EBUSY;
1646
1647        sch_tree_lock(sch);
1648
1649        qlen = cl->q->q.qlen;
1650        backlog = cl->q->qstats.backlog;
1651        qdisc_reset(cl->q);
1652        qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1653
1654        if (cl->next_alive)
1655                cbq_deactivate_class(cl);
1656
1657        if (q->tx_borrowed == cl)
1658                q->tx_borrowed = q->tx_class;
1659        if (q->tx_class == cl) {
1660                q->tx_class = NULL;
1661                q->tx_borrowed = NULL;
1662        }
1663#ifdef CONFIG_NET_CLS_ACT
1664        if (q->rx_class == cl)
1665                q->rx_class = NULL;
1666#endif
1667
1668        cbq_unlink_class(cl);
1669        cbq_adjust_levels(cl->tparent);
1670        cl->defmap = 0;
1671        cbq_sync_defmap(cl);
1672
1673        cbq_rmprio(q, cl);
1674        sch_tree_unlock(sch);
1675
1676        cbq_destroy_class(sch, cl);
1677        return 0;
1678}
1679
1680static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg)
1681{
1682        struct cbq_sched_data *q = qdisc_priv(sch);
1683        struct cbq_class *cl = (struct cbq_class *)arg;
1684
1685        if (cl == NULL)
1686                cl = &q->link;
1687
1688        return cl->block;
1689}
1690
1691static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1692                                     u32 classid)
1693{
1694        struct cbq_sched_data *q = qdisc_priv(sch);
1695        struct cbq_class *p = (struct cbq_class *)parent;
1696        struct cbq_class *cl = cbq_class_lookup(q, classid);
1697
1698        if (cl) {
1699                if (p && p->level <= cl->level)
1700                        return 0;
1701                cl->filters++;
1702                return (unsigned long)cl;
1703        }
1704        return 0;
1705}
1706
1707static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1708{
1709        struct cbq_class *cl = (struct cbq_class *)arg;
1710
1711        cl->filters--;
1712}
1713
1714static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1715{
1716        struct cbq_sched_data *q = qdisc_priv(sch);
1717        struct cbq_class *cl;
1718        unsigned int h;
1719
1720        if (arg->stop)
1721                return;
1722
1723        for (h = 0; h < q->clhash.hashsize; h++) {
1724                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1725                        if (arg->count < arg->skip) {
1726                                arg->count++;
1727                                continue;
1728                        }
1729                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1730                                arg->stop = 1;
1731                                return;
1732                        }
1733                        arg->count++;
1734                }
1735        }
1736}
1737
1738static const struct Qdisc_class_ops cbq_class_ops = {
1739        .graft          =       cbq_graft,
1740        .leaf           =       cbq_leaf,
1741        .qlen_notify    =       cbq_qlen_notify,
1742        .find           =       cbq_find,
1743        .change         =       cbq_change_class,
1744        .delete         =       cbq_delete,
1745        .walk           =       cbq_walk,
1746        .tcf_block      =       cbq_tcf_block,
1747        .bind_tcf       =       cbq_bind_filter,
1748        .unbind_tcf     =       cbq_unbind_filter,
1749        .dump           =       cbq_dump_class,
1750        .dump_stats     =       cbq_dump_class_stats,
1751};
1752
1753static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1754        .next           =       NULL,
1755        .cl_ops         =       &cbq_class_ops,
1756        .id             =       "cbq",
1757        .priv_size      =       sizeof(struct cbq_sched_data),
1758        .enqueue        =       cbq_enqueue,
1759        .dequeue        =       cbq_dequeue,
1760        .peek           =       qdisc_peek_dequeued,
1761        .init           =       cbq_init,
1762        .reset          =       cbq_reset,
1763        .destroy        =       cbq_destroy,
1764        .change         =       NULL,
1765        .dump           =       cbq_dump,
1766        .dump_stats     =       cbq_dump_stats,
1767        .owner          =       THIS_MODULE,
1768};
1769
1770static int __init cbq_module_init(void)
1771{
1772        return register_qdisc(&cbq_qdisc_ops);
1773}
1774static void __exit cbq_module_exit(void)
1775{
1776        unregister_qdisc(&cbq_qdisc_ops);
1777}
1778module_init(cbq_module_init)
1779module_exit(cbq_module_exit)
1780MODULE_LICENSE("GPL");
1781