linux/net/sched/sch_cbq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/slab.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <net/netlink.h>
  21#include <net/pkt_sched.h>
  22#include <net/pkt_cls.h>
  23
  24
  25/*      Class-Based Queueing (CBQ) algorithm.
  26        =======================================
  27
  28        Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  29                 Management Models for Packet Networks",
  30                 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  31
  32                 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  33
  34                 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  35                 Parameters", 1996
  36
  37                 [4] Sally Floyd and Michael Speer, "Experimental Results
  38                 for Class-Based Queueing", 1998, not published.
  39
  40        -----------------------------------------------------------------------
  41
  42        Algorithm skeleton was taken from NS simulator cbq.cc.
  43        If someone wants to check this code against the LBL version,
  44        he should take into account that ONLY the skeleton was borrowed,
  45        the implementation is different. Particularly:
  46
  47        --- The WRR algorithm is different. Our version looks more
  48        reasonable (I hope) and works when quanta are allowed to be
  49        less than MTU, which is always the case when real time classes
  50        have small rates. Note, that the statement of [3] is
  51        incomplete, delay may actually be estimated even if class
  52        per-round allotment is less than MTU. Namely, if per-round
  53        allotment is W*r_i, and r_1+...+r_k = r < 1
  54
  55        delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  56
  57        In the worst case we have IntServ estimate with D = W*r+k*MTU
  58        and C = MTU*r. The proof (if correct at all) is trivial.
  59
  60
  61        --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  62        interpret some places, which look like wrong translations
  63        from NS. Anyone is advised to find these differences
  64        and explain to me, why I am wrong 8).
  65
  66        --- Linux has no EOI event, so that we cannot estimate true class
  67        idle time. Workaround is to consider the next dequeue event
  68        as sign that previous packet is finished. This is wrong because of
  69        internal device queueing, but on a permanently loaded link it is true.
  70        Moreover, combined with clock integrator, this scheme looks
  71        very close to an ideal solution.  */
  72
  73struct cbq_sched_data;
  74
  75
  76struct cbq_class {
  77        struct Qdisc_class_common common;
  78        struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  79
  80/* Parameters */
  81        unsigned char           priority;       /* class priority */
  82        unsigned char           priority2;      /* priority to be used after overlimit */
  83        unsigned char           ewma_log;       /* time constant for idle time calculation */
  84
  85        u32                     defmap;
  86
  87        /* Link-sharing scheduler parameters */
  88        long                    maxidle;        /* Class parameters: see below. */
  89        long                    offtime;
  90        long                    minidle;
  91        u32                     avpkt;
  92        struct qdisc_rate_table *R_tab;
  93
  94        /* General scheduler (WRR) parameters */
  95        long                    allot;
  96        long                    quantum;        /* Allotment per WRR round */
  97        long                    weight;         /* Relative allotment: see below */
  98
  99        struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
 100        struct cbq_class        *split;         /* Ptr to split node */
 101        struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
 102        struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
 103        struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
 104                                                   parent otherwise */
 105        struct cbq_class        *sibling;       /* Sibling chain */
 106        struct cbq_class        *children;      /* Pointer to children chain */
 107
 108        struct Qdisc            *q;             /* Elementary queueing discipline */
 109
 110
 111/* Variables */
 112        unsigned char           cpriority;      /* Effective priority */
 113        unsigned char           delayed;
 114        unsigned char           level;          /* level of the class in hierarchy:
 115                                                   0 for leaf classes, and maximal
 116                                                   level of children + 1 for nodes.
 117                                                 */
 118
 119        psched_time_t           last;           /* Last end of service */
 120        psched_time_t           undertime;
 121        long                    avgidle;
 122        long                    deficit;        /* Saved deficit for WRR */
 123        psched_time_t           penalized;
 124        struct gnet_stats_basic_packed bstats;
 125        struct gnet_stats_queue qstats;
 126        struct net_rate_estimator __rcu *rate_est;
 127        struct tc_cbq_xstats    xstats;
 128
 129        struct tcf_proto __rcu  *filter_list;
 130        struct tcf_block        *block;
 131
 132        int                     filters;
 133
 134        struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 135};
 136
 137struct cbq_sched_data {
 138        struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 139        int                     nclasses[TC_CBQ_MAXPRIO + 1];
 140        unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 141
 142        struct cbq_class        link;
 143
 144        unsigned int            activemask;
 145        struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 146                                                                   with backlog */
 147
 148#ifdef CONFIG_NET_CLS_ACT
 149        struct cbq_class        *rx_class;
 150#endif
 151        struct cbq_class        *tx_class;
 152        struct cbq_class        *tx_borrowed;
 153        int                     tx_len;
 154        psched_time_t           now;            /* Cached timestamp */
 155        unsigned int            pmask;
 156
 157        struct hrtimer          delay_timer;
 158        struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 159                                                   started when CBQ has
 160                                                   backlog, but cannot
 161                                                   transmit just now */
 162        psched_tdiff_t          wd_expires;
 163        int                     toplevel;
 164        u32                     hgenerator;
 165};
 166
 167
 168#define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 169
 170static inline struct cbq_class *
 171cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 172{
 173        struct Qdisc_class_common *clc;
 174
 175        clc = qdisc_class_find(&q->clhash, classid);
 176        if (clc == NULL)
 177                return NULL;
 178        return container_of(clc, struct cbq_class, common);
 179}
 180
 181#ifdef CONFIG_NET_CLS_ACT
 182
 183static struct cbq_class *
 184cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 185{
 186        struct cbq_class *cl;
 187
 188        for (cl = this->tparent; cl; cl = cl->tparent) {
 189                struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 190
 191                if (new != NULL && new != this)
 192                        return new;
 193        }
 194        return NULL;
 195}
 196
 197#endif
 198
 199/* Classify packet. The procedure is pretty complicated, but
 200 * it allows us to combine link sharing and priority scheduling
 201 * transparently.
 202 *
 203 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 204 * so that it resolves to split nodes. Then packets are classified
 205 * by logical priority, or a more specific classifier may be attached
 206 * to the split node.
 207 */
 208
 209static struct cbq_class *
 210cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 211{
 212        struct cbq_sched_data *q = qdisc_priv(sch);
 213        struct cbq_class *head = &q->link;
 214        struct cbq_class **defmap;
 215        struct cbq_class *cl = NULL;
 216        u32 prio = skb->priority;
 217        struct tcf_proto *fl;
 218        struct tcf_result res;
 219
 220        /*
 221         *  Step 1. If skb->priority points to one of our classes, use it.
 222         */
 223        if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 224            (cl = cbq_class_lookup(q, prio)) != NULL)
 225                return cl;
 226
 227        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 228        for (;;) {
 229                int result = 0;
 230                defmap = head->defaults;
 231
 232                fl = rcu_dereference_bh(head->filter_list);
 233                /*
 234                 * Step 2+n. Apply classifier.
 235                 */
 236                result = tcf_classify(skb, fl, &res, true);
 237                if (!fl || result < 0)
 238                        goto fallback;
 239
 240                cl = (void *)res.class;
 241                if (!cl) {
 242                        if (TC_H_MAJ(res.classid))
 243                                cl = cbq_class_lookup(q, res.classid);
 244                        else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 245                                cl = defmap[TC_PRIO_BESTEFFORT];
 246
 247                        if (cl == NULL)
 248                                goto fallback;
 249                }
 250                if (cl->level >= head->level)
 251                        goto fallback;
 252#ifdef CONFIG_NET_CLS_ACT
 253                switch (result) {
 254                case TC_ACT_QUEUED:
 255                case TC_ACT_STOLEN:
 256                case TC_ACT_TRAP:
 257                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 258                        /* fall through */
 259                case TC_ACT_SHOT:
 260                        return NULL;
 261                case TC_ACT_RECLASSIFY:
 262                        return cbq_reclassify(skb, cl);
 263                }
 264#endif
 265                if (cl->level == 0)
 266                        return cl;
 267
 268                /*
 269                 * Step 3+n. If classifier selected a link sharing class,
 270                 *         apply agency specific classifier.
 271                 *         Repeat this procdure until we hit a leaf node.
 272                 */
 273                head = cl;
 274        }
 275
 276fallback:
 277        cl = head;
 278
 279        /*
 280         * Step 4. No success...
 281         */
 282        if (TC_H_MAJ(prio) == 0 &&
 283            !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 284            !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 285                return head;
 286
 287        return cl;
 288}
 289
 290/*
 291 * A packet has just been enqueued on the empty class.
 292 * cbq_activate_class adds it to the tail of active class list
 293 * of its priority band.
 294 */
 295
 296static inline void cbq_activate_class(struct cbq_class *cl)
 297{
 298        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 299        int prio = cl->cpriority;
 300        struct cbq_class *cl_tail;
 301
 302        cl_tail = q->active[prio];
 303        q->active[prio] = cl;
 304
 305        if (cl_tail != NULL) {
 306                cl->next_alive = cl_tail->next_alive;
 307                cl_tail->next_alive = cl;
 308        } else {
 309                cl->next_alive = cl;
 310                q->activemask |= (1<<prio);
 311        }
 312}
 313
 314/*
 315 * Unlink class from active chain.
 316 * Note that this same procedure is done directly in cbq_dequeue*
 317 * during round-robin procedure.
 318 */
 319
 320static void cbq_deactivate_class(struct cbq_class *this)
 321{
 322        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 323        int prio = this->cpriority;
 324        struct cbq_class *cl;
 325        struct cbq_class *cl_prev = q->active[prio];
 326
 327        do {
 328                cl = cl_prev->next_alive;
 329                if (cl == this) {
 330                        cl_prev->next_alive = cl->next_alive;
 331                        cl->next_alive = NULL;
 332
 333                        if (cl == q->active[prio]) {
 334                                q->active[prio] = cl_prev;
 335                                if (cl == q->active[prio]) {
 336                                        q->active[prio] = NULL;
 337                                        q->activemask &= ~(1<<prio);
 338                                        return;
 339                                }
 340                        }
 341                        return;
 342                }
 343        } while ((cl_prev = cl) != q->active[prio]);
 344}
 345
 346static void
 347cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 348{
 349        int toplevel = q->toplevel;
 350
 351        if (toplevel > cl->level) {
 352                psched_time_t now = psched_get_time();
 353
 354                do {
 355                        if (cl->undertime < now) {
 356                                q->toplevel = cl->level;
 357                                return;
 358                        }
 359                } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 360        }
 361}
 362
 363static int
 364cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 365            struct sk_buff **to_free)
 366{
 367        struct cbq_sched_data *q = qdisc_priv(sch);
 368        int uninitialized_var(ret);
 369        struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 370
 371#ifdef CONFIG_NET_CLS_ACT
 372        q->rx_class = cl;
 373#endif
 374        if (cl == NULL) {
 375                if (ret & __NET_XMIT_BYPASS)
 376                        qdisc_qstats_drop(sch);
 377                __qdisc_drop(skb, to_free);
 378                return ret;
 379        }
 380
 381        ret = qdisc_enqueue(skb, cl->q, to_free);
 382        if (ret == NET_XMIT_SUCCESS) {
 383                sch->q.qlen++;
 384                cbq_mark_toplevel(q, cl);
 385                if (!cl->next_alive)
 386                        cbq_activate_class(cl);
 387                return ret;
 388        }
 389
 390        if (net_xmit_drop_count(ret)) {
 391                qdisc_qstats_drop(sch);
 392                cbq_mark_toplevel(q, cl);
 393                cl->qstats.drops++;
 394        }
 395        return ret;
 396}
 397
 398/* Overlimit action: penalize leaf class by adding offtime */
 399static void cbq_overlimit(struct cbq_class *cl)
 400{
 401        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 402        psched_tdiff_t delay = cl->undertime - q->now;
 403
 404        if (!cl->delayed) {
 405                delay += cl->offtime;
 406
 407                /*
 408                 * Class goes to sleep, so that it will have no
 409                 * chance to work avgidle. Let's forgive it 8)
 410                 *
 411                 * BTW cbq-2.0 has a crap in this
 412                 * place, apparently they forgot to shift it by cl->ewma_log.
 413                 */
 414                if (cl->avgidle < 0)
 415                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 416                if (cl->avgidle < cl->minidle)
 417                        cl->avgidle = cl->minidle;
 418                if (delay <= 0)
 419                        delay = 1;
 420                cl->undertime = q->now + delay;
 421
 422                cl->xstats.overactions++;
 423                cl->delayed = 1;
 424        }
 425        if (q->wd_expires == 0 || q->wd_expires > delay)
 426                q->wd_expires = delay;
 427
 428        /* Dirty work! We must schedule wakeups based on
 429         * real available rate, rather than leaf rate,
 430         * which may be tiny (even zero).
 431         */
 432        if (q->toplevel == TC_CBQ_MAXLEVEL) {
 433                struct cbq_class *b;
 434                psched_tdiff_t base_delay = q->wd_expires;
 435
 436                for (b = cl->borrow; b; b = b->borrow) {
 437                        delay = b->undertime - q->now;
 438                        if (delay < base_delay) {
 439                                if (delay <= 0)
 440                                        delay = 1;
 441                                base_delay = delay;
 442                        }
 443                }
 444
 445                q->wd_expires = base_delay;
 446        }
 447}
 448
 449static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 450                                       psched_time_t now)
 451{
 452        struct cbq_class *cl;
 453        struct cbq_class *cl_prev = q->active[prio];
 454        psched_time_t sched = now;
 455
 456        if (cl_prev == NULL)
 457                return 0;
 458
 459        do {
 460                cl = cl_prev->next_alive;
 461                if (now - cl->penalized > 0) {
 462                        cl_prev->next_alive = cl->next_alive;
 463                        cl->next_alive = NULL;
 464                        cl->cpriority = cl->priority;
 465                        cl->delayed = 0;
 466                        cbq_activate_class(cl);
 467
 468                        if (cl == q->active[prio]) {
 469                                q->active[prio] = cl_prev;
 470                                if (cl == q->active[prio]) {
 471                                        q->active[prio] = NULL;
 472                                        return 0;
 473                                }
 474                        }
 475
 476                        cl = cl_prev->next_alive;
 477                } else if (sched - cl->penalized > 0)
 478                        sched = cl->penalized;
 479        } while ((cl_prev = cl) != q->active[prio]);
 480
 481        return sched - now;
 482}
 483
 484static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 485{
 486        struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 487                                                delay_timer);
 488        struct Qdisc *sch = q->watchdog.qdisc;
 489        psched_time_t now;
 490        psched_tdiff_t delay = 0;
 491        unsigned int pmask;
 492
 493        now = psched_get_time();
 494
 495        pmask = q->pmask;
 496        q->pmask = 0;
 497
 498        while (pmask) {
 499                int prio = ffz(~pmask);
 500                psched_tdiff_t tmp;
 501
 502                pmask &= ~(1<<prio);
 503
 504                tmp = cbq_undelay_prio(q, prio, now);
 505                if (tmp > 0) {
 506                        q->pmask |= 1<<prio;
 507                        if (tmp < delay || delay == 0)
 508                                delay = tmp;
 509                }
 510        }
 511
 512        if (delay) {
 513                ktime_t time;
 514
 515                time = 0;
 516                time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 517                hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 518        }
 519
 520        __netif_schedule(qdisc_root(sch));
 521        return HRTIMER_NORESTART;
 522}
 523
 524/*
 525 * It is mission critical procedure.
 526 *
 527 * We "regenerate" toplevel cutoff, if transmitting class
 528 * has backlog and it is not regulated. It is not part of
 529 * original CBQ description, but looks more reasonable.
 530 * Probably, it is wrong. This question needs further investigation.
 531 */
 532
 533static inline void
 534cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 535                    struct cbq_class *borrowed)
 536{
 537        if (cl && q->toplevel >= borrowed->level) {
 538                if (cl->q->q.qlen > 1) {
 539                        do {
 540                                if (borrowed->undertime == PSCHED_PASTPERFECT) {
 541                                        q->toplevel = borrowed->level;
 542                                        return;
 543                                }
 544                        } while ((borrowed = borrowed->borrow) != NULL);
 545                }
 546#if 0
 547        /* It is not necessary now. Uncommenting it
 548           will save CPU cycles, but decrease fairness.
 549         */
 550                q->toplevel = TC_CBQ_MAXLEVEL;
 551#endif
 552        }
 553}
 554
 555static void
 556cbq_update(struct cbq_sched_data *q)
 557{
 558        struct cbq_class *this = q->tx_class;
 559        struct cbq_class *cl = this;
 560        int len = q->tx_len;
 561        psched_time_t now;
 562
 563        q->tx_class = NULL;
 564        /* Time integrator. We calculate EOS time
 565         * by adding expected packet transmission time.
 566         */
 567        now = q->now + L2T(&q->link, len);
 568
 569        for ( ; cl; cl = cl->share) {
 570                long avgidle = cl->avgidle;
 571                long idle;
 572
 573                cl->bstats.packets++;
 574                cl->bstats.bytes += len;
 575
 576                /*
 577                 * (now - last) is total time between packet right edges.
 578                 * (last_pktlen/rate) is "virtual" busy time, so that
 579                 *
 580                 *      idle = (now - last) - last_pktlen/rate
 581                 */
 582
 583                idle = now - cl->last;
 584                if ((unsigned long)idle > 128*1024*1024) {
 585                        avgidle = cl->maxidle;
 586                } else {
 587                        idle -= L2T(cl, len);
 588
 589                /* true_avgidle := (1-W)*true_avgidle + W*idle,
 590                 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 591                 * cl->avgidle == true_avgidle/W,
 592                 * hence:
 593                 */
 594                        avgidle += idle - (avgidle>>cl->ewma_log);
 595                }
 596
 597                if (avgidle <= 0) {
 598                        /* Overlimit or at-limit */
 599
 600                        if (avgidle < cl->minidle)
 601                                avgidle = cl->minidle;
 602
 603                        cl->avgidle = avgidle;
 604
 605                        /* Calculate expected time, when this class
 606                         * will be allowed to send.
 607                         * It will occur, when:
 608                         * (1-W)*true_avgidle + W*delay = 0, i.e.
 609                         * idle = (1/W - 1)*(-true_avgidle)
 610                         * or
 611                         * idle = (1 - W)*(-cl->avgidle);
 612                         */
 613                        idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 614
 615                        /*
 616                         * That is not all.
 617                         * To maintain the rate allocated to the class,
 618                         * we add to undertime virtual clock,
 619                         * necessary to complete transmitted packet.
 620                         * (len/phys_bandwidth has been already passed
 621                         * to the moment of cbq_update)
 622                         */
 623
 624                        idle -= L2T(&q->link, len);
 625                        idle += L2T(cl, len);
 626
 627                        cl->undertime = now + idle;
 628                } else {
 629                        /* Underlimit */
 630
 631                        cl->undertime = PSCHED_PASTPERFECT;
 632                        if (avgidle > cl->maxidle)
 633                                cl->avgidle = cl->maxidle;
 634                        else
 635                                cl->avgidle = avgidle;
 636                }
 637                if ((s64)(now - cl->last) > 0)
 638                        cl->last = now;
 639        }
 640
 641        cbq_update_toplevel(q, this, q->tx_borrowed);
 642}
 643
 644static inline struct cbq_class *
 645cbq_under_limit(struct cbq_class *cl)
 646{
 647        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 648        struct cbq_class *this_cl = cl;
 649
 650        if (cl->tparent == NULL)
 651                return cl;
 652
 653        if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 654                cl->delayed = 0;
 655                return cl;
 656        }
 657
 658        do {
 659                /* It is very suspicious place. Now overlimit
 660                 * action is generated for not bounded classes
 661                 * only if link is completely congested.
 662                 * Though it is in agree with ancestor-only paradigm,
 663                 * it looks very stupid. Particularly,
 664                 * it means that this chunk of code will either
 665                 * never be called or result in strong amplification
 666                 * of burstiness. Dangerous, silly, and, however,
 667                 * no another solution exists.
 668                 */
 669                cl = cl->borrow;
 670                if (!cl) {
 671                        this_cl->qstats.overlimits++;
 672                        cbq_overlimit(this_cl);
 673                        return NULL;
 674                }
 675                if (cl->level > q->toplevel)
 676                        return NULL;
 677        } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 678
 679        cl->delayed = 0;
 680        return cl;
 681}
 682
 683static inline struct sk_buff *
 684cbq_dequeue_prio(struct Qdisc *sch, int prio)
 685{
 686        struct cbq_sched_data *q = qdisc_priv(sch);
 687        struct cbq_class *cl_tail, *cl_prev, *cl;
 688        struct sk_buff *skb;
 689        int deficit;
 690
 691        cl_tail = cl_prev = q->active[prio];
 692        cl = cl_prev->next_alive;
 693
 694        do {
 695                deficit = 0;
 696
 697                /* Start round */
 698                do {
 699                        struct cbq_class *borrow = cl;
 700
 701                        if (cl->q->q.qlen &&
 702                            (borrow = cbq_under_limit(cl)) == NULL)
 703                                goto skip_class;
 704
 705                        if (cl->deficit <= 0) {
 706                                /* Class exhausted its allotment per
 707                                 * this round. Switch to the next one.
 708                                 */
 709                                deficit = 1;
 710                                cl->deficit += cl->quantum;
 711                                goto next_class;
 712                        }
 713
 714                        skb = cl->q->dequeue(cl->q);
 715
 716                        /* Class did not give us any skb :-(
 717                         * It could occur even if cl->q->q.qlen != 0
 718                         * f.e. if cl->q == "tbf"
 719                         */
 720                        if (skb == NULL)
 721                                goto skip_class;
 722
 723                        cl->deficit -= qdisc_pkt_len(skb);
 724                        q->tx_class = cl;
 725                        q->tx_borrowed = borrow;
 726                        if (borrow != cl) {
 727#ifndef CBQ_XSTATS_BORROWS_BYTES
 728                                borrow->xstats.borrows++;
 729                                cl->xstats.borrows++;
 730#else
 731                                borrow->xstats.borrows += qdisc_pkt_len(skb);
 732                                cl->xstats.borrows += qdisc_pkt_len(skb);
 733#endif
 734                        }
 735                        q->tx_len = qdisc_pkt_len(skb);
 736
 737                        if (cl->deficit <= 0) {
 738                                q->active[prio] = cl;
 739                                cl = cl->next_alive;
 740                                cl->deficit += cl->quantum;
 741                        }
 742                        return skb;
 743
 744skip_class:
 745                        if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 746                                /* Class is empty or penalized.
 747                                 * Unlink it from active chain.
 748                                 */
 749                                cl_prev->next_alive = cl->next_alive;
 750                                cl->next_alive = NULL;
 751
 752                                /* Did cl_tail point to it? */
 753                                if (cl == cl_tail) {
 754                                        /* Repair it! */
 755                                        cl_tail = cl_prev;
 756
 757                                        /* Was it the last class in this band? */
 758                                        if (cl == cl_tail) {
 759                                                /* Kill the band! */
 760                                                q->active[prio] = NULL;
 761                                                q->activemask &= ~(1<<prio);
 762                                                if (cl->q->q.qlen)
 763                                                        cbq_activate_class(cl);
 764                                                return NULL;
 765                                        }
 766
 767                                        q->active[prio] = cl_tail;
 768                                }
 769                                if (cl->q->q.qlen)
 770                                        cbq_activate_class(cl);
 771
 772                                cl = cl_prev;
 773                        }
 774
 775next_class:
 776                        cl_prev = cl;
 777                        cl = cl->next_alive;
 778                } while (cl_prev != cl_tail);
 779        } while (deficit);
 780
 781        q->active[prio] = cl_prev;
 782
 783        return NULL;
 784}
 785
 786static inline struct sk_buff *
 787cbq_dequeue_1(struct Qdisc *sch)
 788{
 789        struct cbq_sched_data *q = qdisc_priv(sch);
 790        struct sk_buff *skb;
 791        unsigned int activemask;
 792
 793        activemask = q->activemask & 0xFF;
 794        while (activemask) {
 795                int prio = ffz(~activemask);
 796                activemask &= ~(1<<prio);
 797                skb = cbq_dequeue_prio(sch, prio);
 798                if (skb)
 799                        return skb;
 800        }
 801        return NULL;
 802}
 803
 804static struct sk_buff *
 805cbq_dequeue(struct Qdisc *sch)
 806{
 807        struct sk_buff *skb;
 808        struct cbq_sched_data *q = qdisc_priv(sch);
 809        psched_time_t now;
 810
 811        now = psched_get_time();
 812
 813        if (q->tx_class)
 814                cbq_update(q);
 815
 816        q->now = now;
 817
 818        for (;;) {
 819                q->wd_expires = 0;
 820
 821                skb = cbq_dequeue_1(sch);
 822                if (skb) {
 823                        qdisc_bstats_update(sch, skb);
 824                        sch->q.qlen--;
 825                        return skb;
 826                }
 827
 828                /* All the classes are overlimit.
 829                 *
 830                 * It is possible, if:
 831                 *
 832                 * 1. Scheduler is empty.
 833                 * 2. Toplevel cutoff inhibited borrowing.
 834                 * 3. Root class is overlimit.
 835                 *
 836                 * Reset 2d and 3d conditions and retry.
 837                 *
 838                 * Note, that NS and cbq-2.0 are buggy, peeking
 839                 * an arbitrary class is appropriate for ancestor-only
 840                 * sharing, but not for toplevel algorithm.
 841                 *
 842                 * Our version is better, but slower, because it requires
 843                 * two passes, but it is unavoidable with top-level sharing.
 844                 */
 845
 846                if (q->toplevel == TC_CBQ_MAXLEVEL &&
 847                    q->link.undertime == PSCHED_PASTPERFECT)
 848                        break;
 849
 850                q->toplevel = TC_CBQ_MAXLEVEL;
 851                q->link.undertime = PSCHED_PASTPERFECT;
 852        }
 853
 854        /* No packets in scheduler or nobody wants to give them to us :-(
 855         * Sigh... start watchdog timer in the last case.
 856         */
 857
 858        if (sch->q.qlen) {
 859                qdisc_qstats_overlimit(sch);
 860                if (q->wd_expires)
 861                        qdisc_watchdog_schedule(&q->watchdog,
 862                                                now + q->wd_expires);
 863        }
 864        return NULL;
 865}
 866
 867/* CBQ class maintanance routines */
 868
 869static void cbq_adjust_levels(struct cbq_class *this)
 870{
 871        if (this == NULL)
 872                return;
 873
 874        do {
 875                int level = 0;
 876                struct cbq_class *cl;
 877
 878                cl = this->children;
 879                if (cl) {
 880                        do {
 881                                if (cl->level > level)
 882                                        level = cl->level;
 883                        } while ((cl = cl->sibling) != this->children);
 884                }
 885                this->level = level + 1;
 886        } while ((this = this->tparent) != NULL);
 887}
 888
 889static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 890{
 891        struct cbq_class *cl;
 892        unsigned int h;
 893
 894        if (q->quanta[prio] == 0)
 895                return;
 896
 897        for (h = 0; h < q->clhash.hashsize; h++) {
 898                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 899                        /* BUGGGG... Beware! This expression suffer of
 900                         * arithmetic overflows!
 901                         */
 902                        if (cl->priority == prio) {
 903                                cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 904                                        q->quanta[prio];
 905                        }
 906                        if (cl->quantum <= 0 ||
 907                            cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 908                                pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 909                                        cl->common.classid, cl->quantum);
 910                                cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 911                        }
 912                }
 913        }
 914}
 915
 916static void cbq_sync_defmap(struct cbq_class *cl)
 917{
 918        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 919        struct cbq_class *split = cl->split;
 920        unsigned int h;
 921        int i;
 922
 923        if (split == NULL)
 924                return;
 925
 926        for (i = 0; i <= TC_PRIO_MAX; i++) {
 927                if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 928                        split->defaults[i] = NULL;
 929        }
 930
 931        for (i = 0; i <= TC_PRIO_MAX; i++) {
 932                int level = split->level;
 933
 934                if (split->defaults[i])
 935                        continue;
 936
 937                for (h = 0; h < q->clhash.hashsize; h++) {
 938                        struct cbq_class *c;
 939
 940                        hlist_for_each_entry(c, &q->clhash.hash[h],
 941                                             common.hnode) {
 942                                if (c->split == split && c->level < level &&
 943                                    c->defmap & (1<<i)) {
 944                                        split->defaults[i] = c;
 945                                        level = c->level;
 946                                }
 947                        }
 948                }
 949        }
 950}
 951
 952static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 953{
 954        struct cbq_class *split = NULL;
 955
 956        if (splitid == 0) {
 957                split = cl->split;
 958                if (!split)
 959                        return;
 960                splitid = split->common.classid;
 961        }
 962
 963        if (split == NULL || split->common.classid != splitid) {
 964                for (split = cl->tparent; split; split = split->tparent)
 965                        if (split->common.classid == splitid)
 966                                break;
 967        }
 968
 969        if (split == NULL)
 970                return;
 971
 972        if (cl->split != split) {
 973                cl->defmap = 0;
 974                cbq_sync_defmap(cl);
 975                cl->split = split;
 976                cl->defmap = def & mask;
 977        } else
 978                cl->defmap = (cl->defmap & ~mask) | (def & mask);
 979
 980        cbq_sync_defmap(cl);
 981}
 982
 983static void cbq_unlink_class(struct cbq_class *this)
 984{
 985        struct cbq_class *cl, **clp;
 986        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 987
 988        qdisc_class_hash_remove(&q->clhash, &this->common);
 989
 990        if (this->tparent) {
 991                clp = &this->sibling;
 992                cl = *clp;
 993                do {
 994                        if (cl == this) {
 995                                *clp = cl->sibling;
 996                                break;
 997                        }
 998                        clp = &cl->sibling;
 999                } while ((cl = *clp) != this->sibling);
1000
1001                if (this->tparent->children == this) {
1002                        this->tparent->children = this->sibling;
1003                        if (this->sibling == this)
1004                                this->tparent->children = NULL;
1005                }
1006        } else {
1007                WARN_ON(this->sibling != this);
1008        }
1009}
1010
1011static void cbq_link_class(struct cbq_class *this)
1012{
1013        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1014        struct cbq_class *parent = this->tparent;
1015
1016        this->sibling = this;
1017        qdisc_class_hash_insert(&q->clhash, &this->common);
1018
1019        if (parent == NULL)
1020                return;
1021
1022        if (parent->children == NULL) {
1023                parent->children = this;
1024        } else {
1025                this->sibling = parent->children->sibling;
1026                parent->children->sibling = this;
1027        }
1028}
1029
1030static void
1031cbq_reset(struct Qdisc *sch)
1032{
1033        struct cbq_sched_data *q = qdisc_priv(sch);
1034        struct cbq_class *cl;
1035        int prio;
1036        unsigned int h;
1037
1038        q->activemask = 0;
1039        q->pmask = 0;
1040        q->tx_class = NULL;
1041        q->tx_borrowed = NULL;
1042        qdisc_watchdog_cancel(&q->watchdog);
1043        hrtimer_cancel(&q->delay_timer);
1044        q->toplevel = TC_CBQ_MAXLEVEL;
1045        q->now = psched_get_time();
1046
1047        for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1048                q->active[prio] = NULL;
1049
1050        for (h = 0; h < q->clhash.hashsize; h++) {
1051                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1052                        qdisc_reset(cl->q);
1053
1054                        cl->next_alive = NULL;
1055                        cl->undertime = PSCHED_PASTPERFECT;
1056                        cl->avgidle = cl->maxidle;
1057                        cl->deficit = cl->quantum;
1058                        cl->cpriority = cl->priority;
1059                }
1060        }
1061        sch->q.qlen = 0;
1062}
1063
1064
1065static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1066{
1067        if (lss->change & TCF_CBQ_LSS_FLAGS) {
1068                cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1069                cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1070        }
1071        if (lss->change & TCF_CBQ_LSS_EWMA)
1072                cl->ewma_log = lss->ewma_log;
1073        if (lss->change & TCF_CBQ_LSS_AVPKT)
1074                cl->avpkt = lss->avpkt;
1075        if (lss->change & TCF_CBQ_LSS_MINIDLE)
1076                cl->minidle = -(long)lss->minidle;
1077        if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1078                cl->maxidle = lss->maxidle;
1079                cl->avgidle = lss->maxidle;
1080        }
1081        if (lss->change & TCF_CBQ_LSS_OFFTIME)
1082                cl->offtime = lss->offtime;
1083        return 0;
1084}
1085
1086static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1087{
1088        q->nclasses[cl->priority]--;
1089        q->quanta[cl->priority] -= cl->weight;
1090        cbq_normalize_quanta(q, cl->priority);
1091}
1092
1093static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1094{
1095        q->nclasses[cl->priority]++;
1096        q->quanta[cl->priority] += cl->weight;
1097        cbq_normalize_quanta(q, cl->priority);
1098}
1099
1100static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1101{
1102        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1103
1104        if (wrr->allot)
1105                cl->allot = wrr->allot;
1106        if (wrr->weight)
1107                cl->weight = wrr->weight;
1108        if (wrr->priority) {
1109                cl->priority = wrr->priority - 1;
1110                cl->cpriority = cl->priority;
1111                if (cl->priority >= cl->priority2)
1112                        cl->priority2 = TC_CBQ_MAXPRIO - 1;
1113        }
1114
1115        cbq_addprio(q, cl);
1116        return 0;
1117}
1118
1119static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1120{
1121        cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1122        return 0;
1123}
1124
1125static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1126        [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1127        [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1128        [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1129        [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1130        [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1131        [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1132        [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1133};
1134
1135static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1136                    struct netlink_ext_ack *extack)
1137{
1138        struct cbq_sched_data *q = qdisc_priv(sch);
1139        struct nlattr *tb[TCA_CBQ_MAX + 1];
1140        struct tc_ratespec *r;
1141        int err;
1142
1143        qdisc_watchdog_init(&q->watchdog, sch);
1144        hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1145        q->delay_timer.function = cbq_undelay;
1146
1147        if (!opt) {
1148                NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1149                return -EINVAL;
1150        }
1151
1152        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
1153        if (err < 0)
1154                return err;
1155
1156        if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1157                NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1158                return -EINVAL;
1159        }
1160
1161        r = nla_data(tb[TCA_CBQ_RATE]);
1162
1163        q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1164        if (!q->link.R_tab)
1165                return -EINVAL;
1166
1167        err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1168        if (err)
1169                goto put_rtab;
1170
1171        err = qdisc_class_hash_init(&q->clhash);
1172        if (err < 0)
1173                goto put_block;
1174
1175        q->link.sibling = &q->link;
1176        q->link.common.classid = sch->handle;
1177        q->link.qdisc = sch;
1178        q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1179                                      sch->handle, NULL);
1180        if (!q->link.q)
1181                q->link.q = &noop_qdisc;
1182        else
1183                qdisc_hash_add(q->link.q, true);
1184
1185        q->link.priority = TC_CBQ_MAXPRIO - 1;
1186        q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1187        q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1188        q->link.allot = psched_mtu(qdisc_dev(sch));
1189        q->link.quantum = q->link.allot;
1190        q->link.weight = q->link.R_tab->rate.rate;
1191
1192        q->link.ewma_log = TC_CBQ_DEF_EWMA;
1193        q->link.avpkt = q->link.allot/2;
1194        q->link.minidle = -0x7FFFFFFF;
1195
1196        q->toplevel = TC_CBQ_MAXLEVEL;
1197        q->now = psched_get_time();
1198
1199        cbq_link_class(&q->link);
1200
1201        if (tb[TCA_CBQ_LSSOPT])
1202                cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1203
1204        cbq_addprio(q, &q->link);
1205        return 0;
1206
1207put_block:
1208        tcf_block_put(q->link.block);
1209
1210put_rtab:
1211        qdisc_put_rtab(q->link.R_tab);
1212        return err;
1213}
1214
1215static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1216{
1217        unsigned char *b = skb_tail_pointer(skb);
1218
1219        if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1220                goto nla_put_failure;
1221        return skb->len;
1222
1223nla_put_failure:
1224        nlmsg_trim(skb, b);
1225        return -1;
1226}
1227
1228static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1229{
1230        unsigned char *b = skb_tail_pointer(skb);
1231        struct tc_cbq_lssopt opt;
1232
1233        opt.flags = 0;
1234        if (cl->borrow == NULL)
1235                opt.flags |= TCF_CBQ_LSS_BOUNDED;
1236        if (cl->share == NULL)
1237                opt.flags |= TCF_CBQ_LSS_ISOLATED;
1238        opt.ewma_log = cl->ewma_log;
1239        opt.level = cl->level;
1240        opt.avpkt = cl->avpkt;
1241        opt.maxidle = cl->maxidle;
1242        opt.minidle = (u32)(-cl->minidle);
1243        opt.offtime = cl->offtime;
1244        opt.change = ~0;
1245        if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1246                goto nla_put_failure;
1247        return skb->len;
1248
1249nla_put_failure:
1250        nlmsg_trim(skb, b);
1251        return -1;
1252}
1253
1254static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1255{
1256        unsigned char *b = skb_tail_pointer(skb);
1257        struct tc_cbq_wrropt opt;
1258
1259        memset(&opt, 0, sizeof(opt));
1260        opt.flags = 0;
1261        opt.allot = cl->allot;
1262        opt.priority = cl->priority + 1;
1263        opt.cpriority = cl->cpriority + 1;
1264        opt.weight = cl->weight;
1265        if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1266                goto nla_put_failure;
1267        return skb->len;
1268
1269nla_put_failure:
1270        nlmsg_trim(skb, b);
1271        return -1;
1272}
1273
1274static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1275{
1276        unsigned char *b = skb_tail_pointer(skb);
1277        struct tc_cbq_fopt opt;
1278
1279        if (cl->split || cl->defmap) {
1280                opt.split = cl->split ? cl->split->common.classid : 0;
1281                opt.defmap = cl->defmap;
1282                opt.defchange = ~0;
1283                if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1284                        goto nla_put_failure;
1285        }
1286        return skb->len;
1287
1288nla_put_failure:
1289        nlmsg_trim(skb, b);
1290        return -1;
1291}
1292
1293static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1294{
1295        if (cbq_dump_lss(skb, cl) < 0 ||
1296            cbq_dump_rate(skb, cl) < 0 ||
1297            cbq_dump_wrr(skb, cl) < 0 ||
1298            cbq_dump_fopt(skb, cl) < 0)
1299                return -1;
1300        return 0;
1301}
1302
1303static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1304{
1305        struct cbq_sched_data *q = qdisc_priv(sch);
1306        struct nlattr *nest;
1307
1308        nest = nla_nest_start(skb, TCA_OPTIONS);
1309        if (nest == NULL)
1310                goto nla_put_failure;
1311        if (cbq_dump_attr(skb, &q->link) < 0)
1312                goto nla_put_failure;
1313        return nla_nest_end(skb, nest);
1314
1315nla_put_failure:
1316        nla_nest_cancel(skb, nest);
1317        return -1;
1318}
1319
1320static int
1321cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1322{
1323        struct cbq_sched_data *q = qdisc_priv(sch);
1324
1325        q->link.xstats.avgidle = q->link.avgidle;
1326        return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1327}
1328
1329static int
1330cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1331               struct sk_buff *skb, struct tcmsg *tcm)
1332{
1333        struct cbq_class *cl = (struct cbq_class *)arg;
1334        struct nlattr *nest;
1335
1336        if (cl->tparent)
1337                tcm->tcm_parent = cl->tparent->common.classid;
1338        else
1339                tcm->tcm_parent = TC_H_ROOT;
1340        tcm->tcm_handle = cl->common.classid;
1341        tcm->tcm_info = cl->q->handle;
1342
1343        nest = nla_nest_start(skb, TCA_OPTIONS);
1344        if (nest == NULL)
1345                goto nla_put_failure;
1346        if (cbq_dump_attr(skb, cl) < 0)
1347                goto nla_put_failure;
1348        return nla_nest_end(skb, nest);
1349
1350nla_put_failure:
1351        nla_nest_cancel(skb, nest);
1352        return -1;
1353}
1354
1355static int
1356cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1357        struct gnet_dump *d)
1358{
1359        struct cbq_sched_data *q = qdisc_priv(sch);
1360        struct cbq_class *cl = (struct cbq_class *)arg;
1361        __u32 qlen;
1362
1363        cl->xstats.avgidle = cl->avgidle;
1364        cl->xstats.undertime = 0;
1365        qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1366
1367        if (cl->undertime != PSCHED_PASTPERFECT)
1368                cl->xstats.undertime = cl->undertime - q->now;
1369
1370        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1371                                  d, NULL, &cl->bstats) < 0 ||
1372            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1373            gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1374                return -1;
1375
1376        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1377}
1378
1379static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1380                     struct Qdisc **old, struct netlink_ext_ack *extack)
1381{
1382        struct cbq_class *cl = (struct cbq_class *)arg;
1383
1384        if (new == NULL) {
1385                new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1386                                        cl->common.classid, extack);
1387                if (new == NULL)
1388                        return -ENOBUFS;
1389        }
1390
1391        *old = qdisc_replace(sch, new, &cl->q);
1392        return 0;
1393}
1394
1395static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1396{
1397        struct cbq_class *cl = (struct cbq_class *)arg;
1398
1399        return cl->q;
1400}
1401
1402static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1403{
1404        struct cbq_class *cl = (struct cbq_class *)arg;
1405
1406        cbq_deactivate_class(cl);
1407}
1408
1409static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1410{
1411        struct cbq_sched_data *q = qdisc_priv(sch);
1412
1413        return (unsigned long)cbq_class_lookup(q, classid);
1414}
1415
1416static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1417{
1418        struct cbq_sched_data *q = qdisc_priv(sch);
1419
1420        WARN_ON(cl->filters);
1421
1422        tcf_block_put(cl->block);
1423        qdisc_put(cl->q);
1424        qdisc_put_rtab(cl->R_tab);
1425        gen_kill_estimator(&cl->rate_est);
1426        if (cl != &q->link)
1427                kfree(cl);
1428}
1429
1430static void cbq_destroy(struct Qdisc *sch)
1431{
1432        struct cbq_sched_data *q = qdisc_priv(sch);
1433        struct hlist_node *next;
1434        struct cbq_class *cl;
1435        unsigned int h;
1436
1437#ifdef CONFIG_NET_CLS_ACT
1438        q->rx_class = NULL;
1439#endif
1440        /*
1441         * Filters must be destroyed first because we don't destroy the
1442         * classes from root to leafs which means that filters can still
1443         * be bound to classes which have been destroyed already. --TGR '04
1444         */
1445        for (h = 0; h < q->clhash.hashsize; h++) {
1446                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1447                        tcf_block_put(cl->block);
1448                        cl->block = NULL;
1449                }
1450        }
1451        for (h = 0; h < q->clhash.hashsize; h++) {
1452                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1453                                          common.hnode)
1454                        cbq_destroy_class(sch, cl);
1455        }
1456        qdisc_class_hash_destroy(&q->clhash);
1457}
1458
1459static int
1460cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1461                 unsigned long *arg, struct netlink_ext_ack *extack)
1462{
1463        int err;
1464        struct cbq_sched_data *q = qdisc_priv(sch);
1465        struct cbq_class *cl = (struct cbq_class *)*arg;
1466        struct nlattr *opt = tca[TCA_OPTIONS];
1467        struct nlattr *tb[TCA_CBQ_MAX + 1];
1468        struct cbq_class *parent;
1469        struct qdisc_rate_table *rtab = NULL;
1470
1471        if (!opt) {
1472                NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
1473                return -EINVAL;
1474        }
1475
1476        err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
1477        if (err < 0)
1478                return err;
1479
1480        if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1481                NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1482                return -EOPNOTSUPP;
1483        }
1484
1485        if (cl) {
1486                /* Check parent */
1487                if (parentid) {
1488                        if (cl->tparent &&
1489                            cl->tparent->common.classid != parentid) {
1490                                NL_SET_ERR_MSG(extack, "Invalid parent id");
1491                                return -EINVAL;
1492                        }
1493                        if (!cl->tparent && parentid != TC_H_ROOT) {
1494                                NL_SET_ERR_MSG(extack, "Parent must be root");
1495                                return -EINVAL;
1496                        }
1497                }
1498
1499                if (tb[TCA_CBQ_RATE]) {
1500                        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1501                                              tb[TCA_CBQ_RTAB], extack);
1502                        if (rtab == NULL)
1503                                return -EINVAL;
1504                }
1505
1506                if (tca[TCA_RATE]) {
1507                        err = gen_replace_estimator(&cl->bstats, NULL,
1508                                                    &cl->rate_est,
1509                                                    NULL,
1510                                                    qdisc_root_sleeping_running(sch),
1511                                                    tca[TCA_RATE]);
1512                        if (err) {
1513                                NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1514                                qdisc_put_rtab(rtab);
1515                                return err;
1516                        }
1517                }
1518
1519                /* Change class parameters */
1520                sch_tree_lock(sch);
1521
1522                if (cl->next_alive != NULL)
1523                        cbq_deactivate_class(cl);
1524
1525                if (rtab) {
1526                        qdisc_put_rtab(cl->R_tab);
1527                        cl->R_tab = rtab;
1528                }
1529
1530                if (tb[TCA_CBQ_LSSOPT])
1531                        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1532
1533                if (tb[TCA_CBQ_WRROPT]) {
1534                        cbq_rmprio(q, cl);
1535                        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1536                }
1537
1538                if (tb[TCA_CBQ_FOPT])
1539                        cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1540
1541                if (cl->q->q.qlen)
1542                        cbq_activate_class(cl);
1543
1544                sch_tree_unlock(sch);
1545
1546                return 0;
1547        }
1548
1549        if (parentid == TC_H_ROOT)
1550                return -EINVAL;
1551
1552        if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1553                NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1554                return -EINVAL;
1555        }
1556
1557        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1558                              extack);
1559        if (rtab == NULL)
1560                return -EINVAL;
1561
1562        if (classid) {
1563                err = -EINVAL;
1564                if (TC_H_MAJ(classid ^ sch->handle) ||
1565                    cbq_class_lookup(q, classid)) {
1566                        NL_SET_ERR_MSG(extack, "Specified class not found");
1567                        goto failure;
1568                }
1569        } else {
1570                int i;
1571                classid = TC_H_MAKE(sch->handle, 0x8000);
1572
1573                for (i = 0; i < 0x8000; i++) {
1574                        if (++q->hgenerator >= 0x8000)
1575                                q->hgenerator = 1;
1576                        if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1577                                break;
1578                }
1579                err = -ENOSR;
1580                if (i >= 0x8000) {
1581                        NL_SET_ERR_MSG(extack, "Unable to generate classid");
1582                        goto failure;
1583                }
1584                classid = classid|q->hgenerator;
1585        }
1586
1587        parent = &q->link;
1588        if (parentid) {
1589                parent = cbq_class_lookup(q, parentid);
1590                err = -EINVAL;
1591                if (!parent) {
1592                        NL_SET_ERR_MSG(extack, "Failed to find parentid");
1593                        goto failure;
1594                }
1595        }
1596
1597        err = -ENOBUFS;
1598        cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1599        if (cl == NULL)
1600                goto failure;
1601
1602        err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1603        if (err) {
1604                kfree(cl);
1605                return err;
1606        }
1607
1608        if (tca[TCA_RATE]) {
1609                err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1610                                        NULL,
1611                                        qdisc_root_sleeping_running(sch),
1612                                        tca[TCA_RATE]);
1613                if (err) {
1614                        NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1615                        tcf_block_put(cl->block);
1616                        kfree(cl);
1617                        goto failure;
1618                }
1619        }
1620
1621        cl->R_tab = rtab;
1622        rtab = NULL;
1623        cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1624                                  NULL);
1625        if (!cl->q)
1626                cl->q = &noop_qdisc;
1627        else
1628                qdisc_hash_add(cl->q, true);
1629
1630        cl->common.classid = classid;
1631        cl->tparent = parent;
1632        cl->qdisc = sch;
1633        cl->allot = parent->allot;
1634        cl->quantum = cl->allot;
1635        cl->weight = cl->R_tab->rate.rate;
1636
1637        sch_tree_lock(sch);
1638        cbq_link_class(cl);
1639        cl->borrow = cl->tparent;
1640        if (cl->tparent != &q->link)
1641                cl->share = cl->tparent;
1642        cbq_adjust_levels(parent);
1643        cl->minidle = -0x7FFFFFFF;
1644        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1645        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1646        if (cl->ewma_log == 0)
1647                cl->ewma_log = q->link.ewma_log;
1648        if (cl->maxidle == 0)
1649                cl->maxidle = q->link.maxidle;
1650        if (cl->avpkt == 0)
1651                cl->avpkt = q->link.avpkt;
1652        if (tb[TCA_CBQ_FOPT])
1653                cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1654        sch_tree_unlock(sch);
1655
1656        qdisc_class_hash_grow(sch, &q->clhash);
1657
1658        *arg = (unsigned long)cl;
1659        return 0;
1660
1661failure:
1662        qdisc_put_rtab(rtab);
1663        return err;
1664}
1665
1666static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1667{
1668        struct cbq_sched_data *q = qdisc_priv(sch);
1669        struct cbq_class *cl = (struct cbq_class *)arg;
1670
1671        if (cl->filters || cl->children || cl == &q->link)
1672                return -EBUSY;
1673
1674        sch_tree_lock(sch);
1675
1676        qdisc_purge_queue(cl->q);
1677
1678        if (cl->next_alive)
1679                cbq_deactivate_class(cl);
1680
1681        if (q->tx_borrowed == cl)
1682                q->tx_borrowed = q->tx_class;
1683        if (q->tx_class == cl) {
1684                q->tx_class = NULL;
1685                q->tx_borrowed = NULL;
1686        }
1687#ifdef CONFIG_NET_CLS_ACT
1688        if (q->rx_class == cl)
1689                q->rx_class = NULL;
1690#endif
1691
1692        cbq_unlink_class(cl);
1693        cbq_adjust_levels(cl->tparent);
1694        cl->defmap = 0;
1695        cbq_sync_defmap(cl);
1696
1697        cbq_rmprio(q, cl);
1698        sch_tree_unlock(sch);
1699
1700        cbq_destroy_class(sch, cl);
1701        return 0;
1702}
1703
1704static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1705                                       struct netlink_ext_ack *extack)
1706{
1707        struct cbq_sched_data *q = qdisc_priv(sch);
1708        struct cbq_class *cl = (struct cbq_class *)arg;
1709
1710        if (cl == NULL)
1711                cl = &q->link;
1712
1713        return cl->block;
1714}
1715
1716static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1717                                     u32 classid)
1718{
1719        struct cbq_sched_data *q = qdisc_priv(sch);
1720        struct cbq_class *p = (struct cbq_class *)parent;
1721        struct cbq_class *cl = cbq_class_lookup(q, classid);
1722
1723        if (cl) {
1724                if (p && p->level <= cl->level)
1725                        return 0;
1726                cl->filters++;
1727                return (unsigned long)cl;
1728        }
1729        return 0;
1730}
1731
1732static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1733{
1734        struct cbq_class *cl = (struct cbq_class *)arg;
1735
1736        cl->filters--;
1737}
1738
1739static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1740{
1741        struct cbq_sched_data *q = qdisc_priv(sch);
1742        struct cbq_class *cl;
1743        unsigned int h;
1744
1745        if (arg->stop)
1746                return;
1747
1748        for (h = 0; h < q->clhash.hashsize; h++) {
1749                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1750                        if (arg->count < arg->skip) {
1751                                arg->count++;
1752                                continue;
1753                        }
1754                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1755                                arg->stop = 1;
1756                                return;
1757                        }
1758                        arg->count++;
1759                }
1760        }
1761}
1762
1763static const struct Qdisc_class_ops cbq_class_ops = {
1764        .graft          =       cbq_graft,
1765        .leaf           =       cbq_leaf,
1766        .qlen_notify    =       cbq_qlen_notify,
1767        .find           =       cbq_find,
1768        .change         =       cbq_change_class,
1769        .delete         =       cbq_delete,
1770        .walk           =       cbq_walk,
1771        .tcf_block      =       cbq_tcf_block,
1772        .bind_tcf       =       cbq_bind_filter,
1773        .unbind_tcf     =       cbq_unbind_filter,
1774        .dump           =       cbq_dump_class,
1775        .dump_stats     =       cbq_dump_class_stats,
1776};
1777
1778static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1779        .next           =       NULL,
1780        .cl_ops         =       &cbq_class_ops,
1781        .id             =       "cbq",
1782        .priv_size      =       sizeof(struct cbq_sched_data),
1783        .enqueue        =       cbq_enqueue,
1784        .dequeue        =       cbq_dequeue,
1785        .peek           =       qdisc_peek_dequeued,
1786        .init           =       cbq_init,
1787        .reset          =       cbq_reset,
1788        .destroy        =       cbq_destroy,
1789        .change         =       NULL,
1790        .dump           =       cbq_dump,
1791        .dump_stats     =       cbq_dump_stats,
1792        .owner          =       THIS_MODULE,
1793};
1794
1795static int __init cbq_module_init(void)
1796{
1797        return register_qdisc(&cbq_qdisc_ops);
1798}
1799static void __exit cbq_module_exit(void)
1800{
1801        unregister_qdisc(&cbq_qdisc_ops);
1802}
1803module_init(cbq_module_init)
1804module_exit(cbq_module_exit)
1805MODULE_LICENSE("GPL");
1806