linux/net/sched/sch_cbq.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   4 *
   5 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/slab.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/string.h>
  13#include <linux/errno.h>
  14#include <linux/skbuff.h>
  15#include <net/netlink.h>
  16#include <net/pkt_sched.h>
  17#include <net/pkt_cls.h>
  18
  19
  20/*      Class-Based Queueing (CBQ) algorithm.
  21        =======================================
  22
  23        Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  24                 Management Models for Packet Networks",
  25                 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  26
  27                 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  28
  29                 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  30                 Parameters", 1996
  31
  32                 [4] Sally Floyd and Michael Speer, "Experimental Results
  33                 for Class-Based Queueing", 1998, not published.
  34
  35        -----------------------------------------------------------------------
  36
  37        Algorithm skeleton was taken from NS simulator cbq.cc.
  38        If someone wants to check this code against the LBL version,
  39        he should take into account that ONLY the skeleton was borrowed,
  40        the implementation is different. Particularly:
  41
  42        --- The WRR algorithm is different. Our version looks more
  43        reasonable (I hope) and works when quanta are allowed to be
  44        less than MTU, which is always the case when real time classes
  45        have small rates. Note, that the statement of [3] is
  46        incomplete, delay may actually be estimated even if class
  47        per-round allotment is less than MTU. Namely, if per-round
  48        allotment is W*r_i, and r_1+...+r_k = r < 1
  49
  50        delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  51
  52        In the worst case we have IntServ estimate with D = W*r+k*MTU
  53        and C = MTU*r. The proof (if correct at all) is trivial.
  54
  55
  56        --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  57        interpret some places, which look like wrong translations
  58        from NS. Anyone is advised to find these differences
  59        and explain to me, why I am wrong 8).
  60
  61        --- Linux has no EOI event, so that we cannot estimate true class
  62        idle time. Workaround is to consider the next dequeue event
  63        as sign that previous packet is finished. This is wrong because of
  64        internal device queueing, but on a permanently loaded link it is true.
  65        Moreover, combined with clock integrator, this scheme looks
  66        very close to an ideal solution.  */
  67
  68struct cbq_sched_data;
  69
  70
  71struct cbq_class {
  72        struct Qdisc_class_common common;
  73        struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  74
  75/* Parameters */
  76        unsigned char           priority;       /* class priority */
  77        unsigned char           priority2;      /* priority to be used after overlimit */
  78        unsigned char           ewma_log;       /* time constant for idle time calculation */
  79
  80        u32                     defmap;
  81
  82        /* Link-sharing scheduler parameters */
  83        long                    maxidle;        /* Class parameters: see below. */
  84        long                    offtime;
  85        long                    minidle;
  86        u32                     avpkt;
  87        struct qdisc_rate_table *R_tab;
  88
  89        /* General scheduler (WRR) parameters */
  90        long                    allot;
  91        long                    quantum;        /* Allotment per WRR round */
  92        long                    weight;         /* Relative allotment: see below */
  93
  94        struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
  95        struct cbq_class        *split;         /* Ptr to split node */
  96        struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
  97        struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
  98        struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
  99                                                   parent otherwise */
 100        struct cbq_class        *sibling;       /* Sibling chain */
 101        struct cbq_class        *children;      /* Pointer to children chain */
 102
 103        struct Qdisc            *q;             /* Elementary queueing discipline */
 104
 105
 106/* Variables */
 107        unsigned char           cpriority;      /* Effective priority */
 108        unsigned char           delayed;
 109        unsigned char           level;          /* level of the class in hierarchy:
 110                                                   0 for leaf classes, and maximal
 111                                                   level of children + 1 for nodes.
 112                                                 */
 113
 114        psched_time_t           last;           /* Last end of service */
 115        psched_time_t           undertime;
 116        long                    avgidle;
 117        long                    deficit;        /* Saved deficit for WRR */
 118        psched_time_t           penalized;
 119        struct gnet_stats_basic_packed bstats;
 120        struct gnet_stats_queue qstats;
 121        struct net_rate_estimator __rcu *rate_est;
 122        struct tc_cbq_xstats    xstats;
 123
 124        struct tcf_proto __rcu  *filter_list;
 125        struct tcf_block        *block;
 126
 127        int                     filters;
 128
 129        struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 130};
 131
 132struct cbq_sched_data {
 133        struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 134        int                     nclasses[TC_CBQ_MAXPRIO + 1];
 135        unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 136
 137        struct cbq_class        link;
 138
 139        unsigned int            activemask;
 140        struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 141                                                                   with backlog */
 142
 143#ifdef CONFIG_NET_CLS_ACT
 144        struct cbq_class        *rx_class;
 145#endif
 146        struct cbq_class        *tx_class;
 147        struct cbq_class        *tx_borrowed;
 148        int                     tx_len;
 149        psched_time_t           now;            /* Cached timestamp */
 150        unsigned int            pmask;
 151
 152        struct hrtimer          delay_timer;
 153        struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 154                                                   started when CBQ has
 155                                                   backlog, but cannot
 156                                                   transmit just now */
 157        psched_tdiff_t          wd_expires;
 158        int                     toplevel;
 159        u32                     hgenerator;
 160};
 161
 162
 163#define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 164
 165static inline struct cbq_class *
 166cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 167{
 168        struct Qdisc_class_common *clc;
 169
 170        clc = qdisc_class_find(&q->clhash, classid);
 171        if (clc == NULL)
 172                return NULL;
 173        return container_of(clc, struct cbq_class, common);
 174}
 175
 176#ifdef CONFIG_NET_CLS_ACT
 177
 178static struct cbq_class *
 179cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 180{
 181        struct cbq_class *cl;
 182
 183        for (cl = this->tparent; cl; cl = cl->tparent) {
 184                struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 185
 186                if (new != NULL && new != this)
 187                        return new;
 188        }
 189        return NULL;
 190}
 191
 192#endif
 193
 194/* Classify packet. The procedure is pretty complicated, but
 195 * it allows us to combine link sharing and priority scheduling
 196 * transparently.
 197 *
 198 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 199 * so that it resolves to split nodes. Then packets are classified
 200 * by logical priority, or a more specific classifier may be attached
 201 * to the split node.
 202 */
 203
 204static struct cbq_class *
 205cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 206{
 207        struct cbq_sched_data *q = qdisc_priv(sch);
 208        struct cbq_class *head = &q->link;
 209        struct cbq_class **defmap;
 210        struct cbq_class *cl = NULL;
 211        u32 prio = skb->priority;
 212        struct tcf_proto *fl;
 213        struct tcf_result res;
 214
 215        /*
 216         *  Step 1. If skb->priority points to one of our classes, use it.
 217         */
 218        if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 219            (cl = cbq_class_lookup(q, prio)) != NULL)
 220                return cl;
 221
 222        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 223        for (;;) {
 224                int result = 0;
 225                defmap = head->defaults;
 226
 227                fl = rcu_dereference_bh(head->filter_list);
 228                /*
 229                 * Step 2+n. Apply classifier.
 230                 */
 231                result = tcf_classify(skb, fl, &res, true);
 232                if (!fl || result < 0)
 233                        goto fallback;
 234
 235                cl = (void *)res.class;
 236                if (!cl) {
 237                        if (TC_H_MAJ(res.classid))
 238                                cl = cbq_class_lookup(q, res.classid);
 239                        else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 240                                cl = defmap[TC_PRIO_BESTEFFORT];
 241
 242                        if (cl == NULL)
 243                                goto fallback;
 244                }
 245                if (cl->level >= head->level)
 246                        goto fallback;
 247#ifdef CONFIG_NET_CLS_ACT
 248                switch (result) {
 249                case TC_ACT_QUEUED:
 250                case TC_ACT_STOLEN:
 251                case TC_ACT_TRAP:
 252                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 253                        /* fall through */
 254                case TC_ACT_SHOT:
 255                        return NULL;
 256                case TC_ACT_RECLASSIFY:
 257                        return cbq_reclassify(skb, cl);
 258                }
 259#endif
 260                if (cl->level == 0)
 261                        return cl;
 262
 263                /*
 264                 * Step 3+n. If classifier selected a link sharing class,
 265                 *         apply agency specific classifier.
 266                 *         Repeat this procdure until we hit a leaf node.
 267                 */
 268                head = cl;
 269        }
 270
 271fallback:
 272        cl = head;
 273
 274        /*
 275         * Step 4. No success...
 276         */
 277        if (TC_H_MAJ(prio) == 0 &&
 278            !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 279            !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 280                return head;
 281
 282        return cl;
 283}
 284
 285/*
 286 * A packet has just been enqueued on the empty class.
 287 * cbq_activate_class adds it to the tail of active class list
 288 * of its priority band.
 289 */
 290
 291static inline void cbq_activate_class(struct cbq_class *cl)
 292{
 293        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 294        int prio = cl->cpriority;
 295        struct cbq_class *cl_tail;
 296
 297        cl_tail = q->active[prio];
 298        q->active[prio] = cl;
 299
 300        if (cl_tail != NULL) {
 301                cl->next_alive = cl_tail->next_alive;
 302                cl_tail->next_alive = cl;
 303        } else {
 304                cl->next_alive = cl;
 305                q->activemask |= (1<<prio);
 306        }
 307}
 308
 309/*
 310 * Unlink class from active chain.
 311 * Note that this same procedure is done directly in cbq_dequeue*
 312 * during round-robin procedure.
 313 */
 314
 315static void cbq_deactivate_class(struct cbq_class *this)
 316{
 317        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 318        int prio = this->cpriority;
 319        struct cbq_class *cl;
 320        struct cbq_class *cl_prev = q->active[prio];
 321
 322        do {
 323                cl = cl_prev->next_alive;
 324                if (cl == this) {
 325                        cl_prev->next_alive = cl->next_alive;
 326                        cl->next_alive = NULL;
 327
 328                        if (cl == q->active[prio]) {
 329                                q->active[prio] = cl_prev;
 330                                if (cl == q->active[prio]) {
 331                                        q->active[prio] = NULL;
 332                                        q->activemask &= ~(1<<prio);
 333                                        return;
 334                                }
 335                        }
 336                        return;
 337                }
 338        } while ((cl_prev = cl) != q->active[prio]);
 339}
 340
 341static void
 342cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 343{
 344        int toplevel = q->toplevel;
 345
 346        if (toplevel > cl->level) {
 347                psched_time_t now = psched_get_time();
 348
 349                do {
 350                        if (cl->undertime < now) {
 351                                q->toplevel = cl->level;
 352                                return;
 353                        }
 354                } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 355        }
 356}
 357
 358static int
 359cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 360            struct sk_buff **to_free)
 361{
 362        struct cbq_sched_data *q = qdisc_priv(sch);
 363        int uninitialized_var(ret);
 364        struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 365
 366#ifdef CONFIG_NET_CLS_ACT
 367        q->rx_class = cl;
 368#endif
 369        if (cl == NULL) {
 370                if (ret & __NET_XMIT_BYPASS)
 371                        qdisc_qstats_drop(sch);
 372                __qdisc_drop(skb, to_free);
 373                return ret;
 374        }
 375
 376        ret = qdisc_enqueue(skb, cl->q, to_free);
 377        if (ret == NET_XMIT_SUCCESS) {
 378                sch->q.qlen++;
 379                cbq_mark_toplevel(q, cl);
 380                if (!cl->next_alive)
 381                        cbq_activate_class(cl);
 382                return ret;
 383        }
 384
 385        if (net_xmit_drop_count(ret)) {
 386                qdisc_qstats_drop(sch);
 387                cbq_mark_toplevel(q, cl);
 388                cl->qstats.drops++;
 389        }
 390        return ret;
 391}
 392
 393/* Overlimit action: penalize leaf class by adding offtime */
 394static void cbq_overlimit(struct cbq_class *cl)
 395{
 396        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 397        psched_tdiff_t delay = cl->undertime - q->now;
 398
 399        if (!cl->delayed) {
 400                delay += cl->offtime;
 401
 402                /*
 403                 * Class goes to sleep, so that it will have no
 404                 * chance to work avgidle. Let's forgive it 8)
 405                 *
 406                 * BTW cbq-2.0 has a crap in this
 407                 * place, apparently they forgot to shift it by cl->ewma_log.
 408                 */
 409                if (cl->avgidle < 0)
 410                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 411                if (cl->avgidle < cl->minidle)
 412                        cl->avgidle = cl->minidle;
 413                if (delay <= 0)
 414                        delay = 1;
 415                cl->undertime = q->now + delay;
 416
 417                cl->xstats.overactions++;
 418                cl->delayed = 1;
 419        }
 420        if (q->wd_expires == 0 || q->wd_expires > delay)
 421                q->wd_expires = delay;
 422
 423        /* Dirty work! We must schedule wakeups based on
 424         * real available rate, rather than leaf rate,
 425         * which may be tiny (even zero).
 426         */
 427        if (q->toplevel == TC_CBQ_MAXLEVEL) {
 428                struct cbq_class *b;
 429                psched_tdiff_t base_delay = q->wd_expires;
 430
 431                for (b = cl->borrow; b; b = b->borrow) {
 432                        delay = b->undertime - q->now;
 433                        if (delay < base_delay) {
 434                                if (delay <= 0)
 435                                        delay = 1;
 436                                base_delay = delay;
 437                        }
 438                }
 439
 440                q->wd_expires = base_delay;
 441        }
 442}
 443
 444static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 445                                       psched_time_t now)
 446{
 447        struct cbq_class *cl;
 448        struct cbq_class *cl_prev = q->active[prio];
 449        psched_time_t sched = now;
 450
 451        if (cl_prev == NULL)
 452                return 0;
 453
 454        do {
 455                cl = cl_prev->next_alive;
 456                if (now - cl->penalized > 0) {
 457                        cl_prev->next_alive = cl->next_alive;
 458                        cl->next_alive = NULL;
 459                        cl->cpriority = cl->priority;
 460                        cl->delayed = 0;
 461                        cbq_activate_class(cl);
 462
 463                        if (cl == q->active[prio]) {
 464                                q->active[prio] = cl_prev;
 465                                if (cl == q->active[prio]) {
 466                                        q->active[prio] = NULL;
 467                                        return 0;
 468                                }
 469                        }
 470
 471                        cl = cl_prev->next_alive;
 472                } else if (sched - cl->penalized > 0)
 473                        sched = cl->penalized;
 474        } while ((cl_prev = cl) != q->active[prio]);
 475
 476        return sched - now;
 477}
 478
 479static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 480{
 481        struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 482                                                delay_timer);
 483        struct Qdisc *sch = q->watchdog.qdisc;
 484        psched_time_t now;
 485        psched_tdiff_t delay = 0;
 486        unsigned int pmask;
 487
 488        now = psched_get_time();
 489
 490        pmask = q->pmask;
 491        q->pmask = 0;
 492
 493        while (pmask) {
 494                int prio = ffz(~pmask);
 495                psched_tdiff_t tmp;
 496
 497                pmask &= ~(1<<prio);
 498
 499                tmp = cbq_undelay_prio(q, prio, now);
 500                if (tmp > 0) {
 501                        q->pmask |= 1<<prio;
 502                        if (tmp < delay || delay == 0)
 503                                delay = tmp;
 504                }
 505        }
 506
 507        if (delay) {
 508                ktime_t time;
 509
 510                time = 0;
 511                time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 512                hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 513        }
 514
 515        __netif_schedule(qdisc_root(sch));
 516        return HRTIMER_NORESTART;
 517}
 518
 519/*
 520 * It is mission critical procedure.
 521 *
 522 * We "regenerate" toplevel cutoff, if transmitting class
 523 * has backlog and it is not regulated. It is not part of
 524 * original CBQ description, but looks more reasonable.
 525 * Probably, it is wrong. This question needs further investigation.
 526 */
 527
 528static inline void
 529cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 530                    struct cbq_class *borrowed)
 531{
 532        if (cl && q->toplevel >= borrowed->level) {
 533                if (cl->q->q.qlen > 1) {
 534                        do {
 535                                if (borrowed->undertime == PSCHED_PASTPERFECT) {
 536                                        q->toplevel = borrowed->level;
 537                                        return;
 538                                }
 539                        } while ((borrowed = borrowed->borrow) != NULL);
 540                }
 541#if 0
 542        /* It is not necessary now. Uncommenting it
 543           will save CPU cycles, but decrease fairness.
 544         */
 545                q->toplevel = TC_CBQ_MAXLEVEL;
 546#endif
 547        }
 548}
 549
 550static void
 551cbq_update(struct cbq_sched_data *q)
 552{
 553        struct cbq_class *this = q->tx_class;
 554        struct cbq_class *cl = this;
 555        int len = q->tx_len;
 556        psched_time_t now;
 557
 558        q->tx_class = NULL;
 559        /* Time integrator. We calculate EOS time
 560         * by adding expected packet transmission time.
 561         */
 562        now = q->now + L2T(&q->link, len);
 563
 564        for ( ; cl; cl = cl->share) {
 565                long avgidle = cl->avgidle;
 566                long idle;
 567
 568                cl->bstats.packets++;
 569                cl->bstats.bytes += len;
 570
 571                /*
 572                 * (now - last) is total time between packet right edges.
 573                 * (last_pktlen/rate) is "virtual" busy time, so that
 574                 *
 575                 *      idle = (now - last) - last_pktlen/rate
 576                 */
 577
 578                idle = now - cl->last;
 579                if ((unsigned long)idle > 128*1024*1024) {
 580                        avgidle = cl->maxidle;
 581                } else {
 582                        idle -= L2T(cl, len);
 583
 584                /* true_avgidle := (1-W)*true_avgidle + W*idle,
 585                 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 586                 * cl->avgidle == true_avgidle/W,
 587                 * hence:
 588                 */
 589                        avgidle += idle - (avgidle>>cl->ewma_log);
 590                }
 591
 592                if (avgidle <= 0) {
 593                        /* Overlimit or at-limit */
 594
 595                        if (avgidle < cl->minidle)
 596                                avgidle = cl->minidle;
 597
 598                        cl->avgidle = avgidle;
 599
 600                        /* Calculate expected time, when this class
 601                         * will be allowed to send.
 602                         * It will occur, when:
 603                         * (1-W)*true_avgidle + W*delay = 0, i.e.
 604                         * idle = (1/W - 1)*(-true_avgidle)
 605                         * or
 606                         * idle = (1 - W)*(-cl->avgidle);
 607                         */
 608                        idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 609
 610                        /*
 611                         * That is not all.
 612                         * To maintain the rate allocated to the class,
 613                         * we add to undertime virtual clock,
 614                         * necessary to complete transmitted packet.
 615                         * (len/phys_bandwidth has been already passed
 616                         * to the moment of cbq_update)
 617                         */
 618
 619                        idle -= L2T(&q->link, len);
 620                        idle += L2T(cl, len);
 621
 622                        cl->undertime = now + idle;
 623                } else {
 624                        /* Underlimit */
 625
 626                        cl->undertime = PSCHED_PASTPERFECT;
 627                        if (avgidle > cl->maxidle)
 628                                cl->avgidle = cl->maxidle;
 629                        else
 630                                cl->avgidle = avgidle;
 631                }
 632                if ((s64)(now - cl->last) > 0)
 633                        cl->last = now;
 634        }
 635
 636        cbq_update_toplevel(q, this, q->tx_borrowed);
 637}
 638
 639static inline struct cbq_class *
 640cbq_under_limit(struct cbq_class *cl)
 641{
 642        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 643        struct cbq_class *this_cl = cl;
 644
 645        if (cl->tparent == NULL)
 646                return cl;
 647
 648        if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 649                cl->delayed = 0;
 650                return cl;
 651        }
 652
 653        do {
 654                /* It is very suspicious place. Now overlimit
 655                 * action is generated for not bounded classes
 656                 * only if link is completely congested.
 657                 * Though it is in agree with ancestor-only paradigm,
 658                 * it looks very stupid. Particularly,
 659                 * it means that this chunk of code will either
 660                 * never be called or result in strong amplification
 661                 * of burstiness. Dangerous, silly, and, however,
 662                 * no another solution exists.
 663                 */
 664                cl = cl->borrow;
 665                if (!cl) {
 666                        this_cl->qstats.overlimits++;
 667                        cbq_overlimit(this_cl);
 668                        return NULL;
 669                }
 670                if (cl->level > q->toplevel)
 671                        return NULL;
 672        } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 673
 674        cl->delayed = 0;
 675        return cl;
 676}
 677
 678static inline struct sk_buff *
 679cbq_dequeue_prio(struct Qdisc *sch, int prio)
 680{
 681        struct cbq_sched_data *q = qdisc_priv(sch);
 682        struct cbq_class *cl_tail, *cl_prev, *cl;
 683        struct sk_buff *skb;
 684        int deficit;
 685
 686        cl_tail = cl_prev = q->active[prio];
 687        cl = cl_prev->next_alive;
 688
 689        do {
 690                deficit = 0;
 691
 692                /* Start round */
 693                do {
 694                        struct cbq_class *borrow = cl;
 695
 696                        if (cl->q->q.qlen &&
 697                            (borrow = cbq_under_limit(cl)) == NULL)
 698                                goto skip_class;
 699
 700                        if (cl->deficit <= 0) {
 701                                /* Class exhausted its allotment per
 702                                 * this round. Switch to the next one.
 703                                 */
 704                                deficit = 1;
 705                                cl->deficit += cl->quantum;
 706                                goto next_class;
 707                        }
 708
 709                        skb = cl->q->dequeue(cl->q);
 710
 711                        /* Class did not give us any skb :-(
 712                         * It could occur even if cl->q->q.qlen != 0
 713                         * f.e. if cl->q == "tbf"
 714                         */
 715                        if (skb == NULL)
 716                                goto skip_class;
 717
 718                        cl->deficit -= qdisc_pkt_len(skb);
 719                        q->tx_class = cl;
 720                        q->tx_borrowed = borrow;
 721                        if (borrow != cl) {
 722#ifndef CBQ_XSTATS_BORROWS_BYTES
 723                                borrow->xstats.borrows++;
 724                                cl->xstats.borrows++;
 725#else
 726                                borrow->xstats.borrows += qdisc_pkt_len(skb);
 727                                cl->xstats.borrows += qdisc_pkt_len(skb);
 728#endif
 729                        }
 730                        q->tx_len = qdisc_pkt_len(skb);
 731
 732                        if (cl->deficit <= 0) {
 733                                q->active[prio] = cl;
 734                                cl = cl->next_alive;
 735                                cl->deficit += cl->quantum;
 736                        }
 737                        return skb;
 738
 739skip_class:
 740                        if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 741                                /* Class is empty or penalized.
 742                                 * Unlink it from active chain.
 743                                 */
 744                                cl_prev->next_alive = cl->next_alive;
 745                                cl->next_alive = NULL;
 746
 747                                /* Did cl_tail point to it? */
 748                                if (cl == cl_tail) {
 749                                        /* Repair it! */
 750                                        cl_tail = cl_prev;
 751
 752                                        /* Was it the last class in this band? */
 753                                        if (cl == cl_tail) {
 754                                                /* Kill the band! */
 755                                                q->active[prio] = NULL;
 756                                                q->activemask &= ~(1<<prio);
 757                                                if (cl->q->q.qlen)
 758                                                        cbq_activate_class(cl);
 759                                                return NULL;
 760                                        }
 761
 762                                        q->active[prio] = cl_tail;
 763                                }
 764                                if (cl->q->q.qlen)
 765                                        cbq_activate_class(cl);
 766
 767                                cl = cl_prev;
 768                        }
 769
 770next_class:
 771                        cl_prev = cl;
 772                        cl = cl->next_alive;
 773                } while (cl_prev != cl_tail);
 774        } while (deficit);
 775
 776        q->active[prio] = cl_prev;
 777
 778        return NULL;
 779}
 780
 781static inline struct sk_buff *
 782cbq_dequeue_1(struct Qdisc *sch)
 783{
 784        struct cbq_sched_data *q = qdisc_priv(sch);
 785        struct sk_buff *skb;
 786        unsigned int activemask;
 787
 788        activemask = q->activemask & 0xFF;
 789        while (activemask) {
 790                int prio = ffz(~activemask);
 791                activemask &= ~(1<<prio);
 792                skb = cbq_dequeue_prio(sch, prio);
 793                if (skb)
 794                        return skb;
 795        }
 796        return NULL;
 797}
 798
 799static struct sk_buff *
 800cbq_dequeue(struct Qdisc *sch)
 801{
 802        struct sk_buff *skb;
 803        struct cbq_sched_data *q = qdisc_priv(sch);
 804        psched_time_t now;
 805
 806        now = psched_get_time();
 807
 808        if (q->tx_class)
 809                cbq_update(q);
 810
 811        q->now = now;
 812
 813        for (;;) {
 814                q->wd_expires = 0;
 815
 816                skb = cbq_dequeue_1(sch);
 817                if (skb) {
 818                        qdisc_bstats_update(sch, skb);
 819                        sch->q.qlen--;
 820                        return skb;
 821                }
 822
 823                /* All the classes are overlimit.
 824                 *
 825                 * It is possible, if:
 826                 *
 827                 * 1. Scheduler is empty.
 828                 * 2. Toplevel cutoff inhibited borrowing.
 829                 * 3. Root class is overlimit.
 830                 *
 831                 * Reset 2d and 3d conditions and retry.
 832                 *
 833                 * Note, that NS and cbq-2.0 are buggy, peeking
 834                 * an arbitrary class is appropriate for ancestor-only
 835                 * sharing, but not for toplevel algorithm.
 836                 *
 837                 * Our version is better, but slower, because it requires
 838                 * two passes, but it is unavoidable with top-level sharing.
 839                 */
 840
 841                if (q->toplevel == TC_CBQ_MAXLEVEL &&
 842                    q->link.undertime == PSCHED_PASTPERFECT)
 843                        break;
 844
 845                q->toplevel = TC_CBQ_MAXLEVEL;
 846                q->link.undertime = PSCHED_PASTPERFECT;
 847        }
 848
 849        /* No packets in scheduler or nobody wants to give them to us :-(
 850         * Sigh... start watchdog timer in the last case.
 851         */
 852
 853        if (sch->q.qlen) {
 854                qdisc_qstats_overlimit(sch);
 855                if (q->wd_expires)
 856                        qdisc_watchdog_schedule(&q->watchdog,
 857                                                now + q->wd_expires);
 858        }
 859        return NULL;
 860}
 861
 862/* CBQ class maintanance routines */
 863
 864static void cbq_adjust_levels(struct cbq_class *this)
 865{
 866        if (this == NULL)
 867                return;
 868
 869        do {
 870                int level = 0;
 871                struct cbq_class *cl;
 872
 873                cl = this->children;
 874                if (cl) {
 875                        do {
 876                                if (cl->level > level)
 877                                        level = cl->level;
 878                        } while ((cl = cl->sibling) != this->children);
 879                }
 880                this->level = level + 1;
 881        } while ((this = this->tparent) != NULL);
 882}
 883
 884static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 885{
 886        struct cbq_class *cl;
 887        unsigned int h;
 888
 889        if (q->quanta[prio] == 0)
 890                return;
 891
 892        for (h = 0; h < q->clhash.hashsize; h++) {
 893                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 894                        /* BUGGGG... Beware! This expression suffer of
 895                         * arithmetic overflows!
 896                         */
 897                        if (cl->priority == prio) {
 898                                cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 899                                        q->quanta[prio];
 900                        }
 901                        if (cl->quantum <= 0 ||
 902                            cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 903                                pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 904                                        cl->common.classid, cl->quantum);
 905                                cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 906                        }
 907                }
 908        }
 909}
 910
 911static void cbq_sync_defmap(struct cbq_class *cl)
 912{
 913        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 914        struct cbq_class *split = cl->split;
 915        unsigned int h;
 916        int i;
 917
 918        if (split == NULL)
 919                return;
 920
 921        for (i = 0; i <= TC_PRIO_MAX; i++) {
 922                if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 923                        split->defaults[i] = NULL;
 924        }
 925
 926        for (i = 0; i <= TC_PRIO_MAX; i++) {
 927                int level = split->level;
 928
 929                if (split->defaults[i])
 930                        continue;
 931
 932                for (h = 0; h < q->clhash.hashsize; h++) {
 933                        struct cbq_class *c;
 934
 935                        hlist_for_each_entry(c, &q->clhash.hash[h],
 936                                             common.hnode) {
 937                                if (c->split == split && c->level < level &&
 938                                    c->defmap & (1<<i)) {
 939                                        split->defaults[i] = c;
 940                                        level = c->level;
 941                                }
 942                        }
 943                }
 944        }
 945}
 946
 947static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 948{
 949        struct cbq_class *split = NULL;
 950
 951        if (splitid == 0) {
 952                split = cl->split;
 953                if (!split)
 954                        return;
 955                splitid = split->common.classid;
 956        }
 957
 958        if (split == NULL || split->common.classid != splitid) {
 959                for (split = cl->tparent; split; split = split->tparent)
 960                        if (split->common.classid == splitid)
 961                                break;
 962        }
 963
 964        if (split == NULL)
 965                return;
 966
 967        if (cl->split != split) {
 968                cl->defmap = 0;
 969                cbq_sync_defmap(cl);
 970                cl->split = split;
 971                cl->defmap = def & mask;
 972        } else
 973                cl->defmap = (cl->defmap & ~mask) | (def & mask);
 974
 975        cbq_sync_defmap(cl);
 976}
 977
 978static void cbq_unlink_class(struct cbq_class *this)
 979{
 980        struct cbq_class *cl, **clp;
 981        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 982
 983        qdisc_class_hash_remove(&q->clhash, &this->common);
 984
 985        if (this->tparent) {
 986                clp = &this->sibling;
 987                cl = *clp;
 988                do {
 989                        if (cl == this) {
 990                                *clp = cl->sibling;
 991                                break;
 992                        }
 993                        clp = &cl->sibling;
 994                } while ((cl = *clp) != this->sibling);
 995
 996                if (this->tparent->children == this) {
 997                        this->tparent->children = this->sibling;
 998                        if (this->sibling == this)
 999                                this->tparent->children = NULL;
1000                }
1001        } else {
1002                WARN_ON(this->sibling != this);
1003        }
1004}
1005
1006static void cbq_link_class(struct cbq_class *this)
1007{
1008        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009        struct cbq_class *parent = this->tparent;
1010
1011        this->sibling = this;
1012        qdisc_class_hash_insert(&q->clhash, &this->common);
1013
1014        if (parent == NULL)
1015                return;
1016
1017        if (parent->children == NULL) {
1018                parent->children = this;
1019        } else {
1020                this->sibling = parent->children->sibling;
1021                parent->children->sibling = this;
1022        }
1023}
1024
1025static void
1026cbq_reset(struct Qdisc *sch)
1027{
1028        struct cbq_sched_data *q = qdisc_priv(sch);
1029        struct cbq_class *cl;
1030        int prio;
1031        unsigned int h;
1032
1033        q->activemask = 0;
1034        q->pmask = 0;
1035        q->tx_class = NULL;
1036        q->tx_borrowed = NULL;
1037        qdisc_watchdog_cancel(&q->watchdog);
1038        hrtimer_cancel(&q->delay_timer);
1039        q->toplevel = TC_CBQ_MAXLEVEL;
1040        q->now = psched_get_time();
1041
1042        for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043                q->active[prio] = NULL;
1044
1045        for (h = 0; h < q->clhash.hashsize; h++) {
1046                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047                        qdisc_reset(cl->q);
1048
1049                        cl->next_alive = NULL;
1050                        cl->undertime = PSCHED_PASTPERFECT;
1051                        cl->avgidle = cl->maxidle;
1052                        cl->deficit = cl->quantum;
1053                        cl->cpriority = cl->priority;
1054                }
1055        }
1056        sch->q.qlen = 0;
1057}
1058
1059
1060static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1061{
1062        if (lss->change & TCF_CBQ_LSS_FLAGS) {
1063                cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1064                cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1065        }
1066        if (lss->change & TCF_CBQ_LSS_EWMA)
1067                cl->ewma_log = lss->ewma_log;
1068        if (lss->change & TCF_CBQ_LSS_AVPKT)
1069                cl->avpkt = lss->avpkt;
1070        if (lss->change & TCF_CBQ_LSS_MINIDLE)
1071                cl->minidle = -(long)lss->minidle;
1072        if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1073                cl->maxidle = lss->maxidle;
1074                cl->avgidle = lss->maxidle;
1075        }
1076        if (lss->change & TCF_CBQ_LSS_OFFTIME)
1077                cl->offtime = lss->offtime;
1078        return 0;
1079}
1080
1081static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1082{
1083        q->nclasses[cl->priority]--;
1084        q->quanta[cl->priority] -= cl->weight;
1085        cbq_normalize_quanta(q, cl->priority);
1086}
1087
1088static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1089{
1090        q->nclasses[cl->priority]++;
1091        q->quanta[cl->priority] += cl->weight;
1092        cbq_normalize_quanta(q, cl->priority);
1093}
1094
1095static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1096{
1097        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1098
1099        if (wrr->allot)
1100                cl->allot = wrr->allot;
1101        if (wrr->weight)
1102                cl->weight = wrr->weight;
1103        if (wrr->priority) {
1104                cl->priority = wrr->priority - 1;
1105                cl->cpriority = cl->priority;
1106                if (cl->priority >= cl->priority2)
1107                        cl->priority2 = TC_CBQ_MAXPRIO - 1;
1108        }
1109
1110        cbq_addprio(q, cl);
1111        return 0;
1112}
1113
1114static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1115{
1116        cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1117        return 0;
1118}
1119
1120static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1121        [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1122        [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1123        [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1124        [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1125        [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1126        [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1127        [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1128};
1129
1130static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1131                    struct netlink_ext_ack *extack)
1132{
1133        struct cbq_sched_data *q = qdisc_priv(sch);
1134        struct nlattr *tb[TCA_CBQ_MAX + 1];
1135        struct tc_ratespec *r;
1136        int err;
1137
1138        qdisc_watchdog_init(&q->watchdog, sch);
1139        hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1140        q->delay_timer.function = cbq_undelay;
1141
1142        if (!opt) {
1143                NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1144                return -EINVAL;
1145        }
1146
1147        err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy,
1148                                          extack);
1149        if (err < 0)
1150                return err;
1151
1152        if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1153                NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1154                return -EINVAL;
1155        }
1156
1157        r = nla_data(tb[TCA_CBQ_RATE]);
1158
1159        q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1160        if (!q->link.R_tab)
1161                return -EINVAL;
1162
1163        err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1164        if (err)
1165                goto put_rtab;
1166
1167        err = qdisc_class_hash_init(&q->clhash);
1168        if (err < 0)
1169                goto put_block;
1170
1171        q->link.sibling = &q->link;
1172        q->link.common.classid = sch->handle;
1173        q->link.qdisc = sch;
1174        q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1175                                      sch->handle, NULL);
1176        if (!q->link.q)
1177                q->link.q = &noop_qdisc;
1178        else
1179                qdisc_hash_add(q->link.q, true);
1180
1181        q->link.priority = TC_CBQ_MAXPRIO - 1;
1182        q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1183        q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1184        q->link.allot = psched_mtu(qdisc_dev(sch));
1185        q->link.quantum = q->link.allot;
1186        q->link.weight = q->link.R_tab->rate.rate;
1187
1188        q->link.ewma_log = TC_CBQ_DEF_EWMA;
1189        q->link.avpkt = q->link.allot/2;
1190        q->link.minidle = -0x7FFFFFFF;
1191
1192        q->toplevel = TC_CBQ_MAXLEVEL;
1193        q->now = psched_get_time();
1194
1195        cbq_link_class(&q->link);
1196
1197        if (tb[TCA_CBQ_LSSOPT])
1198                cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1199
1200        cbq_addprio(q, &q->link);
1201        return 0;
1202
1203put_block:
1204        tcf_block_put(q->link.block);
1205
1206put_rtab:
1207        qdisc_put_rtab(q->link.R_tab);
1208        return err;
1209}
1210
1211static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1212{
1213        unsigned char *b = skb_tail_pointer(skb);
1214
1215        if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1216                goto nla_put_failure;
1217        return skb->len;
1218
1219nla_put_failure:
1220        nlmsg_trim(skb, b);
1221        return -1;
1222}
1223
1224static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1225{
1226        unsigned char *b = skb_tail_pointer(skb);
1227        struct tc_cbq_lssopt opt;
1228
1229        opt.flags = 0;
1230        if (cl->borrow == NULL)
1231                opt.flags |= TCF_CBQ_LSS_BOUNDED;
1232        if (cl->share == NULL)
1233                opt.flags |= TCF_CBQ_LSS_ISOLATED;
1234        opt.ewma_log = cl->ewma_log;
1235        opt.level = cl->level;
1236        opt.avpkt = cl->avpkt;
1237        opt.maxidle = cl->maxidle;
1238        opt.minidle = (u32)(-cl->minidle);
1239        opt.offtime = cl->offtime;
1240        opt.change = ~0;
1241        if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1242                goto nla_put_failure;
1243        return skb->len;
1244
1245nla_put_failure:
1246        nlmsg_trim(skb, b);
1247        return -1;
1248}
1249
1250static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1251{
1252        unsigned char *b = skb_tail_pointer(skb);
1253        struct tc_cbq_wrropt opt;
1254
1255        memset(&opt, 0, sizeof(opt));
1256        opt.flags = 0;
1257        opt.allot = cl->allot;
1258        opt.priority = cl->priority + 1;
1259        opt.cpriority = cl->cpriority + 1;
1260        opt.weight = cl->weight;
1261        if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1262                goto nla_put_failure;
1263        return skb->len;
1264
1265nla_put_failure:
1266        nlmsg_trim(skb, b);
1267        return -1;
1268}
1269
1270static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1271{
1272        unsigned char *b = skb_tail_pointer(skb);
1273        struct tc_cbq_fopt opt;
1274
1275        if (cl->split || cl->defmap) {
1276                opt.split = cl->split ? cl->split->common.classid : 0;
1277                opt.defmap = cl->defmap;
1278                opt.defchange = ~0;
1279                if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1280                        goto nla_put_failure;
1281        }
1282        return skb->len;
1283
1284nla_put_failure:
1285        nlmsg_trim(skb, b);
1286        return -1;
1287}
1288
1289static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1290{
1291        if (cbq_dump_lss(skb, cl) < 0 ||
1292            cbq_dump_rate(skb, cl) < 0 ||
1293            cbq_dump_wrr(skb, cl) < 0 ||
1294            cbq_dump_fopt(skb, cl) < 0)
1295                return -1;
1296        return 0;
1297}
1298
1299static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1300{
1301        struct cbq_sched_data *q = qdisc_priv(sch);
1302        struct nlattr *nest;
1303
1304        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1305        if (nest == NULL)
1306                goto nla_put_failure;
1307        if (cbq_dump_attr(skb, &q->link) < 0)
1308                goto nla_put_failure;
1309        return nla_nest_end(skb, nest);
1310
1311nla_put_failure:
1312        nla_nest_cancel(skb, nest);
1313        return -1;
1314}
1315
1316static int
1317cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1318{
1319        struct cbq_sched_data *q = qdisc_priv(sch);
1320
1321        q->link.xstats.avgidle = q->link.avgidle;
1322        return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1323}
1324
1325static int
1326cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1327               struct sk_buff *skb, struct tcmsg *tcm)
1328{
1329        struct cbq_class *cl = (struct cbq_class *)arg;
1330        struct nlattr *nest;
1331
1332        if (cl->tparent)
1333                tcm->tcm_parent = cl->tparent->common.classid;
1334        else
1335                tcm->tcm_parent = TC_H_ROOT;
1336        tcm->tcm_handle = cl->common.classid;
1337        tcm->tcm_info = cl->q->handle;
1338
1339        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1340        if (nest == NULL)
1341                goto nla_put_failure;
1342        if (cbq_dump_attr(skb, cl) < 0)
1343                goto nla_put_failure;
1344        return nla_nest_end(skb, nest);
1345
1346nla_put_failure:
1347        nla_nest_cancel(skb, nest);
1348        return -1;
1349}
1350
1351static int
1352cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1353        struct gnet_dump *d)
1354{
1355        struct cbq_sched_data *q = qdisc_priv(sch);
1356        struct cbq_class *cl = (struct cbq_class *)arg;
1357        __u32 qlen;
1358
1359        cl->xstats.avgidle = cl->avgidle;
1360        cl->xstats.undertime = 0;
1361        qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1362
1363        if (cl->undertime != PSCHED_PASTPERFECT)
1364                cl->xstats.undertime = cl->undertime - q->now;
1365
1366        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1367                                  d, NULL, &cl->bstats) < 0 ||
1368            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1369            gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1370                return -1;
1371
1372        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1373}
1374
1375static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1376                     struct Qdisc **old, struct netlink_ext_ack *extack)
1377{
1378        struct cbq_class *cl = (struct cbq_class *)arg;
1379
1380        if (new == NULL) {
1381                new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1382                                        cl->common.classid, extack);
1383                if (new == NULL)
1384                        return -ENOBUFS;
1385        }
1386
1387        *old = qdisc_replace(sch, new, &cl->q);
1388        return 0;
1389}
1390
1391static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1392{
1393        struct cbq_class *cl = (struct cbq_class *)arg;
1394
1395        return cl->q;
1396}
1397
1398static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1399{
1400        struct cbq_class *cl = (struct cbq_class *)arg;
1401
1402        cbq_deactivate_class(cl);
1403}
1404
1405static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1406{
1407        struct cbq_sched_data *q = qdisc_priv(sch);
1408
1409        return (unsigned long)cbq_class_lookup(q, classid);
1410}
1411
1412static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1413{
1414        struct cbq_sched_data *q = qdisc_priv(sch);
1415
1416        WARN_ON(cl->filters);
1417
1418        tcf_block_put(cl->block);
1419        qdisc_put(cl->q);
1420        qdisc_put_rtab(cl->R_tab);
1421        gen_kill_estimator(&cl->rate_est);
1422        if (cl != &q->link)
1423                kfree(cl);
1424}
1425
1426static void cbq_destroy(struct Qdisc *sch)
1427{
1428        struct cbq_sched_data *q = qdisc_priv(sch);
1429        struct hlist_node *next;
1430        struct cbq_class *cl;
1431        unsigned int h;
1432
1433#ifdef CONFIG_NET_CLS_ACT
1434        q->rx_class = NULL;
1435#endif
1436        /*
1437         * Filters must be destroyed first because we don't destroy the
1438         * classes from root to leafs which means that filters can still
1439         * be bound to classes which have been destroyed already. --TGR '04
1440         */
1441        for (h = 0; h < q->clhash.hashsize; h++) {
1442                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1443                        tcf_block_put(cl->block);
1444                        cl->block = NULL;
1445                }
1446        }
1447        for (h = 0; h < q->clhash.hashsize; h++) {
1448                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1449                                          common.hnode)
1450                        cbq_destroy_class(sch, cl);
1451        }
1452        qdisc_class_hash_destroy(&q->clhash);
1453}
1454
1455static int
1456cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1457                 unsigned long *arg, struct netlink_ext_ack *extack)
1458{
1459        int err;
1460        struct cbq_sched_data *q = qdisc_priv(sch);
1461        struct cbq_class *cl = (struct cbq_class *)*arg;
1462        struct nlattr *opt = tca[TCA_OPTIONS];
1463        struct nlattr *tb[TCA_CBQ_MAX + 1];
1464        struct cbq_class *parent;
1465        struct qdisc_rate_table *rtab = NULL;
1466
1467        if (!opt) {
1468                NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
1469                return -EINVAL;
1470        }
1471
1472        err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy,
1473                                          extack);
1474        if (err < 0)
1475                return err;
1476
1477        if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1478                NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1479                return -EOPNOTSUPP;
1480        }
1481
1482        if (cl) {
1483                /* Check parent */
1484                if (parentid) {
1485                        if (cl->tparent &&
1486                            cl->tparent->common.classid != parentid) {
1487                                NL_SET_ERR_MSG(extack, "Invalid parent id");
1488                                return -EINVAL;
1489                        }
1490                        if (!cl->tparent && parentid != TC_H_ROOT) {
1491                                NL_SET_ERR_MSG(extack, "Parent must be root");
1492                                return -EINVAL;
1493                        }
1494                }
1495
1496                if (tb[TCA_CBQ_RATE]) {
1497                        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1498                                              tb[TCA_CBQ_RTAB], extack);
1499                        if (rtab == NULL)
1500                                return -EINVAL;
1501                }
1502
1503                if (tca[TCA_RATE]) {
1504                        err = gen_replace_estimator(&cl->bstats, NULL,
1505                                                    &cl->rate_est,
1506                                                    NULL,
1507                                                    qdisc_root_sleeping_running(sch),
1508                                                    tca[TCA_RATE]);
1509                        if (err) {
1510                                NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1511                                qdisc_put_rtab(rtab);
1512                                return err;
1513                        }
1514                }
1515
1516                /* Change class parameters */
1517                sch_tree_lock(sch);
1518
1519                if (cl->next_alive != NULL)
1520                        cbq_deactivate_class(cl);
1521
1522                if (rtab) {
1523                        qdisc_put_rtab(cl->R_tab);
1524                        cl->R_tab = rtab;
1525                }
1526
1527                if (tb[TCA_CBQ_LSSOPT])
1528                        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1529
1530                if (tb[TCA_CBQ_WRROPT]) {
1531                        cbq_rmprio(q, cl);
1532                        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1533                }
1534
1535                if (tb[TCA_CBQ_FOPT])
1536                        cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1537
1538                if (cl->q->q.qlen)
1539                        cbq_activate_class(cl);
1540
1541                sch_tree_unlock(sch);
1542
1543                return 0;
1544        }
1545
1546        if (parentid == TC_H_ROOT)
1547                return -EINVAL;
1548
1549        if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1550                NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1551                return -EINVAL;
1552        }
1553
1554        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1555                              extack);
1556        if (rtab == NULL)
1557                return -EINVAL;
1558
1559        if (classid) {
1560                err = -EINVAL;
1561                if (TC_H_MAJ(classid ^ sch->handle) ||
1562                    cbq_class_lookup(q, classid)) {
1563                        NL_SET_ERR_MSG(extack, "Specified class not found");
1564                        goto failure;
1565                }
1566        } else {
1567                int i;
1568                classid = TC_H_MAKE(sch->handle, 0x8000);
1569
1570                for (i = 0; i < 0x8000; i++) {
1571                        if (++q->hgenerator >= 0x8000)
1572                                q->hgenerator = 1;
1573                        if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1574                                break;
1575                }
1576                err = -ENOSR;
1577                if (i >= 0x8000) {
1578                        NL_SET_ERR_MSG(extack, "Unable to generate classid");
1579                        goto failure;
1580                }
1581                classid = classid|q->hgenerator;
1582        }
1583
1584        parent = &q->link;
1585        if (parentid) {
1586                parent = cbq_class_lookup(q, parentid);
1587                err = -EINVAL;
1588                if (!parent) {
1589                        NL_SET_ERR_MSG(extack, "Failed to find parentid");
1590                        goto failure;
1591                }
1592        }
1593
1594        err = -ENOBUFS;
1595        cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1596        if (cl == NULL)
1597                goto failure;
1598
1599        err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1600        if (err) {
1601                kfree(cl);
1602                return err;
1603        }
1604
1605        if (tca[TCA_RATE]) {
1606                err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1607                                        NULL,
1608                                        qdisc_root_sleeping_running(sch),
1609                                        tca[TCA_RATE]);
1610                if (err) {
1611                        NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1612                        tcf_block_put(cl->block);
1613                        kfree(cl);
1614                        goto failure;
1615                }
1616        }
1617
1618        cl->R_tab = rtab;
1619        rtab = NULL;
1620        cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1621                                  NULL);
1622        if (!cl->q)
1623                cl->q = &noop_qdisc;
1624        else
1625                qdisc_hash_add(cl->q, true);
1626
1627        cl->common.classid = classid;
1628        cl->tparent = parent;
1629        cl->qdisc = sch;
1630        cl->allot = parent->allot;
1631        cl->quantum = cl->allot;
1632        cl->weight = cl->R_tab->rate.rate;
1633
1634        sch_tree_lock(sch);
1635        cbq_link_class(cl);
1636        cl->borrow = cl->tparent;
1637        if (cl->tparent != &q->link)
1638                cl->share = cl->tparent;
1639        cbq_adjust_levels(parent);
1640        cl->minidle = -0x7FFFFFFF;
1641        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1642        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1643        if (cl->ewma_log == 0)
1644                cl->ewma_log = q->link.ewma_log;
1645        if (cl->maxidle == 0)
1646                cl->maxidle = q->link.maxidle;
1647        if (cl->avpkt == 0)
1648                cl->avpkt = q->link.avpkt;
1649        if (tb[TCA_CBQ_FOPT])
1650                cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1651        sch_tree_unlock(sch);
1652
1653        qdisc_class_hash_grow(sch, &q->clhash);
1654
1655        *arg = (unsigned long)cl;
1656        return 0;
1657
1658failure:
1659        qdisc_put_rtab(rtab);
1660        return err;
1661}
1662
1663static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1664{
1665        struct cbq_sched_data *q = qdisc_priv(sch);
1666        struct cbq_class *cl = (struct cbq_class *)arg;
1667
1668        if (cl->filters || cl->children || cl == &q->link)
1669                return -EBUSY;
1670
1671        sch_tree_lock(sch);
1672
1673        qdisc_purge_queue(cl->q);
1674
1675        if (cl->next_alive)
1676                cbq_deactivate_class(cl);
1677
1678        if (q->tx_borrowed == cl)
1679                q->tx_borrowed = q->tx_class;
1680        if (q->tx_class == cl) {
1681                q->tx_class = NULL;
1682                q->tx_borrowed = NULL;
1683        }
1684#ifdef CONFIG_NET_CLS_ACT
1685        if (q->rx_class == cl)
1686                q->rx_class = NULL;
1687#endif
1688
1689        cbq_unlink_class(cl);
1690        cbq_adjust_levels(cl->tparent);
1691        cl->defmap = 0;
1692        cbq_sync_defmap(cl);
1693
1694        cbq_rmprio(q, cl);
1695        sch_tree_unlock(sch);
1696
1697        cbq_destroy_class(sch, cl);
1698        return 0;
1699}
1700
1701static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1702                                       struct netlink_ext_ack *extack)
1703{
1704        struct cbq_sched_data *q = qdisc_priv(sch);
1705        struct cbq_class *cl = (struct cbq_class *)arg;
1706
1707        if (cl == NULL)
1708                cl = &q->link;
1709
1710        return cl->block;
1711}
1712
1713static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1714                                     u32 classid)
1715{
1716        struct cbq_sched_data *q = qdisc_priv(sch);
1717        struct cbq_class *p = (struct cbq_class *)parent;
1718        struct cbq_class *cl = cbq_class_lookup(q, classid);
1719
1720        if (cl) {
1721                if (p && p->level <= cl->level)
1722                        return 0;
1723                cl->filters++;
1724                return (unsigned long)cl;
1725        }
1726        return 0;
1727}
1728
1729static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1730{
1731        struct cbq_class *cl = (struct cbq_class *)arg;
1732
1733        cl->filters--;
1734}
1735
1736static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1737{
1738        struct cbq_sched_data *q = qdisc_priv(sch);
1739        struct cbq_class *cl;
1740        unsigned int h;
1741
1742        if (arg->stop)
1743                return;
1744
1745        for (h = 0; h < q->clhash.hashsize; h++) {
1746                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1747                        if (arg->count < arg->skip) {
1748                                arg->count++;
1749                                continue;
1750                        }
1751                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1752                                arg->stop = 1;
1753                                return;
1754                        }
1755                        arg->count++;
1756                }
1757        }
1758}
1759
1760static const struct Qdisc_class_ops cbq_class_ops = {
1761        .graft          =       cbq_graft,
1762        .leaf           =       cbq_leaf,
1763        .qlen_notify    =       cbq_qlen_notify,
1764        .find           =       cbq_find,
1765        .change         =       cbq_change_class,
1766        .delete         =       cbq_delete,
1767        .walk           =       cbq_walk,
1768        .tcf_block      =       cbq_tcf_block,
1769        .bind_tcf       =       cbq_bind_filter,
1770        .unbind_tcf     =       cbq_unbind_filter,
1771        .dump           =       cbq_dump_class,
1772        .dump_stats     =       cbq_dump_class_stats,
1773};
1774
1775static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1776        .next           =       NULL,
1777        .cl_ops         =       &cbq_class_ops,
1778        .id             =       "cbq",
1779        .priv_size      =       sizeof(struct cbq_sched_data),
1780        .enqueue        =       cbq_enqueue,
1781        .dequeue        =       cbq_dequeue,
1782        .peek           =       qdisc_peek_dequeued,
1783        .init           =       cbq_init,
1784        .reset          =       cbq_reset,
1785        .destroy        =       cbq_destroy,
1786        .change         =       NULL,
1787        .dump           =       cbq_dump,
1788        .dump_stats     =       cbq_dump_stats,
1789        .owner          =       THIS_MODULE,
1790};
1791
1792static int __init cbq_module_init(void)
1793{
1794        return register_qdisc(&cbq_qdisc_ops);
1795}
1796static void __exit cbq_module_exit(void)
1797{
1798        unregister_qdisc(&cbq_qdisc_ops);
1799}
1800module_init(cbq_module_init)
1801module_exit(cbq_module_exit)
1802MODULE_LICENSE("GPL");
1803