linux/net/sched/sch_cbq.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   4 *
   5 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/slab.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/string.h>
  13#include <linux/errno.h>
  14#include <linux/skbuff.h>
  15#include <net/netlink.h>
  16#include <net/pkt_sched.h>
  17#include <net/pkt_cls.h>
  18
  19
  20/*      Class-Based Queueing (CBQ) algorithm.
  21        =======================================
  22
  23        Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  24                 Management Models for Packet Networks",
  25                 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  26
  27                 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  28
  29                 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  30                 Parameters", 1996
  31
  32                 [4] Sally Floyd and Michael Speer, "Experimental Results
  33                 for Class-Based Queueing", 1998, not published.
  34
  35        -----------------------------------------------------------------------
  36
  37        Algorithm skeleton was taken from NS simulator cbq.cc.
  38        If someone wants to check this code against the LBL version,
  39        he should take into account that ONLY the skeleton was borrowed,
  40        the implementation is different. Particularly:
  41
  42        --- The WRR algorithm is different. Our version looks more
  43        reasonable (I hope) and works when quanta are allowed to be
  44        less than MTU, which is always the case when real time classes
  45        have small rates. Note, that the statement of [3] is
  46        incomplete, delay may actually be estimated even if class
  47        per-round allotment is less than MTU. Namely, if per-round
  48        allotment is W*r_i, and r_1+...+r_k = r < 1
  49
  50        delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  51
  52        In the worst case we have IntServ estimate with D = W*r+k*MTU
  53        and C = MTU*r. The proof (if correct at all) is trivial.
  54
  55
  56        --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  57        interpret some places, which look like wrong translations
  58        from NS. Anyone is advised to find these differences
  59        and explain to me, why I am wrong 8).
  60
  61        --- Linux has no EOI event, so that we cannot estimate true class
  62        idle time. Workaround is to consider the next dequeue event
  63        as sign that previous packet is finished. This is wrong because of
  64        internal device queueing, but on a permanently loaded link it is true.
  65        Moreover, combined with clock integrator, this scheme looks
  66        very close to an ideal solution.  */
  67
  68struct cbq_sched_data;
  69
  70
  71struct cbq_class {
  72        struct Qdisc_class_common common;
  73        struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  74
  75/* Parameters */
  76        unsigned char           priority;       /* class priority */
  77        unsigned char           priority2;      /* priority to be used after overlimit */
  78        unsigned char           ewma_log;       /* time constant for idle time calculation */
  79
  80        u32                     defmap;
  81
  82        /* Link-sharing scheduler parameters */
  83        long                    maxidle;        /* Class parameters: see below. */
  84        long                    offtime;
  85        long                    minidle;
  86        u32                     avpkt;
  87        struct qdisc_rate_table *R_tab;
  88
  89        /* General scheduler (WRR) parameters */
  90        long                    allot;
  91        long                    quantum;        /* Allotment per WRR round */
  92        long                    weight;         /* Relative allotment: see below */
  93
  94        struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
  95        struct cbq_class        *split;         /* Ptr to split node */
  96        struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
  97        struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
  98        struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
  99                                                   parent otherwise */
 100        struct cbq_class        *sibling;       /* Sibling chain */
 101        struct cbq_class        *children;      /* Pointer to children chain */
 102
 103        struct Qdisc            *q;             /* Elementary queueing discipline */
 104
 105
 106/* Variables */
 107        unsigned char           cpriority;      /* Effective priority */
 108        unsigned char           delayed;
 109        unsigned char           level;          /* level of the class in hierarchy:
 110                                                   0 for leaf classes, and maximal
 111                                                   level of children + 1 for nodes.
 112                                                 */
 113
 114        psched_time_t           last;           /* Last end of service */
 115        psched_time_t           undertime;
 116        long                    avgidle;
 117        long                    deficit;        /* Saved deficit for WRR */
 118        psched_time_t           penalized;
 119        struct gnet_stats_basic_packed bstats;
 120        struct gnet_stats_queue qstats;
 121        struct net_rate_estimator __rcu *rate_est;
 122        struct tc_cbq_xstats    xstats;
 123
 124        struct tcf_proto __rcu  *filter_list;
 125        struct tcf_block        *block;
 126
 127        int                     filters;
 128
 129        struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 130};
 131
 132struct cbq_sched_data {
 133        struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 134        int                     nclasses[TC_CBQ_MAXPRIO + 1];
 135        unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 136
 137        struct cbq_class        link;
 138
 139        unsigned int            activemask;
 140        struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 141                                                                   with backlog */
 142
 143#ifdef CONFIG_NET_CLS_ACT
 144        struct cbq_class        *rx_class;
 145#endif
 146        struct cbq_class        *tx_class;
 147        struct cbq_class        *tx_borrowed;
 148        int                     tx_len;
 149        psched_time_t           now;            /* Cached timestamp */
 150        unsigned int            pmask;
 151
 152        struct hrtimer          delay_timer;
 153        struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 154                                                   started when CBQ has
 155                                                   backlog, but cannot
 156                                                   transmit just now */
 157        psched_tdiff_t          wd_expires;
 158        int                     toplevel;
 159        u32                     hgenerator;
 160};
 161
 162
 163#define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 164
 165static inline struct cbq_class *
 166cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 167{
 168        struct Qdisc_class_common *clc;
 169
 170        clc = qdisc_class_find(&q->clhash, classid);
 171        if (clc == NULL)
 172                return NULL;
 173        return container_of(clc, struct cbq_class, common);
 174}
 175
 176#ifdef CONFIG_NET_CLS_ACT
 177
 178static struct cbq_class *
 179cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 180{
 181        struct cbq_class *cl;
 182
 183        for (cl = this->tparent; cl; cl = cl->tparent) {
 184                struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 185
 186                if (new != NULL && new != this)
 187                        return new;
 188        }
 189        return NULL;
 190}
 191
 192#endif
 193
 194/* Classify packet. The procedure is pretty complicated, but
 195 * it allows us to combine link sharing and priority scheduling
 196 * transparently.
 197 *
 198 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 199 * so that it resolves to split nodes. Then packets are classified
 200 * by logical priority, or a more specific classifier may be attached
 201 * to the split node.
 202 */
 203
 204static struct cbq_class *
 205cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 206{
 207        struct cbq_sched_data *q = qdisc_priv(sch);
 208        struct cbq_class *head = &q->link;
 209        struct cbq_class **defmap;
 210        struct cbq_class *cl = NULL;
 211        u32 prio = skb->priority;
 212        struct tcf_proto *fl;
 213        struct tcf_result res;
 214
 215        /*
 216         *  Step 1. If skb->priority points to one of our classes, use it.
 217         */
 218        if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 219            (cl = cbq_class_lookup(q, prio)) != NULL)
 220                return cl;
 221
 222        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 223        for (;;) {
 224                int result = 0;
 225                defmap = head->defaults;
 226
 227                fl = rcu_dereference_bh(head->filter_list);
 228                /*
 229                 * Step 2+n. Apply classifier.
 230                 */
 231                result = tcf_classify(skb, fl, &res, true);
 232                if (!fl || result < 0)
 233                        goto fallback;
 234
 235                cl = (void *)res.class;
 236                if (!cl) {
 237                        if (TC_H_MAJ(res.classid))
 238                                cl = cbq_class_lookup(q, res.classid);
 239                        else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 240                                cl = defmap[TC_PRIO_BESTEFFORT];
 241
 242                        if (cl == NULL)
 243                                goto fallback;
 244                }
 245                if (cl->level >= head->level)
 246                        goto fallback;
 247#ifdef CONFIG_NET_CLS_ACT
 248                switch (result) {
 249                case TC_ACT_QUEUED:
 250                case TC_ACT_STOLEN:
 251                case TC_ACT_TRAP:
 252                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 253                        fallthrough;
 254                case TC_ACT_SHOT:
 255                        return NULL;
 256                case TC_ACT_RECLASSIFY:
 257                        return cbq_reclassify(skb, cl);
 258                }
 259#endif
 260                if (cl->level == 0)
 261                        return cl;
 262
 263                /*
 264                 * Step 3+n. If classifier selected a link sharing class,
 265                 *         apply agency specific classifier.
 266                 *         Repeat this procedure until we hit a leaf node.
 267                 */
 268                head = cl;
 269        }
 270
 271fallback:
 272        cl = head;
 273
 274        /*
 275         * Step 4. No success...
 276         */
 277        if (TC_H_MAJ(prio) == 0 &&
 278            !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 279            !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 280                return head;
 281
 282        return cl;
 283}
 284
 285/*
 286 * A packet has just been enqueued on the empty class.
 287 * cbq_activate_class adds it to the tail of active class list
 288 * of its priority band.
 289 */
 290
 291static inline void cbq_activate_class(struct cbq_class *cl)
 292{
 293        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 294        int prio = cl->cpriority;
 295        struct cbq_class *cl_tail;
 296
 297        cl_tail = q->active[prio];
 298        q->active[prio] = cl;
 299
 300        if (cl_tail != NULL) {
 301                cl->next_alive = cl_tail->next_alive;
 302                cl_tail->next_alive = cl;
 303        } else {
 304                cl->next_alive = cl;
 305                q->activemask |= (1<<prio);
 306        }
 307}
 308
 309/*
 310 * Unlink class from active chain.
 311 * Note that this same procedure is done directly in cbq_dequeue*
 312 * during round-robin procedure.
 313 */
 314
 315static void cbq_deactivate_class(struct cbq_class *this)
 316{
 317        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 318        int prio = this->cpriority;
 319        struct cbq_class *cl;
 320        struct cbq_class *cl_prev = q->active[prio];
 321
 322        do {
 323                cl = cl_prev->next_alive;
 324                if (cl == this) {
 325                        cl_prev->next_alive = cl->next_alive;
 326                        cl->next_alive = NULL;
 327
 328                        if (cl == q->active[prio]) {
 329                                q->active[prio] = cl_prev;
 330                                if (cl == q->active[prio]) {
 331                                        q->active[prio] = NULL;
 332                                        q->activemask &= ~(1<<prio);
 333                                        return;
 334                                }
 335                        }
 336                        return;
 337                }
 338        } while ((cl_prev = cl) != q->active[prio]);
 339}
 340
 341static void
 342cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 343{
 344        int toplevel = q->toplevel;
 345
 346        if (toplevel > cl->level) {
 347                psched_time_t now = psched_get_time();
 348
 349                do {
 350                        if (cl->undertime < now) {
 351                                q->toplevel = cl->level;
 352                                return;
 353                        }
 354                } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 355        }
 356}
 357
 358static int
 359cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 360            struct sk_buff **to_free)
 361{
 362        struct cbq_sched_data *q = qdisc_priv(sch);
 363        int ret;
 364        struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 365
 366#ifdef CONFIG_NET_CLS_ACT
 367        q->rx_class = cl;
 368#endif
 369        if (cl == NULL) {
 370                if (ret & __NET_XMIT_BYPASS)
 371                        qdisc_qstats_drop(sch);
 372                __qdisc_drop(skb, to_free);
 373                return ret;
 374        }
 375
 376        ret = qdisc_enqueue(skb, cl->q, to_free);
 377        if (ret == NET_XMIT_SUCCESS) {
 378                sch->q.qlen++;
 379                cbq_mark_toplevel(q, cl);
 380                if (!cl->next_alive)
 381                        cbq_activate_class(cl);
 382                return ret;
 383        }
 384
 385        if (net_xmit_drop_count(ret)) {
 386                qdisc_qstats_drop(sch);
 387                cbq_mark_toplevel(q, cl);
 388                cl->qstats.drops++;
 389        }
 390        return ret;
 391}
 392
 393/* Overlimit action: penalize leaf class by adding offtime */
 394static void cbq_overlimit(struct cbq_class *cl)
 395{
 396        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 397        psched_tdiff_t delay = cl->undertime - q->now;
 398
 399        if (!cl->delayed) {
 400                delay += cl->offtime;
 401
 402                /*
 403                 * Class goes to sleep, so that it will have no
 404                 * chance to work avgidle. Let's forgive it 8)
 405                 *
 406                 * BTW cbq-2.0 has a crap in this
 407                 * place, apparently they forgot to shift it by cl->ewma_log.
 408                 */
 409                if (cl->avgidle < 0)
 410                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 411                if (cl->avgidle < cl->minidle)
 412                        cl->avgidle = cl->minidle;
 413                if (delay <= 0)
 414                        delay = 1;
 415                cl->undertime = q->now + delay;
 416
 417                cl->xstats.overactions++;
 418                cl->delayed = 1;
 419        }
 420        if (q->wd_expires == 0 || q->wd_expires > delay)
 421                q->wd_expires = delay;
 422
 423        /* Dirty work! We must schedule wakeups based on
 424         * real available rate, rather than leaf rate,
 425         * which may be tiny (even zero).
 426         */
 427        if (q->toplevel == TC_CBQ_MAXLEVEL) {
 428                struct cbq_class *b;
 429                psched_tdiff_t base_delay = q->wd_expires;
 430
 431                for (b = cl->borrow; b; b = b->borrow) {
 432                        delay = b->undertime - q->now;
 433                        if (delay < base_delay) {
 434                                if (delay <= 0)
 435                                        delay = 1;
 436                                base_delay = delay;
 437                        }
 438                }
 439
 440                q->wd_expires = base_delay;
 441        }
 442}
 443
 444static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 445                                       psched_time_t now)
 446{
 447        struct cbq_class *cl;
 448        struct cbq_class *cl_prev = q->active[prio];
 449        psched_time_t sched = now;
 450
 451        if (cl_prev == NULL)
 452                return 0;
 453
 454        do {
 455                cl = cl_prev->next_alive;
 456                if (now - cl->penalized > 0) {
 457                        cl_prev->next_alive = cl->next_alive;
 458                        cl->next_alive = NULL;
 459                        cl->cpriority = cl->priority;
 460                        cl->delayed = 0;
 461                        cbq_activate_class(cl);
 462
 463                        if (cl == q->active[prio]) {
 464                                q->active[prio] = cl_prev;
 465                                if (cl == q->active[prio]) {
 466                                        q->active[prio] = NULL;
 467                                        return 0;
 468                                }
 469                        }
 470
 471                        cl = cl_prev->next_alive;
 472                } else if (sched - cl->penalized > 0)
 473                        sched = cl->penalized;
 474        } while ((cl_prev = cl) != q->active[prio]);
 475
 476        return sched - now;
 477}
 478
 479static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 480{
 481        struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 482                                                delay_timer);
 483        struct Qdisc *sch = q->watchdog.qdisc;
 484        psched_time_t now;
 485        psched_tdiff_t delay = 0;
 486        unsigned int pmask;
 487
 488        now = psched_get_time();
 489
 490        pmask = q->pmask;
 491        q->pmask = 0;
 492
 493        while (pmask) {
 494                int prio = ffz(~pmask);
 495                psched_tdiff_t tmp;
 496
 497                pmask &= ~(1<<prio);
 498
 499                tmp = cbq_undelay_prio(q, prio, now);
 500                if (tmp > 0) {
 501                        q->pmask |= 1<<prio;
 502                        if (tmp < delay || delay == 0)
 503                                delay = tmp;
 504                }
 505        }
 506
 507        if (delay) {
 508                ktime_t time;
 509
 510                time = 0;
 511                time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 512                hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 513        }
 514
 515        __netif_schedule(qdisc_root(sch));
 516        return HRTIMER_NORESTART;
 517}
 518
 519/*
 520 * It is mission critical procedure.
 521 *
 522 * We "regenerate" toplevel cutoff, if transmitting class
 523 * has backlog and it is not regulated. It is not part of
 524 * original CBQ description, but looks more reasonable.
 525 * Probably, it is wrong. This question needs further investigation.
 526 */
 527
 528static inline void
 529cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 530                    struct cbq_class *borrowed)
 531{
 532        if (cl && q->toplevel >= borrowed->level) {
 533                if (cl->q->q.qlen > 1) {
 534                        do {
 535                                if (borrowed->undertime == PSCHED_PASTPERFECT) {
 536                                        q->toplevel = borrowed->level;
 537                                        return;
 538                                }
 539                        } while ((borrowed = borrowed->borrow) != NULL);
 540                }
 541#if 0
 542        /* It is not necessary now. Uncommenting it
 543           will save CPU cycles, but decrease fairness.
 544         */
 545                q->toplevel = TC_CBQ_MAXLEVEL;
 546#endif
 547        }
 548}
 549
 550static void
 551cbq_update(struct cbq_sched_data *q)
 552{
 553        struct cbq_class *this = q->tx_class;
 554        struct cbq_class *cl = this;
 555        int len = q->tx_len;
 556        psched_time_t now;
 557
 558        q->tx_class = NULL;
 559        /* Time integrator. We calculate EOS time
 560         * by adding expected packet transmission time.
 561         */
 562        now = q->now + L2T(&q->link, len);
 563
 564        for ( ; cl; cl = cl->share) {
 565                long avgidle = cl->avgidle;
 566                long idle;
 567
 568                cl->bstats.packets++;
 569                cl->bstats.bytes += len;
 570
 571                /*
 572                 * (now - last) is total time between packet right edges.
 573                 * (last_pktlen/rate) is "virtual" busy time, so that
 574                 *
 575                 *      idle = (now - last) - last_pktlen/rate
 576                 */
 577
 578                idle = now - cl->last;
 579                if ((unsigned long)idle > 128*1024*1024) {
 580                        avgidle = cl->maxidle;
 581                } else {
 582                        idle -= L2T(cl, len);
 583
 584                /* true_avgidle := (1-W)*true_avgidle + W*idle,
 585                 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 586                 * cl->avgidle == true_avgidle/W,
 587                 * hence:
 588                 */
 589                        avgidle += idle - (avgidle>>cl->ewma_log);
 590                }
 591
 592                if (avgidle <= 0) {
 593                        /* Overlimit or at-limit */
 594
 595                        if (avgidle < cl->minidle)
 596                                avgidle = cl->minidle;
 597
 598                        cl->avgidle = avgidle;
 599
 600                        /* Calculate expected time, when this class
 601                         * will be allowed to send.
 602                         * It will occur, when:
 603                         * (1-W)*true_avgidle + W*delay = 0, i.e.
 604                         * idle = (1/W - 1)*(-true_avgidle)
 605                         * or
 606                         * idle = (1 - W)*(-cl->avgidle);
 607                         */
 608                        idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 609
 610                        /*
 611                         * That is not all.
 612                         * To maintain the rate allocated to the class,
 613                         * we add to undertime virtual clock,
 614                         * necessary to complete transmitted packet.
 615                         * (len/phys_bandwidth has been already passed
 616                         * to the moment of cbq_update)
 617                         */
 618
 619                        idle -= L2T(&q->link, len);
 620                        idle += L2T(cl, len);
 621
 622                        cl->undertime = now + idle;
 623                } else {
 624                        /* Underlimit */
 625
 626                        cl->undertime = PSCHED_PASTPERFECT;
 627                        if (avgidle > cl->maxidle)
 628                                cl->avgidle = cl->maxidle;
 629                        else
 630                                cl->avgidle = avgidle;
 631                }
 632                if ((s64)(now - cl->last) > 0)
 633                        cl->last = now;
 634        }
 635
 636        cbq_update_toplevel(q, this, q->tx_borrowed);
 637}
 638
 639static inline struct cbq_class *
 640cbq_under_limit(struct cbq_class *cl)
 641{
 642        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 643        struct cbq_class *this_cl = cl;
 644
 645        if (cl->tparent == NULL)
 646                return cl;
 647
 648        if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 649                cl->delayed = 0;
 650                return cl;
 651        }
 652
 653        do {
 654                /* It is very suspicious place. Now overlimit
 655                 * action is generated for not bounded classes
 656                 * only if link is completely congested.
 657                 * Though it is in agree with ancestor-only paradigm,
 658                 * it looks very stupid. Particularly,
 659                 * it means that this chunk of code will either
 660                 * never be called or result in strong amplification
 661                 * of burstiness. Dangerous, silly, and, however,
 662                 * no another solution exists.
 663                 */
 664                cl = cl->borrow;
 665                if (!cl) {
 666                        this_cl->qstats.overlimits++;
 667                        cbq_overlimit(this_cl);
 668                        return NULL;
 669                }
 670                if (cl->level > q->toplevel)
 671                        return NULL;
 672        } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 673
 674        cl->delayed = 0;
 675        return cl;
 676}
 677
 678static inline struct sk_buff *
 679cbq_dequeue_prio(struct Qdisc *sch, int prio)
 680{
 681        struct cbq_sched_data *q = qdisc_priv(sch);
 682        struct cbq_class *cl_tail, *cl_prev, *cl;
 683        struct sk_buff *skb;
 684        int deficit;
 685
 686        cl_tail = cl_prev = q->active[prio];
 687        cl = cl_prev->next_alive;
 688
 689        do {
 690                deficit = 0;
 691
 692                /* Start round */
 693                do {
 694                        struct cbq_class *borrow = cl;
 695
 696                        if (cl->q->q.qlen &&
 697                            (borrow = cbq_under_limit(cl)) == NULL)
 698                                goto skip_class;
 699
 700                        if (cl->deficit <= 0) {
 701                                /* Class exhausted its allotment per
 702                                 * this round. Switch to the next one.
 703                                 */
 704                                deficit = 1;
 705                                cl->deficit += cl->quantum;
 706                                goto next_class;
 707                        }
 708
 709                        skb = cl->q->dequeue(cl->q);
 710
 711                        /* Class did not give us any skb :-(
 712                         * It could occur even if cl->q->q.qlen != 0
 713                         * f.e. if cl->q == "tbf"
 714                         */
 715                        if (skb == NULL)
 716                                goto skip_class;
 717
 718                        cl->deficit -= qdisc_pkt_len(skb);
 719                        q->tx_class = cl;
 720                        q->tx_borrowed = borrow;
 721                        if (borrow != cl) {
 722#ifndef CBQ_XSTATS_BORROWS_BYTES
 723                                borrow->xstats.borrows++;
 724                                cl->xstats.borrows++;
 725#else
 726                                borrow->xstats.borrows += qdisc_pkt_len(skb);
 727                                cl->xstats.borrows += qdisc_pkt_len(skb);
 728#endif
 729                        }
 730                        q->tx_len = qdisc_pkt_len(skb);
 731
 732                        if (cl->deficit <= 0) {
 733                                q->active[prio] = cl;
 734                                cl = cl->next_alive;
 735                                cl->deficit += cl->quantum;
 736                        }
 737                        return skb;
 738
 739skip_class:
 740                        if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 741                                /* Class is empty or penalized.
 742                                 * Unlink it from active chain.
 743                                 */
 744                                cl_prev->next_alive = cl->next_alive;
 745                                cl->next_alive = NULL;
 746
 747                                /* Did cl_tail point to it? */
 748                                if (cl == cl_tail) {
 749                                        /* Repair it! */
 750                                        cl_tail = cl_prev;
 751
 752                                        /* Was it the last class in this band? */
 753                                        if (cl == cl_tail) {
 754                                                /* Kill the band! */
 755                                                q->active[prio] = NULL;
 756                                                q->activemask &= ~(1<<prio);
 757                                                if (cl->q->q.qlen)
 758                                                        cbq_activate_class(cl);
 759                                                return NULL;
 760                                        }
 761
 762                                        q->active[prio] = cl_tail;
 763                                }
 764                                if (cl->q->q.qlen)
 765                                        cbq_activate_class(cl);
 766
 767                                cl = cl_prev;
 768                        }
 769
 770next_class:
 771                        cl_prev = cl;
 772                        cl = cl->next_alive;
 773                } while (cl_prev != cl_tail);
 774        } while (deficit);
 775
 776        q->active[prio] = cl_prev;
 777
 778        return NULL;
 779}
 780
 781static inline struct sk_buff *
 782cbq_dequeue_1(struct Qdisc *sch)
 783{
 784        struct cbq_sched_data *q = qdisc_priv(sch);
 785        struct sk_buff *skb;
 786        unsigned int activemask;
 787
 788        activemask = q->activemask & 0xFF;
 789        while (activemask) {
 790                int prio = ffz(~activemask);
 791                activemask &= ~(1<<prio);
 792                skb = cbq_dequeue_prio(sch, prio);
 793                if (skb)
 794                        return skb;
 795        }
 796        return NULL;
 797}
 798
 799static struct sk_buff *
 800cbq_dequeue(struct Qdisc *sch)
 801{
 802        struct sk_buff *skb;
 803        struct cbq_sched_data *q = qdisc_priv(sch);
 804        psched_time_t now;
 805
 806        now = psched_get_time();
 807
 808        if (q->tx_class)
 809                cbq_update(q);
 810
 811        q->now = now;
 812
 813        for (;;) {
 814                q->wd_expires = 0;
 815
 816                skb = cbq_dequeue_1(sch);
 817                if (skb) {
 818                        qdisc_bstats_update(sch, skb);
 819                        sch->q.qlen--;
 820                        return skb;
 821                }
 822
 823                /* All the classes are overlimit.
 824                 *
 825                 * It is possible, if:
 826                 *
 827                 * 1. Scheduler is empty.
 828                 * 2. Toplevel cutoff inhibited borrowing.
 829                 * 3. Root class is overlimit.
 830                 *
 831                 * Reset 2d and 3d conditions and retry.
 832                 *
 833                 * Note, that NS and cbq-2.0 are buggy, peeking
 834                 * an arbitrary class is appropriate for ancestor-only
 835                 * sharing, but not for toplevel algorithm.
 836                 *
 837                 * Our version is better, but slower, because it requires
 838                 * two passes, but it is unavoidable with top-level sharing.
 839                 */
 840
 841                if (q->toplevel == TC_CBQ_MAXLEVEL &&
 842                    q->link.undertime == PSCHED_PASTPERFECT)
 843                        break;
 844
 845                q->toplevel = TC_CBQ_MAXLEVEL;
 846                q->link.undertime = PSCHED_PASTPERFECT;
 847        }
 848
 849        /* No packets in scheduler or nobody wants to give them to us :-(
 850         * Sigh... start watchdog timer in the last case.
 851         */
 852
 853        if (sch->q.qlen) {
 854                qdisc_qstats_overlimit(sch);
 855                if (q->wd_expires)
 856                        qdisc_watchdog_schedule(&q->watchdog,
 857                                                now + q->wd_expires);
 858        }
 859        return NULL;
 860}
 861
 862/* CBQ class maintenance routines */
 863
 864static void cbq_adjust_levels(struct cbq_class *this)
 865{
 866        if (this == NULL)
 867                return;
 868
 869        do {
 870                int level = 0;
 871                struct cbq_class *cl;
 872
 873                cl = this->children;
 874                if (cl) {
 875                        do {
 876                                if (cl->level > level)
 877                                        level = cl->level;
 878                        } while ((cl = cl->sibling) != this->children);
 879                }
 880                this->level = level + 1;
 881        } while ((this = this->tparent) != NULL);
 882}
 883
 884static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 885{
 886        struct cbq_class *cl;
 887        unsigned int h;
 888
 889        if (q->quanta[prio] == 0)
 890                return;
 891
 892        for (h = 0; h < q->clhash.hashsize; h++) {
 893                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 894                        /* BUGGGG... Beware! This expression suffer of
 895                         * arithmetic overflows!
 896                         */
 897                        if (cl->priority == prio) {
 898                                cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 899                                        q->quanta[prio];
 900                        }
 901                        if (cl->quantum <= 0 ||
 902                            cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 903                                pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 904                                        cl->common.classid, cl->quantum);
 905                                cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 906                        }
 907                }
 908        }
 909}
 910
 911static void cbq_sync_defmap(struct cbq_class *cl)
 912{
 913        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 914        struct cbq_class *split = cl->split;
 915        unsigned int h;
 916        int i;
 917
 918        if (split == NULL)
 919                return;
 920
 921        for (i = 0; i <= TC_PRIO_MAX; i++) {
 922                if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 923                        split->defaults[i] = NULL;
 924        }
 925
 926        for (i = 0; i <= TC_PRIO_MAX; i++) {
 927                int level = split->level;
 928
 929                if (split->defaults[i])
 930                        continue;
 931
 932                for (h = 0; h < q->clhash.hashsize; h++) {
 933                        struct cbq_class *c;
 934
 935                        hlist_for_each_entry(c, &q->clhash.hash[h],
 936                                             common.hnode) {
 937                                if (c->split == split && c->level < level &&
 938                                    c->defmap & (1<<i)) {
 939                                        split->defaults[i] = c;
 940                                        level = c->level;
 941                                }
 942                        }
 943                }
 944        }
 945}
 946
 947static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 948{
 949        struct cbq_class *split = NULL;
 950
 951        if (splitid == 0) {
 952                split = cl->split;
 953                if (!split)
 954                        return;
 955                splitid = split->common.classid;
 956        }
 957
 958        if (split == NULL || split->common.classid != splitid) {
 959                for (split = cl->tparent; split; split = split->tparent)
 960                        if (split->common.classid == splitid)
 961                                break;
 962        }
 963
 964        if (split == NULL)
 965                return;
 966
 967        if (cl->split != split) {
 968                cl->defmap = 0;
 969                cbq_sync_defmap(cl);
 970                cl->split = split;
 971                cl->defmap = def & mask;
 972        } else
 973                cl->defmap = (cl->defmap & ~mask) | (def & mask);
 974
 975        cbq_sync_defmap(cl);
 976}
 977
 978static void cbq_unlink_class(struct cbq_class *this)
 979{
 980        struct cbq_class *cl, **clp;
 981        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 982
 983        qdisc_class_hash_remove(&q->clhash, &this->common);
 984
 985        if (this->tparent) {
 986                clp = &this->sibling;
 987                cl = *clp;
 988                do {
 989                        if (cl == this) {
 990                                *clp = cl->sibling;
 991                                break;
 992                        }
 993                        clp = &cl->sibling;
 994                } while ((cl = *clp) != this->sibling);
 995
 996                if (this->tparent->children == this) {
 997                        this->tparent->children = this->sibling;
 998                        if (this->sibling == this)
 999                                this->tparent->children = NULL;
1000                }
1001        } else {
1002                WARN_ON(this->sibling != this);
1003        }
1004}
1005
1006static void cbq_link_class(struct cbq_class *this)
1007{
1008        struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009        struct cbq_class *parent = this->tparent;
1010
1011        this->sibling = this;
1012        qdisc_class_hash_insert(&q->clhash, &this->common);
1013
1014        if (parent == NULL)
1015                return;
1016
1017        if (parent->children == NULL) {
1018                parent->children = this;
1019        } else {
1020                this->sibling = parent->children->sibling;
1021                parent->children->sibling = this;
1022        }
1023}
1024
1025static void
1026cbq_reset(struct Qdisc *sch)
1027{
1028        struct cbq_sched_data *q = qdisc_priv(sch);
1029        struct cbq_class *cl;
1030        int prio;
1031        unsigned int h;
1032
1033        q->activemask = 0;
1034        q->pmask = 0;
1035        q->tx_class = NULL;
1036        q->tx_borrowed = NULL;
1037        qdisc_watchdog_cancel(&q->watchdog);
1038        hrtimer_cancel(&q->delay_timer);
1039        q->toplevel = TC_CBQ_MAXLEVEL;
1040        q->now = psched_get_time();
1041
1042        for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043                q->active[prio] = NULL;
1044
1045        for (h = 0; h < q->clhash.hashsize; h++) {
1046                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047                        qdisc_reset(cl->q);
1048
1049                        cl->next_alive = NULL;
1050                        cl->undertime = PSCHED_PASTPERFECT;
1051                        cl->avgidle = cl->maxidle;
1052                        cl->deficit = cl->quantum;
1053                        cl->cpriority = cl->priority;
1054                }
1055        }
1056        sch->q.qlen = 0;
1057}
1058
1059
1060static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1061{
1062        if (lss->change & TCF_CBQ_LSS_FLAGS) {
1063                cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1064                cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1065        }
1066        if (lss->change & TCF_CBQ_LSS_EWMA)
1067                cl->ewma_log = lss->ewma_log;
1068        if (lss->change & TCF_CBQ_LSS_AVPKT)
1069                cl->avpkt = lss->avpkt;
1070        if (lss->change & TCF_CBQ_LSS_MINIDLE)
1071                cl->minidle = -(long)lss->minidle;
1072        if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1073                cl->maxidle = lss->maxidle;
1074                cl->avgidle = lss->maxidle;
1075        }
1076        if (lss->change & TCF_CBQ_LSS_OFFTIME)
1077                cl->offtime = lss->offtime;
1078        return 0;
1079}
1080
1081static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1082{
1083        q->nclasses[cl->priority]--;
1084        q->quanta[cl->priority] -= cl->weight;
1085        cbq_normalize_quanta(q, cl->priority);
1086}
1087
1088static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1089{
1090        q->nclasses[cl->priority]++;
1091        q->quanta[cl->priority] += cl->weight;
1092        cbq_normalize_quanta(q, cl->priority);
1093}
1094
1095static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1096{
1097        struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1098
1099        if (wrr->allot)
1100                cl->allot = wrr->allot;
1101        if (wrr->weight)
1102                cl->weight = wrr->weight;
1103        if (wrr->priority) {
1104                cl->priority = wrr->priority - 1;
1105                cl->cpriority = cl->priority;
1106                if (cl->priority >= cl->priority2)
1107                        cl->priority2 = TC_CBQ_MAXPRIO - 1;
1108        }
1109
1110        cbq_addprio(q, cl);
1111        return 0;
1112}
1113
1114static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1115{
1116        cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1117        return 0;
1118}
1119
1120static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1121        [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1122        [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1123        [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1124        [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1125        [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1126        [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1127        [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1128};
1129
1130static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1131                         struct nlattr *opt,
1132                         struct netlink_ext_ack *extack)
1133{
1134        int err;
1135
1136        if (!opt) {
1137                NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1138                return -EINVAL;
1139        }
1140
1141        err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1142                                          cbq_policy, extack);
1143        if (err < 0)
1144                return err;
1145
1146        if (tb[TCA_CBQ_WRROPT]) {
1147                const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1148
1149                if (wrr->priority > TC_CBQ_MAXPRIO) {
1150                        NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1151                        err = -EINVAL;
1152                }
1153        }
1154        return err;
1155}
1156
1157static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1158                    struct netlink_ext_ack *extack)
1159{
1160        struct cbq_sched_data *q = qdisc_priv(sch);
1161        struct nlattr *tb[TCA_CBQ_MAX + 1];
1162        struct tc_ratespec *r;
1163        int err;
1164
1165        qdisc_watchdog_init(&q->watchdog, sch);
1166        hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1167        q->delay_timer.function = cbq_undelay;
1168
1169        err = cbq_opt_parse(tb, opt, extack);
1170        if (err < 0)
1171                return err;
1172
1173        if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1174                NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1175                return -EINVAL;
1176        }
1177
1178        r = nla_data(tb[TCA_CBQ_RATE]);
1179
1180        q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1181        if (!q->link.R_tab)
1182                return -EINVAL;
1183
1184        err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1185        if (err)
1186                goto put_rtab;
1187
1188        err = qdisc_class_hash_init(&q->clhash);
1189        if (err < 0)
1190                goto put_block;
1191
1192        q->link.sibling = &q->link;
1193        q->link.common.classid = sch->handle;
1194        q->link.qdisc = sch;
1195        q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1196                                      sch->handle, NULL);
1197        if (!q->link.q)
1198                q->link.q = &noop_qdisc;
1199        else
1200                qdisc_hash_add(q->link.q, true);
1201
1202        q->link.priority = TC_CBQ_MAXPRIO - 1;
1203        q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1204        q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1205        q->link.allot = psched_mtu(qdisc_dev(sch));
1206        q->link.quantum = q->link.allot;
1207        q->link.weight = q->link.R_tab->rate.rate;
1208
1209        q->link.ewma_log = TC_CBQ_DEF_EWMA;
1210        q->link.avpkt = q->link.allot/2;
1211        q->link.minidle = -0x7FFFFFFF;
1212
1213        q->toplevel = TC_CBQ_MAXLEVEL;
1214        q->now = psched_get_time();
1215
1216        cbq_link_class(&q->link);
1217
1218        if (tb[TCA_CBQ_LSSOPT])
1219                cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1220
1221        cbq_addprio(q, &q->link);
1222        return 0;
1223
1224put_block:
1225        tcf_block_put(q->link.block);
1226
1227put_rtab:
1228        qdisc_put_rtab(q->link.R_tab);
1229        return err;
1230}
1231
1232static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1233{
1234        unsigned char *b = skb_tail_pointer(skb);
1235
1236        if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1237                goto nla_put_failure;
1238        return skb->len;
1239
1240nla_put_failure:
1241        nlmsg_trim(skb, b);
1242        return -1;
1243}
1244
1245static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1246{
1247        unsigned char *b = skb_tail_pointer(skb);
1248        struct tc_cbq_lssopt opt;
1249
1250        opt.flags = 0;
1251        if (cl->borrow == NULL)
1252                opt.flags |= TCF_CBQ_LSS_BOUNDED;
1253        if (cl->share == NULL)
1254                opt.flags |= TCF_CBQ_LSS_ISOLATED;
1255        opt.ewma_log = cl->ewma_log;
1256        opt.level = cl->level;
1257        opt.avpkt = cl->avpkt;
1258        opt.maxidle = cl->maxidle;
1259        opt.minidle = (u32)(-cl->minidle);
1260        opt.offtime = cl->offtime;
1261        opt.change = ~0;
1262        if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1263                goto nla_put_failure;
1264        return skb->len;
1265
1266nla_put_failure:
1267        nlmsg_trim(skb, b);
1268        return -1;
1269}
1270
1271static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1272{
1273        unsigned char *b = skb_tail_pointer(skb);
1274        struct tc_cbq_wrropt opt;
1275
1276        memset(&opt, 0, sizeof(opt));
1277        opt.flags = 0;
1278        opt.allot = cl->allot;
1279        opt.priority = cl->priority + 1;
1280        opt.cpriority = cl->cpriority + 1;
1281        opt.weight = cl->weight;
1282        if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1283                goto nla_put_failure;
1284        return skb->len;
1285
1286nla_put_failure:
1287        nlmsg_trim(skb, b);
1288        return -1;
1289}
1290
1291static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1292{
1293        unsigned char *b = skb_tail_pointer(skb);
1294        struct tc_cbq_fopt opt;
1295
1296        if (cl->split || cl->defmap) {
1297                opt.split = cl->split ? cl->split->common.classid : 0;
1298                opt.defmap = cl->defmap;
1299                opt.defchange = ~0;
1300                if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1301                        goto nla_put_failure;
1302        }
1303        return skb->len;
1304
1305nla_put_failure:
1306        nlmsg_trim(skb, b);
1307        return -1;
1308}
1309
1310static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1311{
1312        if (cbq_dump_lss(skb, cl) < 0 ||
1313            cbq_dump_rate(skb, cl) < 0 ||
1314            cbq_dump_wrr(skb, cl) < 0 ||
1315            cbq_dump_fopt(skb, cl) < 0)
1316                return -1;
1317        return 0;
1318}
1319
1320static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1321{
1322        struct cbq_sched_data *q = qdisc_priv(sch);
1323        struct nlattr *nest;
1324
1325        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1326        if (nest == NULL)
1327                goto nla_put_failure;
1328        if (cbq_dump_attr(skb, &q->link) < 0)
1329                goto nla_put_failure;
1330        return nla_nest_end(skb, nest);
1331
1332nla_put_failure:
1333        nla_nest_cancel(skb, nest);
1334        return -1;
1335}
1336
1337static int
1338cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1339{
1340        struct cbq_sched_data *q = qdisc_priv(sch);
1341
1342        q->link.xstats.avgidle = q->link.avgidle;
1343        return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1344}
1345
1346static int
1347cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1348               struct sk_buff *skb, struct tcmsg *tcm)
1349{
1350        struct cbq_class *cl = (struct cbq_class *)arg;
1351        struct nlattr *nest;
1352
1353        if (cl->tparent)
1354                tcm->tcm_parent = cl->tparent->common.classid;
1355        else
1356                tcm->tcm_parent = TC_H_ROOT;
1357        tcm->tcm_handle = cl->common.classid;
1358        tcm->tcm_info = cl->q->handle;
1359
1360        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1361        if (nest == NULL)
1362                goto nla_put_failure;
1363        if (cbq_dump_attr(skb, cl) < 0)
1364                goto nla_put_failure;
1365        return nla_nest_end(skb, nest);
1366
1367nla_put_failure:
1368        nla_nest_cancel(skb, nest);
1369        return -1;
1370}
1371
1372static int
1373cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1374        struct gnet_dump *d)
1375{
1376        struct cbq_sched_data *q = qdisc_priv(sch);
1377        struct cbq_class *cl = (struct cbq_class *)arg;
1378        __u32 qlen;
1379
1380        cl->xstats.avgidle = cl->avgidle;
1381        cl->xstats.undertime = 0;
1382        qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1383
1384        if (cl->undertime != PSCHED_PASTPERFECT)
1385                cl->xstats.undertime = cl->undertime - q->now;
1386
1387        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1388                                  d, NULL, &cl->bstats) < 0 ||
1389            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1390            gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1391                return -1;
1392
1393        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1394}
1395
1396static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1397                     struct Qdisc **old, struct netlink_ext_ack *extack)
1398{
1399        struct cbq_class *cl = (struct cbq_class *)arg;
1400
1401        if (new == NULL) {
1402                new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1403                                        cl->common.classid, extack);
1404                if (new == NULL)
1405                        return -ENOBUFS;
1406        }
1407
1408        *old = qdisc_replace(sch, new, &cl->q);
1409        return 0;
1410}
1411
1412static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1413{
1414        struct cbq_class *cl = (struct cbq_class *)arg;
1415
1416        return cl->q;
1417}
1418
1419static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1420{
1421        struct cbq_class *cl = (struct cbq_class *)arg;
1422
1423        cbq_deactivate_class(cl);
1424}
1425
1426static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1427{
1428        struct cbq_sched_data *q = qdisc_priv(sch);
1429
1430        return (unsigned long)cbq_class_lookup(q, classid);
1431}
1432
1433static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1434{
1435        struct cbq_sched_data *q = qdisc_priv(sch);
1436
1437        WARN_ON(cl->filters);
1438
1439        tcf_block_put(cl->block);
1440        qdisc_put(cl->q);
1441        qdisc_put_rtab(cl->R_tab);
1442        gen_kill_estimator(&cl->rate_est);
1443        if (cl != &q->link)
1444                kfree(cl);
1445}
1446
1447static void cbq_destroy(struct Qdisc *sch)
1448{
1449        struct cbq_sched_data *q = qdisc_priv(sch);
1450        struct hlist_node *next;
1451        struct cbq_class *cl;
1452        unsigned int h;
1453
1454#ifdef CONFIG_NET_CLS_ACT
1455        q->rx_class = NULL;
1456#endif
1457        /*
1458         * Filters must be destroyed first because we don't destroy the
1459         * classes from root to leafs which means that filters can still
1460         * be bound to classes which have been destroyed already. --TGR '04
1461         */
1462        for (h = 0; h < q->clhash.hashsize; h++) {
1463                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1464                        tcf_block_put(cl->block);
1465                        cl->block = NULL;
1466                }
1467        }
1468        for (h = 0; h < q->clhash.hashsize; h++) {
1469                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1470                                          common.hnode)
1471                        cbq_destroy_class(sch, cl);
1472        }
1473        qdisc_class_hash_destroy(&q->clhash);
1474}
1475
1476static int
1477cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1478                 unsigned long *arg, struct netlink_ext_ack *extack)
1479{
1480        int err;
1481        struct cbq_sched_data *q = qdisc_priv(sch);
1482        struct cbq_class *cl = (struct cbq_class *)*arg;
1483        struct nlattr *opt = tca[TCA_OPTIONS];
1484        struct nlattr *tb[TCA_CBQ_MAX + 1];
1485        struct cbq_class *parent;
1486        struct qdisc_rate_table *rtab = NULL;
1487
1488        err = cbq_opt_parse(tb, opt, extack);
1489        if (err < 0)
1490                return err;
1491
1492        if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1493                NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1494                return -EOPNOTSUPP;
1495        }
1496
1497        if (cl) {
1498                /* Check parent */
1499                if (parentid) {
1500                        if (cl->tparent &&
1501                            cl->tparent->common.classid != parentid) {
1502                                NL_SET_ERR_MSG(extack, "Invalid parent id");
1503                                return -EINVAL;
1504                        }
1505                        if (!cl->tparent && parentid != TC_H_ROOT) {
1506                                NL_SET_ERR_MSG(extack, "Parent must be root");
1507                                return -EINVAL;
1508                        }
1509                }
1510
1511                if (tb[TCA_CBQ_RATE]) {
1512                        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1513                                              tb[TCA_CBQ_RTAB], extack);
1514                        if (rtab == NULL)
1515                                return -EINVAL;
1516                }
1517
1518                if (tca[TCA_RATE]) {
1519                        err = gen_replace_estimator(&cl->bstats, NULL,
1520                                                    &cl->rate_est,
1521                                                    NULL,
1522                                                    qdisc_root_sleeping_running(sch),
1523                                                    tca[TCA_RATE]);
1524                        if (err) {
1525                                NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1526                                qdisc_put_rtab(rtab);
1527                                return err;
1528                        }
1529                }
1530
1531                /* Change class parameters */
1532                sch_tree_lock(sch);
1533
1534                if (cl->next_alive != NULL)
1535                        cbq_deactivate_class(cl);
1536
1537                if (rtab) {
1538                        qdisc_put_rtab(cl->R_tab);
1539                        cl->R_tab = rtab;
1540                }
1541
1542                if (tb[TCA_CBQ_LSSOPT])
1543                        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1544
1545                if (tb[TCA_CBQ_WRROPT]) {
1546                        cbq_rmprio(q, cl);
1547                        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1548                }
1549
1550                if (tb[TCA_CBQ_FOPT])
1551                        cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1552
1553                if (cl->q->q.qlen)
1554                        cbq_activate_class(cl);
1555
1556                sch_tree_unlock(sch);
1557
1558                return 0;
1559        }
1560
1561        if (parentid == TC_H_ROOT)
1562                return -EINVAL;
1563
1564        if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1565                NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1566                return -EINVAL;
1567        }
1568
1569        rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1570                              extack);
1571        if (rtab == NULL)
1572                return -EINVAL;
1573
1574        if (classid) {
1575                err = -EINVAL;
1576                if (TC_H_MAJ(classid ^ sch->handle) ||
1577                    cbq_class_lookup(q, classid)) {
1578                        NL_SET_ERR_MSG(extack, "Specified class not found");
1579                        goto failure;
1580                }
1581        } else {
1582                int i;
1583                classid = TC_H_MAKE(sch->handle, 0x8000);
1584
1585                for (i = 0; i < 0x8000; i++) {
1586                        if (++q->hgenerator >= 0x8000)
1587                                q->hgenerator = 1;
1588                        if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1589                                break;
1590                }
1591                err = -ENOSR;
1592                if (i >= 0x8000) {
1593                        NL_SET_ERR_MSG(extack, "Unable to generate classid");
1594                        goto failure;
1595                }
1596                classid = classid|q->hgenerator;
1597        }
1598
1599        parent = &q->link;
1600        if (parentid) {
1601                parent = cbq_class_lookup(q, parentid);
1602                err = -EINVAL;
1603                if (!parent) {
1604                        NL_SET_ERR_MSG(extack, "Failed to find parentid");
1605                        goto failure;
1606                }
1607        }
1608
1609        err = -ENOBUFS;
1610        cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1611        if (cl == NULL)
1612                goto failure;
1613
1614        err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1615        if (err) {
1616                kfree(cl);
1617                return err;
1618        }
1619
1620        if (tca[TCA_RATE]) {
1621                err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1622                                        NULL,
1623                                        qdisc_root_sleeping_running(sch),
1624                                        tca[TCA_RATE]);
1625                if (err) {
1626                        NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1627                        tcf_block_put(cl->block);
1628                        kfree(cl);
1629                        goto failure;
1630                }
1631        }
1632
1633        cl->R_tab = rtab;
1634        rtab = NULL;
1635        cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1636                                  NULL);
1637        if (!cl->q)
1638                cl->q = &noop_qdisc;
1639        else
1640                qdisc_hash_add(cl->q, true);
1641
1642        cl->common.classid = classid;
1643        cl->tparent = parent;
1644        cl->qdisc = sch;
1645        cl->allot = parent->allot;
1646        cl->quantum = cl->allot;
1647        cl->weight = cl->R_tab->rate.rate;
1648
1649        sch_tree_lock(sch);
1650        cbq_link_class(cl);
1651        cl->borrow = cl->tparent;
1652        if (cl->tparent != &q->link)
1653                cl->share = cl->tparent;
1654        cbq_adjust_levels(parent);
1655        cl->minidle = -0x7FFFFFFF;
1656        cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1657        cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1658        if (cl->ewma_log == 0)
1659                cl->ewma_log = q->link.ewma_log;
1660        if (cl->maxidle == 0)
1661                cl->maxidle = q->link.maxidle;
1662        if (cl->avpkt == 0)
1663                cl->avpkt = q->link.avpkt;
1664        if (tb[TCA_CBQ_FOPT])
1665                cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1666        sch_tree_unlock(sch);
1667
1668        qdisc_class_hash_grow(sch, &q->clhash);
1669
1670        *arg = (unsigned long)cl;
1671        return 0;
1672
1673failure:
1674        qdisc_put_rtab(rtab);
1675        return err;
1676}
1677
1678static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1679                      struct netlink_ext_ack *extack)
1680{
1681        struct cbq_sched_data *q = qdisc_priv(sch);
1682        struct cbq_class *cl = (struct cbq_class *)arg;
1683
1684        if (cl->filters || cl->children || cl == &q->link)
1685                return -EBUSY;
1686
1687        sch_tree_lock(sch);
1688
1689        qdisc_purge_queue(cl->q);
1690
1691        if (cl->next_alive)
1692                cbq_deactivate_class(cl);
1693
1694        if (q->tx_borrowed == cl)
1695                q->tx_borrowed = q->tx_class;
1696        if (q->tx_class == cl) {
1697                q->tx_class = NULL;
1698                q->tx_borrowed = NULL;
1699        }
1700#ifdef CONFIG_NET_CLS_ACT
1701        if (q->rx_class == cl)
1702                q->rx_class = NULL;
1703#endif
1704
1705        cbq_unlink_class(cl);
1706        cbq_adjust_levels(cl->tparent);
1707        cl->defmap = 0;
1708        cbq_sync_defmap(cl);
1709
1710        cbq_rmprio(q, cl);
1711        sch_tree_unlock(sch);
1712
1713        cbq_destroy_class(sch, cl);
1714        return 0;
1715}
1716
1717static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1718                                       struct netlink_ext_ack *extack)
1719{
1720        struct cbq_sched_data *q = qdisc_priv(sch);
1721        struct cbq_class *cl = (struct cbq_class *)arg;
1722
1723        if (cl == NULL)
1724                cl = &q->link;
1725
1726        return cl->block;
1727}
1728
1729static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1730                                     u32 classid)
1731{
1732        struct cbq_sched_data *q = qdisc_priv(sch);
1733        struct cbq_class *p = (struct cbq_class *)parent;
1734        struct cbq_class *cl = cbq_class_lookup(q, classid);
1735
1736        if (cl) {
1737                if (p && p->level <= cl->level)
1738                        return 0;
1739                cl->filters++;
1740                return (unsigned long)cl;
1741        }
1742        return 0;
1743}
1744
1745static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1746{
1747        struct cbq_class *cl = (struct cbq_class *)arg;
1748
1749        cl->filters--;
1750}
1751
1752static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1753{
1754        struct cbq_sched_data *q = qdisc_priv(sch);
1755        struct cbq_class *cl;
1756        unsigned int h;
1757
1758        if (arg->stop)
1759                return;
1760
1761        for (h = 0; h < q->clhash.hashsize; h++) {
1762                hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1763                        if (arg->count < arg->skip) {
1764                                arg->count++;
1765                                continue;
1766                        }
1767                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1768                                arg->stop = 1;
1769                                return;
1770                        }
1771                        arg->count++;
1772                }
1773        }
1774}
1775
1776static const struct Qdisc_class_ops cbq_class_ops = {
1777        .graft          =       cbq_graft,
1778        .leaf           =       cbq_leaf,
1779        .qlen_notify    =       cbq_qlen_notify,
1780        .find           =       cbq_find,
1781        .change         =       cbq_change_class,
1782        .delete         =       cbq_delete,
1783        .walk           =       cbq_walk,
1784        .tcf_block      =       cbq_tcf_block,
1785        .bind_tcf       =       cbq_bind_filter,
1786        .unbind_tcf     =       cbq_unbind_filter,
1787        .dump           =       cbq_dump_class,
1788        .dump_stats     =       cbq_dump_class_stats,
1789};
1790
1791static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1792        .next           =       NULL,
1793        .cl_ops         =       &cbq_class_ops,
1794        .id             =       "cbq",
1795        .priv_size      =       sizeof(struct cbq_sched_data),
1796        .enqueue        =       cbq_enqueue,
1797        .dequeue        =       cbq_dequeue,
1798        .peek           =       qdisc_peek_dequeued,
1799        .init           =       cbq_init,
1800        .reset          =       cbq_reset,
1801        .destroy        =       cbq_destroy,
1802        .change         =       NULL,
1803        .dump           =       cbq_dump,
1804        .dump_stats     =       cbq_dump_stats,
1805        .owner          =       THIS_MODULE,
1806};
1807
1808static int __init cbq_module_init(void)
1809{
1810        return register_qdisc(&cbq_qdisc_ops);
1811}
1812static void __exit cbq_module_exit(void)
1813{
1814        unregister_qdisc(&cbq_qdisc_ops);
1815}
1816module_init(cbq_module_init)
1817module_exit(cbq_module_exit)
1818MODULE_LICENSE("GPL");
1819