linux/net/sched/sch_htb.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_htb.c  Hierarchical token bucket, feed tree version
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Martin Devera, <devik@cdi.cz>
  10 *
  11 * Credits (in time order) for older HTB versions:
  12 *              Stef Coene <stef.coene@docum.org>
  13 *                      HTB support at LARTC mailing list
  14 *              Ondrej Kraus, <krauso@barr.cz>
  15 *                      found missing INIT_QDISC(htb)
  16 *              Vladimir Smelhaus, Aamer Akhter, Bert Hubert
  17 *                      helped a lot to locate nasty class stall bug
  18 *              Andi Kleen, Jamal Hadi, Bert Hubert
  19 *                      code review and helpful comments on shaping
  20 *              Tomasz Wrona, <tw@eter.tym.pl>
  21 *                      created test case so that I was able to fix nasty bug
  22 *              Wilfried Weissmann
  23 *                      spotted bug in dequeue code and helped with fix
  24 *              Jiri Fojtasek
  25 *                      fixed requeue routine
  26 *              and many others. thanks.
  27 */
  28#include <linux/module.h>
  29#include <linux/moduleparam.h>
  30#include <linux/types.h>
  31#include <linux/kernel.h>
  32#include <linux/string.h>
  33#include <linux/errno.h>
  34#include <linux/skbuff.h>
  35#include <linux/list.h>
  36#include <linux/compiler.h>
  37#include <linux/rbtree.h>
  38#include <linux/workqueue.h>
  39#include <linux/slab.h>
  40#include <net/netlink.h>
  41#include <net/sch_generic.h>
  42#include <net/pkt_sched.h>
  43#include <net/pkt_cls.h>
  44
  45/* HTB algorithm.
  46    Author: devik@cdi.cz
  47    ========================================================================
  48    HTB is like TBF with multiple classes. It is also similar to CBQ because
  49    it allows to assign priority to each class in hierarchy.
  50    In fact it is another implementation of Floyd's formal sharing.
  51
  52    Levels:
  53    Each class is assigned level. Leaf has ALWAYS level 0 and root
  54    classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
  55    one less than their parent.
  56*/
  57
  58static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
  59#define HTB_VER 0x30011         /* major must be matched with number suplied by TC as version */
  60
  61#if HTB_VER >> 16 != TC_HTB_PROTOVER
  62#error "Mismatched sch_htb.c and pkt_sch.h"
  63#endif
  64
  65/* Module parameter and sysfs export */
  66module_param    (htb_hysteresis, int, 0640);
  67MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
  68
  69static int htb_rate_est = 0; /* htb classes have a default rate estimator */
  70module_param(htb_rate_est, int, 0640);
  71MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
  72
  73/* used internaly to keep status of single class */
  74enum htb_cmode {
  75        HTB_CANT_SEND,          /* class can't send and can't borrow */
  76        HTB_MAY_BORROW,         /* class can't send but may borrow */
  77        HTB_CAN_SEND            /* class can send */
  78};
  79
  80struct htb_prio {
  81        union {
  82                struct rb_root  row;
  83                struct rb_root  feed;
  84        };
  85        struct rb_node  *ptr;
  86        /* When class changes from state 1->2 and disconnects from
  87         * parent's feed then we lost ptr value and start from the
  88         * first child again. Here we store classid of the
  89         * last valid ptr (used when ptr is NULL).
  90         */
  91        u32             last_ptr_id;
  92};
  93
  94/* interior & leaf nodes; props specific to leaves are marked L:
  95 * To reduce false sharing, place mostly read fields at beginning,
  96 * and mostly written ones at the end.
  97 */
  98struct htb_class {
  99        struct Qdisc_class_common common;
 100        struct psched_ratecfg   rate;
 101        struct psched_ratecfg   ceil;
 102        s64                     buffer, cbuffer;/* token bucket depth/rate */
 103        s64                     mbuffer;        /* max wait time */
 104        u32                     prio;           /* these two are used only by leaves... */
 105        int                     quantum;        /* but stored for parent-to-leaf return */
 106
 107        struct tcf_proto __rcu  *filter_list;   /* class attached filters */
 108        struct tcf_block        *block;
 109        int                     filter_cnt;
 110        int                     refcnt;         /* usage count of this class */
 111
 112        int                     level;          /* our level (see above) */
 113        unsigned int            children;
 114        struct htb_class        *parent;        /* parent class */
 115
 116        struct net_rate_estimator __rcu *rate_est;
 117
 118        /*
 119         * Written often fields
 120         */
 121        struct gnet_stats_basic_packed bstats;
 122        struct tc_htb_xstats    xstats; /* our special stats */
 123
 124        /* token bucket parameters */
 125        s64                     tokens, ctokens;/* current number of tokens */
 126        s64                     t_c;            /* checkpoint time */
 127
 128        union {
 129                struct htb_class_leaf {
 130                        struct list_head drop_list;
 131                        int             deficit[TC_HTB_MAXDEPTH];
 132                        struct Qdisc    *q;
 133                } leaf;
 134                struct htb_class_inner {
 135                        struct htb_prio clprio[TC_HTB_NUMPRIO];
 136                } inner;
 137        } un;
 138        s64                     pq_key;
 139
 140        int                     prio_activity;  /* for which prios are we active */
 141        enum htb_cmode          cmode;          /* current mode of the class */
 142        struct rb_node          pq_node;        /* node for event queue */
 143        struct rb_node          node[TC_HTB_NUMPRIO];   /* node for self or feed tree */
 144
 145        unsigned int drops ____cacheline_aligned_in_smp;
 146};
 147
 148struct htb_level {
 149        struct rb_root  wait_pq;
 150        struct htb_prio hprio[TC_HTB_NUMPRIO];
 151};
 152
 153struct htb_sched {
 154        struct Qdisc_class_hash clhash;
 155        int                     defcls;         /* class where unclassified flows go to */
 156        int                     rate2quantum;   /* quant = rate / rate2quantum */
 157
 158        /* filters for qdisc itself */
 159        struct tcf_proto __rcu  *filter_list;
 160        struct tcf_block        *block;
 161
 162#define HTB_WARN_TOOMANYEVENTS  0x1
 163        unsigned int            warned; /* only one warning */
 164        int                     direct_qlen;
 165        struct work_struct      work;
 166
 167        /* non shaped skbs; let them go directly thru */
 168        struct qdisc_skb_head   direct_queue;
 169        long                    direct_pkts;
 170
 171        struct qdisc_watchdog   watchdog;
 172
 173        s64                     now;    /* cached dequeue time */
 174        struct list_head        drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
 175
 176        /* time of nearest event per level (row) */
 177        s64                     near_ev_cache[TC_HTB_MAXDEPTH];
 178
 179        int                     row_mask[TC_HTB_MAXDEPTH];
 180
 181        struct htb_level        hlevel[TC_HTB_MAXDEPTH];
 182};
 183
 184/* find class in global hash table using given handle */
 185static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
 186{
 187        struct htb_sched *q = qdisc_priv(sch);
 188        struct Qdisc_class_common *clc;
 189
 190        clc = qdisc_class_find(&q->clhash, handle);
 191        if (clc == NULL)
 192                return NULL;
 193        return container_of(clc, struct htb_class, common);
 194}
 195
 196/**
 197 * htb_classify - classify a packet into class
 198 *
 199 * It returns NULL if the packet should be dropped or -1 if the packet
 200 * should be passed directly thru. In all other cases leaf class is returned.
 201 * We allow direct class selection by classid in priority. The we examine
 202 * filters in qdisc and in inner nodes (if higher filter points to the inner
 203 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
 204 * internal fifo (direct). These packets then go directly thru. If we still
 205 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
 206 * then finish and return direct queue.
 207 */
 208#define HTB_DIRECT ((struct htb_class *)-1L)
 209
 210static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 211                                      int *qerr)
 212{
 213        struct htb_sched *q = qdisc_priv(sch);
 214        struct htb_class *cl;
 215        struct tcf_result res;
 216        struct tcf_proto *tcf;
 217        int result;
 218
 219        /* allow to select class by setting skb->priority to valid classid;
 220         * note that nfmark can be used too by attaching filter fw with no
 221         * rules in it
 222         */
 223        if (skb->priority == sch->handle)
 224                return HTB_DIRECT;      /* X:0 (direct flow) selected */
 225        cl = htb_find(skb->priority, sch);
 226        if (cl) {
 227                if (cl->level == 0)
 228                        return cl;
 229                /* Start with inner filter chain if a non-leaf class is selected */
 230                tcf = rcu_dereference_bh(cl->filter_list);
 231        } else {
 232                tcf = rcu_dereference_bh(q->filter_list);
 233        }
 234
 235        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 236        while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
 237#ifdef CONFIG_NET_CLS_ACT
 238                switch (result) {
 239                case TC_ACT_QUEUED:
 240                case TC_ACT_STOLEN:
 241                case TC_ACT_TRAP:
 242                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 243                case TC_ACT_SHOT:
 244                        return NULL;
 245                }
 246#endif
 247                cl = (void *)res.class;
 248                if (!cl) {
 249                        if (res.classid == sch->handle)
 250                                return HTB_DIRECT;      /* X:0 (direct flow) */
 251                        cl = htb_find(res.classid, sch);
 252                        if (!cl)
 253                                break;  /* filter selected invalid classid */
 254                }
 255                if (!cl->level)
 256                        return cl;      /* we hit leaf; return it */
 257
 258                /* we have got inner class; apply inner filter chain */
 259                tcf = rcu_dereference_bh(cl->filter_list);
 260        }
 261        /* classification failed; try to use default class */
 262        cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
 263        if (!cl || cl->level)
 264                return HTB_DIRECT;      /* bad default .. this is safe bet */
 265        return cl;
 266}
 267
 268/**
 269 * htb_add_to_id_tree - adds class to the round robin list
 270 *
 271 * Routine adds class to the list (actually tree) sorted by classid.
 272 * Make sure that class is not already on such list for given prio.
 273 */
 274static void htb_add_to_id_tree(struct rb_root *root,
 275                               struct htb_class *cl, int prio)
 276{
 277        struct rb_node **p = &root->rb_node, *parent = NULL;
 278
 279        while (*p) {
 280                struct htb_class *c;
 281                parent = *p;
 282                c = rb_entry(parent, struct htb_class, node[prio]);
 283
 284                if (cl->common.classid > c->common.classid)
 285                        p = &parent->rb_right;
 286                else
 287                        p = &parent->rb_left;
 288        }
 289        rb_link_node(&cl->node[prio], parent, p);
 290        rb_insert_color(&cl->node[prio], root);
 291}
 292
 293/**
 294 * htb_add_to_wait_tree - adds class to the event queue with delay
 295 *
 296 * The class is added to priority event queue to indicate that class will
 297 * change its mode in cl->pq_key microseconds. Make sure that class is not
 298 * already in the queue.
 299 */
 300static void htb_add_to_wait_tree(struct htb_sched *q,
 301                                 struct htb_class *cl, s64 delay)
 302{
 303        struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
 304
 305        cl->pq_key = q->now + delay;
 306        if (cl->pq_key == q->now)
 307                cl->pq_key++;
 308
 309        /* update the nearest event cache */
 310        if (q->near_ev_cache[cl->level] > cl->pq_key)
 311                q->near_ev_cache[cl->level] = cl->pq_key;
 312
 313        while (*p) {
 314                struct htb_class *c;
 315                parent = *p;
 316                c = rb_entry(parent, struct htb_class, pq_node);
 317                if (cl->pq_key >= c->pq_key)
 318                        p = &parent->rb_right;
 319                else
 320                        p = &parent->rb_left;
 321        }
 322        rb_link_node(&cl->pq_node, parent, p);
 323        rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 324}
 325
 326/**
 327 * htb_next_rb_node - finds next node in binary tree
 328 *
 329 * When we are past last key we return NULL.
 330 * Average complexity is 2 steps per call.
 331 */
 332static inline void htb_next_rb_node(struct rb_node **n)
 333{
 334        *n = rb_next(*n);
 335}
 336
 337/**
 338 * htb_add_class_to_row - add class to its row
 339 *
 340 * The class is added to row at priorities marked in mask.
 341 * It does nothing if mask == 0.
 342 */
 343static inline void htb_add_class_to_row(struct htb_sched *q,
 344                                        struct htb_class *cl, int mask)
 345{
 346        q->row_mask[cl->level] |= mask;
 347        while (mask) {
 348                int prio = ffz(~mask);
 349                mask &= ~(1 << prio);
 350                htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
 351        }
 352}
 353
 354/* If this triggers, it is a bug in this code, but it need not be fatal */
 355static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
 356{
 357        if (RB_EMPTY_NODE(rb)) {
 358                WARN_ON(1);
 359        } else {
 360                rb_erase(rb, root);
 361                RB_CLEAR_NODE(rb);
 362        }
 363}
 364
 365
 366/**
 367 * htb_remove_class_from_row - removes class from its row
 368 *
 369 * The class is removed from row at priorities marked in mask.
 370 * It does nothing if mask == 0.
 371 */
 372static inline void htb_remove_class_from_row(struct htb_sched *q,
 373                                                 struct htb_class *cl, int mask)
 374{
 375        int m = 0;
 376        struct htb_level *hlevel = &q->hlevel[cl->level];
 377
 378        while (mask) {
 379                int prio = ffz(~mask);
 380                struct htb_prio *hprio = &hlevel->hprio[prio];
 381
 382                mask &= ~(1 << prio);
 383                if (hprio->ptr == cl->node + prio)
 384                        htb_next_rb_node(&hprio->ptr);
 385
 386                htb_safe_rb_erase(cl->node + prio, &hprio->row);
 387                if (!hprio->row.rb_node)
 388                        m |= 1 << prio;
 389        }
 390        q->row_mask[cl->level] &= ~m;
 391}
 392
 393/**
 394 * htb_activate_prios - creates active classe's feed chain
 395 *
 396 * The class is connected to ancestors and/or appropriate rows
 397 * for priorities it is participating on. cl->cmode must be new
 398 * (activated) mode. It does nothing if cl->prio_activity == 0.
 399 */
 400static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
 401{
 402        struct htb_class *p = cl->parent;
 403        long m, mask = cl->prio_activity;
 404
 405        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
 406                m = mask;
 407                while (m) {
 408                        int prio = ffz(~m);
 409                        m &= ~(1 << prio);
 410
 411                        if (p->un.inner.clprio[prio].feed.rb_node)
 412                                /* parent already has its feed in use so that
 413                                 * reset bit in mask as parent is already ok
 414                                 */
 415                                mask &= ~(1 << prio);
 416
 417                        htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
 418                }
 419                p->prio_activity |= mask;
 420                cl = p;
 421                p = cl->parent;
 422
 423        }
 424        if (cl->cmode == HTB_CAN_SEND && mask)
 425                htb_add_class_to_row(q, cl, mask);
 426}
 427
 428/**
 429 * htb_deactivate_prios - remove class from feed chain
 430 *
 431 * cl->cmode must represent old mode (before deactivation). It does
 432 * nothing if cl->prio_activity == 0. Class is removed from all feed
 433 * chains and rows.
 434 */
 435static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 436{
 437        struct htb_class *p = cl->parent;
 438        long m, mask = cl->prio_activity;
 439
 440        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
 441                m = mask;
 442                mask = 0;
 443                while (m) {
 444                        int prio = ffz(~m);
 445                        m &= ~(1 << prio);
 446
 447                        if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
 448                                /* we are removing child which is pointed to from
 449                                 * parent feed - forget the pointer but remember
 450                                 * classid
 451                                 */
 452                                p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
 453                                p->un.inner.clprio[prio].ptr = NULL;
 454                        }
 455
 456                        htb_safe_rb_erase(cl->node + prio,
 457                                          &p->un.inner.clprio[prio].feed);
 458
 459                        if (!p->un.inner.clprio[prio].feed.rb_node)
 460                                mask |= 1 << prio;
 461                }
 462
 463                p->prio_activity &= ~mask;
 464                cl = p;
 465                p = cl->parent;
 466
 467        }
 468        if (cl->cmode == HTB_CAN_SEND && mask)
 469                htb_remove_class_from_row(q, cl, mask);
 470}
 471
 472static inline s64 htb_lowater(const struct htb_class *cl)
 473{
 474        if (htb_hysteresis)
 475                return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
 476        else
 477                return 0;
 478}
 479static inline s64 htb_hiwater(const struct htb_class *cl)
 480{
 481        if (htb_hysteresis)
 482                return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
 483        else
 484                return 0;
 485}
 486
 487
 488/**
 489 * htb_class_mode - computes and returns current class mode
 490 *
 491 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
 492 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
 493 * from now to time when cl will change its state.
 494 * Also it is worth to note that class mode doesn't change simply
 495 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
 496 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
 497 * mode transitions per time unit. The speed gain is about 1/6.
 498 */
 499static inline enum htb_cmode
 500htb_class_mode(struct htb_class *cl, s64 *diff)
 501{
 502        s64 toks;
 503
 504        if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
 505                *diff = -toks;
 506                return HTB_CANT_SEND;
 507        }
 508
 509        if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
 510                return HTB_CAN_SEND;
 511
 512        *diff = -toks;
 513        return HTB_MAY_BORROW;
 514}
 515
 516/**
 517 * htb_change_class_mode - changes classe's mode
 518 *
 519 * This should be the only way how to change classe's mode under normal
 520 * cirsumstances. Routine will update feed lists linkage, change mode
 521 * and add class to the wait event queue if appropriate. New mode should
 522 * be different from old one and cl->pq_key has to be valid if changing
 523 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
 524 */
 525static void
 526htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
 527{
 528        enum htb_cmode new_mode = htb_class_mode(cl, diff);
 529
 530        if (new_mode == cl->cmode)
 531                return;
 532
 533        if (cl->prio_activity) {        /* not necessary: speed optimization */
 534                if (cl->cmode != HTB_CANT_SEND)
 535                        htb_deactivate_prios(q, cl);
 536                cl->cmode = new_mode;
 537                if (new_mode != HTB_CANT_SEND)
 538                        htb_activate_prios(q, cl);
 539        } else
 540                cl->cmode = new_mode;
 541}
 542
 543/**
 544 * htb_activate - inserts leaf cl into appropriate active feeds
 545 *
 546 * Routine learns (new) priority of leaf and activates feed chain
 547 * for the prio. It can be called on already active leaf safely.
 548 * It also adds leaf into droplist.
 549 */
 550static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
 551{
 552        WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
 553
 554        if (!cl->prio_activity) {
 555                cl->prio_activity = 1 << cl->prio;
 556                htb_activate_prios(q, cl);
 557                list_add_tail(&cl->un.leaf.drop_list,
 558                              q->drops + cl->prio);
 559        }
 560}
 561
 562/**
 563 * htb_deactivate - remove leaf cl from active feeds
 564 *
 565 * Make sure that leaf is active. In the other words it can't be called
 566 * with non-active leaf. It also removes class from the drop list.
 567 */
 568static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 569{
 570        WARN_ON(!cl->prio_activity);
 571
 572        htb_deactivate_prios(q, cl);
 573        cl->prio_activity = 0;
 574        list_del_init(&cl->un.leaf.drop_list);
 575}
 576
 577static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
 578                             struct qdisc_skb_head *qh)
 579{
 580        struct sk_buff *last = qh->tail;
 581
 582        if (last) {
 583                skb->next = NULL;
 584                last->next = skb;
 585                qh->tail = skb;
 586        } else {
 587                qh->tail = skb;
 588                qh->head = skb;
 589        }
 590        qh->qlen++;
 591}
 592
 593static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 594                       struct sk_buff **to_free)
 595{
 596        int uninitialized_var(ret);
 597        struct htb_sched *q = qdisc_priv(sch);
 598        struct htb_class *cl = htb_classify(skb, sch, &ret);
 599
 600        if (cl == HTB_DIRECT) {
 601                /* enqueue to helper queue */
 602                if (q->direct_queue.qlen < q->direct_qlen) {
 603                        htb_enqueue_tail(skb, sch, &q->direct_queue);
 604                        q->direct_pkts++;
 605                } else {
 606                        return qdisc_drop(skb, sch, to_free);
 607                }
 608#ifdef CONFIG_NET_CLS_ACT
 609        } else if (!cl) {
 610                if (ret & __NET_XMIT_BYPASS)
 611                        qdisc_qstats_drop(sch);
 612                __qdisc_drop(skb, to_free);
 613                return ret;
 614#endif
 615        } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
 616                                        to_free)) != NET_XMIT_SUCCESS) {
 617                if (net_xmit_drop_count(ret)) {
 618                        qdisc_qstats_drop(sch);
 619                        cl->drops++;
 620                }
 621                return ret;
 622        } else {
 623                htb_activate(q, cl);
 624        }
 625
 626        qdisc_qstats_backlog_inc(sch, skb);
 627        sch->q.qlen++;
 628        return NET_XMIT_SUCCESS;
 629}
 630
 631static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
 632{
 633        s64 toks = diff + cl->tokens;
 634
 635        if (toks > cl->buffer)
 636                toks = cl->buffer;
 637        toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
 638        if (toks <= -cl->mbuffer)
 639                toks = 1 - cl->mbuffer;
 640
 641        cl->tokens = toks;
 642}
 643
 644static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
 645{
 646        s64 toks = diff + cl->ctokens;
 647
 648        if (toks > cl->cbuffer)
 649                toks = cl->cbuffer;
 650        toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
 651        if (toks <= -cl->mbuffer)
 652                toks = 1 - cl->mbuffer;
 653
 654        cl->ctokens = toks;
 655}
 656
 657/**
 658 * htb_charge_class - charges amount "bytes" to leaf and ancestors
 659 *
 660 * Routine assumes that packet "bytes" long was dequeued from leaf cl
 661 * borrowing from "level". It accounts bytes to ceil leaky bucket for
 662 * leaf and all ancestors and to rate bucket for ancestors at levels
 663 * "level" and higher. It also handles possible change of mode resulting
 664 * from the update. Note that mode can also increase here (MAY_BORROW to
 665 * CAN_SEND) because we can use more precise clock that event queue here.
 666 * In such case we remove class from event queue first.
 667 */
 668static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 669                             int level, struct sk_buff *skb)
 670{
 671        int bytes = qdisc_pkt_len(skb);
 672        enum htb_cmode old_mode;
 673        s64 diff;
 674
 675        while (cl) {
 676                diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 677                if (cl->level >= level) {
 678                        if (cl->level == level)
 679                                cl->xstats.lends++;
 680                        htb_accnt_tokens(cl, bytes, diff);
 681                } else {
 682                        cl->xstats.borrows++;
 683                        cl->tokens += diff;     /* we moved t_c; update tokens */
 684                }
 685                htb_accnt_ctokens(cl, bytes, diff);
 686                cl->t_c = q->now;
 687
 688                old_mode = cl->cmode;
 689                diff = 0;
 690                htb_change_class_mode(q, cl, &diff);
 691                if (old_mode != cl->cmode) {
 692                        if (old_mode != HTB_CAN_SEND)
 693                                htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 694                        if (cl->cmode != HTB_CAN_SEND)
 695                                htb_add_to_wait_tree(q, cl, diff);
 696                }
 697
 698                /* update basic stats except for leaves which are already updated */
 699                if (cl->level)
 700                        bstats_update(&cl->bstats, skb);
 701
 702                cl = cl->parent;
 703        }
 704}
 705
 706/**
 707 * htb_do_events - make mode changes to classes at the level
 708 *
 709 * Scans event queue for pending events and applies them. Returns time of
 710 * next pending event (0 for no event in pq, q->now for too many events).
 711 * Note: Applied are events whose have cl->pq_key <= q->now.
 712 */
 713static s64 htb_do_events(struct htb_sched *q, const int level,
 714                         unsigned long start)
 715{
 716        /* don't run for longer than 2 jiffies; 2 is used instead of
 717         * 1 to simplify things when jiffy is going to be incremented
 718         * too soon
 719         */
 720        unsigned long stop_at = start + 2;
 721        struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
 722
 723        while (time_before(jiffies, stop_at)) {
 724                struct htb_class *cl;
 725                s64 diff;
 726                struct rb_node *p = rb_first(wait_pq);
 727
 728                if (!p)
 729                        return 0;
 730
 731                cl = rb_entry(p, struct htb_class, pq_node);
 732                if (cl->pq_key > q->now)
 733                        return cl->pq_key;
 734
 735                htb_safe_rb_erase(p, wait_pq);
 736                diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 737                htb_change_class_mode(q, cl, &diff);
 738                if (cl->cmode != HTB_CAN_SEND)
 739                        htb_add_to_wait_tree(q, cl, diff);
 740        }
 741
 742        /* too much load - let's continue after a break for scheduling */
 743        if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
 744                pr_warn("htb: too many events!\n");
 745                q->warned |= HTB_WARN_TOOMANYEVENTS;
 746        }
 747
 748        return q->now;
 749}
 750
 751/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
 752 * is no such one exists.
 753 */
 754static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
 755                                              u32 id)
 756{
 757        struct rb_node *r = NULL;
 758        while (n) {
 759                struct htb_class *cl =
 760                    rb_entry(n, struct htb_class, node[prio]);
 761
 762                if (id > cl->common.classid) {
 763                        n = n->rb_right;
 764                } else if (id < cl->common.classid) {
 765                        r = n;
 766                        n = n->rb_left;
 767                } else {
 768                        return n;
 769                }
 770        }
 771        return r;
 772}
 773
 774/**
 775 * htb_lookup_leaf - returns next leaf class in DRR order
 776 *
 777 * Find leaf where current feed pointers points to.
 778 */
 779static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
 780{
 781        int i;
 782        struct {
 783                struct rb_node *root;
 784                struct rb_node **pptr;
 785                u32 *pid;
 786        } stk[TC_HTB_MAXDEPTH], *sp = stk;
 787
 788        BUG_ON(!hprio->row.rb_node);
 789        sp->root = hprio->row.rb_node;
 790        sp->pptr = &hprio->ptr;
 791        sp->pid = &hprio->last_ptr_id;
 792
 793        for (i = 0; i < 65535; i++) {
 794                if (!*sp->pptr && *sp->pid) {
 795                        /* ptr was invalidated but id is valid - try to recover
 796                         * the original or next ptr
 797                         */
 798                        *sp->pptr =
 799                            htb_id_find_next_upper(prio, sp->root, *sp->pid);
 800                }
 801                *sp->pid = 0;   /* ptr is valid now so that remove this hint as it
 802                                 * can become out of date quickly
 803                                 */
 804                if (!*sp->pptr) {       /* we are at right end; rewind & go up */
 805                        *sp->pptr = sp->root;
 806                        while ((*sp->pptr)->rb_left)
 807                                *sp->pptr = (*sp->pptr)->rb_left;
 808                        if (sp > stk) {
 809                                sp--;
 810                                if (!*sp->pptr) {
 811                                        WARN_ON(1);
 812                                        return NULL;
 813                                }
 814                                htb_next_rb_node(sp->pptr);
 815                        }
 816                } else {
 817                        struct htb_class *cl;
 818                        struct htb_prio *clp;
 819
 820                        cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
 821                        if (!cl->level)
 822                                return cl;
 823                        clp = &cl->un.inner.clprio[prio];
 824                        (++sp)->root = clp->feed.rb_node;
 825                        sp->pptr = &clp->ptr;
 826                        sp->pid = &clp->last_ptr_id;
 827                }
 828        }
 829        WARN_ON(1);
 830        return NULL;
 831}
 832
 833/* dequeues packet at given priority and level; call only if
 834 * you are sure that there is active class at prio/level
 835 */
 836static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
 837                                        const int level)
 838{
 839        struct sk_buff *skb = NULL;
 840        struct htb_class *cl, *start;
 841        struct htb_level *hlevel = &q->hlevel[level];
 842        struct htb_prio *hprio = &hlevel->hprio[prio];
 843
 844        /* look initial class up in the row */
 845        start = cl = htb_lookup_leaf(hprio, prio);
 846
 847        do {
 848next:
 849                if (unlikely(!cl))
 850                        return NULL;
 851
 852                /* class can be empty - it is unlikely but can be true if leaf
 853                 * qdisc drops packets in enqueue routine or if someone used
 854                 * graft operation on the leaf since last dequeue;
 855                 * simply deactivate and skip such class
 856                 */
 857                if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
 858                        struct htb_class *next;
 859                        htb_deactivate(q, cl);
 860
 861                        /* row/level might become empty */
 862                        if ((q->row_mask[level] & (1 << prio)) == 0)
 863                                return NULL;
 864
 865                        next = htb_lookup_leaf(hprio, prio);
 866
 867                        if (cl == start)        /* fix start if we just deleted it */
 868                                start = next;
 869                        cl = next;
 870                        goto next;
 871                }
 872
 873                skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
 874                if (likely(skb != NULL))
 875                        break;
 876
 877                qdisc_warn_nonwc("htb", cl->un.leaf.q);
 878                htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
 879                                         &q->hlevel[0].hprio[prio].ptr);
 880                cl = htb_lookup_leaf(hprio, prio);
 881
 882        } while (cl != start);
 883
 884        if (likely(skb != NULL)) {
 885                bstats_update(&cl->bstats, skb);
 886                cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
 887                if (cl->un.leaf.deficit[level] < 0) {
 888                        cl->un.leaf.deficit[level] += cl->quantum;
 889                        htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
 890                                                 &q->hlevel[0].hprio[prio].ptr);
 891                }
 892                /* this used to be after charge_class but this constelation
 893                 * gives us slightly better performance
 894                 */
 895                if (!cl->un.leaf.q->q.qlen)
 896                        htb_deactivate(q, cl);
 897                htb_charge_class(q, cl, level, skb);
 898        }
 899        return skb;
 900}
 901
 902static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 903{
 904        struct sk_buff *skb;
 905        struct htb_sched *q = qdisc_priv(sch);
 906        int level;
 907        s64 next_event;
 908        unsigned long start_at;
 909
 910        /* try to dequeue direct packets as high prio (!) to minimize cpu work */
 911        skb = __qdisc_dequeue_head(&q->direct_queue);
 912        if (skb != NULL) {
 913ok:
 914                qdisc_bstats_update(sch, skb);
 915                qdisc_qstats_backlog_dec(sch, skb);
 916                sch->q.qlen--;
 917                return skb;
 918        }
 919
 920        if (!sch->q.qlen)
 921                goto fin;
 922        q->now = ktime_get_ns();
 923        start_at = jiffies;
 924
 925        next_event = q->now + 5LLU * NSEC_PER_SEC;
 926
 927        for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
 928                /* common case optimization - skip event handler quickly */
 929                int m;
 930                s64 event = q->near_ev_cache[level];
 931
 932                if (q->now >= event) {
 933                        event = htb_do_events(q, level, start_at);
 934                        if (!event)
 935                                event = q->now + NSEC_PER_SEC;
 936                        q->near_ev_cache[level] = event;
 937                }
 938
 939                if (next_event > event)
 940                        next_event = event;
 941
 942                m = ~q->row_mask[level];
 943                while (m != (int)(-1)) {
 944                        int prio = ffz(m);
 945
 946                        m |= 1 << prio;
 947                        skb = htb_dequeue_tree(q, prio, level);
 948                        if (likely(skb != NULL))
 949                                goto ok;
 950                }
 951        }
 952        qdisc_qstats_overlimit(sch);
 953        if (likely(next_event > q->now))
 954                qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
 955        else
 956                schedule_work(&q->work);
 957fin:
 958        return skb;
 959}
 960
 961/* reset all classes */
 962/* always caled under BH & queue lock */
 963static void htb_reset(struct Qdisc *sch)
 964{
 965        struct htb_sched *q = qdisc_priv(sch);
 966        struct htb_class *cl;
 967        unsigned int i;
 968
 969        for (i = 0; i < q->clhash.hashsize; i++) {
 970                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 971                        if (cl->level)
 972                                memset(&cl->un.inner, 0, sizeof(cl->un.inner));
 973                        else {
 974                                if (cl->un.leaf.q)
 975                                        qdisc_reset(cl->un.leaf.q);
 976                                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
 977                        }
 978                        cl->prio_activity = 0;
 979                        cl->cmode = HTB_CAN_SEND;
 980                }
 981        }
 982        qdisc_watchdog_cancel(&q->watchdog);
 983        __qdisc_reset_queue(&q->direct_queue);
 984        sch->q.qlen = 0;
 985        sch->qstats.backlog = 0;
 986        memset(q->hlevel, 0, sizeof(q->hlevel));
 987        memset(q->row_mask, 0, sizeof(q->row_mask));
 988        for (i = 0; i < TC_HTB_NUMPRIO; i++)
 989                INIT_LIST_HEAD(q->drops + i);
 990}
 991
 992static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
 993        [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
 994        [TCA_HTB_INIT]  = { .len = sizeof(struct tc_htb_glob) },
 995        [TCA_HTB_CTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 996        [TCA_HTB_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 997        [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
 998        [TCA_HTB_RATE64] = { .type = NLA_U64 },
 999        [TCA_HTB_CEIL64] = { .type = NLA_U64 },
1000};
1001
1002static void htb_work_func(struct work_struct *work)
1003{
1004        struct htb_sched *q = container_of(work, struct htb_sched, work);
1005        struct Qdisc *sch = q->watchdog.qdisc;
1006
1007        rcu_read_lock();
1008        __netif_schedule(qdisc_root(sch));
1009        rcu_read_unlock();
1010}
1011
1012static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1013{
1014        struct htb_sched *q = qdisc_priv(sch);
1015        struct nlattr *tb[TCA_HTB_MAX + 1];
1016        struct tc_htb_glob *gopt;
1017        int err;
1018        int i;
1019
1020        qdisc_watchdog_init(&q->watchdog, sch);
1021        INIT_WORK(&q->work, htb_work_func);
1022
1023        if (!opt)
1024                return -EINVAL;
1025
1026        err = tcf_block_get(&q->block, &q->filter_list);
1027        if (err)
1028                return err;
1029
1030        err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1031        if (err < 0)
1032                return err;
1033
1034        if (!tb[TCA_HTB_INIT])
1035                return -EINVAL;
1036
1037        gopt = nla_data(tb[TCA_HTB_INIT]);
1038        if (gopt->version != HTB_VER >> 16)
1039                return -EINVAL;
1040
1041        err = qdisc_class_hash_init(&q->clhash);
1042        if (err < 0)
1043                return err;
1044        for (i = 0; i < TC_HTB_NUMPRIO; i++)
1045                INIT_LIST_HEAD(q->drops + i);
1046
1047        qdisc_skb_head_init(&q->direct_queue);
1048
1049        if (tb[TCA_HTB_DIRECT_QLEN])
1050                q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1051        else
1052                q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1053
1054        if ((q->rate2quantum = gopt->rate2quantum) < 1)
1055                q->rate2quantum = 1;
1056        q->defcls = gopt->defcls;
1057
1058        return 0;
1059}
1060
1061static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1062{
1063        struct htb_sched *q = qdisc_priv(sch);
1064        struct nlattr *nest;
1065        struct tc_htb_glob gopt;
1066
1067        /* Its safe to not acquire qdisc lock. As we hold RTNL,
1068         * no change can happen on the qdisc parameters.
1069         */
1070
1071        gopt.direct_pkts = q->direct_pkts;
1072        gopt.version = HTB_VER;
1073        gopt.rate2quantum = q->rate2quantum;
1074        gopt.defcls = q->defcls;
1075        gopt.debug = 0;
1076
1077        nest = nla_nest_start(skb, TCA_OPTIONS);
1078        if (nest == NULL)
1079                goto nla_put_failure;
1080        if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1081            nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1082                goto nla_put_failure;
1083
1084        return nla_nest_end(skb, nest);
1085
1086nla_put_failure:
1087        nla_nest_cancel(skb, nest);
1088        return -1;
1089}
1090
1091static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1092                          struct sk_buff *skb, struct tcmsg *tcm)
1093{
1094        struct htb_class *cl = (struct htb_class *)arg;
1095        struct nlattr *nest;
1096        struct tc_htb_opt opt;
1097
1098        /* Its safe to not acquire qdisc lock. As we hold RTNL,
1099         * no change can happen on the class parameters.
1100         */
1101        tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1102        tcm->tcm_handle = cl->common.classid;
1103        if (!cl->level && cl->un.leaf.q)
1104                tcm->tcm_info = cl->un.leaf.q->handle;
1105
1106        nest = nla_nest_start(skb, TCA_OPTIONS);
1107        if (nest == NULL)
1108                goto nla_put_failure;
1109
1110        memset(&opt, 0, sizeof(opt));
1111
1112        psched_ratecfg_getrate(&opt.rate, &cl->rate);
1113        opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1114        psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1115        opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1116        opt.quantum = cl->quantum;
1117        opt.prio = cl->prio;
1118        opt.level = cl->level;
1119        if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1120                goto nla_put_failure;
1121        if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1122            nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1123                              TCA_HTB_PAD))
1124                goto nla_put_failure;
1125        if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1126            nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1127                              TCA_HTB_PAD))
1128                goto nla_put_failure;
1129
1130        return nla_nest_end(skb, nest);
1131
1132nla_put_failure:
1133        nla_nest_cancel(skb, nest);
1134        return -1;
1135}
1136
1137static int
1138htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1139{
1140        struct htb_class *cl = (struct htb_class *)arg;
1141        struct gnet_stats_queue qs = {
1142                .drops = cl->drops,
1143        };
1144        __u32 qlen = 0;
1145
1146        if (!cl->level && cl->un.leaf.q) {
1147                qlen = cl->un.leaf.q->q.qlen;
1148                qs.backlog = cl->un.leaf.q->qstats.backlog;
1149        }
1150        cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1151                                    INT_MIN, INT_MAX);
1152        cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1153                                     INT_MIN, INT_MAX);
1154
1155        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1156                                  d, NULL, &cl->bstats) < 0 ||
1157            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1158            gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1159                return -1;
1160
1161        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1162}
1163
1164static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1165                     struct Qdisc **old)
1166{
1167        struct htb_class *cl = (struct htb_class *)arg;
1168
1169        if (cl->level)
1170                return -EINVAL;
1171        if (new == NULL &&
1172            (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1173                                     cl->common.classid)) == NULL)
1174                return -ENOBUFS;
1175
1176        *old = qdisc_replace(sch, new, &cl->un.leaf.q);
1177        return 0;
1178}
1179
1180static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1181{
1182        struct htb_class *cl = (struct htb_class *)arg;
1183        return !cl->level ? cl->un.leaf.q : NULL;
1184}
1185
1186static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1187{
1188        struct htb_class *cl = (struct htb_class *)arg;
1189
1190        if (cl->un.leaf.q->q.qlen == 0)
1191                htb_deactivate(qdisc_priv(sch), cl);
1192}
1193
1194static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1195{
1196        struct htb_class *cl = htb_find(classid, sch);
1197        if (cl)
1198                cl->refcnt++;
1199        return (unsigned long)cl;
1200}
1201
1202static inline int htb_parent_last_child(struct htb_class *cl)
1203{
1204        if (!cl->parent)
1205                /* the root class */
1206                return 0;
1207        if (cl->parent->children > 1)
1208                /* not the last child */
1209                return 0;
1210        return 1;
1211}
1212
1213static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1214                               struct Qdisc *new_q)
1215{
1216        struct htb_class *parent = cl->parent;
1217
1218        WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1219
1220        if (parent->cmode != HTB_CAN_SEND)
1221                htb_safe_rb_erase(&parent->pq_node,
1222                                  &q->hlevel[parent->level].wait_pq);
1223
1224        parent->level = 0;
1225        memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1226        INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1227        parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1228        parent->tokens = parent->buffer;
1229        parent->ctokens = parent->cbuffer;
1230        parent->t_c = ktime_get_ns();
1231        parent->cmode = HTB_CAN_SEND;
1232}
1233
1234static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1235{
1236        if (!cl->level) {
1237                WARN_ON(!cl->un.leaf.q);
1238                qdisc_destroy(cl->un.leaf.q);
1239        }
1240        gen_kill_estimator(&cl->rate_est);
1241        tcf_block_put(cl->block);
1242        kfree(cl);
1243}
1244
1245static void htb_destroy(struct Qdisc *sch)
1246{
1247        struct htb_sched *q = qdisc_priv(sch);
1248        struct hlist_node *next;
1249        struct htb_class *cl;
1250        unsigned int i;
1251
1252        cancel_work_sync(&q->work);
1253        qdisc_watchdog_cancel(&q->watchdog);
1254        /* This line used to be after htb_destroy_class call below
1255         * and surprisingly it worked in 2.4. But it must precede it
1256         * because filter need its target class alive to be able to call
1257         * unbind_filter on it (without Oops).
1258         */
1259        tcf_block_put(q->block);
1260
1261        for (i = 0; i < q->clhash.hashsize; i++) {
1262                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1263                        tcf_block_put(cl->block);
1264                        cl->block = NULL;
1265                }
1266        }
1267        for (i = 0; i < q->clhash.hashsize; i++) {
1268                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1269                                          common.hnode)
1270                        htb_destroy_class(sch, cl);
1271        }
1272        qdisc_class_hash_destroy(&q->clhash);
1273        __qdisc_reset_queue(&q->direct_queue);
1274}
1275
1276static int htb_delete(struct Qdisc *sch, unsigned long arg)
1277{
1278        struct htb_sched *q = qdisc_priv(sch);
1279        struct htb_class *cl = (struct htb_class *)arg;
1280        struct Qdisc *new_q = NULL;
1281        int last_child = 0;
1282
1283        /* TODO: why don't allow to delete subtree ? references ? does
1284         * tc subsys guarantee us that in htb_destroy it holds no class
1285         * refs so that we can remove children safely there ?
1286         */
1287        if (cl->children || cl->filter_cnt)
1288                return -EBUSY;
1289
1290        if (!cl->level && htb_parent_last_child(cl)) {
1291                new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1292                                          cl->parent->common.classid);
1293                last_child = 1;
1294        }
1295
1296        sch_tree_lock(sch);
1297
1298        if (!cl->level) {
1299                unsigned int qlen = cl->un.leaf.q->q.qlen;
1300                unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1301
1302                qdisc_reset(cl->un.leaf.q);
1303                qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1304        }
1305
1306        /* delete from hash and active; remainder in destroy_class */
1307        qdisc_class_hash_remove(&q->clhash, &cl->common);
1308        if (cl->parent)
1309                cl->parent->children--;
1310
1311        if (cl->prio_activity)
1312                htb_deactivate(q, cl);
1313
1314        if (cl->cmode != HTB_CAN_SEND)
1315                htb_safe_rb_erase(&cl->pq_node,
1316                                  &q->hlevel[cl->level].wait_pq);
1317
1318        if (last_child)
1319                htb_parent_to_leaf(q, cl, new_q);
1320
1321        BUG_ON(--cl->refcnt == 0);
1322        /*
1323         * This shouldn't happen: we "hold" one cops->get() when called
1324         * from tc_ctl_tclass; the destroy method is done from cops->put().
1325         */
1326
1327        sch_tree_unlock(sch);
1328        return 0;
1329}
1330
1331static void htb_put(struct Qdisc *sch, unsigned long arg)
1332{
1333        struct htb_class *cl = (struct htb_class *)arg;
1334
1335        if (--cl->refcnt == 0)
1336                htb_destroy_class(sch, cl);
1337}
1338
1339static int htb_change_class(struct Qdisc *sch, u32 classid,
1340                            u32 parentid, struct nlattr **tca,
1341                            unsigned long *arg)
1342{
1343        int err = -EINVAL;
1344        struct htb_sched *q = qdisc_priv(sch);
1345        struct htb_class *cl = (struct htb_class *)*arg, *parent;
1346        struct nlattr *opt = tca[TCA_OPTIONS];
1347        struct nlattr *tb[TCA_HTB_MAX + 1];
1348        struct tc_htb_opt *hopt;
1349        u64 rate64, ceil64;
1350
1351        /* extract all subattrs from opt attr */
1352        if (!opt)
1353                goto failure;
1354
1355        err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1356        if (err < 0)
1357                goto failure;
1358
1359        err = -EINVAL;
1360        if (tb[TCA_HTB_PARMS] == NULL)
1361                goto failure;
1362
1363        parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1364
1365        hopt = nla_data(tb[TCA_HTB_PARMS]);
1366        if (!hopt->rate.rate || !hopt->ceil.rate)
1367                goto failure;
1368
1369        /* Keeping backward compatible with rate_table based iproute2 tc */
1370        if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1371                qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1372
1373        if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1374                qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1375
1376        if (!cl) {              /* new class */
1377                struct Qdisc *new_q;
1378                int prio;
1379                struct {
1380                        struct nlattr           nla;
1381                        struct gnet_estimator   opt;
1382                } est = {
1383                        .nla = {
1384                                .nla_len        = nla_attr_size(sizeof(est.opt)),
1385                                .nla_type       = TCA_RATE,
1386                        },
1387                        .opt = {
1388                                /* 4s interval, 16s averaging constant */
1389                                .interval       = 2,
1390                                .ewma_log       = 2,
1391                        },
1392                };
1393
1394                /* check for valid classid */
1395                if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1396                    htb_find(classid, sch))
1397                        goto failure;
1398
1399                /* check maximal depth */
1400                if (parent && parent->parent && parent->parent->level < 2) {
1401                        pr_err("htb: tree is too deep\n");
1402                        goto failure;
1403                }
1404                err = -ENOBUFS;
1405                cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1406                if (!cl)
1407                        goto failure;
1408
1409                err = tcf_block_get(&cl->block, &cl->filter_list);
1410                if (err) {
1411                        kfree(cl);
1412                        goto failure;
1413                }
1414                if (htb_rate_est || tca[TCA_RATE]) {
1415                        err = gen_new_estimator(&cl->bstats, NULL,
1416                                                &cl->rate_est,
1417                                                NULL,
1418                                                qdisc_root_sleeping_running(sch),
1419                                                tca[TCA_RATE] ? : &est.nla);
1420                        if (err) {
1421                                tcf_block_put(cl->block);
1422                                kfree(cl);
1423                                goto failure;
1424                        }
1425                }
1426
1427                cl->refcnt = 1;
1428                cl->children = 0;
1429                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1430                RB_CLEAR_NODE(&cl->pq_node);
1431
1432                for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1433                        RB_CLEAR_NODE(&cl->node[prio]);
1434
1435                /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1436                 * so that can't be used inside of sch_tree_lock
1437                 * -- thanks to Karlis Peisenieks
1438                 */
1439                new_q = qdisc_create_dflt(sch->dev_queue,
1440                                          &pfifo_qdisc_ops, classid);
1441                sch_tree_lock(sch);
1442                if (parent && !parent->level) {
1443                        unsigned int qlen = parent->un.leaf.q->q.qlen;
1444                        unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1445
1446                        /* turn parent into inner node */
1447                        qdisc_reset(parent->un.leaf.q);
1448                        qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1449                        qdisc_destroy(parent->un.leaf.q);
1450                        if (parent->prio_activity)
1451                                htb_deactivate(q, parent);
1452
1453                        /* remove from evt list because of level change */
1454                        if (parent->cmode != HTB_CAN_SEND) {
1455                                htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1456                                parent->cmode = HTB_CAN_SEND;
1457                        }
1458                        parent->level = (parent->parent ? parent->parent->level
1459                                         : TC_HTB_MAXDEPTH) - 1;
1460                        memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1461                }
1462                /* leaf (we) needs elementary qdisc */
1463                cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1464
1465                cl->common.classid = classid;
1466                cl->parent = parent;
1467
1468                /* set class to be in HTB_CAN_SEND state */
1469                cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1470                cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1471                cl->mbuffer = 60ULL * NSEC_PER_SEC;     /* 1min */
1472                cl->t_c = ktime_get_ns();
1473                cl->cmode = HTB_CAN_SEND;
1474
1475                /* attach to the hash list and parent's family */
1476                qdisc_class_hash_insert(&q->clhash, &cl->common);
1477                if (parent)
1478                        parent->children++;
1479                if (cl->un.leaf.q != &noop_qdisc)
1480                        qdisc_hash_add(cl->un.leaf.q, true);
1481        } else {
1482                if (tca[TCA_RATE]) {
1483                        err = gen_replace_estimator(&cl->bstats, NULL,
1484                                                    &cl->rate_est,
1485                                                    NULL,
1486                                                    qdisc_root_sleeping_running(sch),
1487                                                    tca[TCA_RATE]);
1488                        if (err)
1489                                return err;
1490                }
1491                sch_tree_lock(sch);
1492        }
1493
1494        rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1495
1496        ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1497
1498        psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1499        psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1500
1501        /* it used to be a nasty bug here, we have to check that node
1502         * is really leaf before changing cl->un.leaf !
1503         */
1504        if (!cl->level) {
1505                u64 quantum = cl->rate.rate_bytes_ps;
1506
1507                do_div(quantum, q->rate2quantum);
1508                cl->quantum = min_t(u64, quantum, INT_MAX);
1509
1510                if (!hopt->quantum && cl->quantum < 1000) {
1511                        pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1512                                cl->common.classid);
1513                        cl->quantum = 1000;
1514                }
1515                if (!hopt->quantum && cl->quantum > 200000) {
1516                        pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1517                                cl->common.classid);
1518                        cl->quantum = 200000;
1519                }
1520                if (hopt->quantum)
1521                        cl->quantum = hopt->quantum;
1522                if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1523                        cl->prio = TC_HTB_NUMPRIO - 1;
1524        }
1525
1526        cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1527        cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1528
1529        sch_tree_unlock(sch);
1530
1531        qdisc_class_hash_grow(sch, &q->clhash);
1532
1533        *arg = (unsigned long)cl;
1534        return 0;
1535
1536failure:
1537        return err;
1538}
1539
1540static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg)
1541{
1542        struct htb_sched *q = qdisc_priv(sch);
1543        struct htb_class *cl = (struct htb_class *)arg;
1544
1545        return cl ? cl->block : q->block;
1546}
1547
1548static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1549                                     u32 classid)
1550{
1551        struct htb_class *cl = htb_find(classid, sch);
1552
1553        /*if (cl && !cl->level) return 0;
1554         * The line above used to be there to prevent attaching filters to
1555         * leaves. But at least tc_index filter uses this just to get class
1556         * for other reasons so that we have to allow for it.
1557         * ----
1558         * 19.6.2002 As Werner explained it is ok - bind filter is just
1559         * another way to "lock" the class - unlike "get" this lock can
1560         * be broken by class during destroy IIUC.
1561         */
1562        if (cl)
1563                cl->filter_cnt++;
1564        return (unsigned long)cl;
1565}
1566
1567static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1568{
1569        struct htb_class *cl = (struct htb_class *)arg;
1570
1571        if (cl)
1572                cl->filter_cnt--;
1573}
1574
1575static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1576{
1577        struct htb_sched *q = qdisc_priv(sch);
1578        struct htb_class *cl;
1579        unsigned int i;
1580
1581        if (arg->stop)
1582                return;
1583
1584        for (i = 0; i < q->clhash.hashsize; i++) {
1585                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1586                        if (arg->count < arg->skip) {
1587                                arg->count++;
1588                                continue;
1589                        }
1590                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1591                                arg->stop = 1;
1592                                return;
1593                        }
1594                        arg->count++;
1595                }
1596        }
1597}
1598
1599static const struct Qdisc_class_ops htb_class_ops = {
1600        .graft          =       htb_graft,
1601        .leaf           =       htb_leaf,
1602        .qlen_notify    =       htb_qlen_notify,
1603        .get            =       htb_get,
1604        .put            =       htb_put,
1605        .change         =       htb_change_class,
1606        .delete         =       htb_delete,
1607        .walk           =       htb_walk,
1608        .tcf_block      =       htb_tcf_block,
1609        .bind_tcf       =       htb_bind_filter,
1610        .unbind_tcf     =       htb_unbind_filter,
1611        .dump           =       htb_dump_class,
1612        .dump_stats     =       htb_dump_class_stats,
1613};
1614
1615static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1616        .cl_ops         =       &htb_class_ops,
1617        .id             =       "htb",
1618        .priv_size      =       sizeof(struct htb_sched),
1619        .enqueue        =       htb_enqueue,
1620        .dequeue        =       htb_dequeue,
1621        .peek           =       qdisc_peek_dequeued,
1622        .init           =       htb_init,
1623        .reset          =       htb_reset,
1624        .destroy        =       htb_destroy,
1625        .dump           =       htb_dump,
1626        .owner          =       THIS_MODULE,
1627};
1628
1629static int __init htb_module_init(void)
1630{
1631        return register_qdisc(&htb_qdisc_ops);
1632}
1633static void __exit htb_module_exit(void)
1634{
1635        unregister_qdisc(&htb_qdisc_ops);
1636}
1637
1638module_init(htb_module_init)
1639module_exit(htb_module_exit)
1640MODULE_LICENSE("GPL");
1641