linux/net/sched/sch_htb.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_htb.c  Hierarchical token bucket, feed tree version
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Martin Devera, <devik@cdi.cz>
  10 *
  11 * Credits (in time order) for older HTB versions:
  12 *              Stef Coene <stef.coene@docum.org>
  13 *                      HTB support at LARTC mailing list
  14 *              Ondrej Kraus, <krauso@barr.cz>
  15 *                      found missing INIT_QDISC(htb)
  16 *              Vladimir Smelhaus, Aamer Akhter, Bert Hubert
  17 *                      helped a lot to locate nasty class stall bug
  18 *              Andi Kleen, Jamal Hadi, Bert Hubert
  19 *                      code review and helpful comments on shaping
  20 *              Tomasz Wrona, <tw@eter.tym.pl>
  21 *                      created test case so that I was able to fix nasty bug
  22 *              Wilfried Weissmann
  23 *                      spotted bug in dequeue code and helped with fix
  24 *              Jiri Fojtasek
  25 *                      fixed requeue routine
  26 *              and many others. thanks.
  27 */
  28#include <linux/module.h>
  29#include <linux/moduleparam.h>
  30#include <linux/types.h>
  31#include <linux/kernel.h>
  32#include <linux/string.h>
  33#include <linux/errno.h>
  34#include <linux/skbuff.h>
  35#include <linux/list.h>
  36#include <linux/compiler.h>
  37#include <linux/rbtree.h>
  38#include <linux/workqueue.h>
  39#include <linux/slab.h>
  40#include <net/netlink.h>
  41#include <net/sch_generic.h>
  42#include <net/pkt_sched.h>
  43
  44/* HTB algorithm.
  45    Author: devik@cdi.cz
  46    ========================================================================
  47    HTB is like TBF with multiple classes. It is also similar to CBQ because
  48    it allows to assign priority to each class in hierarchy.
  49    In fact it is another implementation of Floyd's formal sharing.
  50
  51    Levels:
  52    Each class is assigned level. Leaf has ALWAYS level 0 and root
  53    classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
  54    one less than their parent.
  55*/
  56
  57static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
  58#define HTB_VER 0x30011         /* major must be matched with number suplied by TC as version */
  59
  60#if HTB_VER >> 16 != TC_HTB_PROTOVER
  61#error "Mismatched sch_htb.c and pkt_sch.h"
  62#endif
  63
  64/* Module parameter and sysfs export */
  65module_param    (htb_hysteresis, int, 0640);
  66MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
  67
  68static int htb_rate_est = 0; /* htb classes have a default rate estimator */
  69module_param(htb_rate_est, int, 0640);
  70MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
  71
  72/* used internaly to keep status of single class */
  73enum htb_cmode {
  74        HTB_CANT_SEND,          /* class can't send and can't borrow */
  75        HTB_MAY_BORROW,         /* class can't send but may borrow */
  76        HTB_CAN_SEND            /* class can send */
  77};
  78
  79struct htb_prio {
  80        union {
  81                struct rb_root  row;
  82                struct rb_root  feed;
  83        };
  84        struct rb_node  *ptr;
  85        /* When class changes from state 1->2 and disconnects from
  86         * parent's feed then we lost ptr value and start from the
  87         * first child again. Here we store classid of the
  88         * last valid ptr (used when ptr is NULL).
  89         */
  90        u32             last_ptr_id;
  91};
  92
  93/* interior & leaf nodes; props specific to leaves are marked L:
  94 * To reduce false sharing, place mostly read fields at beginning,
  95 * and mostly written ones at the end.
  96 */
  97struct htb_class {
  98        struct Qdisc_class_common common;
  99        struct psched_ratecfg   rate;
 100        struct psched_ratecfg   ceil;
 101        s64                     buffer, cbuffer;/* token bucket depth/rate */
 102        s64                     mbuffer;        /* max wait time */
 103        u32                     prio;           /* these two are used only by leaves... */
 104        int                     quantum;        /* but stored for parent-to-leaf return */
 105
 106        struct tcf_proto __rcu  *filter_list;   /* class attached filters */
 107        int                     filter_cnt;
 108        int                     refcnt;         /* usage count of this class */
 109
 110        int                     level;          /* our level (see above) */
 111        unsigned int            children;
 112        struct htb_class        *parent;        /* parent class */
 113
 114        struct gnet_stats_rate_est64 rate_est;
 115
 116        /*
 117         * Written often fields
 118         */
 119        struct gnet_stats_basic_packed bstats;
 120        struct gnet_stats_queue qstats;
 121        struct tc_htb_xstats    xstats; /* our special stats */
 122
 123        /* token bucket parameters */
 124        s64                     tokens, ctokens;/* current number of tokens */
 125        s64                     t_c;            /* checkpoint time */
 126
 127        union {
 128                struct htb_class_leaf {
 129                        struct list_head drop_list;
 130                        int             deficit[TC_HTB_MAXDEPTH];
 131                        struct Qdisc    *q;
 132                } leaf;
 133                struct htb_class_inner {
 134                        struct htb_prio clprio[TC_HTB_NUMPRIO];
 135                } inner;
 136        } un;
 137        s64                     pq_key;
 138
 139        int                     prio_activity;  /* for which prios are we active */
 140        enum htb_cmode          cmode;          /* current mode of the class */
 141        struct rb_node          pq_node;        /* node for event queue */
 142        struct rb_node          node[TC_HTB_NUMPRIO];   /* node for self or feed tree */
 143};
 144
 145struct htb_level {
 146        struct rb_root  wait_pq;
 147        struct htb_prio hprio[TC_HTB_NUMPRIO];
 148};
 149
 150struct htb_sched {
 151        struct Qdisc_class_hash clhash;
 152        int                     defcls;         /* class where unclassified flows go to */
 153        int                     rate2quantum;   /* quant = rate / rate2quantum */
 154
 155        /* filters for qdisc itself */
 156        struct tcf_proto __rcu  *filter_list;
 157
 158#define HTB_WARN_TOOMANYEVENTS  0x1
 159        unsigned int            warned; /* only one warning */
 160        int                     direct_qlen;
 161        struct work_struct      work;
 162
 163        /* non shaped skbs; let them go directly thru */
 164        struct sk_buff_head     direct_queue;
 165        long                    direct_pkts;
 166
 167        struct qdisc_watchdog   watchdog;
 168
 169        s64                     now;    /* cached dequeue time */
 170        struct list_head        drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
 171
 172        /* time of nearest event per level (row) */
 173        s64                     near_ev_cache[TC_HTB_MAXDEPTH];
 174
 175        int                     row_mask[TC_HTB_MAXDEPTH];
 176
 177        struct htb_level        hlevel[TC_HTB_MAXDEPTH];
 178};
 179
 180/* find class in global hash table using given handle */
 181static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
 182{
 183        struct htb_sched *q = qdisc_priv(sch);
 184        struct Qdisc_class_common *clc;
 185
 186        clc = qdisc_class_find(&q->clhash, handle);
 187        if (clc == NULL)
 188                return NULL;
 189        return container_of(clc, struct htb_class, common);
 190}
 191
 192/**
 193 * htb_classify - classify a packet into class
 194 *
 195 * It returns NULL if the packet should be dropped or -1 if the packet
 196 * should be passed directly thru. In all other cases leaf class is returned.
 197 * We allow direct class selection by classid in priority. The we examine
 198 * filters in qdisc and in inner nodes (if higher filter points to the inner
 199 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
 200 * internal fifo (direct). These packets then go directly thru. If we still
 201 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
 202 * then finish and return direct queue.
 203 */
 204#define HTB_DIRECT ((struct htb_class *)-1L)
 205
 206static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 207                                      int *qerr)
 208{
 209        struct htb_sched *q = qdisc_priv(sch);
 210        struct htb_class *cl;
 211        struct tcf_result res;
 212        struct tcf_proto *tcf;
 213        int result;
 214
 215        /* allow to select class by setting skb->priority to valid classid;
 216         * note that nfmark can be used too by attaching filter fw with no
 217         * rules in it
 218         */
 219        if (skb->priority == sch->handle)
 220                return HTB_DIRECT;      /* X:0 (direct flow) selected */
 221        cl = htb_find(skb->priority, sch);
 222        if (cl) {
 223                if (cl->level == 0)
 224                        return cl;
 225                /* Start with inner filter chain if a non-leaf class is selected */
 226                tcf = rcu_dereference_bh(cl->filter_list);
 227        } else {
 228                tcf = rcu_dereference_bh(q->filter_list);
 229        }
 230
 231        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 232        while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) {
 233#ifdef CONFIG_NET_CLS_ACT
 234                switch (result) {
 235                case TC_ACT_QUEUED:
 236                case TC_ACT_STOLEN:
 237                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 238                case TC_ACT_SHOT:
 239                        return NULL;
 240                }
 241#endif
 242                cl = (void *)res.class;
 243                if (!cl) {
 244                        if (res.classid == sch->handle)
 245                                return HTB_DIRECT;      /* X:0 (direct flow) */
 246                        cl = htb_find(res.classid, sch);
 247                        if (!cl)
 248                                break;  /* filter selected invalid classid */
 249                }
 250                if (!cl->level)
 251                        return cl;      /* we hit leaf; return it */
 252
 253                /* we have got inner class; apply inner filter chain */
 254                tcf = rcu_dereference_bh(cl->filter_list);
 255        }
 256        /* classification failed; try to use default class */
 257        cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
 258        if (!cl || cl->level)
 259                return HTB_DIRECT;      /* bad default .. this is safe bet */
 260        return cl;
 261}
 262
 263/**
 264 * htb_add_to_id_tree - adds class to the round robin list
 265 *
 266 * Routine adds class to the list (actually tree) sorted by classid.
 267 * Make sure that class is not already on such list for given prio.
 268 */
 269static void htb_add_to_id_tree(struct rb_root *root,
 270                               struct htb_class *cl, int prio)
 271{
 272        struct rb_node **p = &root->rb_node, *parent = NULL;
 273
 274        while (*p) {
 275                struct htb_class *c;
 276                parent = *p;
 277                c = rb_entry(parent, struct htb_class, node[prio]);
 278
 279                if (cl->common.classid > c->common.classid)
 280                        p = &parent->rb_right;
 281                else
 282                        p = &parent->rb_left;
 283        }
 284        rb_link_node(&cl->node[prio], parent, p);
 285        rb_insert_color(&cl->node[prio], root);
 286}
 287
 288/**
 289 * htb_add_to_wait_tree - adds class to the event queue with delay
 290 *
 291 * The class is added to priority event queue to indicate that class will
 292 * change its mode in cl->pq_key microseconds. Make sure that class is not
 293 * already in the queue.
 294 */
 295static void htb_add_to_wait_tree(struct htb_sched *q,
 296                                 struct htb_class *cl, s64 delay)
 297{
 298        struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
 299
 300        cl->pq_key = q->now + delay;
 301        if (cl->pq_key == q->now)
 302                cl->pq_key++;
 303
 304        /* update the nearest event cache */
 305        if (q->near_ev_cache[cl->level] > cl->pq_key)
 306                q->near_ev_cache[cl->level] = cl->pq_key;
 307
 308        while (*p) {
 309                struct htb_class *c;
 310                parent = *p;
 311                c = rb_entry(parent, struct htb_class, pq_node);
 312                if (cl->pq_key >= c->pq_key)
 313                        p = &parent->rb_right;
 314                else
 315                        p = &parent->rb_left;
 316        }
 317        rb_link_node(&cl->pq_node, parent, p);
 318        rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 319}
 320
 321/**
 322 * htb_next_rb_node - finds next node in binary tree
 323 *
 324 * When we are past last key we return NULL.
 325 * Average complexity is 2 steps per call.
 326 */
 327static inline void htb_next_rb_node(struct rb_node **n)
 328{
 329        *n = rb_next(*n);
 330}
 331
 332/**
 333 * htb_add_class_to_row - add class to its row
 334 *
 335 * The class is added to row at priorities marked in mask.
 336 * It does nothing if mask == 0.
 337 */
 338static inline void htb_add_class_to_row(struct htb_sched *q,
 339                                        struct htb_class *cl, int mask)
 340{
 341        q->row_mask[cl->level] |= mask;
 342        while (mask) {
 343                int prio = ffz(~mask);
 344                mask &= ~(1 << prio);
 345                htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
 346        }
 347}
 348
 349/* If this triggers, it is a bug in this code, but it need not be fatal */
 350static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
 351{
 352        if (RB_EMPTY_NODE(rb)) {
 353                WARN_ON(1);
 354        } else {
 355                rb_erase(rb, root);
 356                RB_CLEAR_NODE(rb);
 357        }
 358}
 359
 360
 361/**
 362 * htb_remove_class_from_row - removes class from its row
 363 *
 364 * The class is removed from row at priorities marked in mask.
 365 * It does nothing if mask == 0.
 366 */
 367static inline void htb_remove_class_from_row(struct htb_sched *q,
 368                                                 struct htb_class *cl, int mask)
 369{
 370        int m = 0;
 371        struct htb_level *hlevel = &q->hlevel[cl->level];
 372
 373        while (mask) {
 374                int prio = ffz(~mask);
 375                struct htb_prio *hprio = &hlevel->hprio[prio];
 376
 377                mask &= ~(1 << prio);
 378                if (hprio->ptr == cl->node + prio)
 379                        htb_next_rb_node(&hprio->ptr);
 380
 381                htb_safe_rb_erase(cl->node + prio, &hprio->row);
 382                if (!hprio->row.rb_node)
 383                        m |= 1 << prio;
 384        }
 385        q->row_mask[cl->level] &= ~m;
 386}
 387
 388/**
 389 * htb_activate_prios - creates active classe's feed chain
 390 *
 391 * The class is connected to ancestors and/or appropriate rows
 392 * for priorities it is participating on. cl->cmode must be new
 393 * (activated) mode. It does nothing if cl->prio_activity == 0.
 394 */
 395static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
 396{
 397        struct htb_class *p = cl->parent;
 398        long m, mask = cl->prio_activity;
 399
 400        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
 401                m = mask;
 402                while (m) {
 403                        int prio = ffz(~m);
 404                        m &= ~(1 << prio);
 405
 406                        if (p->un.inner.clprio[prio].feed.rb_node)
 407                                /* parent already has its feed in use so that
 408                                 * reset bit in mask as parent is already ok
 409                                 */
 410                                mask &= ~(1 << prio);
 411
 412                        htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
 413                }
 414                p->prio_activity |= mask;
 415                cl = p;
 416                p = cl->parent;
 417
 418        }
 419        if (cl->cmode == HTB_CAN_SEND && mask)
 420                htb_add_class_to_row(q, cl, mask);
 421}
 422
 423/**
 424 * htb_deactivate_prios - remove class from feed chain
 425 *
 426 * cl->cmode must represent old mode (before deactivation). It does
 427 * nothing if cl->prio_activity == 0. Class is removed from all feed
 428 * chains and rows.
 429 */
 430static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 431{
 432        struct htb_class *p = cl->parent;
 433        long m, mask = cl->prio_activity;
 434
 435        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
 436                m = mask;
 437                mask = 0;
 438                while (m) {
 439                        int prio = ffz(~m);
 440                        m &= ~(1 << prio);
 441
 442                        if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
 443                                /* we are removing child which is pointed to from
 444                                 * parent feed - forget the pointer but remember
 445                                 * classid
 446                                 */
 447                                p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
 448                                p->un.inner.clprio[prio].ptr = NULL;
 449                        }
 450
 451                        htb_safe_rb_erase(cl->node + prio,
 452                                          &p->un.inner.clprio[prio].feed);
 453
 454                        if (!p->un.inner.clprio[prio].feed.rb_node)
 455                                mask |= 1 << prio;
 456                }
 457
 458                p->prio_activity &= ~mask;
 459                cl = p;
 460                p = cl->parent;
 461
 462        }
 463        if (cl->cmode == HTB_CAN_SEND && mask)
 464                htb_remove_class_from_row(q, cl, mask);
 465}
 466
 467static inline s64 htb_lowater(const struct htb_class *cl)
 468{
 469        if (htb_hysteresis)
 470                return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
 471        else
 472                return 0;
 473}
 474static inline s64 htb_hiwater(const struct htb_class *cl)
 475{
 476        if (htb_hysteresis)
 477                return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
 478        else
 479                return 0;
 480}
 481
 482
 483/**
 484 * htb_class_mode - computes and returns current class mode
 485 *
 486 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
 487 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
 488 * from now to time when cl will change its state.
 489 * Also it is worth to note that class mode doesn't change simply
 490 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
 491 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
 492 * mode transitions per time unit. The speed gain is about 1/6.
 493 */
 494static inline enum htb_cmode
 495htb_class_mode(struct htb_class *cl, s64 *diff)
 496{
 497        s64 toks;
 498
 499        if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
 500                *diff = -toks;
 501                return HTB_CANT_SEND;
 502        }
 503
 504        if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
 505                return HTB_CAN_SEND;
 506
 507        *diff = -toks;
 508        return HTB_MAY_BORROW;
 509}
 510
 511/**
 512 * htb_change_class_mode - changes classe's mode
 513 *
 514 * This should be the only way how to change classe's mode under normal
 515 * cirsumstances. Routine will update feed lists linkage, change mode
 516 * and add class to the wait event queue if appropriate. New mode should
 517 * be different from old one and cl->pq_key has to be valid if changing
 518 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
 519 */
 520static void
 521htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
 522{
 523        enum htb_cmode new_mode = htb_class_mode(cl, diff);
 524
 525        if (new_mode == cl->cmode)
 526                return;
 527
 528        if (cl->prio_activity) {        /* not necessary: speed optimization */
 529                if (cl->cmode != HTB_CANT_SEND)
 530                        htb_deactivate_prios(q, cl);
 531                cl->cmode = new_mode;
 532                if (new_mode != HTB_CANT_SEND)
 533                        htb_activate_prios(q, cl);
 534        } else
 535                cl->cmode = new_mode;
 536}
 537
 538/**
 539 * htb_activate - inserts leaf cl into appropriate active feeds
 540 *
 541 * Routine learns (new) priority of leaf and activates feed chain
 542 * for the prio. It can be called on already active leaf safely.
 543 * It also adds leaf into droplist.
 544 */
 545static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
 546{
 547        WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
 548
 549        if (!cl->prio_activity) {
 550                cl->prio_activity = 1 << cl->prio;
 551                htb_activate_prios(q, cl);
 552                list_add_tail(&cl->un.leaf.drop_list,
 553                              q->drops + cl->prio);
 554        }
 555}
 556
 557/**
 558 * htb_deactivate - remove leaf cl from active feeds
 559 *
 560 * Make sure that leaf is active. In the other words it can't be called
 561 * with non-active leaf. It also removes class from the drop list.
 562 */
 563static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 564{
 565        WARN_ON(!cl->prio_activity);
 566
 567        htb_deactivate_prios(q, cl);
 568        cl->prio_activity = 0;
 569        list_del_init(&cl->un.leaf.drop_list);
 570}
 571
 572static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 573{
 574        int uninitialized_var(ret);
 575        struct htb_sched *q = qdisc_priv(sch);
 576        struct htb_class *cl = htb_classify(skb, sch, &ret);
 577
 578        if (cl == HTB_DIRECT) {
 579                /* enqueue to helper queue */
 580                if (q->direct_queue.qlen < q->direct_qlen) {
 581                        __skb_queue_tail(&q->direct_queue, skb);
 582                        q->direct_pkts++;
 583                } else {
 584                        return qdisc_drop(skb, sch);
 585                }
 586#ifdef CONFIG_NET_CLS_ACT
 587        } else if (!cl) {
 588                if (ret & __NET_XMIT_BYPASS)
 589                        qdisc_qstats_drop(sch);
 590                kfree_skb(skb);
 591                return ret;
 592#endif
 593        } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
 594                if (net_xmit_drop_count(ret)) {
 595                        qdisc_qstats_drop(sch);
 596                        cl->qstats.drops++;
 597                }
 598                return ret;
 599        } else {
 600                htb_activate(q, cl);
 601        }
 602
 603        qdisc_qstats_backlog_inc(sch, skb);
 604        sch->q.qlen++;
 605        return NET_XMIT_SUCCESS;
 606}
 607
 608static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
 609{
 610        s64 toks = diff + cl->tokens;
 611
 612        if (toks > cl->buffer)
 613                toks = cl->buffer;
 614        toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
 615        if (toks <= -cl->mbuffer)
 616                toks = 1 - cl->mbuffer;
 617
 618        cl->tokens = toks;
 619}
 620
 621static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
 622{
 623        s64 toks = diff + cl->ctokens;
 624
 625        if (toks > cl->cbuffer)
 626                toks = cl->cbuffer;
 627        toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
 628        if (toks <= -cl->mbuffer)
 629                toks = 1 - cl->mbuffer;
 630
 631        cl->ctokens = toks;
 632}
 633
 634/**
 635 * htb_charge_class - charges amount "bytes" to leaf and ancestors
 636 *
 637 * Routine assumes that packet "bytes" long was dequeued from leaf cl
 638 * borrowing from "level". It accounts bytes to ceil leaky bucket for
 639 * leaf and all ancestors and to rate bucket for ancestors at levels
 640 * "level" and higher. It also handles possible change of mode resulting
 641 * from the update. Note that mode can also increase here (MAY_BORROW to
 642 * CAN_SEND) because we can use more precise clock that event queue here.
 643 * In such case we remove class from event queue first.
 644 */
 645static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 646                             int level, struct sk_buff *skb)
 647{
 648        int bytes = qdisc_pkt_len(skb);
 649        enum htb_cmode old_mode;
 650        s64 diff;
 651
 652        while (cl) {
 653                diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 654                if (cl->level >= level) {
 655                        if (cl->level == level)
 656                                cl->xstats.lends++;
 657                        htb_accnt_tokens(cl, bytes, diff);
 658                } else {
 659                        cl->xstats.borrows++;
 660                        cl->tokens += diff;     /* we moved t_c; update tokens */
 661                }
 662                htb_accnt_ctokens(cl, bytes, diff);
 663                cl->t_c = q->now;
 664
 665                old_mode = cl->cmode;
 666                diff = 0;
 667                htb_change_class_mode(q, cl, &diff);
 668                if (old_mode != cl->cmode) {
 669                        if (old_mode != HTB_CAN_SEND)
 670                                htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 671                        if (cl->cmode != HTB_CAN_SEND)
 672                                htb_add_to_wait_tree(q, cl, diff);
 673                }
 674
 675                /* update basic stats except for leaves which are already updated */
 676                if (cl->level)
 677                        bstats_update(&cl->bstats, skb);
 678
 679                cl = cl->parent;
 680        }
 681}
 682
 683/**
 684 * htb_do_events - make mode changes to classes at the level
 685 *
 686 * Scans event queue for pending events and applies them. Returns time of
 687 * next pending event (0 for no event in pq, q->now for too many events).
 688 * Note: Applied are events whose have cl->pq_key <= q->now.
 689 */
 690static s64 htb_do_events(struct htb_sched *q, const int level,
 691                         unsigned long start)
 692{
 693        /* don't run for longer than 2 jiffies; 2 is used instead of
 694         * 1 to simplify things when jiffy is going to be incremented
 695         * too soon
 696         */
 697        unsigned long stop_at = start + 2;
 698        struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
 699
 700        while (time_before(jiffies, stop_at)) {
 701                struct htb_class *cl;
 702                s64 diff;
 703                struct rb_node *p = rb_first(wait_pq);
 704
 705                if (!p)
 706                        return 0;
 707
 708                cl = rb_entry(p, struct htb_class, pq_node);
 709                if (cl->pq_key > q->now)
 710                        return cl->pq_key;
 711
 712                htb_safe_rb_erase(p, wait_pq);
 713                diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 714                htb_change_class_mode(q, cl, &diff);
 715                if (cl->cmode != HTB_CAN_SEND)
 716                        htb_add_to_wait_tree(q, cl, diff);
 717        }
 718
 719        /* too much load - let's continue after a break for scheduling */
 720        if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
 721                pr_warn("htb: too many events!\n");
 722                q->warned |= HTB_WARN_TOOMANYEVENTS;
 723        }
 724
 725        return q->now;
 726}
 727
 728/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
 729 * is no such one exists.
 730 */
 731static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
 732                                              u32 id)
 733{
 734        struct rb_node *r = NULL;
 735        while (n) {
 736                struct htb_class *cl =
 737                    rb_entry(n, struct htb_class, node[prio]);
 738
 739                if (id > cl->common.classid) {
 740                        n = n->rb_right;
 741                } else if (id < cl->common.classid) {
 742                        r = n;
 743                        n = n->rb_left;
 744                } else {
 745                        return n;
 746                }
 747        }
 748        return r;
 749}
 750
 751/**
 752 * htb_lookup_leaf - returns next leaf class in DRR order
 753 *
 754 * Find leaf where current feed pointers points to.
 755 */
 756static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
 757{
 758        int i;
 759        struct {
 760                struct rb_node *root;
 761                struct rb_node **pptr;
 762                u32 *pid;
 763        } stk[TC_HTB_MAXDEPTH], *sp = stk;
 764
 765        BUG_ON(!hprio->row.rb_node);
 766        sp->root = hprio->row.rb_node;
 767        sp->pptr = &hprio->ptr;
 768        sp->pid = &hprio->last_ptr_id;
 769
 770        for (i = 0; i < 65535; i++) {
 771                if (!*sp->pptr && *sp->pid) {
 772                        /* ptr was invalidated but id is valid - try to recover
 773                         * the original or next ptr
 774                         */
 775                        *sp->pptr =
 776                            htb_id_find_next_upper(prio, sp->root, *sp->pid);
 777                }
 778                *sp->pid = 0;   /* ptr is valid now so that remove this hint as it
 779                                 * can become out of date quickly
 780                                 */
 781                if (!*sp->pptr) {       /* we are at right end; rewind & go up */
 782                        *sp->pptr = sp->root;
 783                        while ((*sp->pptr)->rb_left)
 784                                *sp->pptr = (*sp->pptr)->rb_left;
 785                        if (sp > stk) {
 786                                sp--;
 787                                if (!*sp->pptr) {
 788                                        WARN_ON(1);
 789                                        return NULL;
 790                                }
 791                                htb_next_rb_node(sp->pptr);
 792                        }
 793                } else {
 794                        struct htb_class *cl;
 795                        struct htb_prio *clp;
 796
 797                        cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
 798                        if (!cl->level)
 799                                return cl;
 800                        clp = &cl->un.inner.clprio[prio];
 801                        (++sp)->root = clp->feed.rb_node;
 802                        sp->pptr = &clp->ptr;
 803                        sp->pid = &clp->last_ptr_id;
 804                }
 805        }
 806        WARN_ON(1);
 807        return NULL;
 808}
 809
 810/* dequeues packet at given priority and level; call only if
 811 * you are sure that there is active class at prio/level
 812 */
 813static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
 814                                        const int level)
 815{
 816        struct sk_buff *skb = NULL;
 817        struct htb_class *cl, *start;
 818        struct htb_level *hlevel = &q->hlevel[level];
 819        struct htb_prio *hprio = &hlevel->hprio[prio];
 820
 821        /* look initial class up in the row */
 822        start = cl = htb_lookup_leaf(hprio, prio);
 823
 824        do {
 825next:
 826                if (unlikely(!cl))
 827                        return NULL;
 828
 829                /* class can be empty - it is unlikely but can be true if leaf
 830                 * qdisc drops packets in enqueue routine or if someone used
 831                 * graft operation on the leaf since last dequeue;
 832                 * simply deactivate and skip such class
 833                 */
 834                if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
 835                        struct htb_class *next;
 836                        htb_deactivate(q, cl);
 837
 838                        /* row/level might become empty */
 839                        if ((q->row_mask[level] & (1 << prio)) == 0)
 840                                return NULL;
 841
 842                        next = htb_lookup_leaf(hprio, prio);
 843
 844                        if (cl == start)        /* fix start if we just deleted it */
 845                                start = next;
 846                        cl = next;
 847                        goto next;
 848                }
 849
 850                skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
 851                if (likely(skb != NULL))
 852                        break;
 853
 854                qdisc_warn_nonwc("htb", cl->un.leaf.q);
 855                htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
 856                                         &q->hlevel[0].hprio[prio].ptr);
 857                cl = htb_lookup_leaf(hprio, prio);
 858
 859        } while (cl != start);
 860
 861        if (likely(skb != NULL)) {
 862                bstats_update(&cl->bstats, skb);
 863                cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
 864                if (cl->un.leaf.deficit[level] < 0) {
 865                        cl->un.leaf.deficit[level] += cl->quantum;
 866                        htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
 867                                                 &q->hlevel[0].hprio[prio].ptr);
 868                }
 869                /* this used to be after charge_class but this constelation
 870                 * gives us slightly better performance
 871                 */
 872                if (!cl->un.leaf.q->q.qlen)
 873                        htb_deactivate(q, cl);
 874                htb_charge_class(q, cl, level, skb);
 875        }
 876        return skb;
 877}
 878
 879static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 880{
 881        struct sk_buff *skb;
 882        struct htb_sched *q = qdisc_priv(sch);
 883        int level;
 884        s64 next_event;
 885        unsigned long start_at;
 886
 887        /* try to dequeue direct packets as high prio (!) to minimize cpu work */
 888        skb = __skb_dequeue(&q->direct_queue);
 889        if (skb != NULL) {
 890ok:
 891                qdisc_bstats_update(sch, skb);
 892                qdisc_unthrottled(sch);
 893                qdisc_qstats_backlog_dec(sch, skb);
 894                sch->q.qlen--;
 895                return skb;
 896        }
 897
 898        if (!sch->q.qlen)
 899                goto fin;
 900        q->now = ktime_get_ns();
 901        start_at = jiffies;
 902
 903        next_event = q->now + 5LLU * NSEC_PER_SEC;
 904
 905        for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
 906                /* common case optimization - skip event handler quickly */
 907                int m;
 908                s64 event = q->near_ev_cache[level];
 909
 910                if (q->now >= event) {
 911                        event = htb_do_events(q, level, start_at);
 912                        if (!event)
 913                                event = q->now + NSEC_PER_SEC;
 914                        q->near_ev_cache[level] = event;
 915                }
 916
 917                if (next_event > event)
 918                        next_event = event;
 919
 920                m = ~q->row_mask[level];
 921                while (m != (int)(-1)) {
 922                        int prio = ffz(m);
 923
 924                        m |= 1 << prio;
 925                        skb = htb_dequeue_tree(q, prio, level);
 926                        if (likely(skb != NULL))
 927                                goto ok;
 928                }
 929        }
 930        qdisc_qstats_overlimit(sch);
 931        if (likely(next_event > q->now)) {
 932                if (!test_bit(__QDISC_STATE_DEACTIVATED,
 933                              &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
 934                        ktime_t time = ns_to_ktime(next_event);
 935                        qdisc_throttled(q->watchdog.qdisc);
 936                        hrtimer_start(&q->watchdog.timer, time,
 937                                      HRTIMER_MODE_ABS_PINNED);
 938                }
 939        } else {
 940                schedule_work(&q->work);
 941        }
 942fin:
 943        return skb;
 944}
 945
 946/* try to drop from each class (by prio) until one succeed */
 947static unsigned int htb_drop(struct Qdisc *sch)
 948{
 949        struct htb_sched *q = qdisc_priv(sch);
 950        int prio;
 951
 952        for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
 953                struct list_head *p;
 954                list_for_each(p, q->drops + prio) {
 955                        struct htb_class *cl = list_entry(p, struct htb_class,
 956                                                          un.leaf.drop_list);
 957                        unsigned int len;
 958                        if (cl->un.leaf.q->ops->drop &&
 959                            (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
 960                                sch->qstats.backlog -= len;
 961                                sch->q.qlen--;
 962                                if (!cl->un.leaf.q->q.qlen)
 963                                        htb_deactivate(q, cl);
 964                                return len;
 965                        }
 966                }
 967        }
 968        return 0;
 969}
 970
 971/* reset all classes */
 972/* always caled under BH & queue lock */
 973static void htb_reset(struct Qdisc *sch)
 974{
 975        struct htb_sched *q = qdisc_priv(sch);
 976        struct htb_class *cl;
 977        unsigned int i;
 978
 979        for (i = 0; i < q->clhash.hashsize; i++) {
 980                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 981                        if (cl->level)
 982                                memset(&cl->un.inner, 0, sizeof(cl->un.inner));
 983                        else {
 984                                if (cl->un.leaf.q)
 985                                        qdisc_reset(cl->un.leaf.q);
 986                                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
 987                        }
 988                        cl->prio_activity = 0;
 989                        cl->cmode = HTB_CAN_SEND;
 990                }
 991        }
 992        qdisc_watchdog_cancel(&q->watchdog);
 993        __skb_queue_purge(&q->direct_queue);
 994        sch->q.qlen = 0;
 995        sch->qstats.backlog = 0;
 996        memset(q->hlevel, 0, sizeof(q->hlevel));
 997        memset(q->row_mask, 0, sizeof(q->row_mask));
 998        for (i = 0; i < TC_HTB_NUMPRIO; i++)
 999                INIT_LIST_HEAD(q->drops + i);
1000}
1001
1002static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1003        [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
1004        [TCA_HTB_INIT]  = { .len = sizeof(struct tc_htb_glob) },
1005        [TCA_HTB_CTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1006        [TCA_HTB_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1007        [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1008        [TCA_HTB_RATE64] = { .type = NLA_U64 },
1009        [TCA_HTB_CEIL64] = { .type = NLA_U64 },
1010};
1011
1012static void htb_work_func(struct work_struct *work)
1013{
1014        struct htb_sched *q = container_of(work, struct htb_sched, work);
1015        struct Qdisc *sch = q->watchdog.qdisc;
1016
1017        __netif_schedule(qdisc_root(sch));
1018}
1019
1020static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1021{
1022        struct htb_sched *q = qdisc_priv(sch);
1023        struct nlattr *tb[TCA_HTB_MAX + 1];
1024        struct tc_htb_glob *gopt;
1025        int err;
1026        int i;
1027
1028        if (!opt)
1029                return -EINVAL;
1030
1031        err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1032        if (err < 0)
1033                return err;
1034
1035        if (!tb[TCA_HTB_INIT])
1036                return -EINVAL;
1037
1038        gopt = nla_data(tb[TCA_HTB_INIT]);
1039        if (gopt->version != HTB_VER >> 16)
1040                return -EINVAL;
1041
1042        err = qdisc_class_hash_init(&q->clhash);
1043        if (err < 0)
1044                return err;
1045        for (i = 0; i < TC_HTB_NUMPRIO; i++)
1046                INIT_LIST_HEAD(q->drops + i);
1047
1048        qdisc_watchdog_init(&q->watchdog, sch);
1049        INIT_WORK(&q->work, htb_work_func);
1050        __skb_queue_head_init(&q->direct_queue);
1051
1052        if (tb[TCA_HTB_DIRECT_QLEN])
1053                q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1054        else
1055                q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1056
1057        if ((q->rate2quantum = gopt->rate2quantum) < 1)
1058                q->rate2quantum = 1;
1059        q->defcls = gopt->defcls;
1060
1061        return 0;
1062}
1063
1064static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1065{
1066        struct htb_sched *q = qdisc_priv(sch);
1067        struct nlattr *nest;
1068        struct tc_htb_glob gopt;
1069
1070        /* Its safe to not acquire qdisc lock. As we hold RTNL,
1071         * no change can happen on the qdisc parameters.
1072         */
1073
1074        gopt.direct_pkts = q->direct_pkts;
1075        gopt.version = HTB_VER;
1076        gopt.rate2quantum = q->rate2quantum;
1077        gopt.defcls = q->defcls;
1078        gopt.debug = 0;
1079
1080        nest = nla_nest_start(skb, TCA_OPTIONS);
1081        if (nest == NULL)
1082                goto nla_put_failure;
1083        if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1084            nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1085                goto nla_put_failure;
1086
1087        return nla_nest_end(skb, nest);
1088
1089nla_put_failure:
1090        nla_nest_cancel(skb, nest);
1091        return -1;
1092}
1093
1094static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1095                          struct sk_buff *skb, struct tcmsg *tcm)
1096{
1097        struct htb_class *cl = (struct htb_class *)arg;
1098        struct nlattr *nest;
1099        struct tc_htb_opt opt;
1100
1101        /* Its safe to not acquire qdisc lock. As we hold RTNL,
1102         * no change can happen on the class parameters.
1103         */
1104        tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1105        tcm->tcm_handle = cl->common.classid;
1106        if (!cl->level && cl->un.leaf.q)
1107                tcm->tcm_info = cl->un.leaf.q->handle;
1108
1109        nest = nla_nest_start(skb, TCA_OPTIONS);
1110        if (nest == NULL)
1111                goto nla_put_failure;
1112
1113        memset(&opt, 0, sizeof(opt));
1114
1115        psched_ratecfg_getrate(&opt.rate, &cl->rate);
1116        opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1117        psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1118        opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1119        opt.quantum = cl->quantum;
1120        opt.prio = cl->prio;
1121        opt.level = cl->level;
1122        if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1123                goto nla_put_failure;
1124        if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1125            nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
1126                goto nla_put_failure;
1127        if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1128            nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
1129                goto nla_put_failure;
1130
1131        return nla_nest_end(skb, nest);
1132
1133nla_put_failure:
1134        nla_nest_cancel(skb, nest);
1135        return -1;
1136}
1137
1138static int
1139htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1140{
1141        struct htb_class *cl = (struct htb_class *)arg;
1142        __u32 qlen = 0;
1143
1144        if (!cl->level && cl->un.leaf.q)
1145                qlen = cl->un.leaf.q->q.qlen;
1146        cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
1147        cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
1148
1149        if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
1150            gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
1151            gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1152                return -1;
1153
1154        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1155}
1156
1157static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1158                     struct Qdisc **old)
1159{
1160        struct htb_class *cl = (struct htb_class *)arg;
1161
1162        if (cl->level)
1163                return -EINVAL;
1164        if (new == NULL &&
1165            (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1166                                     cl->common.classid)) == NULL)
1167                return -ENOBUFS;
1168
1169        *old = qdisc_replace(sch, new, &cl->un.leaf.q);
1170        return 0;
1171}
1172
1173static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1174{
1175        struct htb_class *cl = (struct htb_class *)arg;
1176        return !cl->level ? cl->un.leaf.q : NULL;
1177}
1178
1179static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1180{
1181        struct htb_class *cl = (struct htb_class *)arg;
1182
1183        if (cl->un.leaf.q->q.qlen == 0)
1184                htb_deactivate(qdisc_priv(sch), cl);
1185}
1186
1187static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1188{
1189        struct htb_class *cl = htb_find(classid, sch);
1190        if (cl)
1191                cl->refcnt++;
1192        return (unsigned long)cl;
1193}
1194
1195static inline int htb_parent_last_child(struct htb_class *cl)
1196{
1197        if (!cl->parent)
1198                /* the root class */
1199                return 0;
1200        if (cl->parent->children > 1)
1201                /* not the last child */
1202                return 0;
1203        return 1;
1204}
1205
1206static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1207                               struct Qdisc *new_q)
1208{
1209        struct htb_class *parent = cl->parent;
1210
1211        WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1212
1213        if (parent->cmode != HTB_CAN_SEND)
1214                htb_safe_rb_erase(&parent->pq_node,
1215                                  &q->hlevel[parent->level].wait_pq);
1216
1217        parent->level = 0;
1218        memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1219        INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1220        parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1221        parent->tokens = parent->buffer;
1222        parent->ctokens = parent->cbuffer;
1223        parent->t_c = ktime_get_ns();
1224        parent->cmode = HTB_CAN_SEND;
1225}
1226
1227static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1228{
1229        if (!cl->level) {
1230                WARN_ON(!cl->un.leaf.q);
1231                qdisc_destroy(cl->un.leaf.q);
1232        }
1233        gen_kill_estimator(&cl->bstats, &cl->rate_est);
1234        tcf_destroy_chain(&cl->filter_list);
1235        kfree(cl);
1236}
1237
1238static void htb_destroy(struct Qdisc *sch)
1239{
1240        struct htb_sched *q = qdisc_priv(sch);
1241        struct hlist_node *next;
1242        struct htb_class *cl;
1243        unsigned int i;
1244
1245        cancel_work_sync(&q->work);
1246        qdisc_watchdog_cancel(&q->watchdog);
1247        /* This line used to be after htb_destroy_class call below
1248         * and surprisingly it worked in 2.4. But it must precede it
1249         * because filter need its target class alive to be able to call
1250         * unbind_filter on it (without Oops).
1251         */
1252        tcf_destroy_chain(&q->filter_list);
1253
1254        for (i = 0; i < q->clhash.hashsize; i++) {
1255                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
1256                        tcf_destroy_chain(&cl->filter_list);
1257        }
1258        for (i = 0; i < q->clhash.hashsize; i++) {
1259                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1260                                          common.hnode)
1261                        htb_destroy_class(sch, cl);
1262        }
1263        qdisc_class_hash_destroy(&q->clhash);
1264        __skb_queue_purge(&q->direct_queue);
1265}
1266
1267static int htb_delete(struct Qdisc *sch, unsigned long arg)
1268{
1269        struct htb_sched *q = qdisc_priv(sch);
1270        struct htb_class *cl = (struct htb_class *)arg;
1271        struct Qdisc *new_q = NULL;
1272        int last_child = 0;
1273
1274        /* TODO: why don't allow to delete subtree ? references ? does
1275         * tc subsys guarantee us that in htb_destroy it holds no class
1276         * refs so that we can remove children safely there ?
1277         */
1278        if (cl->children || cl->filter_cnt)
1279                return -EBUSY;
1280
1281        if (!cl->level && htb_parent_last_child(cl)) {
1282                new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1283                                          cl->parent->common.classid);
1284                last_child = 1;
1285        }
1286
1287        sch_tree_lock(sch);
1288
1289        if (!cl->level) {
1290                unsigned int qlen = cl->un.leaf.q->q.qlen;
1291                unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1292
1293                qdisc_reset(cl->un.leaf.q);
1294                qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1295        }
1296
1297        /* delete from hash and active; remainder in destroy_class */
1298        qdisc_class_hash_remove(&q->clhash, &cl->common);
1299        if (cl->parent)
1300                cl->parent->children--;
1301
1302        if (cl->prio_activity)
1303                htb_deactivate(q, cl);
1304
1305        if (cl->cmode != HTB_CAN_SEND)
1306                htb_safe_rb_erase(&cl->pq_node,
1307                                  &q->hlevel[cl->level].wait_pq);
1308
1309        if (last_child)
1310                htb_parent_to_leaf(q, cl, new_q);
1311
1312        BUG_ON(--cl->refcnt == 0);
1313        /*
1314         * This shouldn't happen: we "hold" one cops->get() when called
1315         * from tc_ctl_tclass; the destroy method is done from cops->put().
1316         */
1317
1318        sch_tree_unlock(sch);
1319        return 0;
1320}
1321
1322static void htb_put(struct Qdisc *sch, unsigned long arg)
1323{
1324        struct htb_class *cl = (struct htb_class *)arg;
1325
1326        if (--cl->refcnt == 0)
1327                htb_destroy_class(sch, cl);
1328}
1329
1330static int htb_change_class(struct Qdisc *sch, u32 classid,
1331                            u32 parentid, struct nlattr **tca,
1332                            unsigned long *arg)
1333{
1334        int err = -EINVAL;
1335        struct htb_sched *q = qdisc_priv(sch);
1336        struct htb_class *cl = (struct htb_class *)*arg, *parent;
1337        struct nlattr *opt = tca[TCA_OPTIONS];
1338        struct nlattr *tb[TCA_HTB_MAX + 1];
1339        struct tc_htb_opt *hopt;
1340        u64 rate64, ceil64;
1341
1342        /* extract all subattrs from opt attr */
1343        if (!opt)
1344                goto failure;
1345
1346        err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1347        if (err < 0)
1348                goto failure;
1349
1350        err = -EINVAL;
1351        if (tb[TCA_HTB_PARMS] == NULL)
1352                goto failure;
1353
1354        parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1355
1356        hopt = nla_data(tb[TCA_HTB_PARMS]);
1357        if (!hopt->rate.rate || !hopt->ceil.rate)
1358                goto failure;
1359
1360        /* Keeping backward compatible with rate_table based iproute2 tc */
1361        if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1362                qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1363
1364        if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1365                qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1366
1367        if (!cl) {              /* new class */
1368                struct Qdisc *new_q;
1369                int prio;
1370                struct {
1371                        struct nlattr           nla;
1372                        struct gnet_estimator   opt;
1373                } est = {
1374                        .nla = {
1375                                .nla_len        = nla_attr_size(sizeof(est.opt)),
1376                                .nla_type       = TCA_RATE,
1377                        },
1378                        .opt = {
1379                                /* 4s interval, 16s averaging constant */
1380                                .interval       = 2,
1381                                .ewma_log       = 2,
1382                        },
1383                };
1384
1385                /* check for valid classid */
1386                if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1387                    htb_find(classid, sch))
1388                        goto failure;
1389
1390                /* check maximal depth */
1391                if (parent && parent->parent && parent->parent->level < 2) {
1392                        pr_err("htb: tree is too deep\n");
1393                        goto failure;
1394                }
1395                err = -ENOBUFS;
1396                cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1397                if (!cl)
1398                        goto failure;
1399
1400                if (htb_rate_est || tca[TCA_RATE]) {
1401                        err = gen_new_estimator(&cl->bstats, NULL,
1402                                                &cl->rate_est,
1403                                                qdisc_root_sleeping_lock(sch),
1404                                                tca[TCA_RATE] ? : &est.nla);
1405                        if (err) {
1406                                kfree(cl);
1407                                goto failure;
1408                        }
1409                }
1410
1411                cl->refcnt = 1;
1412                cl->children = 0;
1413                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1414                RB_CLEAR_NODE(&cl->pq_node);
1415
1416                for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1417                        RB_CLEAR_NODE(&cl->node[prio]);
1418
1419                /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1420                 * so that can't be used inside of sch_tree_lock
1421                 * -- thanks to Karlis Peisenieks
1422                 */
1423                new_q = qdisc_create_dflt(sch->dev_queue,
1424                                          &pfifo_qdisc_ops, classid);
1425                sch_tree_lock(sch);
1426                if (parent && !parent->level) {
1427                        unsigned int qlen = parent->un.leaf.q->q.qlen;
1428                        unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1429
1430                        /* turn parent into inner node */
1431                        qdisc_reset(parent->un.leaf.q);
1432                        qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1433                        qdisc_destroy(parent->un.leaf.q);
1434                        if (parent->prio_activity)
1435                                htb_deactivate(q, parent);
1436
1437                        /* remove from evt list because of level change */
1438                        if (parent->cmode != HTB_CAN_SEND) {
1439                                htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1440                                parent->cmode = HTB_CAN_SEND;
1441                        }
1442                        parent->level = (parent->parent ? parent->parent->level
1443                                         : TC_HTB_MAXDEPTH) - 1;
1444                        memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1445                }
1446                /* leaf (we) needs elementary qdisc */
1447                cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1448
1449                cl->common.classid = classid;
1450                cl->parent = parent;
1451
1452                /* set class to be in HTB_CAN_SEND state */
1453                cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1454                cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1455                cl->mbuffer = 60ULL * NSEC_PER_SEC;     /* 1min */
1456                cl->t_c = ktime_get_ns();
1457                cl->cmode = HTB_CAN_SEND;
1458
1459                /* attach to the hash list and parent's family */
1460                qdisc_class_hash_insert(&q->clhash, &cl->common);
1461                if (parent)
1462                        parent->children++;
1463        } else {
1464                if (tca[TCA_RATE]) {
1465                        spinlock_t *lock = qdisc_root_sleeping_lock(sch);
1466
1467                        err = gen_replace_estimator(&cl->bstats, NULL,
1468                                                    &cl->rate_est,
1469                                                    lock,
1470                                                    tca[TCA_RATE]);
1471                        if (err)
1472                                return err;
1473                }
1474                sch_tree_lock(sch);
1475        }
1476
1477        rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1478
1479        ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1480
1481        psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1482        psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1483
1484        /* it used to be a nasty bug here, we have to check that node
1485         * is really leaf before changing cl->un.leaf !
1486         */
1487        if (!cl->level) {
1488                u64 quantum = cl->rate.rate_bytes_ps;
1489
1490                do_div(quantum, q->rate2quantum);
1491                cl->quantum = min_t(u64, quantum, INT_MAX);
1492
1493                if (!hopt->quantum && cl->quantum < 1000) {
1494                        pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1495                                cl->common.classid);
1496                        cl->quantum = 1000;
1497                }
1498                if (!hopt->quantum && cl->quantum > 200000) {
1499                        pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1500                                cl->common.classid);
1501                        cl->quantum = 200000;
1502                }
1503                if (hopt->quantum)
1504                        cl->quantum = hopt->quantum;
1505                if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1506                        cl->prio = TC_HTB_NUMPRIO - 1;
1507        }
1508
1509        cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1510        cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1511
1512        sch_tree_unlock(sch);
1513
1514        qdisc_class_hash_grow(sch, &q->clhash);
1515
1516        *arg = (unsigned long)cl;
1517        return 0;
1518
1519failure:
1520        return err;
1521}
1522
1523static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch,
1524                                             unsigned long arg)
1525{
1526        struct htb_sched *q = qdisc_priv(sch);
1527        struct htb_class *cl = (struct htb_class *)arg;
1528        struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list;
1529
1530        return fl;
1531}
1532
1533static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1534                                     u32 classid)
1535{
1536        struct htb_class *cl = htb_find(classid, sch);
1537
1538        /*if (cl && !cl->level) return 0;
1539         * The line above used to be there to prevent attaching filters to
1540         * leaves. But at least tc_index filter uses this just to get class
1541         * for other reasons so that we have to allow for it.
1542         * ----
1543         * 19.6.2002 As Werner explained it is ok - bind filter is just
1544         * another way to "lock" the class - unlike "get" this lock can
1545         * be broken by class during destroy IIUC.
1546         */
1547        if (cl)
1548                cl->filter_cnt++;
1549        return (unsigned long)cl;
1550}
1551
1552static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1553{
1554        struct htb_class *cl = (struct htb_class *)arg;
1555
1556        if (cl)
1557                cl->filter_cnt--;
1558}
1559
1560static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1561{
1562        struct htb_sched *q = qdisc_priv(sch);
1563        struct htb_class *cl;
1564        unsigned int i;
1565
1566        if (arg->stop)
1567                return;
1568
1569        for (i = 0; i < q->clhash.hashsize; i++) {
1570                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1571                        if (arg->count < arg->skip) {
1572                                arg->count++;
1573                                continue;
1574                        }
1575                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1576                                arg->stop = 1;
1577                                return;
1578                        }
1579                        arg->count++;
1580                }
1581        }
1582}
1583
1584static const struct Qdisc_class_ops htb_class_ops = {
1585        .graft          =       htb_graft,
1586        .leaf           =       htb_leaf,
1587        .qlen_notify    =       htb_qlen_notify,
1588        .get            =       htb_get,
1589        .put            =       htb_put,
1590        .change         =       htb_change_class,
1591        .delete         =       htb_delete,
1592        .walk           =       htb_walk,
1593        .tcf_chain      =       htb_find_tcf,
1594        .bind_tcf       =       htb_bind_filter,
1595        .unbind_tcf     =       htb_unbind_filter,
1596        .dump           =       htb_dump_class,
1597        .dump_stats     =       htb_dump_class_stats,
1598};
1599
1600static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1601        .cl_ops         =       &htb_class_ops,
1602        .id             =       "htb",
1603        .priv_size      =       sizeof(struct htb_sched),
1604        .enqueue        =       htb_enqueue,
1605        .dequeue        =       htb_dequeue,
1606        .peek           =       qdisc_peek_dequeued,
1607        .drop           =       htb_drop,
1608        .init           =       htb_init,
1609        .reset          =       htb_reset,
1610        .destroy        =       htb_destroy,
1611        .dump           =       htb_dump,
1612        .owner          =       THIS_MODULE,
1613};
1614
1615static int __init htb_module_init(void)
1616{
1617        return register_qdisc(&htb_qdisc_ops);
1618}
1619static void __exit htb_module_exit(void)
1620{
1621        unregister_qdisc(&htb_qdisc_ops);
1622}
1623
1624module_init(htb_module_init)
1625module_exit(htb_module_exit)
1626MODULE_LICENSE("GPL");
1627