linux/net/sched/sch_htb.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_htb.c  Hierarchical token bucket, feed tree version
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Martin Devera, <devik@cdi.cz>
  10 *
  11 * Credits (in time order) for older HTB versions:
  12 *              Stef Coene <stef.coene@docum.org>
  13 *                      HTB support at LARTC mailing list
  14 *              Ondrej Kraus, <krauso@barr.cz>
  15 *                      found missing INIT_QDISC(htb)
  16 *              Vladimir Smelhaus, Aamer Akhter, Bert Hubert
  17 *                      helped a lot to locate nasty class stall bug
  18 *              Andi Kleen, Jamal Hadi, Bert Hubert
  19 *                      code review and helpful comments on shaping
  20 *              Tomasz Wrona, <tw@eter.tym.pl>
  21 *                      created test case so that I was able to fix nasty bug
  22 *              Wilfried Weissmann
  23 *                      spotted bug in dequeue code and helped with fix
  24 *              Jiri Fojtasek
  25 *                      fixed requeue routine
  26 *              and many others. thanks.
  27 */
  28#include <linux/module.h>
  29#include <linux/moduleparam.h>
  30#include <linux/types.h>
  31#include <linux/kernel.h>
  32#include <linux/string.h>
  33#include <linux/errno.h>
  34#include <linux/skbuff.h>
  35#include <linux/list.h>
  36#include <linux/compiler.h>
  37#include <linux/rbtree.h>
  38#include <linux/workqueue.h>
  39#include <linux/slab.h>
  40#include <net/netlink.h>
  41#include <net/sch_generic.h>
  42#include <net/pkt_sched.h>
  43#include <net/pkt_cls.h>
  44
  45/* HTB algorithm.
  46    Author: devik@cdi.cz
  47    ========================================================================
  48    HTB is like TBF with multiple classes. It is also similar to CBQ because
  49    it allows to assign priority to each class in hierarchy.
  50    In fact it is another implementation of Floyd's formal sharing.
  51
  52    Levels:
  53    Each class is assigned level. Leaf has ALWAYS level 0 and root
  54    classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
  55    one less than their parent.
  56*/
  57
  58static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
  59#define HTB_VER 0x30011         /* major must be matched with number suplied by TC as version */
  60
  61#if HTB_VER >> 16 != TC_HTB_PROTOVER
  62#error "Mismatched sch_htb.c and pkt_sch.h"
  63#endif
  64
  65/* Module parameter and sysfs export */
  66module_param    (htb_hysteresis, int, 0640);
  67MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
  68
  69static int htb_rate_est = 0; /* htb classes have a default rate estimator */
  70module_param(htb_rate_est, int, 0640);
  71MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
  72
  73/* used internaly to keep status of single class */
  74enum htb_cmode {
  75        HTB_CANT_SEND,          /* class can't send and can't borrow */
  76        HTB_MAY_BORROW,         /* class can't send but may borrow */
  77        HTB_CAN_SEND            /* class can send */
  78};
  79
  80struct htb_prio {
  81        union {
  82                struct rb_root  row;
  83                struct rb_root  feed;
  84        };
  85        struct rb_node  *ptr;
  86        /* When class changes from state 1->2 and disconnects from
  87         * parent's feed then we lost ptr value and start from the
  88         * first child again. Here we store classid of the
  89         * last valid ptr (used when ptr is NULL).
  90         */
  91        u32             last_ptr_id;
  92};
  93
  94/* interior & leaf nodes; props specific to leaves are marked L:
  95 * To reduce false sharing, place mostly read fields at beginning,
  96 * and mostly written ones at the end.
  97 */
  98struct htb_class {
  99        struct Qdisc_class_common common;
 100        struct psched_ratecfg   rate;
 101        struct psched_ratecfg   ceil;
 102        s64                     buffer, cbuffer;/* token bucket depth/rate */
 103        s64                     mbuffer;        /* max wait time */
 104        u32                     prio;           /* these two are used only by leaves... */
 105        int                     quantum;        /* but stored for parent-to-leaf return */
 106
 107        struct tcf_proto __rcu  *filter_list;   /* class attached filters */
 108        struct tcf_block        *block;
 109        int                     filter_cnt;
 110
 111        int                     level;          /* our level (see above) */
 112        unsigned int            children;
 113        struct htb_class        *parent;        /* parent class */
 114
 115        struct net_rate_estimator __rcu *rate_est;
 116
 117        /*
 118         * Written often fields
 119         */
 120        struct gnet_stats_basic_packed bstats;
 121        struct tc_htb_xstats    xstats; /* our special stats */
 122
 123        /* token bucket parameters */
 124        s64                     tokens, ctokens;/* current number of tokens */
 125        s64                     t_c;            /* checkpoint time */
 126
 127        union {
 128                struct htb_class_leaf {
 129                        struct list_head drop_list;
 130                        int             deficit[TC_HTB_MAXDEPTH];
 131                        struct Qdisc    *q;
 132                } leaf;
 133                struct htb_class_inner {
 134                        struct htb_prio clprio[TC_HTB_NUMPRIO];
 135                } inner;
 136        } un;
 137        s64                     pq_key;
 138
 139        int                     prio_activity;  /* for which prios are we active */
 140        enum htb_cmode          cmode;          /* current mode of the class */
 141        struct rb_node          pq_node;        /* node for event queue */
 142        struct rb_node          node[TC_HTB_NUMPRIO];   /* node for self or feed tree */
 143
 144        unsigned int drops ____cacheline_aligned_in_smp;
 145};
 146
 147struct htb_level {
 148        struct rb_root  wait_pq;
 149        struct htb_prio hprio[TC_HTB_NUMPRIO];
 150};
 151
 152struct htb_sched {
 153        struct Qdisc_class_hash clhash;
 154        int                     defcls;         /* class where unclassified flows go to */
 155        int                     rate2quantum;   /* quant = rate / rate2quantum */
 156
 157        /* filters for qdisc itself */
 158        struct tcf_proto __rcu  *filter_list;
 159        struct tcf_block        *block;
 160
 161#define HTB_WARN_TOOMANYEVENTS  0x1
 162        unsigned int            warned; /* only one warning */
 163        int                     direct_qlen;
 164        struct work_struct      work;
 165
 166        /* non shaped skbs; let them go directly thru */
 167        struct qdisc_skb_head   direct_queue;
 168        long                    direct_pkts;
 169
 170        struct qdisc_watchdog   watchdog;
 171
 172        s64                     now;    /* cached dequeue time */
 173        struct list_head        drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
 174
 175        /* time of nearest event per level (row) */
 176        s64                     near_ev_cache[TC_HTB_MAXDEPTH];
 177
 178        int                     row_mask[TC_HTB_MAXDEPTH];
 179
 180        struct htb_level        hlevel[TC_HTB_MAXDEPTH];
 181};
 182
 183/* find class in global hash table using given handle */
 184static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
 185{
 186        struct htb_sched *q = qdisc_priv(sch);
 187        struct Qdisc_class_common *clc;
 188
 189        clc = qdisc_class_find(&q->clhash, handle);
 190        if (clc == NULL)
 191                return NULL;
 192        return container_of(clc, struct htb_class, common);
 193}
 194
 195static unsigned long htb_search(struct Qdisc *sch, u32 handle)
 196{
 197        return (unsigned long)htb_find(handle, sch);
 198}
 199/**
 200 * htb_classify - classify a packet into class
 201 *
 202 * It returns NULL if the packet should be dropped or -1 if the packet
 203 * should be passed directly thru. In all other cases leaf class is returned.
 204 * We allow direct class selection by classid in priority. The we examine
 205 * filters in qdisc and in inner nodes (if higher filter points to the inner
 206 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
 207 * internal fifo (direct). These packets then go directly thru. If we still
 208 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
 209 * then finish and return direct queue.
 210 */
 211#define HTB_DIRECT ((struct htb_class *)-1L)
 212
 213static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 214                                      int *qerr)
 215{
 216        struct htb_sched *q = qdisc_priv(sch);
 217        struct htb_class *cl;
 218        struct tcf_result res;
 219        struct tcf_proto *tcf;
 220        int result;
 221
 222        /* allow to select class by setting skb->priority to valid classid;
 223         * note that nfmark can be used too by attaching filter fw with no
 224         * rules in it
 225         */
 226        if (skb->priority == sch->handle)
 227                return HTB_DIRECT;      /* X:0 (direct flow) selected */
 228        cl = htb_find(skb->priority, sch);
 229        if (cl) {
 230                if (cl->level == 0)
 231                        return cl;
 232                /* Start with inner filter chain if a non-leaf class is selected */
 233                tcf = rcu_dereference_bh(cl->filter_list);
 234        } else {
 235                tcf = rcu_dereference_bh(q->filter_list);
 236        }
 237
 238        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 239        while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
 240#ifdef CONFIG_NET_CLS_ACT
 241                switch (result) {
 242                case TC_ACT_QUEUED:
 243                case TC_ACT_STOLEN:
 244                case TC_ACT_TRAP:
 245                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 246                case TC_ACT_SHOT:
 247                        return NULL;
 248                }
 249#endif
 250                cl = (void *)res.class;
 251                if (!cl) {
 252                        if (res.classid == sch->handle)
 253                                return HTB_DIRECT;      /* X:0 (direct flow) */
 254                        cl = htb_find(res.classid, sch);
 255                        if (!cl)
 256                                break;  /* filter selected invalid classid */
 257                }
 258                if (!cl->level)
 259                        return cl;      /* we hit leaf; return it */
 260
 261                /* we have got inner class; apply inner filter chain */
 262                tcf = rcu_dereference_bh(cl->filter_list);
 263        }
 264        /* classification failed; try to use default class */
 265        cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
 266        if (!cl || cl->level)
 267                return HTB_DIRECT;      /* bad default .. this is safe bet */
 268        return cl;
 269}
 270
 271/**
 272 * htb_add_to_id_tree - adds class to the round robin list
 273 *
 274 * Routine adds class to the list (actually tree) sorted by classid.
 275 * Make sure that class is not already on such list for given prio.
 276 */
 277static void htb_add_to_id_tree(struct rb_root *root,
 278                               struct htb_class *cl, int prio)
 279{
 280        struct rb_node **p = &root->rb_node, *parent = NULL;
 281
 282        while (*p) {
 283                struct htb_class *c;
 284                parent = *p;
 285                c = rb_entry(parent, struct htb_class, node[prio]);
 286
 287                if (cl->common.classid > c->common.classid)
 288                        p = &parent->rb_right;
 289                else
 290                        p = &parent->rb_left;
 291        }
 292        rb_link_node(&cl->node[prio], parent, p);
 293        rb_insert_color(&cl->node[prio], root);
 294}
 295
 296/**
 297 * htb_add_to_wait_tree - adds class to the event queue with delay
 298 *
 299 * The class is added to priority event queue to indicate that class will
 300 * change its mode in cl->pq_key microseconds. Make sure that class is not
 301 * already in the queue.
 302 */
 303static void htb_add_to_wait_tree(struct htb_sched *q,
 304                                 struct htb_class *cl, s64 delay)
 305{
 306        struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
 307
 308        cl->pq_key = q->now + delay;
 309        if (cl->pq_key == q->now)
 310                cl->pq_key++;
 311
 312        /* update the nearest event cache */
 313        if (q->near_ev_cache[cl->level] > cl->pq_key)
 314                q->near_ev_cache[cl->level] = cl->pq_key;
 315
 316        while (*p) {
 317                struct htb_class *c;
 318                parent = *p;
 319                c = rb_entry(parent, struct htb_class, pq_node);
 320                if (cl->pq_key >= c->pq_key)
 321                        p = &parent->rb_right;
 322                else
 323                        p = &parent->rb_left;
 324        }
 325        rb_link_node(&cl->pq_node, parent, p);
 326        rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 327}
 328
 329/**
 330 * htb_next_rb_node - finds next node in binary tree
 331 *
 332 * When we are past last key we return NULL.
 333 * Average complexity is 2 steps per call.
 334 */
 335static inline void htb_next_rb_node(struct rb_node **n)
 336{
 337        *n = rb_next(*n);
 338}
 339
 340/**
 341 * htb_add_class_to_row - add class to its row
 342 *
 343 * The class is added to row at priorities marked in mask.
 344 * It does nothing if mask == 0.
 345 */
 346static inline void htb_add_class_to_row(struct htb_sched *q,
 347                                        struct htb_class *cl, int mask)
 348{
 349        q->row_mask[cl->level] |= mask;
 350        while (mask) {
 351                int prio = ffz(~mask);
 352                mask &= ~(1 << prio);
 353                htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
 354        }
 355}
 356
 357/* If this triggers, it is a bug in this code, but it need not be fatal */
 358static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
 359{
 360        if (RB_EMPTY_NODE(rb)) {
 361                WARN_ON(1);
 362        } else {
 363                rb_erase(rb, root);
 364                RB_CLEAR_NODE(rb);
 365        }
 366}
 367
 368
 369/**
 370 * htb_remove_class_from_row - removes class from its row
 371 *
 372 * The class is removed from row at priorities marked in mask.
 373 * It does nothing if mask == 0.
 374 */
 375static inline void htb_remove_class_from_row(struct htb_sched *q,
 376                                                 struct htb_class *cl, int mask)
 377{
 378        int m = 0;
 379        struct htb_level *hlevel = &q->hlevel[cl->level];
 380
 381        while (mask) {
 382                int prio = ffz(~mask);
 383                struct htb_prio *hprio = &hlevel->hprio[prio];
 384
 385                mask &= ~(1 << prio);
 386                if (hprio->ptr == cl->node + prio)
 387                        htb_next_rb_node(&hprio->ptr);
 388
 389                htb_safe_rb_erase(cl->node + prio, &hprio->row);
 390                if (!hprio->row.rb_node)
 391                        m |= 1 << prio;
 392        }
 393        q->row_mask[cl->level] &= ~m;
 394}
 395
 396/**
 397 * htb_activate_prios - creates active classe's feed chain
 398 *
 399 * The class is connected to ancestors and/or appropriate rows
 400 * for priorities it is participating on. cl->cmode must be new
 401 * (activated) mode. It does nothing if cl->prio_activity == 0.
 402 */
 403static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
 404{
 405        struct htb_class *p = cl->parent;
 406        long m, mask = cl->prio_activity;
 407
 408        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
 409                m = mask;
 410                while (m) {
 411                        int prio = ffz(~m);
 412                        m &= ~(1 << prio);
 413
 414                        if (p->un.inner.clprio[prio].feed.rb_node)
 415                                /* parent already has its feed in use so that
 416                                 * reset bit in mask as parent is already ok
 417                                 */
 418                                mask &= ~(1 << prio);
 419
 420                        htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
 421                }
 422                p->prio_activity |= mask;
 423                cl = p;
 424                p = cl->parent;
 425
 426        }
 427        if (cl->cmode == HTB_CAN_SEND && mask)
 428                htb_add_class_to_row(q, cl, mask);
 429}
 430
 431/**
 432 * htb_deactivate_prios - remove class from feed chain
 433 *
 434 * cl->cmode must represent old mode (before deactivation). It does
 435 * nothing if cl->prio_activity == 0. Class is removed from all feed
 436 * chains and rows.
 437 */
 438static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 439{
 440        struct htb_class *p = cl->parent;
 441        long m, mask = cl->prio_activity;
 442
 443        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
 444                m = mask;
 445                mask = 0;
 446                while (m) {
 447                        int prio = ffz(~m);
 448                        m &= ~(1 << prio);
 449
 450                        if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
 451                                /* we are removing child which is pointed to from
 452                                 * parent feed - forget the pointer but remember
 453                                 * classid
 454                                 */
 455                                p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
 456                                p->un.inner.clprio[prio].ptr = NULL;
 457                        }
 458
 459                        htb_safe_rb_erase(cl->node + prio,
 460                                          &p->un.inner.clprio[prio].feed);
 461
 462                        if (!p->un.inner.clprio[prio].feed.rb_node)
 463                                mask |= 1 << prio;
 464                }
 465
 466                p->prio_activity &= ~mask;
 467                cl = p;
 468                p = cl->parent;
 469
 470        }
 471        if (cl->cmode == HTB_CAN_SEND && mask)
 472                htb_remove_class_from_row(q, cl, mask);
 473}
 474
 475static inline s64 htb_lowater(const struct htb_class *cl)
 476{
 477        if (htb_hysteresis)
 478                return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
 479        else
 480                return 0;
 481}
 482static inline s64 htb_hiwater(const struct htb_class *cl)
 483{
 484        if (htb_hysteresis)
 485                return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
 486        else
 487                return 0;
 488}
 489
 490
 491/**
 492 * htb_class_mode - computes and returns current class mode
 493 *
 494 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
 495 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
 496 * from now to time when cl will change its state.
 497 * Also it is worth to note that class mode doesn't change simply
 498 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
 499 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
 500 * mode transitions per time unit. The speed gain is about 1/6.
 501 */
 502static inline enum htb_cmode
 503htb_class_mode(struct htb_class *cl, s64 *diff)
 504{
 505        s64 toks;
 506
 507        if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
 508                *diff = -toks;
 509                return HTB_CANT_SEND;
 510        }
 511
 512        if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
 513                return HTB_CAN_SEND;
 514
 515        *diff = -toks;
 516        return HTB_MAY_BORROW;
 517}
 518
 519/**
 520 * htb_change_class_mode - changes classe's mode
 521 *
 522 * This should be the only way how to change classe's mode under normal
 523 * cirsumstances. Routine will update feed lists linkage, change mode
 524 * and add class to the wait event queue if appropriate. New mode should
 525 * be different from old one and cl->pq_key has to be valid if changing
 526 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
 527 */
 528static void
 529htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
 530{
 531        enum htb_cmode new_mode = htb_class_mode(cl, diff);
 532
 533        if (new_mode == cl->cmode)
 534                return;
 535
 536        if (cl->prio_activity) {        /* not necessary: speed optimization */
 537                if (cl->cmode != HTB_CANT_SEND)
 538                        htb_deactivate_prios(q, cl);
 539                cl->cmode = new_mode;
 540                if (new_mode != HTB_CANT_SEND)
 541                        htb_activate_prios(q, cl);
 542        } else
 543                cl->cmode = new_mode;
 544}
 545
 546/**
 547 * htb_activate - inserts leaf cl into appropriate active feeds
 548 *
 549 * Routine learns (new) priority of leaf and activates feed chain
 550 * for the prio. It can be called on already active leaf safely.
 551 * It also adds leaf into droplist.
 552 */
 553static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
 554{
 555        WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
 556
 557        if (!cl->prio_activity) {
 558                cl->prio_activity = 1 << cl->prio;
 559                htb_activate_prios(q, cl);
 560                list_add_tail(&cl->un.leaf.drop_list,
 561                              q->drops + cl->prio);
 562        }
 563}
 564
 565/**
 566 * htb_deactivate - remove leaf cl from active feeds
 567 *
 568 * Make sure that leaf is active. In the other words it can't be called
 569 * with non-active leaf. It also removes class from the drop list.
 570 */
 571static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 572{
 573        WARN_ON(!cl->prio_activity);
 574
 575        htb_deactivate_prios(q, cl);
 576        cl->prio_activity = 0;
 577        list_del_init(&cl->un.leaf.drop_list);
 578}
 579
 580static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
 581                             struct qdisc_skb_head *qh)
 582{
 583        struct sk_buff *last = qh->tail;
 584
 585        if (last) {
 586                skb->next = NULL;
 587                last->next = skb;
 588                qh->tail = skb;
 589        } else {
 590                qh->tail = skb;
 591                qh->head = skb;
 592        }
 593        qh->qlen++;
 594}
 595
 596static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 597                       struct sk_buff **to_free)
 598{
 599        int uninitialized_var(ret);
 600        struct htb_sched *q = qdisc_priv(sch);
 601        struct htb_class *cl = htb_classify(skb, sch, &ret);
 602
 603        if (cl == HTB_DIRECT) {
 604                /* enqueue to helper queue */
 605                if (q->direct_queue.qlen < q->direct_qlen) {
 606                        htb_enqueue_tail(skb, sch, &q->direct_queue);
 607                        q->direct_pkts++;
 608                } else {
 609                        return qdisc_drop(skb, sch, to_free);
 610                }
 611#ifdef CONFIG_NET_CLS_ACT
 612        } else if (!cl) {
 613                if (ret & __NET_XMIT_BYPASS)
 614                        qdisc_qstats_drop(sch);
 615                __qdisc_drop(skb, to_free);
 616                return ret;
 617#endif
 618        } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
 619                                        to_free)) != NET_XMIT_SUCCESS) {
 620                if (net_xmit_drop_count(ret)) {
 621                        qdisc_qstats_drop(sch);
 622                        cl->drops++;
 623                }
 624                return ret;
 625        } else {
 626                htb_activate(q, cl);
 627        }
 628
 629        qdisc_qstats_backlog_inc(sch, skb);
 630        sch->q.qlen++;
 631        return NET_XMIT_SUCCESS;
 632}
 633
 634static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
 635{
 636        s64 toks = diff + cl->tokens;
 637
 638        if (toks > cl->buffer)
 639                toks = cl->buffer;
 640        toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
 641        if (toks <= -cl->mbuffer)
 642                toks = 1 - cl->mbuffer;
 643
 644        cl->tokens = toks;
 645}
 646
 647static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
 648{
 649        s64 toks = diff + cl->ctokens;
 650
 651        if (toks > cl->cbuffer)
 652                toks = cl->cbuffer;
 653        toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
 654        if (toks <= -cl->mbuffer)
 655                toks = 1 - cl->mbuffer;
 656
 657        cl->ctokens = toks;
 658}
 659
 660/**
 661 * htb_charge_class - charges amount "bytes" to leaf and ancestors
 662 *
 663 * Routine assumes that packet "bytes" long was dequeued from leaf cl
 664 * borrowing from "level". It accounts bytes to ceil leaky bucket for
 665 * leaf and all ancestors and to rate bucket for ancestors at levels
 666 * "level" and higher. It also handles possible change of mode resulting
 667 * from the update. Note that mode can also increase here (MAY_BORROW to
 668 * CAN_SEND) because we can use more precise clock that event queue here.
 669 * In such case we remove class from event queue first.
 670 */
 671static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 672                             int level, struct sk_buff *skb)
 673{
 674        int bytes = qdisc_pkt_len(skb);
 675        enum htb_cmode old_mode;
 676        s64 diff;
 677
 678        while (cl) {
 679                diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 680                if (cl->level >= level) {
 681                        if (cl->level == level)
 682                                cl->xstats.lends++;
 683                        htb_accnt_tokens(cl, bytes, diff);
 684                } else {
 685                        cl->xstats.borrows++;
 686                        cl->tokens += diff;     /* we moved t_c; update tokens */
 687                }
 688                htb_accnt_ctokens(cl, bytes, diff);
 689                cl->t_c = q->now;
 690
 691                old_mode = cl->cmode;
 692                diff = 0;
 693                htb_change_class_mode(q, cl, &diff);
 694                if (old_mode != cl->cmode) {
 695                        if (old_mode != HTB_CAN_SEND)
 696                                htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 697                        if (cl->cmode != HTB_CAN_SEND)
 698                                htb_add_to_wait_tree(q, cl, diff);
 699                }
 700
 701                /* update basic stats except for leaves which are already updated */
 702                if (cl->level)
 703                        bstats_update(&cl->bstats, skb);
 704
 705                cl = cl->parent;
 706        }
 707}
 708
 709/**
 710 * htb_do_events - make mode changes to classes at the level
 711 *
 712 * Scans event queue for pending events and applies them. Returns time of
 713 * next pending event (0 for no event in pq, q->now for too many events).
 714 * Note: Applied are events whose have cl->pq_key <= q->now.
 715 */
 716static s64 htb_do_events(struct htb_sched *q, const int level,
 717                         unsigned long start)
 718{
 719        /* don't run for longer than 2 jiffies; 2 is used instead of
 720         * 1 to simplify things when jiffy is going to be incremented
 721         * too soon
 722         */
 723        unsigned long stop_at = start + 2;
 724        struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
 725
 726        while (time_before(jiffies, stop_at)) {
 727                struct htb_class *cl;
 728                s64 diff;
 729                struct rb_node *p = rb_first(wait_pq);
 730
 731                if (!p)
 732                        return 0;
 733
 734                cl = rb_entry(p, struct htb_class, pq_node);
 735                if (cl->pq_key > q->now)
 736                        return cl->pq_key;
 737
 738                htb_safe_rb_erase(p, wait_pq);
 739                diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 740                htb_change_class_mode(q, cl, &diff);
 741                if (cl->cmode != HTB_CAN_SEND)
 742                        htb_add_to_wait_tree(q, cl, diff);
 743        }
 744
 745        /* too much load - let's continue after a break for scheduling */
 746        if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
 747                pr_warn("htb: too many events!\n");
 748                q->warned |= HTB_WARN_TOOMANYEVENTS;
 749        }
 750
 751        return q->now;
 752}
 753
 754/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
 755 * is no such one exists.
 756 */
 757static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
 758                                              u32 id)
 759{
 760        struct rb_node *r = NULL;
 761        while (n) {
 762                struct htb_class *cl =
 763                    rb_entry(n, struct htb_class, node[prio]);
 764
 765                if (id > cl->common.classid) {
 766                        n = n->rb_right;
 767                } else if (id < cl->common.classid) {
 768                        r = n;
 769                        n = n->rb_left;
 770                } else {
 771                        return n;
 772                }
 773        }
 774        return r;
 775}
 776
 777/**
 778 * htb_lookup_leaf - returns next leaf class in DRR order
 779 *
 780 * Find leaf where current feed pointers points to.
 781 */
 782static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
 783{
 784        int i;
 785        struct {
 786                struct rb_node *root;
 787                struct rb_node **pptr;
 788                u32 *pid;
 789        } stk[TC_HTB_MAXDEPTH], *sp = stk;
 790
 791        BUG_ON(!hprio->row.rb_node);
 792        sp->root = hprio->row.rb_node;
 793        sp->pptr = &hprio->ptr;
 794        sp->pid = &hprio->last_ptr_id;
 795
 796        for (i = 0; i < 65535; i++) {
 797                if (!*sp->pptr && *sp->pid) {
 798                        /* ptr was invalidated but id is valid - try to recover
 799                         * the original or next ptr
 800                         */
 801                        *sp->pptr =
 802                            htb_id_find_next_upper(prio, sp->root, *sp->pid);
 803                }
 804                *sp->pid = 0;   /* ptr is valid now so that remove this hint as it
 805                                 * can become out of date quickly
 806                                 */
 807                if (!*sp->pptr) {       /* we are at right end; rewind & go up */
 808                        *sp->pptr = sp->root;
 809                        while ((*sp->pptr)->rb_left)
 810                                *sp->pptr = (*sp->pptr)->rb_left;
 811                        if (sp > stk) {
 812                                sp--;
 813                                if (!*sp->pptr) {
 814                                        WARN_ON(1);
 815                                        return NULL;
 816                                }
 817                                htb_next_rb_node(sp->pptr);
 818                        }
 819                } else {
 820                        struct htb_class *cl;
 821                        struct htb_prio *clp;
 822
 823                        cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
 824                        if (!cl->level)
 825                                return cl;
 826                        clp = &cl->un.inner.clprio[prio];
 827                        (++sp)->root = clp->feed.rb_node;
 828                        sp->pptr = &clp->ptr;
 829                        sp->pid = &clp->last_ptr_id;
 830                }
 831        }
 832        WARN_ON(1);
 833        return NULL;
 834}
 835
 836/* dequeues packet at given priority and level; call only if
 837 * you are sure that there is active class at prio/level
 838 */
 839static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
 840                                        const int level)
 841{
 842        struct sk_buff *skb = NULL;
 843        struct htb_class *cl, *start;
 844        struct htb_level *hlevel = &q->hlevel[level];
 845        struct htb_prio *hprio = &hlevel->hprio[prio];
 846
 847        /* look initial class up in the row */
 848        start = cl = htb_lookup_leaf(hprio, prio);
 849
 850        do {
 851next:
 852                if (unlikely(!cl))
 853                        return NULL;
 854
 855                /* class can be empty - it is unlikely but can be true if leaf
 856                 * qdisc drops packets in enqueue routine or if someone used
 857                 * graft operation on the leaf since last dequeue;
 858                 * simply deactivate and skip such class
 859                 */
 860                if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
 861                        struct htb_class *next;
 862                        htb_deactivate(q, cl);
 863
 864                        /* row/level might become empty */
 865                        if ((q->row_mask[level] & (1 << prio)) == 0)
 866                                return NULL;
 867
 868                        next = htb_lookup_leaf(hprio, prio);
 869
 870                        if (cl == start)        /* fix start if we just deleted it */
 871                                start = next;
 872                        cl = next;
 873                        goto next;
 874                }
 875
 876                skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
 877                if (likely(skb != NULL))
 878                        break;
 879
 880                qdisc_warn_nonwc("htb", cl->un.leaf.q);
 881                htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
 882                                         &q->hlevel[0].hprio[prio].ptr);
 883                cl = htb_lookup_leaf(hprio, prio);
 884
 885        } while (cl != start);
 886
 887        if (likely(skb != NULL)) {
 888                bstats_update(&cl->bstats, skb);
 889                cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
 890                if (cl->un.leaf.deficit[level] < 0) {
 891                        cl->un.leaf.deficit[level] += cl->quantum;
 892                        htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
 893                                                 &q->hlevel[0].hprio[prio].ptr);
 894                }
 895                /* this used to be after charge_class but this constelation
 896                 * gives us slightly better performance
 897                 */
 898                if (!cl->un.leaf.q->q.qlen)
 899                        htb_deactivate(q, cl);
 900                htb_charge_class(q, cl, level, skb);
 901        }
 902        return skb;
 903}
 904
 905static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 906{
 907        struct sk_buff *skb;
 908        struct htb_sched *q = qdisc_priv(sch);
 909        int level;
 910        s64 next_event;
 911        unsigned long start_at;
 912
 913        /* try to dequeue direct packets as high prio (!) to minimize cpu work */
 914        skb = __qdisc_dequeue_head(&q->direct_queue);
 915        if (skb != NULL) {
 916ok:
 917                qdisc_bstats_update(sch, skb);
 918                qdisc_qstats_backlog_dec(sch, skb);
 919                sch->q.qlen--;
 920                return skb;
 921        }
 922
 923        if (!sch->q.qlen)
 924                goto fin;
 925        q->now = ktime_get_ns();
 926        start_at = jiffies;
 927
 928        next_event = q->now + 5LLU * NSEC_PER_SEC;
 929
 930        for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
 931                /* common case optimization - skip event handler quickly */
 932                int m;
 933                s64 event = q->near_ev_cache[level];
 934
 935                if (q->now >= event) {
 936                        event = htb_do_events(q, level, start_at);
 937                        if (!event)
 938                                event = q->now + NSEC_PER_SEC;
 939                        q->near_ev_cache[level] = event;
 940                }
 941
 942                if (next_event > event)
 943                        next_event = event;
 944
 945                m = ~q->row_mask[level];
 946                while (m != (int)(-1)) {
 947                        int prio = ffz(m);
 948
 949                        m |= 1 << prio;
 950                        skb = htb_dequeue_tree(q, prio, level);
 951                        if (likely(skb != NULL))
 952                                goto ok;
 953                }
 954        }
 955        qdisc_qstats_overlimit(sch);
 956        if (likely(next_event > q->now))
 957                qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
 958        else
 959                schedule_work(&q->work);
 960fin:
 961        return skb;
 962}
 963
 964/* reset all classes */
 965/* always caled under BH & queue lock */
 966static void htb_reset(struct Qdisc *sch)
 967{
 968        struct htb_sched *q = qdisc_priv(sch);
 969        struct htb_class *cl;
 970        unsigned int i;
 971
 972        for (i = 0; i < q->clhash.hashsize; i++) {
 973                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 974                        if (cl->level)
 975                                memset(&cl->un.inner, 0, sizeof(cl->un.inner));
 976                        else {
 977                                if (cl->un.leaf.q)
 978                                        qdisc_reset(cl->un.leaf.q);
 979                                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
 980                        }
 981                        cl->prio_activity = 0;
 982                        cl->cmode = HTB_CAN_SEND;
 983                }
 984        }
 985        qdisc_watchdog_cancel(&q->watchdog);
 986        __qdisc_reset_queue(&q->direct_queue);
 987        sch->q.qlen = 0;
 988        sch->qstats.backlog = 0;
 989        memset(q->hlevel, 0, sizeof(q->hlevel));
 990        memset(q->row_mask, 0, sizeof(q->row_mask));
 991        for (i = 0; i < TC_HTB_NUMPRIO; i++)
 992                INIT_LIST_HEAD(q->drops + i);
 993}
 994
 995static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
 996        [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
 997        [TCA_HTB_INIT]  = { .len = sizeof(struct tc_htb_glob) },
 998        [TCA_HTB_CTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 999        [TCA_HTB_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1000        [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1001        [TCA_HTB_RATE64] = { .type = NLA_U64 },
1002        [TCA_HTB_CEIL64] = { .type = NLA_U64 },
1003};
1004
1005static void htb_work_func(struct work_struct *work)
1006{
1007        struct htb_sched *q = container_of(work, struct htb_sched, work);
1008        struct Qdisc *sch = q->watchdog.qdisc;
1009
1010        rcu_read_lock();
1011        __netif_schedule(qdisc_root(sch));
1012        rcu_read_unlock();
1013}
1014
1015static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1016{
1017        struct htb_sched *q = qdisc_priv(sch);
1018        struct nlattr *tb[TCA_HTB_MAX + 1];
1019        struct tc_htb_glob *gopt;
1020        int err;
1021        int i;
1022
1023        qdisc_watchdog_init(&q->watchdog, sch);
1024        INIT_WORK(&q->work, htb_work_func);
1025
1026        if (!opt)
1027                return -EINVAL;
1028
1029        err = tcf_block_get(&q->block, &q->filter_list);
1030        if (err)
1031                return err;
1032
1033        err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1034        if (err < 0)
1035                return err;
1036
1037        if (!tb[TCA_HTB_INIT])
1038                return -EINVAL;
1039
1040        gopt = nla_data(tb[TCA_HTB_INIT]);
1041        if (gopt->version != HTB_VER >> 16)
1042                return -EINVAL;
1043
1044        err = qdisc_class_hash_init(&q->clhash);
1045        if (err < 0)
1046                return err;
1047        for (i = 0; i < TC_HTB_NUMPRIO; i++)
1048                INIT_LIST_HEAD(q->drops + i);
1049
1050        qdisc_skb_head_init(&q->direct_queue);
1051
1052        if (tb[TCA_HTB_DIRECT_QLEN])
1053                q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1054        else
1055                q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1056
1057        if ((q->rate2quantum = gopt->rate2quantum) < 1)
1058                q->rate2quantum = 1;
1059        q->defcls = gopt->defcls;
1060
1061        return 0;
1062}
1063
1064static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1065{
1066        struct htb_sched *q = qdisc_priv(sch);
1067        struct nlattr *nest;
1068        struct tc_htb_glob gopt;
1069
1070        /* Its safe to not acquire qdisc lock. As we hold RTNL,
1071         * no change can happen on the qdisc parameters.
1072         */
1073
1074        gopt.direct_pkts = q->direct_pkts;
1075        gopt.version = HTB_VER;
1076        gopt.rate2quantum = q->rate2quantum;
1077        gopt.defcls = q->defcls;
1078        gopt.debug = 0;
1079
1080        nest = nla_nest_start(skb, TCA_OPTIONS);
1081        if (nest == NULL)
1082                goto nla_put_failure;
1083        if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1084            nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1085                goto nla_put_failure;
1086
1087        return nla_nest_end(skb, nest);
1088
1089nla_put_failure:
1090        nla_nest_cancel(skb, nest);
1091        return -1;
1092}
1093
1094static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1095                          struct sk_buff *skb, struct tcmsg *tcm)
1096{
1097        struct htb_class *cl = (struct htb_class *)arg;
1098        struct nlattr *nest;
1099        struct tc_htb_opt opt;
1100
1101        /* Its safe to not acquire qdisc lock. As we hold RTNL,
1102         * no change can happen on the class parameters.
1103         */
1104        tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1105        tcm->tcm_handle = cl->common.classid;
1106        if (!cl->level && cl->un.leaf.q)
1107                tcm->tcm_info = cl->un.leaf.q->handle;
1108
1109        nest = nla_nest_start(skb, TCA_OPTIONS);
1110        if (nest == NULL)
1111                goto nla_put_failure;
1112
1113        memset(&opt, 0, sizeof(opt));
1114
1115        psched_ratecfg_getrate(&opt.rate, &cl->rate);
1116        opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1117        psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1118        opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1119        opt.quantum = cl->quantum;
1120        opt.prio = cl->prio;
1121        opt.level = cl->level;
1122        if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1123                goto nla_put_failure;
1124        if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1125            nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1126                              TCA_HTB_PAD))
1127                goto nla_put_failure;
1128        if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1129            nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1130                              TCA_HTB_PAD))
1131                goto nla_put_failure;
1132
1133        return nla_nest_end(skb, nest);
1134
1135nla_put_failure:
1136        nla_nest_cancel(skb, nest);
1137        return -1;
1138}
1139
1140static int
1141htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1142{
1143        struct htb_class *cl = (struct htb_class *)arg;
1144        struct gnet_stats_queue qs = {
1145                .drops = cl->drops,
1146        };
1147        __u32 qlen = 0;
1148
1149        if (!cl->level && cl->un.leaf.q) {
1150                qlen = cl->un.leaf.q->q.qlen;
1151                qs.backlog = cl->un.leaf.q->qstats.backlog;
1152        }
1153        cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1154                                    INT_MIN, INT_MAX);
1155        cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1156                                     INT_MIN, INT_MAX);
1157
1158        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1159                                  d, NULL, &cl->bstats) < 0 ||
1160            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1161            gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1162                return -1;
1163
1164        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1165}
1166
1167static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1168                     struct Qdisc **old)
1169{
1170        struct htb_class *cl = (struct htb_class *)arg;
1171
1172        if (cl->level)
1173                return -EINVAL;
1174        if (new == NULL &&
1175            (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1176                                     cl->common.classid)) == NULL)
1177                return -ENOBUFS;
1178
1179        *old = qdisc_replace(sch, new, &cl->un.leaf.q);
1180        return 0;
1181}
1182
1183static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1184{
1185        struct htb_class *cl = (struct htb_class *)arg;
1186        return !cl->level ? cl->un.leaf.q : NULL;
1187}
1188
1189static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1190{
1191        struct htb_class *cl = (struct htb_class *)arg;
1192
1193        htb_deactivate(qdisc_priv(sch), cl);
1194}
1195
1196static inline int htb_parent_last_child(struct htb_class *cl)
1197{
1198        if (!cl->parent)
1199                /* the root class */
1200                return 0;
1201        if (cl->parent->children > 1)
1202                /* not the last child */
1203                return 0;
1204        return 1;
1205}
1206
1207static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1208                               struct Qdisc *new_q)
1209{
1210        struct htb_class *parent = cl->parent;
1211
1212        WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1213
1214        if (parent->cmode != HTB_CAN_SEND)
1215                htb_safe_rb_erase(&parent->pq_node,
1216                                  &q->hlevel[parent->level].wait_pq);
1217
1218        parent->level = 0;
1219        memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1220        INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1221        parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1222        parent->tokens = parent->buffer;
1223        parent->ctokens = parent->cbuffer;
1224        parent->t_c = ktime_get_ns();
1225        parent->cmode = HTB_CAN_SEND;
1226}
1227
1228static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1229{
1230        if (!cl->level) {
1231                WARN_ON(!cl->un.leaf.q);
1232                qdisc_destroy(cl->un.leaf.q);
1233        }
1234        gen_kill_estimator(&cl->rate_est);
1235        tcf_block_put(cl->block);
1236        kfree(cl);
1237}
1238
1239static void htb_destroy(struct Qdisc *sch)
1240{
1241        struct htb_sched *q = qdisc_priv(sch);
1242        struct hlist_node *next;
1243        struct htb_class *cl;
1244        unsigned int i;
1245
1246        cancel_work_sync(&q->work);
1247        qdisc_watchdog_cancel(&q->watchdog);
1248        /* This line used to be after htb_destroy_class call below
1249         * and surprisingly it worked in 2.4. But it must precede it
1250         * because filter need its target class alive to be able to call
1251         * unbind_filter on it (without Oops).
1252         */
1253        tcf_block_put(q->block);
1254
1255        for (i = 0; i < q->clhash.hashsize; i++) {
1256                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1257                        tcf_block_put(cl->block);
1258                        cl->block = NULL;
1259                }
1260        }
1261        for (i = 0; i < q->clhash.hashsize; i++) {
1262                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1263                                          common.hnode)
1264                        htb_destroy_class(sch, cl);
1265        }
1266        qdisc_class_hash_destroy(&q->clhash);
1267        __qdisc_reset_queue(&q->direct_queue);
1268}
1269
1270static int htb_delete(struct Qdisc *sch, unsigned long arg)
1271{
1272        struct htb_sched *q = qdisc_priv(sch);
1273        struct htb_class *cl = (struct htb_class *)arg;
1274        struct Qdisc *new_q = NULL;
1275        int last_child = 0;
1276
1277        /* TODO: why don't allow to delete subtree ? references ? does
1278         * tc subsys guarantee us that in htb_destroy it holds no class
1279         * refs so that we can remove children safely there ?
1280         */
1281        if (cl->children || cl->filter_cnt)
1282                return -EBUSY;
1283
1284        if (!cl->level && htb_parent_last_child(cl)) {
1285                new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1286                                          cl->parent->common.classid);
1287                last_child = 1;
1288        }
1289
1290        sch_tree_lock(sch);
1291
1292        if (!cl->level) {
1293                unsigned int qlen = cl->un.leaf.q->q.qlen;
1294                unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1295
1296                qdisc_reset(cl->un.leaf.q);
1297                qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1298        }
1299
1300        /* delete from hash and active; remainder in destroy_class */
1301        qdisc_class_hash_remove(&q->clhash, &cl->common);
1302        if (cl->parent)
1303                cl->parent->children--;
1304
1305        if (cl->prio_activity)
1306                htb_deactivate(q, cl);
1307
1308        if (cl->cmode != HTB_CAN_SEND)
1309                htb_safe_rb_erase(&cl->pq_node,
1310                                  &q->hlevel[cl->level].wait_pq);
1311
1312        if (last_child)
1313                htb_parent_to_leaf(q, cl, new_q);
1314
1315        sch_tree_unlock(sch);
1316
1317        htb_destroy_class(sch, cl);
1318        return 0;
1319}
1320
1321static int htb_change_class(struct Qdisc *sch, u32 classid,
1322                            u32 parentid, struct nlattr **tca,
1323                            unsigned long *arg)
1324{
1325        int err = -EINVAL;
1326        struct htb_sched *q = qdisc_priv(sch);
1327        struct htb_class *cl = (struct htb_class *)*arg, *parent;
1328        struct nlattr *opt = tca[TCA_OPTIONS];
1329        struct nlattr *tb[TCA_HTB_MAX + 1];
1330        struct tc_htb_opt *hopt;
1331        u64 rate64, ceil64;
1332
1333        /* extract all subattrs from opt attr */
1334        if (!opt)
1335                goto failure;
1336
1337        err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1338        if (err < 0)
1339                goto failure;
1340
1341        err = -EINVAL;
1342        if (tb[TCA_HTB_PARMS] == NULL)
1343                goto failure;
1344
1345        parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1346
1347        hopt = nla_data(tb[TCA_HTB_PARMS]);
1348        if (!hopt->rate.rate || !hopt->ceil.rate)
1349                goto failure;
1350
1351        /* Keeping backward compatible with rate_table based iproute2 tc */
1352        if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1353                qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1354
1355        if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1356                qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1357
1358        if (!cl) {              /* new class */
1359                struct Qdisc *new_q;
1360                int prio;
1361                struct {
1362                        struct nlattr           nla;
1363                        struct gnet_estimator   opt;
1364                } est = {
1365                        .nla = {
1366                                .nla_len        = nla_attr_size(sizeof(est.opt)),
1367                                .nla_type       = TCA_RATE,
1368                        },
1369                        .opt = {
1370                                /* 4s interval, 16s averaging constant */
1371                                .interval       = 2,
1372                                .ewma_log       = 2,
1373                        },
1374                };
1375
1376                /* check for valid classid */
1377                if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1378                    htb_find(classid, sch))
1379                        goto failure;
1380
1381                /* check maximal depth */
1382                if (parent && parent->parent && parent->parent->level < 2) {
1383                        pr_err("htb: tree is too deep\n");
1384                        goto failure;
1385                }
1386                err = -ENOBUFS;
1387                cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1388                if (!cl)
1389                        goto failure;
1390
1391                err = tcf_block_get(&cl->block, &cl->filter_list);
1392                if (err) {
1393                        kfree(cl);
1394                        goto failure;
1395                }
1396                if (htb_rate_est || tca[TCA_RATE]) {
1397                        err = gen_new_estimator(&cl->bstats, NULL,
1398                                                &cl->rate_est,
1399                                                NULL,
1400                                                qdisc_root_sleeping_running(sch),
1401                                                tca[TCA_RATE] ? : &est.nla);
1402                        if (err) {
1403                                tcf_block_put(cl->block);
1404                                kfree(cl);
1405                                goto failure;
1406                        }
1407                }
1408
1409                cl->children = 0;
1410                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1411                RB_CLEAR_NODE(&cl->pq_node);
1412
1413                for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1414                        RB_CLEAR_NODE(&cl->node[prio]);
1415
1416                /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1417                 * so that can't be used inside of sch_tree_lock
1418                 * -- thanks to Karlis Peisenieks
1419                 */
1420                new_q = qdisc_create_dflt(sch->dev_queue,
1421                                          &pfifo_qdisc_ops, classid);
1422                sch_tree_lock(sch);
1423                if (parent && !parent->level) {
1424                        unsigned int qlen = parent->un.leaf.q->q.qlen;
1425                        unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1426
1427                        /* turn parent into inner node */
1428                        qdisc_reset(parent->un.leaf.q);
1429                        qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1430                        qdisc_destroy(parent->un.leaf.q);
1431                        if (parent->prio_activity)
1432                                htb_deactivate(q, parent);
1433
1434                        /* remove from evt list because of level change */
1435                        if (parent->cmode != HTB_CAN_SEND) {
1436                                htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1437                                parent->cmode = HTB_CAN_SEND;
1438                        }
1439                        parent->level = (parent->parent ? parent->parent->level
1440                                         : TC_HTB_MAXDEPTH) - 1;
1441                        memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1442                }
1443                /* leaf (we) needs elementary qdisc */
1444                cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1445
1446                cl->common.classid = classid;
1447                cl->parent = parent;
1448
1449                /* set class to be in HTB_CAN_SEND state */
1450                cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1451                cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1452                cl->mbuffer = 60ULL * NSEC_PER_SEC;     /* 1min */
1453                cl->t_c = ktime_get_ns();
1454                cl->cmode = HTB_CAN_SEND;
1455
1456                /* attach to the hash list and parent's family */
1457                qdisc_class_hash_insert(&q->clhash, &cl->common);
1458                if (parent)
1459                        parent->children++;
1460                if (cl->un.leaf.q != &noop_qdisc)
1461                        qdisc_hash_add(cl->un.leaf.q, true);
1462        } else {
1463                if (tca[TCA_RATE]) {
1464                        err = gen_replace_estimator(&cl->bstats, NULL,
1465                                                    &cl->rate_est,
1466                                                    NULL,
1467                                                    qdisc_root_sleeping_running(sch),
1468                                                    tca[TCA_RATE]);
1469                        if (err)
1470                                return err;
1471                }
1472                sch_tree_lock(sch);
1473        }
1474
1475        rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1476
1477        ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1478
1479        psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1480        psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1481
1482        /* it used to be a nasty bug here, we have to check that node
1483         * is really leaf before changing cl->un.leaf !
1484         */
1485        if (!cl->level) {
1486                u64 quantum = cl->rate.rate_bytes_ps;
1487
1488                do_div(quantum, q->rate2quantum);
1489                cl->quantum = min_t(u64, quantum, INT_MAX);
1490
1491                if (!hopt->quantum && cl->quantum < 1000) {
1492                        pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1493                                cl->common.classid);
1494                        cl->quantum = 1000;
1495                }
1496                if (!hopt->quantum && cl->quantum > 200000) {
1497                        pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1498                                cl->common.classid);
1499                        cl->quantum = 200000;
1500                }
1501                if (hopt->quantum)
1502                        cl->quantum = hopt->quantum;
1503                if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1504                        cl->prio = TC_HTB_NUMPRIO - 1;
1505        }
1506
1507        cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1508        cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1509
1510        sch_tree_unlock(sch);
1511
1512        qdisc_class_hash_grow(sch, &q->clhash);
1513
1514        *arg = (unsigned long)cl;
1515        return 0;
1516
1517failure:
1518        return err;
1519}
1520
1521static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg)
1522{
1523        struct htb_sched *q = qdisc_priv(sch);
1524        struct htb_class *cl = (struct htb_class *)arg;
1525
1526        return cl ? cl->block : q->block;
1527}
1528
1529static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1530                                     u32 classid)
1531{
1532        struct htb_class *cl = htb_find(classid, sch);
1533
1534        /*if (cl && !cl->level) return 0;
1535         * The line above used to be there to prevent attaching filters to
1536         * leaves. But at least tc_index filter uses this just to get class
1537         * for other reasons so that we have to allow for it.
1538         * ----
1539         * 19.6.2002 As Werner explained it is ok - bind filter is just
1540         * another way to "lock" the class - unlike "get" this lock can
1541         * be broken by class during destroy IIUC.
1542         */
1543        if (cl)
1544                cl->filter_cnt++;
1545        return (unsigned long)cl;
1546}
1547
1548static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1549{
1550        struct htb_class *cl = (struct htb_class *)arg;
1551
1552        if (cl)
1553                cl->filter_cnt--;
1554}
1555
1556static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1557{
1558        struct htb_sched *q = qdisc_priv(sch);
1559        struct htb_class *cl;
1560        unsigned int i;
1561
1562        if (arg->stop)
1563                return;
1564
1565        for (i = 0; i < q->clhash.hashsize; i++) {
1566                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1567                        if (arg->count < arg->skip) {
1568                                arg->count++;
1569                                continue;
1570                        }
1571                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1572                                arg->stop = 1;
1573                                return;
1574                        }
1575                        arg->count++;
1576                }
1577        }
1578}
1579
1580static const struct Qdisc_class_ops htb_class_ops = {
1581        .graft          =       htb_graft,
1582        .leaf           =       htb_leaf,
1583        .qlen_notify    =       htb_qlen_notify,
1584        .find           =       htb_search,
1585        .change         =       htb_change_class,
1586        .delete         =       htb_delete,
1587        .walk           =       htb_walk,
1588        .tcf_block      =       htb_tcf_block,
1589        .bind_tcf       =       htb_bind_filter,
1590        .unbind_tcf     =       htb_unbind_filter,
1591        .dump           =       htb_dump_class,
1592        .dump_stats     =       htb_dump_class_stats,
1593};
1594
1595static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1596        .cl_ops         =       &htb_class_ops,
1597        .id             =       "htb",
1598        .priv_size      =       sizeof(struct htb_sched),
1599        .enqueue        =       htb_enqueue,
1600        .dequeue        =       htb_dequeue,
1601        .peek           =       qdisc_peek_dequeued,
1602        .init           =       htb_init,
1603        .reset          =       htb_reset,
1604        .destroy        =       htb_destroy,
1605        .dump           =       htb_dump,
1606        .owner          =       THIS_MODULE,
1607};
1608
1609static int __init htb_module_init(void)
1610{
1611        return register_qdisc(&htb_qdisc_ops);
1612}
1613static void __exit htb_module_exit(void)
1614{
1615        unregister_qdisc(&htb_qdisc_ops);
1616}
1617
1618module_init(htb_module_init)
1619module_exit(htb_module_exit)
1620MODULE_LICENSE("GPL");
1621