linux/net/sched/sch_htb.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_htb.c  Hierarchical token bucket, feed tree version
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Martin Devera, <devik@cdi.cz>
  10 *
  11 * Credits (in time order) for older HTB versions:
  12 *              Stef Coene <stef.coene@docum.org>
  13 *                      HTB support at LARTC mailing list
  14 *              Ondrej Kraus, <krauso@barr.cz>
  15 *                      found missing INIT_QDISC(htb)
  16 *              Vladimir Smelhaus, Aamer Akhter, Bert Hubert
  17 *                      helped a lot to locate nasty class stall bug
  18 *              Andi Kleen, Jamal Hadi, Bert Hubert
  19 *                      code review and helpful comments on shaping
  20 *              Tomasz Wrona, <tw@eter.tym.pl>
  21 *                      created test case so that I was able to fix nasty bug
  22 *              Wilfried Weissmann
  23 *                      spotted bug in dequeue code and helped with fix
  24 *              Jiri Fojtasek
  25 *                      fixed requeue routine
  26 *              and many others. thanks.
  27 */
  28#include <linux/module.h>
  29#include <linux/moduleparam.h>
  30#include <linux/types.h>
  31#include <linux/kernel.h>
  32#include <linux/string.h>
  33#include <linux/errno.h>
  34#include <linux/skbuff.h>
  35#include <linux/list.h>
  36#include <linux/compiler.h>
  37#include <linux/rbtree.h>
  38#include <linux/workqueue.h>
  39#include <linux/slab.h>
  40#include <net/netlink.h>
  41#include <net/sch_generic.h>
  42#include <net/pkt_sched.h>
  43#include <net/pkt_cls.h>
  44
  45/* HTB algorithm.
  46    Author: devik@cdi.cz
  47    ========================================================================
  48    HTB is like TBF with multiple classes. It is also similar to CBQ because
  49    it allows to assign priority to each class in hierarchy.
  50    In fact it is another implementation of Floyd's formal sharing.
  51
  52    Levels:
  53    Each class is assigned level. Leaf has ALWAYS level 0 and root
  54    classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
  55    one less than their parent.
  56*/
  57
  58static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
  59#define HTB_VER 0x30011         /* major must be matched with number suplied by TC as version */
  60
  61#if HTB_VER >> 16 != TC_HTB_PROTOVER
  62#error "Mismatched sch_htb.c and pkt_sch.h"
  63#endif
  64
  65/* Module parameter and sysfs export */
  66module_param    (htb_hysteresis, int, 0640);
  67MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
  68
  69static int htb_rate_est = 0; /* htb classes have a default rate estimator */
  70module_param(htb_rate_est, int, 0640);
  71MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
  72
  73/* used internaly to keep status of single class */
  74enum htb_cmode {
  75        HTB_CANT_SEND,          /* class can't send and can't borrow */
  76        HTB_MAY_BORROW,         /* class can't send but may borrow */
  77        HTB_CAN_SEND            /* class can send */
  78};
  79
  80struct htb_prio {
  81        union {
  82                struct rb_root  row;
  83                struct rb_root  feed;
  84        };
  85        struct rb_node  *ptr;
  86        /* When class changes from state 1->2 and disconnects from
  87         * parent's feed then we lost ptr value and start from the
  88         * first child again. Here we store classid of the
  89         * last valid ptr (used when ptr is NULL).
  90         */
  91        u32             last_ptr_id;
  92};
  93
  94/* interior & leaf nodes; props specific to leaves are marked L:
  95 * To reduce false sharing, place mostly read fields at beginning,
  96 * and mostly written ones at the end.
  97 */
  98struct htb_class {
  99        struct Qdisc_class_common common;
 100        struct psched_ratecfg   rate;
 101        struct psched_ratecfg   ceil;
 102        s64                     buffer, cbuffer;/* token bucket depth/rate */
 103        s64                     mbuffer;        /* max wait time */
 104        u32                     prio;           /* these two are used only by leaves... */
 105        int                     quantum;        /* but stored for parent-to-leaf return */
 106
 107        struct tcf_proto __rcu  *filter_list;   /* class attached filters */
 108        struct tcf_block        *block;
 109        int                     filter_cnt;
 110
 111        int                     level;          /* our level (see above) */
 112        unsigned int            children;
 113        struct htb_class        *parent;        /* parent class */
 114
 115        struct net_rate_estimator __rcu *rate_est;
 116
 117        /*
 118         * Written often fields
 119         */
 120        struct gnet_stats_basic_packed bstats;
 121        struct tc_htb_xstats    xstats; /* our special stats */
 122
 123        /* token bucket parameters */
 124        s64                     tokens, ctokens;/* current number of tokens */
 125        s64                     t_c;            /* checkpoint time */
 126
 127        union {
 128                struct htb_class_leaf {
 129                        struct list_head drop_list;
 130                        int             deficit[TC_HTB_MAXDEPTH];
 131                        struct Qdisc    *q;
 132                } leaf;
 133                struct htb_class_inner {
 134                        struct htb_prio clprio[TC_HTB_NUMPRIO];
 135                } inner;
 136        } un;
 137        s64                     pq_key;
 138
 139        int                     prio_activity;  /* for which prios are we active */
 140        enum htb_cmode          cmode;          /* current mode of the class */
 141        struct rb_node          pq_node;        /* node for event queue */
 142        struct rb_node          node[TC_HTB_NUMPRIO];   /* node for self or feed tree */
 143
 144        unsigned int drops ____cacheline_aligned_in_smp;
 145        unsigned int            overlimits;
 146};
 147
 148struct htb_level {
 149        struct rb_root  wait_pq;
 150        struct htb_prio hprio[TC_HTB_NUMPRIO];
 151};
 152
 153struct htb_sched {
 154        struct Qdisc_class_hash clhash;
 155        int                     defcls;         /* class where unclassified flows go to */
 156        int                     rate2quantum;   /* quant = rate / rate2quantum */
 157
 158        /* filters for qdisc itself */
 159        struct tcf_proto __rcu  *filter_list;
 160        struct tcf_block        *block;
 161
 162#define HTB_WARN_TOOMANYEVENTS  0x1
 163        unsigned int            warned; /* only one warning */
 164        int                     direct_qlen;
 165        struct work_struct      work;
 166
 167        /* non shaped skbs; let them go directly thru */
 168        struct qdisc_skb_head   direct_queue;
 169        long                    direct_pkts;
 170
 171        struct qdisc_watchdog   watchdog;
 172
 173        s64                     now;    /* cached dequeue time */
 174        struct list_head        drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
 175
 176        /* time of nearest event per level (row) */
 177        s64                     near_ev_cache[TC_HTB_MAXDEPTH];
 178
 179        int                     row_mask[TC_HTB_MAXDEPTH];
 180
 181        struct htb_level        hlevel[TC_HTB_MAXDEPTH];
 182};
 183
 184/* find class in global hash table using given handle */
 185static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
 186{
 187        struct htb_sched *q = qdisc_priv(sch);
 188        struct Qdisc_class_common *clc;
 189
 190        clc = qdisc_class_find(&q->clhash, handle);
 191        if (clc == NULL)
 192                return NULL;
 193        return container_of(clc, struct htb_class, common);
 194}
 195
 196static unsigned long htb_search(struct Qdisc *sch, u32 handle)
 197{
 198        return (unsigned long)htb_find(handle, sch);
 199}
 200/**
 201 * htb_classify - classify a packet into class
 202 *
 203 * It returns NULL if the packet should be dropped or -1 if the packet
 204 * should be passed directly thru. In all other cases leaf class is returned.
 205 * We allow direct class selection by classid in priority. The we examine
 206 * filters in qdisc and in inner nodes (if higher filter points to the inner
 207 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
 208 * internal fifo (direct). These packets then go directly thru. If we still
 209 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
 210 * then finish and return direct queue.
 211 */
 212#define HTB_DIRECT ((struct htb_class *)-1L)
 213
 214static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 215                                      int *qerr)
 216{
 217        struct htb_sched *q = qdisc_priv(sch);
 218        struct htb_class *cl;
 219        struct tcf_result res;
 220        struct tcf_proto *tcf;
 221        int result;
 222
 223        /* allow to select class by setting skb->priority to valid classid;
 224         * note that nfmark can be used too by attaching filter fw with no
 225         * rules in it
 226         */
 227        if (skb->priority == sch->handle)
 228                return HTB_DIRECT;      /* X:0 (direct flow) selected */
 229        cl = htb_find(skb->priority, sch);
 230        if (cl) {
 231                if (cl->level == 0)
 232                        return cl;
 233                /* Start with inner filter chain if a non-leaf class is selected */
 234                tcf = rcu_dereference_bh(cl->filter_list);
 235        } else {
 236                tcf = rcu_dereference_bh(q->filter_list);
 237        }
 238
 239        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 240        while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
 241#ifdef CONFIG_NET_CLS_ACT
 242                switch (result) {
 243                case TC_ACT_QUEUED:
 244                case TC_ACT_STOLEN:
 245                case TC_ACT_TRAP:
 246                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 247                        /* fall through */
 248                case TC_ACT_SHOT:
 249                        return NULL;
 250                }
 251#endif
 252                cl = (void *)res.class;
 253                if (!cl) {
 254                        if (res.classid == sch->handle)
 255                                return HTB_DIRECT;      /* X:0 (direct flow) */
 256                        cl = htb_find(res.classid, sch);
 257                        if (!cl)
 258                                break;  /* filter selected invalid classid */
 259                }
 260                if (!cl->level)
 261                        return cl;      /* we hit leaf; return it */
 262
 263                /* we have got inner class; apply inner filter chain */
 264                tcf = rcu_dereference_bh(cl->filter_list);
 265        }
 266        /* classification failed; try to use default class */
 267        cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
 268        if (!cl || cl->level)
 269                return HTB_DIRECT;      /* bad default .. this is safe bet */
 270        return cl;
 271}
 272
 273/**
 274 * htb_add_to_id_tree - adds class to the round robin list
 275 *
 276 * Routine adds class to the list (actually tree) sorted by classid.
 277 * Make sure that class is not already on such list for given prio.
 278 */
 279static void htb_add_to_id_tree(struct rb_root *root,
 280                               struct htb_class *cl, int prio)
 281{
 282        struct rb_node **p = &root->rb_node, *parent = NULL;
 283
 284        while (*p) {
 285                struct htb_class *c;
 286                parent = *p;
 287                c = rb_entry(parent, struct htb_class, node[prio]);
 288
 289                if (cl->common.classid > c->common.classid)
 290                        p = &parent->rb_right;
 291                else
 292                        p = &parent->rb_left;
 293        }
 294        rb_link_node(&cl->node[prio], parent, p);
 295        rb_insert_color(&cl->node[prio], root);
 296}
 297
 298/**
 299 * htb_add_to_wait_tree - adds class to the event queue with delay
 300 *
 301 * The class is added to priority event queue to indicate that class will
 302 * change its mode in cl->pq_key microseconds. Make sure that class is not
 303 * already in the queue.
 304 */
 305static void htb_add_to_wait_tree(struct htb_sched *q,
 306                                 struct htb_class *cl, s64 delay)
 307{
 308        struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
 309
 310        cl->pq_key = q->now + delay;
 311        if (cl->pq_key == q->now)
 312                cl->pq_key++;
 313
 314        /* update the nearest event cache */
 315        if (q->near_ev_cache[cl->level] > cl->pq_key)
 316                q->near_ev_cache[cl->level] = cl->pq_key;
 317
 318        while (*p) {
 319                struct htb_class *c;
 320                parent = *p;
 321                c = rb_entry(parent, struct htb_class, pq_node);
 322                if (cl->pq_key >= c->pq_key)
 323                        p = &parent->rb_right;
 324                else
 325                        p = &parent->rb_left;
 326        }
 327        rb_link_node(&cl->pq_node, parent, p);
 328        rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 329}
 330
 331/**
 332 * htb_next_rb_node - finds next node in binary tree
 333 *
 334 * When we are past last key we return NULL.
 335 * Average complexity is 2 steps per call.
 336 */
 337static inline void htb_next_rb_node(struct rb_node **n)
 338{
 339        *n = rb_next(*n);
 340}
 341
 342/**
 343 * htb_add_class_to_row - add class to its row
 344 *
 345 * The class is added to row at priorities marked in mask.
 346 * It does nothing if mask == 0.
 347 */
 348static inline void htb_add_class_to_row(struct htb_sched *q,
 349                                        struct htb_class *cl, int mask)
 350{
 351        q->row_mask[cl->level] |= mask;
 352        while (mask) {
 353                int prio = ffz(~mask);
 354                mask &= ~(1 << prio);
 355                htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
 356        }
 357}
 358
 359/* If this triggers, it is a bug in this code, but it need not be fatal */
 360static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
 361{
 362        if (RB_EMPTY_NODE(rb)) {
 363                WARN_ON(1);
 364        } else {
 365                rb_erase(rb, root);
 366                RB_CLEAR_NODE(rb);
 367        }
 368}
 369
 370
 371/**
 372 * htb_remove_class_from_row - removes class from its row
 373 *
 374 * The class is removed from row at priorities marked in mask.
 375 * It does nothing if mask == 0.
 376 */
 377static inline void htb_remove_class_from_row(struct htb_sched *q,
 378                                                 struct htb_class *cl, int mask)
 379{
 380        int m = 0;
 381        struct htb_level *hlevel = &q->hlevel[cl->level];
 382
 383        while (mask) {
 384                int prio = ffz(~mask);
 385                struct htb_prio *hprio = &hlevel->hprio[prio];
 386
 387                mask &= ~(1 << prio);
 388                if (hprio->ptr == cl->node + prio)
 389                        htb_next_rb_node(&hprio->ptr);
 390
 391                htb_safe_rb_erase(cl->node + prio, &hprio->row);
 392                if (!hprio->row.rb_node)
 393                        m |= 1 << prio;
 394        }
 395        q->row_mask[cl->level] &= ~m;
 396}
 397
 398/**
 399 * htb_activate_prios - creates active classe's feed chain
 400 *
 401 * The class is connected to ancestors and/or appropriate rows
 402 * for priorities it is participating on. cl->cmode must be new
 403 * (activated) mode. It does nothing if cl->prio_activity == 0.
 404 */
 405static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
 406{
 407        struct htb_class *p = cl->parent;
 408        long m, mask = cl->prio_activity;
 409
 410        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
 411                m = mask;
 412                while (m) {
 413                        int prio = ffz(~m);
 414                        m &= ~(1 << prio);
 415
 416                        if (p->un.inner.clprio[prio].feed.rb_node)
 417                                /* parent already has its feed in use so that
 418                                 * reset bit in mask as parent is already ok
 419                                 */
 420                                mask &= ~(1 << prio);
 421
 422                        htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
 423                }
 424                p->prio_activity |= mask;
 425                cl = p;
 426                p = cl->parent;
 427
 428        }
 429        if (cl->cmode == HTB_CAN_SEND && mask)
 430                htb_add_class_to_row(q, cl, mask);
 431}
 432
 433/**
 434 * htb_deactivate_prios - remove class from feed chain
 435 *
 436 * cl->cmode must represent old mode (before deactivation). It does
 437 * nothing if cl->prio_activity == 0. Class is removed from all feed
 438 * chains and rows.
 439 */
 440static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 441{
 442        struct htb_class *p = cl->parent;
 443        long m, mask = cl->prio_activity;
 444
 445        while (cl->cmode == HTB_MAY_BORROW && p && mask) {
 446                m = mask;
 447                mask = 0;
 448                while (m) {
 449                        int prio = ffz(~m);
 450                        m &= ~(1 << prio);
 451
 452                        if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
 453                                /* we are removing child which is pointed to from
 454                                 * parent feed - forget the pointer but remember
 455                                 * classid
 456                                 */
 457                                p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
 458                                p->un.inner.clprio[prio].ptr = NULL;
 459                        }
 460
 461                        htb_safe_rb_erase(cl->node + prio,
 462                                          &p->un.inner.clprio[prio].feed);
 463
 464                        if (!p->un.inner.clprio[prio].feed.rb_node)
 465                                mask |= 1 << prio;
 466                }
 467
 468                p->prio_activity &= ~mask;
 469                cl = p;
 470                p = cl->parent;
 471
 472        }
 473        if (cl->cmode == HTB_CAN_SEND && mask)
 474                htb_remove_class_from_row(q, cl, mask);
 475}
 476
 477static inline s64 htb_lowater(const struct htb_class *cl)
 478{
 479        if (htb_hysteresis)
 480                return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
 481        else
 482                return 0;
 483}
 484static inline s64 htb_hiwater(const struct htb_class *cl)
 485{
 486        if (htb_hysteresis)
 487                return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
 488        else
 489                return 0;
 490}
 491
 492
 493/**
 494 * htb_class_mode - computes and returns current class mode
 495 *
 496 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
 497 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
 498 * from now to time when cl will change its state.
 499 * Also it is worth to note that class mode doesn't change simply
 500 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
 501 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
 502 * mode transitions per time unit. The speed gain is about 1/6.
 503 */
 504static inline enum htb_cmode
 505htb_class_mode(struct htb_class *cl, s64 *diff)
 506{
 507        s64 toks;
 508
 509        if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
 510                *diff = -toks;
 511                return HTB_CANT_SEND;
 512        }
 513
 514        if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
 515                return HTB_CAN_SEND;
 516
 517        *diff = -toks;
 518        return HTB_MAY_BORROW;
 519}
 520
 521/**
 522 * htb_change_class_mode - changes classe's mode
 523 *
 524 * This should be the only way how to change classe's mode under normal
 525 * cirsumstances. Routine will update feed lists linkage, change mode
 526 * and add class to the wait event queue if appropriate. New mode should
 527 * be different from old one and cl->pq_key has to be valid if changing
 528 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
 529 */
 530static void
 531htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
 532{
 533        enum htb_cmode new_mode = htb_class_mode(cl, diff);
 534
 535        if (new_mode == cl->cmode)
 536                return;
 537
 538        if (new_mode == HTB_CANT_SEND)
 539                cl->overlimits++;
 540
 541        if (cl->prio_activity) {        /* not necessary: speed optimization */
 542                if (cl->cmode != HTB_CANT_SEND)
 543                        htb_deactivate_prios(q, cl);
 544                cl->cmode = new_mode;
 545                if (new_mode != HTB_CANT_SEND)
 546                        htb_activate_prios(q, cl);
 547        } else
 548                cl->cmode = new_mode;
 549}
 550
 551/**
 552 * htb_activate - inserts leaf cl into appropriate active feeds
 553 *
 554 * Routine learns (new) priority of leaf and activates feed chain
 555 * for the prio. It can be called on already active leaf safely.
 556 * It also adds leaf into droplist.
 557 */
 558static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
 559{
 560        WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
 561
 562        if (!cl->prio_activity) {
 563                cl->prio_activity = 1 << cl->prio;
 564                htb_activate_prios(q, cl);
 565                list_add_tail(&cl->un.leaf.drop_list,
 566                              q->drops + cl->prio);
 567        }
 568}
 569
 570/**
 571 * htb_deactivate - remove leaf cl from active feeds
 572 *
 573 * Make sure that leaf is active. In the other words it can't be called
 574 * with non-active leaf. It also removes class from the drop list.
 575 */
 576static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 577{
 578        WARN_ON(!cl->prio_activity);
 579
 580        htb_deactivate_prios(q, cl);
 581        cl->prio_activity = 0;
 582        list_del_init(&cl->un.leaf.drop_list);
 583}
 584
 585static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
 586                             struct qdisc_skb_head *qh)
 587{
 588        struct sk_buff *last = qh->tail;
 589
 590        if (last) {
 591                skb->next = NULL;
 592                last->next = skb;
 593                qh->tail = skb;
 594        } else {
 595                qh->tail = skb;
 596                qh->head = skb;
 597        }
 598        qh->qlen++;
 599}
 600
 601static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 602                       struct sk_buff **to_free)
 603{
 604        int uninitialized_var(ret);
 605        struct htb_sched *q = qdisc_priv(sch);
 606        struct htb_class *cl = htb_classify(skb, sch, &ret);
 607
 608        if (cl == HTB_DIRECT) {
 609                /* enqueue to helper queue */
 610                if (q->direct_queue.qlen < q->direct_qlen) {
 611                        htb_enqueue_tail(skb, sch, &q->direct_queue);
 612                        q->direct_pkts++;
 613                } else {
 614                        return qdisc_drop(skb, sch, to_free);
 615                }
 616#ifdef CONFIG_NET_CLS_ACT
 617        } else if (!cl) {
 618                if (ret & __NET_XMIT_BYPASS)
 619                        qdisc_qstats_drop(sch);
 620                __qdisc_drop(skb, to_free);
 621                return ret;
 622#endif
 623        } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
 624                                        to_free)) != NET_XMIT_SUCCESS) {
 625                if (net_xmit_drop_count(ret)) {
 626                        qdisc_qstats_drop(sch);
 627                        cl->drops++;
 628                }
 629                return ret;
 630        } else {
 631                htb_activate(q, cl);
 632        }
 633
 634        qdisc_qstats_backlog_inc(sch, skb);
 635        sch->q.qlen++;
 636        return NET_XMIT_SUCCESS;
 637}
 638
 639static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
 640{
 641        s64 toks = diff + cl->tokens;
 642
 643        if (toks > cl->buffer)
 644                toks = cl->buffer;
 645        toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
 646        if (toks <= -cl->mbuffer)
 647                toks = 1 - cl->mbuffer;
 648
 649        cl->tokens = toks;
 650}
 651
 652static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
 653{
 654        s64 toks = diff + cl->ctokens;
 655
 656        if (toks > cl->cbuffer)
 657                toks = cl->cbuffer;
 658        toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
 659        if (toks <= -cl->mbuffer)
 660                toks = 1 - cl->mbuffer;
 661
 662        cl->ctokens = toks;
 663}
 664
 665/**
 666 * htb_charge_class - charges amount "bytes" to leaf and ancestors
 667 *
 668 * Routine assumes that packet "bytes" long was dequeued from leaf cl
 669 * borrowing from "level". It accounts bytes to ceil leaky bucket for
 670 * leaf and all ancestors and to rate bucket for ancestors at levels
 671 * "level" and higher. It also handles possible change of mode resulting
 672 * from the update. Note that mode can also increase here (MAY_BORROW to
 673 * CAN_SEND) because we can use more precise clock that event queue here.
 674 * In such case we remove class from event queue first.
 675 */
 676static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 677                             int level, struct sk_buff *skb)
 678{
 679        int bytes = qdisc_pkt_len(skb);
 680        enum htb_cmode old_mode;
 681        s64 diff;
 682
 683        while (cl) {
 684                diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 685                if (cl->level >= level) {
 686                        if (cl->level == level)
 687                                cl->xstats.lends++;
 688                        htb_accnt_tokens(cl, bytes, diff);
 689                } else {
 690                        cl->xstats.borrows++;
 691                        cl->tokens += diff;     /* we moved t_c; update tokens */
 692                }
 693                htb_accnt_ctokens(cl, bytes, diff);
 694                cl->t_c = q->now;
 695
 696                old_mode = cl->cmode;
 697                diff = 0;
 698                htb_change_class_mode(q, cl, &diff);
 699                if (old_mode != cl->cmode) {
 700                        if (old_mode != HTB_CAN_SEND)
 701                                htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 702                        if (cl->cmode != HTB_CAN_SEND)
 703                                htb_add_to_wait_tree(q, cl, diff);
 704                }
 705
 706                /* update basic stats except for leaves which are already updated */
 707                if (cl->level)
 708                        bstats_update(&cl->bstats, skb);
 709
 710                cl = cl->parent;
 711        }
 712}
 713
 714/**
 715 * htb_do_events - make mode changes to classes at the level
 716 *
 717 * Scans event queue for pending events and applies them. Returns time of
 718 * next pending event (0 for no event in pq, q->now for too many events).
 719 * Note: Applied are events whose have cl->pq_key <= q->now.
 720 */
 721static s64 htb_do_events(struct htb_sched *q, const int level,
 722                         unsigned long start)
 723{
 724        /* don't run for longer than 2 jiffies; 2 is used instead of
 725         * 1 to simplify things when jiffy is going to be incremented
 726         * too soon
 727         */
 728        unsigned long stop_at = start + 2;
 729        struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
 730
 731        while (time_before(jiffies, stop_at)) {
 732                struct htb_class *cl;
 733                s64 diff;
 734                struct rb_node *p = rb_first(wait_pq);
 735
 736                if (!p)
 737                        return 0;
 738
 739                cl = rb_entry(p, struct htb_class, pq_node);
 740                if (cl->pq_key > q->now)
 741                        return cl->pq_key;
 742
 743                htb_safe_rb_erase(p, wait_pq);
 744                diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 745                htb_change_class_mode(q, cl, &diff);
 746                if (cl->cmode != HTB_CAN_SEND)
 747                        htb_add_to_wait_tree(q, cl, diff);
 748        }
 749
 750        /* too much load - let's continue after a break for scheduling */
 751        if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
 752                pr_warn("htb: too many events!\n");
 753                q->warned |= HTB_WARN_TOOMANYEVENTS;
 754        }
 755
 756        return q->now;
 757}
 758
 759/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
 760 * is no such one exists.
 761 */
 762static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
 763                                              u32 id)
 764{
 765        struct rb_node *r = NULL;
 766        while (n) {
 767                struct htb_class *cl =
 768                    rb_entry(n, struct htb_class, node[prio]);
 769
 770                if (id > cl->common.classid) {
 771                        n = n->rb_right;
 772                } else if (id < cl->common.classid) {
 773                        r = n;
 774                        n = n->rb_left;
 775                } else {
 776                        return n;
 777                }
 778        }
 779        return r;
 780}
 781
 782/**
 783 * htb_lookup_leaf - returns next leaf class in DRR order
 784 *
 785 * Find leaf where current feed pointers points to.
 786 */
 787static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
 788{
 789        int i;
 790        struct {
 791                struct rb_node *root;
 792                struct rb_node **pptr;
 793                u32 *pid;
 794        } stk[TC_HTB_MAXDEPTH], *sp = stk;
 795
 796        BUG_ON(!hprio->row.rb_node);
 797        sp->root = hprio->row.rb_node;
 798        sp->pptr = &hprio->ptr;
 799        sp->pid = &hprio->last_ptr_id;
 800
 801        for (i = 0; i < 65535; i++) {
 802                if (!*sp->pptr && *sp->pid) {
 803                        /* ptr was invalidated but id is valid - try to recover
 804                         * the original or next ptr
 805                         */
 806                        *sp->pptr =
 807                            htb_id_find_next_upper(prio, sp->root, *sp->pid);
 808                }
 809                *sp->pid = 0;   /* ptr is valid now so that remove this hint as it
 810                                 * can become out of date quickly
 811                                 */
 812                if (!*sp->pptr) {       /* we are at right end; rewind & go up */
 813                        *sp->pptr = sp->root;
 814                        while ((*sp->pptr)->rb_left)
 815                                *sp->pptr = (*sp->pptr)->rb_left;
 816                        if (sp > stk) {
 817                                sp--;
 818                                if (!*sp->pptr) {
 819                                        WARN_ON(1);
 820                                        return NULL;
 821                                }
 822                                htb_next_rb_node(sp->pptr);
 823                        }
 824                } else {
 825                        struct htb_class *cl;
 826                        struct htb_prio *clp;
 827
 828                        cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
 829                        if (!cl->level)
 830                                return cl;
 831                        clp = &cl->un.inner.clprio[prio];
 832                        (++sp)->root = clp->feed.rb_node;
 833                        sp->pptr = &clp->ptr;
 834                        sp->pid = &clp->last_ptr_id;
 835                }
 836        }
 837        WARN_ON(1);
 838        return NULL;
 839}
 840
 841/* dequeues packet at given priority and level; call only if
 842 * you are sure that there is active class at prio/level
 843 */
 844static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
 845                                        const int level)
 846{
 847        struct sk_buff *skb = NULL;
 848        struct htb_class *cl, *start;
 849        struct htb_level *hlevel = &q->hlevel[level];
 850        struct htb_prio *hprio = &hlevel->hprio[prio];
 851
 852        /* look initial class up in the row */
 853        start = cl = htb_lookup_leaf(hprio, prio);
 854
 855        do {
 856next:
 857                if (unlikely(!cl))
 858                        return NULL;
 859
 860                /* class can be empty - it is unlikely but can be true if leaf
 861                 * qdisc drops packets in enqueue routine or if someone used
 862                 * graft operation on the leaf since last dequeue;
 863                 * simply deactivate and skip such class
 864                 */
 865                if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
 866                        struct htb_class *next;
 867                        htb_deactivate(q, cl);
 868
 869                        /* row/level might become empty */
 870                        if ((q->row_mask[level] & (1 << prio)) == 0)
 871                                return NULL;
 872
 873                        next = htb_lookup_leaf(hprio, prio);
 874
 875                        if (cl == start)        /* fix start if we just deleted it */
 876                                start = next;
 877                        cl = next;
 878                        goto next;
 879                }
 880
 881                skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
 882                if (likely(skb != NULL))
 883                        break;
 884
 885                qdisc_warn_nonwc("htb", cl->un.leaf.q);
 886                htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
 887                                         &q->hlevel[0].hprio[prio].ptr);
 888                cl = htb_lookup_leaf(hprio, prio);
 889
 890        } while (cl != start);
 891
 892        if (likely(skb != NULL)) {
 893                bstats_update(&cl->bstats, skb);
 894                cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
 895                if (cl->un.leaf.deficit[level] < 0) {
 896                        cl->un.leaf.deficit[level] += cl->quantum;
 897                        htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
 898                                                 &q->hlevel[0].hprio[prio].ptr);
 899                }
 900                /* this used to be after charge_class but this constelation
 901                 * gives us slightly better performance
 902                 */
 903                if (!cl->un.leaf.q->q.qlen)
 904                        htb_deactivate(q, cl);
 905                htb_charge_class(q, cl, level, skb);
 906        }
 907        return skb;
 908}
 909
 910static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 911{
 912        struct sk_buff *skb;
 913        struct htb_sched *q = qdisc_priv(sch);
 914        int level;
 915        s64 next_event;
 916        unsigned long start_at;
 917
 918        /* try to dequeue direct packets as high prio (!) to minimize cpu work */
 919        skb = __qdisc_dequeue_head(&q->direct_queue);
 920        if (skb != NULL) {
 921ok:
 922                qdisc_bstats_update(sch, skb);
 923                qdisc_qstats_backlog_dec(sch, skb);
 924                sch->q.qlen--;
 925                return skb;
 926        }
 927
 928        if (!sch->q.qlen)
 929                goto fin;
 930        q->now = ktime_get_ns();
 931        start_at = jiffies;
 932
 933        next_event = q->now + 5LLU * NSEC_PER_SEC;
 934
 935        for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
 936                /* common case optimization - skip event handler quickly */
 937                int m;
 938                s64 event = q->near_ev_cache[level];
 939
 940                if (q->now >= event) {
 941                        event = htb_do_events(q, level, start_at);
 942                        if (!event)
 943                                event = q->now + NSEC_PER_SEC;
 944                        q->near_ev_cache[level] = event;
 945                }
 946
 947                if (next_event > event)
 948                        next_event = event;
 949
 950                m = ~q->row_mask[level];
 951                while (m != (int)(-1)) {
 952                        int prio = ffz(m);
 953
 954                        m |= 1 << prio;
 955                        skb = htb_dequeue_tree(q, prio, level);
 956                        if (likely(skb != NULL))
 957                                goto ok;
 958                }
 959        }
 960        qdisc_qstats_overlimit(sch);
 961        if (likely(next_event > q->now))
 962                qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
 963        else
 964                schedule_work(&q->work);
 965fin:
 966        return skb;
 967}
 968
 969/* reset all classes */
 970/* always caled under BH & queue lock */
 971static void htb_reset(struct Qdisc *sch)
 972{
 973        struct htb_sched *q = qdisc_priv(sch);
 974        struct htb_class *cl;
 975        unsigned int i;
 976
 977        for (i = 0; i < q->clhash.hashsize; i++) {
 978                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 979                        if (cl->level)
 980                                memset(&cl->un.inner, 0, sizeof(cl->un.inner));
 981                        else {
 982                                if (cl->un.leaf.q)
 983                                        qdisc_reset(cl->un.leaf.q);
 984                                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
 985                        }
 986                        cl->prio_activity = 0;
 987                        cl->cmode = HTB_CAN_SEND;
 988                }
 989        }
 990        qdisc_watchdog_cancel(&q->watchdog);
 991        __qdisc_reset_queue(&q->direct_queue);
 992        sch->q.qlen = 0;
 993        sch->qstats.backlog = 0;
 994        memset(q->hlevel, 0, sizeof(q->hlevel));
 995        memset(q->row_mask, 0, sizeof(q->row_mask));
 996        for (i = 0; i < TC_HTB_NUMPRIO; i++)
 997                INIT_LIST_HEAD(q->drops + i);
 998}
 999
1000static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1001        [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
1002        [TCA_HTB_INIT]  = { .len = sizeof(struct tc_htb_glob) },
1003        [TCA_HTB_CTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1004        [TCA_HTB_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1005        [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1006        [TCA_HTB_RATE64] = { .type = NLA_U64 },
1007        [TCA_HTB_CEIL64] = { .type = NLA_U64 },
1008};
1009
1010static void htb_work_func(struct work_struct *work)
1011{
1012        struct htb_sched *q = container_of(work, struct htb_sched, work);
1013        struct Qdisc *sch = q->watchdog.qdisc;
1014
1015        rcu_read_lock();
1016        __netif_schedule(qdisc_root(sch));
1017        rcu_read_unlock();
1018}
1019
1020static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1021{
1022        struct htb_sched *q = qdisc_priv(sch);
1023        struct nlattr *tb[TCA_HTB_MAX + 1];
1024        struct tc_htb_glob *gopt;
1025        int err;
1026        int i;
1027
1028        qdisc_watchdog_init(&q->watchdog, sch);
1029        INIT_WORK(&q->work, htb_work_func);
1030
1031        if (!opt)
1032                return -EINVAL;
1033
1034        err = tcf_block_get(&q->block, &q->filter_list, sch);
1035        if (err)
1036                return err;
1037
1038        err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1039        if (err < 0)
1040                return err;
1041
1042        if (!tb[TCA_HTB_INIT])
1043                return -EINVAL;
1044
1045        gopt = nla_data(tb[TCA_HTB_INIT]);
1046        if (gopt->version != HTB_VER >> 16)
1047                return -EINVAL;
1048
1049        err = qdisc_class_hash_init(&q->clhash);
1050        if (err < 0)
1051                return err;
1052        for (i = 0; i < TC_HTB_NUMPRIO; i++)
1053                INIT_LIST_HEAD(q->drops + i);
1054
1055        qdisc_skb_head_init(&q->direct_queue);
1056
1057        if (tb[TCA_HTB_DIRECT_QLEN])
1058                q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1059        else
1060                q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1061
1062        if ((q->rate2quantum = gopt->rate2quantum) < 1)
1063                q->rate2quantum = 1;
1064        q->defcls = gopt->defcls;
1065
1066        return 0;
1067}
1068
1069static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1070{
1071        struct htb_sched *q = qdisc_priv(sch);
1072        struct nlattr *nest;
1073        struct tc_htb_glob gopt;
1074
1075        /* Its safe to not acquire qdisc lock. As we hold RTNL,
1076         * no change can happen on the qdisc parameters.
1077         */
1078
1079        gopt.direct_pkts = q->direct_pkts;
1080        gopt.version = HTB_VER;
1081        gopt.rate2quantum = q->rate2quantum;
1082        gopt.defcls = q->defcls;
1083        gopt.debug = 0;
1084
1085        nest = nla_nest_start(skb, TCA_OPTIONS);
1086        if (nest == NULL)
1087                goto nla_put_failure;
1088        if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1089            nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1090                goto nla_put_failure;
1091
1092        return nla_nest_end(skb, nest);
1093
1094nla_put_failure:
1095        nla_nest_cancel(skb, nest);
1096        return -1;
1097}
1098
1099static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1100                          struct sk_buff *skb, struct tcmsg *tcm)
1101{
1102        struct htb_class *cl = (struct htb_class *)arg;
1103        struct nlattr *nest;
1104        struct tc_htb_opt opt;
1105
1106        /* Its safe to not acquire qdisc lock. As we hold RTNL,
1107         * no change can happen on the class parameters.
1108         */
1109        tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1110        tcm->tcm_handle = cl->common.classid;
1111        if (!cl->level && cl->un.leaf.q)
1112                tcm->tcm_info = cl->un.leaf.q->handle;
1113
1114        nest = nla_nest_start(skb, TCA_OPTIONS);
1115        if (nest == NULL)
1116                goto nla_put_failure;
1117
1118        memset(&opt, 0, sizeof(opt));
1119
1120        psched_ratecfg_getrate(&opt.rate, &cl->rate);
1121        opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1122        psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1123        opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1124        opt.quantum = cl->quantum;
1125        opt.prio = cl->prio;
1126        opt.level = cl->level;
1127        if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1128                goto nla_put_failure;
1129        if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1130            nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1131                              TCA_HTB_PAD))
1132                goto nla_put_failure;
1133        if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1134            nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1135                              TCA_HTB_PAD))
1136                goto nla_put_failure;
1137
1138        return nla_nest_end(skb, nest);
1139
1140nla_put_failure:
1141        nla_nest_cancel(skb, nest);
1142        return -1;
1143}
1144
1145static int
1146htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1147{
1148        struct htb_class *cl = (struct htb_class *)arg;
1149        struct gnet_stats_queue qs = {
1150                .drops = cl->drops,
1151                .overlimits = cl->overlimits,
1152        };
1153        __u32 qlen = 0;
1154
1155        if (!cl->level && cl->un.leaf.q) {
1156                qlen = cl->un.leaf.q->q.qlen;
1157                qs.backlog = cl->un.leaf.q->qstats.backlog;
1158        }
1159        cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1160                                    INT_MIN, INT_MAX);
1161        cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1162                                     INT_MIN, INT_MAX);
1163
1164        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1165                                  d, NULL, &cl->bstats) < 0 ||
1166            gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1167            gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1168                return -1;
1169
1170        return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1171}
1172
1173static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1174                     struct Qdisc **old)
1175{
1176        struct htb_class *cl = (struct htb_class *)arg;
1177
1178        if (cl->level)
1179                return -EINVAL;
1180        if (new == NULL &&
1181            (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1182                                     cl->common.classid)) == NULL)
1183                return -ENOBUFS;
1184
1185        *old = qdisc_replace(sch, new, &cl->un.leaf.q);
1186        return 0;
1187}
1188
1189static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1190{
1191        struct htb_class *cl = (struct htb_class *)arg;
1192        return !cl->level ? cl->un.leaf.q : NULL;
1193}
1194
1195static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1196{
1197        struct htb_class *cl = (struct htb_class *)arg;
1198
1199        htb_deactivate(qdisc_priv(sch), cl);
1200}
1201
1202static inline int htb_parent_last_child(struct htb_class *cl)
1203{
1204        if (!cl->parent)
1205                /* the root class */
1206                return 0;
1207        if (cl->parent->children > 1)
1208                /* not the last child */
1209                return 0;
1210        return 1;
1211}
1212
1213static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1214                               struct Qdisc *new_q)
1215{
1216        struct htb_class *parent = cl->parent;
1217
1218        WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1219
1220        if (parent->cmode != HTB_CAN_SEND)
1221                htb_safe_rb_erase(&parent->pq_node,
1222                                  &q->hlevel[parent->level].wait_pq);
1223
1224        parent->level = 0;
1225        memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1226        INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1227        parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1228        parent->tokens = parent->buffer;
1229        parent->ctokens = parent->cbuffer;
1230        parent->t_c = ktime_get_ns();
1231        parent->cmode = HTB_CAN_SEND;
1232}
1233
1234static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1235{
1236        if (!cl->level) {
1237                WARN_ON(!cl->un.leaf.q);
1238                qdisc_destroy(cl->un.leaf.q);
1239        }
1240        gen_kill_estimator(&cl->rate_est);
1241        tcf_block_put(cl->block);
1242        kfree(cl);
1243}
1244
1245static void htb_destroy(struct Qdisc *sch)
1246{
1247        struct htb_sched *q = qdisc_priv(sch);
1248        struct hlist_node *next;
1249        struct htb_class *cl;
1250        unsigned int i;
1251
1252        cancel_work_sync(&q->work);
1253        qdisc_watchdog_cancel(&q->watchdog);
1254        /* This line used to be after htb_destroy_class call below
1255         * and surprisingly it worked in 2.4. But it must precede it
1256         * because filter need its target class alive to be able to call
1257         * unbind_filter on it (without Oops).
1258         */
1259        tcf_block_put(q->block);
1260
1261        for (i = 0; i < q->clhash.hashsize; i++) {
1262                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1263                        tcf_block_put(cl->block);
1264                        cl->block = NULL;
1265                }
1266        }
1267        for (i = 0; i < q->clhash.hashsize; i++) {
1268                hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1269                                          common.hnode)
1270                        htb_destroy_class(sch, cl);
1271        }
1272        qdisc_class_hash_destroy(&q->clhash);
1273        __qdisc_reset_queue(&q->direct_queue);
1274}
1275
1276static int htb_delete(struct Qdisc *sch, unsigned long arg)
1277{
1278        struct htb_sched *q = qdisc_priv(sch);
1279        struct htb_class *cl = (struct htb_class *)arg;
1280        struct Qdisc *new_q = NULL;
1281        int last_child = 0;
1282
1283        /* TODO: why don't allow to delete subtree ? references ? does
1284         * tc subsys guarantee us that in htb_destroy it holds no class
1285         * refs so that we can remove children safely there ?
1286         */
1287        if (cl->children || cl->filter_cnt)
1288                return -EBUSY;
1289
1290        if (!cl->level && htb_parent_last_child(cl)) {
1291                new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1292                                          cl->parent->common.classid);
1293                last_child = 1;
1294        }
1295
1296        sch_tree_lock(sch);
1297
1298        if (!cl->level) {
1299                unsigned int qlen = cl->un.leaf.q->q.qlen;
1300                unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1301
1302                qdisc_reset(cl->un.leaf.q);
1303                qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1304        }
1305
1306        /* delete from hash and active; remainder in destroy_class */
1307        qdisc_class_hash_remove(&q->clhash, &cl->common);
1308        if (cl->parent)
1309                cl->parent->children--;
1310
1311        if (cl->prio_activity)
1312                htb_deactivate(q, cl);
1313
1314        if (cl->cmode != HTB_CAN_SEND)
1315                htb_safe_rb_erase(&cl->pq_node,
1316                                  &q->hlevel[cl->level].wait_pq);
1317
1318        if (last_child)
1319                htb_parent_to_leaf(q, cl, new_q);
1320
1321        sch_tree_unlock(sch);
1322
1323        htb_destroy_class(sch, cl);
1324        return 0;
1325}
1326
1327static int htb_change_class(struct Qdisc *sch, u32 classid,
1328                            u32 parentid, struct nlattr **tca,
1329                            unsigned long *arg)
1330{
1331        int err = -EINVAL;
1332        struct htb_sched *q = qdisc_priv(sch);
1333        struct htb_class *cl = (struct htb_class *)*arg, *parent;
1334        struct nlattr *opt = tca[TCA_OPTIONS];
1335        struct nlattr *tb[TCA_HTB_MAX + 1];
1336        struct tc_htb_opt *hopt;
1337        u64 rate64, ceil64;
1338
1339        /* extract all subattrs from opt attr */
1340        if (!opt)
1341                goto failure;
1342
1343        err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1344        if (err < 0)
1345                goto failure;
1346
1347        err = -EINVAL;
1348        if (tb[TCA_HTB_PARMS] == NULL)
1349                goto failure;
1350
1351        parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1352
1353        hopt = nla_data(tb[TCA_HTB_PARMS]);
1354        if (!hopt->rate.rate || !hopt->ceil.rate)
1355                goto failure;
1356
1357        /* Keeping backward compatible with rate_table based iproute2 tc */
1358        if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1359                qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1360
1361        if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1362                qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1363
1364        if (!cl) {              /* new class */
1365                struct Qdisc *new_q;
1366                int prio;
1367                struct {
1368                        struct nlattr           nla;
1369                        struct gnet_estimator   opt;
1370                } est = {
1371                        .nla = {
1372                                .nla_len        = nla_attr_size(sizeof(est.opt)),
1373                                .nla_type       = TCA_RATE,
1374                        },
1375                        .opt = {
1376                                /* 4s interval, 16s averaging constant */
1377                                .interval       = 2,
1378                                .ewma_log       = 2,
1379                        },
1380                };
1381
1382                /* check for valid classid */
1383                if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1384                    htb_find(classid, sch))
1385                        goto failure;
1386
1387                /* check maximal depth */
1388                if (parent && parent->parent && parent->parent->level < 2) {
1389                        pr_err("htb: tree is too deep\n");
1390                        goto failure;
1391                }
1392                err = -ENOBUFS;
1393                cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1394                if (!cl)
1395                        goto failure;
1396
1397                err = tcf_block_get(&cl->block, &cl->filter_list, sch);
1398                if (err) {
1399                        kfree(cl);
1400                        goto failure;
1401                }
1402                if (htb_rate_est || tca[TCA_RATE]) {
1403                        err = gen_new_estimator(&cl->bstats, NULL,
1404                                                &cl->rate_est,
1405                                                NULL,
1406                                                qdisc_root_sleeping_running(sch),
1407                                                tca[TCA_RATE] ? : &est.nla);
1408                        if (err) {
1409                                tcf_block_put(cl->block);
1410                                kfree(cl);
1411                                goto failure;
1412                        }
1413                }
1414
1415                cl->children = 0;
1416                INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1417                RB_CLEAR_NODE(&cl->pq_node);
1418
1419                for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1420                        RB_CLEAR_NODE(&cl->node[prio]);
1421
1422                /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1423                 * so that can't be used inside of sch_tree_lock
1424                 * -- thanks to Karlis Peisenieks
1425                 */
1426                new_q = qdisc_create_dflt(sch->dev_queue,
1427                                          &pfifo_qdisc_ops, classid);
1428                sch_tree_lock(sch);
1429                if (parent && !parent->level) {
1430                        unsigned int qlen = parent->un.leaf.q->q.qlen;
1431                        unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1432
1433                        /* turn parent into inner node */
1434                        qdisc_reset(parent->un.leaf.q);
1435                        qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1436                        qdisc_destroy(parent->un.leaf.q);
1437                        if (parent->prio_activity)
1438                                htb_deactivate(q, parent);
1439
1440                        /* remove from evt list because of level change */
1441                        if (parent->cmode != HTB_CAN_SEND) {
1442                                htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1443                                parent->cmode = HTB_CAN_SEND;
1444                        }
1445                        parent->level = (parent->parent ? parent->parent->level
1446                                         : TC_HTB_MAXDEPTH) - 1;
1447                        memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1448                }
1449                /* leaf (we) needs elementary qdisc */
1450                cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1451
1452                cl->common.classid = classid;
1453                cl->parent = parent;
1454
1455                /* set class to be in HTB_CAN_SEND state */
1456                cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1457                cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1458                cl->mbuffer = 60ULL * NSEC_PER_SEC;     /* 1min */
1459                cl->t_c = ktime_get_ns();
1460                cl->cmode = HTB_CAN_SEND;
1461
1462                /* attach to the hash list and parent's family */
1463                qdisc_class_hash_insert(&q->clhash, &cl->common);
1464                if (parent)
1465                        parent->children++;
1466                if (cl->un.leaf.q != &noop_qdisc)
1467                        qdisc_hash_add(cl->un.leaf.q, true);
1468        } else {
1469                if (tca[TCA_RATE]) {
1470                        err = gen_replace_estimator(&cl->bstats, NULL,
1471                                                    &cl->rate_est,
1472                                                    NULL,
1473                                                    qdisc_root_sleeping_running(sch),
1474                                                    tca[TCA_RATE]);
1475                        if (err)
1476                                return err;
1477                }
1478                sch_tree_lock(sch);
1479        }
1480
1481        rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1482
1483        ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1484
1485        psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1486        psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1487
1488        /* it used to be a nasty bug here, we have to check that node
1489         * is really leaf before changing cl->un.leaf !
1490         */
1491        if (!cl->level) {
1492                u64 quantum = cl->rate.rate_bytes_ps;
1493
1494                do_div(quantum, q->rate2quantum);
1495                cl->quantum = min_t(u64, quantum, INT_MAX);
1496
1497                if (!hopt->quantum && cl->quantum < 1000) {
1498                        pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1499                                cl->common.classid);
1500                        cl->quantum = 1000;
1501                }
1502                if (!hopt->quantum && cl->quantum > 200000) {
1503                        pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1504                                cl->common.classid);
1505                        cl->quantum = 200000;
1506                }
1507                if (hopt->quantum)
1508                        cl->quantum = hopt->quantum;
1509                if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1510                        cl->prio = TC_HTB_NUMPRIO - 1;
1511        }
1512
1513        cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1514        cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1515
1516        sch_tree_unlock(sch);
1517
1518        qdisc_class_hash_grow(sch, &q->clhash);
1519
1520        *arg = (unsigned long)cl;
1521        return 0;
1522
1523failure:
1524        return err;
1525}
1526
1527static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg)
1528{
1529        struct htb_sched *q = qdisc_priv(sch);
1530        struct htb_class *cl = (struct htb_class *)arg;
1531
1532        return cl ? cl->block : q->block;
1533}
1534
1535static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1536                                     u32 classid)
1537{
1538        struct htb_class *cl = htb_find(classid, sch);
1539
1540        /*if (cl && !cl->level) return 0;
1541         * The line above used to be there to prevent attaching filters to
1542         * leaves. But at least tc_index filter uses this just to get class
1543         * for other reasons so that we have to allow for it.
1544         * ----
1545         * 19.6.2002 As Werner explained it is ok - bind filter is just
1546         * another way to "lock" the class - unlike "get" this lock can
1547         * be broken by class during destroy IIUC.
1548         */
1549        if (cl)
1550                cl->filter_cnt++;
1551        return (unsigned long)cl;
1552}
1553
1554static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1555{
1556        struct htb_class *cl = (struct htb_class *)arg;
1557
1558        if (cl)
1559                cl->filter_cnt--;
1560}
1561
1562static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1563{
1564        struct htb_sched *q = qdisc_priv(sch);
1565        struct htb_class *cl;
1566        unsigned int i;
1567
1568        if (arg->stop)
1569                return;
1570
1571        for (i = 0; i < q->clhash.hashsize; i++) {
1572                hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1573                        if (arg->count < arg->skip) {
1574                                arg->count++;
1575                                continue;
1576                        }
1577                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1578                                arg->stop = 1;
1579                                return;
1580                        }
1581                        arg->count++;
1582                }
1583        }
1584}
1585
1586static const struct Qdisc_class_ops htb_class_ops = {
1587        .graft          =       htb_graft,
1588        .leaf           =       htb_leaf,
1589        .qlen_notify    =       htb_qlen_notify,
1590        .find           =       htb_search,
1591        .change         =       htb_change_class,
1592        .delete         =       htb_delete,
1593        .walk           =       htb_walk,
1594        .tcf_block      =       htb_tcf_block,
1595        .bind_tcf       =       htb_bind_filter,
1596        .unbind_tcf     =       htb_unbind_filter,
1597        .dump           =       htb_dump_class,
1598        .dump_stats     =       htb_dump_class_stats,
1599};
1600
1601static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1602        .cl_ops         =       &htb_class_ops,
1603        .id             =       "htb",
1604        .priv_size      =       sizeof(struct htb_sched),
1605        .enqueue        =       htb_enqueue,
1606        .dequeue        =       htb_dequeue,
1607        .peek           =       qdisc_peek_dequeued,
1608        .init           =       htb_init,
1609        .reset          =       htb_reset,
1610        .destroy        =       htb_destroy,
1611        .dump           =       htb_dump,
1612        .owner          =       THIS_MODULE,
1613};
1614
1615static int __init htb_module_init(void)
1616{
1617        return register_qdisc(&htb_qdisc_ops);
1618}
1619static void __exit htb_module_exit(void)
1620{
1621        unregister_qdisc(&htb_qdisc_ops);
1622}
1623
1624module_init(htb_module_init)
1625module_exit(htb_module_exit)
1626MODULE_LICENSE("GPL");
1627