linux/net/sched/sch_tbf.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  11 *                                               original idea by Martin Devera
  12 *
  13 */
  14
  15#include <linux/module.h>
  16#include <linux/types.h>
  17#include <linux/kernel.h>
  18#include <linux/string.h>
  19#include <linux/errno.h>
  20#include <linux/skbuff.h>
  21#include <net/netlink.h>
  22#include <net/sch_generic.h>
  23#include <net/pkt_sched.h>
  24
  25
  26/*      Simple Token Bucket Filter.
  27        =======================================
  28
  29        SOURCE.
  30        -------
  31
  32        None.
  33
  34        Description.
  35        ------------
  36
  37        A data flow obeys TBF with rate R and depth B, if for any
  38        time interval t_i...t_f the number of transmitted bits
  39        does not exceed B + R*(t_f-t_i).
  40
  41        Packetized version of this definition:
  42        The sequence of packets of sizes s_i served at moments t_i
  43        obeys TBF, if for any i<=k:
  44
  45        s_i+....+s_k <= B + R*(t_k - t_i)
  46
  47        Algorithm.
  48        ----------
  49
  50        Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  51
  52        N(t+delta) = min{B/R, N(t) + delta}
  53
  54        If the first packet in queue has length S, it may be
  55        transmitted only at the time t_* when S/R <= N(t_*),
  56        and in this case N(t) jumps:
  57
  58        N(t_* + 0) = N(t_* - 0) - S/R.
  59
  60
  61
  62        Actually, QoS requires two TBF to be applied to a data stream.
  63        One of them controls steady state burst size, another
  64        one with rate P (peak rate) and depth M (equal to link MTU)
  65        limits bursts at a smaller time scale.
  66
  67        It is easy to see that P>R, and B>M. If P is infinity, this double
  68        TBF is equivalent to a single one.
  69
  70        When TBF works in reshaping mode, latency is estimated as:
  71
  72        lat = max ((L-B)/R, (L-M)/P)
  73
  74
  75        NOTES.
  76        ------
  77
  78        If TBF throttles, it starts a watchdog timer, which will wake it up
  79        when it is ready to transmit.
  80        Note that the minimal timer resolution is 1/HZ.
  81        If no new packets arrive during this period,
  82        or if the device is not awaken by EOI for some previous packet,
  83        TBF can stop its activity for 1/HZ.
  84
  85
  86        This means, that with depth B, the maximal rate is
  87
  88        R_crit = B*HZ
  89
  90        F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  91
  92        Note that the peak rate TBF is much more tough: with MTU 1500
  93        P_crit = 150Kbytes/sec. So, if you need greater peak
  94        rates, use alpha with HZ=1000 :-)
  95
  96        With classful TBF, limit is just kept for backwards compatibility.
  97        It is passed to the default bfifo qdisc - if the inner qdisc is
  98        changed the limit is not effective anymore.
  99*/
 100
 101struct tbf_sched_data {
 102/* Parameters */
 103        u32             limit;          /* Maximal length of backlog: bytes */
 104        s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 105        s64             mtu;
 106        u32             max_size;
 107        struct psched_ratecfg rate;
 108        struct psched_ratecfg peak;
 109        bool peak_present;
 110
 111/* Variables */
 112        s64     tokens;                 /* Current number of B tokens */
 113        s64     ptokens;                /* Current number of P tokens */
 114        s64     t_c;                    /* Time check-point */
 115        struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 116        struct qdisc_watchdog watchdog; /* Watchdog timer */
 117};
 118
 119
 120/* GSO packet is too big, segment it so that tbf can transmit
 121 * each segment in time
 122 */
 123static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 124{
 125        struct tbf_sched_data *q = qdisc_priv(sch);
 126        struct sk_buff *segs, *nskb;
 127        netdev_features_t features = netif_skb_features(skb);
 128        int ret, nb;
 129
 130        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 131
 132        if (IS_ERR_OR_NULL(segs))
 133                return qdisc_reshape_fail(skb, sch);
 134
 135        nb = 0;
 136        while (segs) {
 137                nskb = segs->next;
 138                segs->next = NULL;
 139                if (likely(segs->len <= q->max_size)) {
 140                        qdisc_skb_cb(segs)->pkt_len = segs->len;
 141                        ret = qdisc_enqueue(segs, q->qdisc);
 142                } else {
 143                        ret = qdisc_reshape_fail(skb, sch);
 144                }
 145                if (ret != NET_XMIT_SUCCESS) {
 146                        if (net_xmit_drop_count(ret))
 147                                sch->qstats.drops++;
 148                } else {
 149                        nb++;
 150                }
 151                segs = nskb;
 152        }
 153        sch->q.qlen += nb;
 154        if (nb > 1)
 155                qdisc_tree_decrease_qlen(sch, 1 - nb);
 156        consume_skb(skb);
 157        return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 158}
 159
 160static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 161{
 162        struct tbf_sched_data *q = qdisc_priv(sch);
 163        int ret;
 164
 165        if (qdisc_pkt_len(skb) > q->max_size) {
 166                if (skb_is_gso(skb))
 167                        return tbf_segment(skb, sch);
 168                return qdisc_reshape_fail(skb, sch);
 169        }
 170        ret = qdisc_enqueue(skb, q->qdisc);
 171        if (ret != NET_XMIT_SUCCESS) {
 172                if (net_xmit_drop_count(ret))
 173                        sch->qstats.drops++;
 174                return ret;
 175        }
 176
 177        sch->q.qlen++;
 178        return NET_XMIT_SUCCESS;
 179}
 180
 181static unsigned int tbf_drop(struct Qdisc *sch)
 182{
 183        struct tbf_sched_data *q = qdisc_priv(sch);
 184        unsigned int len = 0;
 185
 186        if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 187                sch->q.qlen--;
 188                sch->qstats.drops++;
 189        }
 190        return len;
 191}
 192
 193static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 194{
 195        struct tbf_sched_data *q = qdisc_priv(sch);
 196        struct sk_buff *skb;
 197
 198        skb = q->qdisc->ops->peek(q->qdisc);
 199
 200        if (skb) {
 201                s64 now;
 202                s64 toks;
 203                s64 ptoks = 0;
 204                unsigned int len = qdisc_pkt_len(skb);
 205
 206                now = ktime_to_ns(ktime_get());
 207                toks = min_t(s64, now - q->t_c, q->buffer);
 208
 209                if (q->peak_present) {
 210                        ptoks = toks + q->ptokens;
 211                        if (ptoks > q->mtu)
 212                                ptoks = q->mtu;
 213                        ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 214                }
 215                toks += q->tokens;
 216                if (toks > q->buffer)
 217                        toks = q->buffer;
 218                toks -= (s64) psched_l2t_ns(&q->rate, len);
 219
 220                if ((toks|ptoks) >= 0) {
 221                        skb = qdisc_dequeue_peeked(q->qdisc);
 222                        if (unlikely(!skb))
 223                                return NULL;
 224
 225                        q->t_c = now;
 226                        q->tokens = toks;
 227                        q->ptokens = ptoks;
 228                        sch->q.qlen--;
 229                        qdisc_unthrottled(sch);
 230                        qdisc_bstats_update(sch, skb);
 231                        return skb;
 232                }
 233
 234                qdisc_watchdog_schedule_ns(&q->watchdog,
 235                                           now + max_t(long, -toks, -ptoks));
 236
 237                /* Maybe we have a shorter packet in the queue,
 238                   which can be sent now. It sounds cool,
 239                   but, however, this is wrong in principle.
 240                   We MUST NOT reorder packets under these circumstances.
 241
 242                   Really, if we split the flow into independent
 243                   subflows, it would be a very good solution.
 244                   This is the main idea of all FQ algorithms
 245                   (cf. CSZ, HPFQ, HFSC)
 246                 */
 247
 248                sch->qstats.overlimits++;
 249        }
 250        return NULL;
 251}
 252
 253static void tbf_reset(struct Qdisc *sch)
 254{
 255        struct tbf_sched_data *q = qdisc_priv(sch);
 256
 257        qdisc_reset(q->qdisc);
 258        sch->q.qlen = 0;
 259        q->t_c = ktime_to_ns(ktime_get());
 260        q->tokens = q->buffer;
 261        q->ptokens = q->mtu;
 262        qdisc_watchdog_cancel(&q->watchdog);
 263}
 264
 265static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 266        [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 267        [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 268        [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 269};
 270
 271static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 272{
 273        int err;
 274        struct tbf_sched_data *q = qdisc_priv(sch);
 275        struct nlattr *tb[TCA_TBF_PTAB + 1];
 276        struct tc_tbf_qopt *qopt;
 277        struct qdisc_rate_table *rtab = NULL;
 278        struct qdisc_rate_table *ptab = NULL;
 279        struct Qdisc *child = NULL;
 280        int max_size, n;
 281
 282        err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
 283        if (err < 0)
 284                return err;
 285
 286        err = -EINVAL;
 287        if (tb[TCA_TBF_PARMS] == NULL)
 288                goto done;
 289
 290        qopt = nla_data(tb[TCA_TBF_PARMS]);
 291        rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
 292        if (rtab == NULL)
 293                goto done;
 294
 295        if (qopt->peakrate.rate) {
 296                if (qopt->peakrate.rate > qopt->rate.rate)
 297                        ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
 298                if (ptab == NULL)
 299                        goto done;
 300        }
 301
 302        for (n = 0; n < 256; n++)
 303                if (rtab->data[n] > qopt->buffer)
 304                        break;
 305        max_size = (n << qopt->rate.cell_log) - 1;
 306        if (ptab) {
 307                int size;
 308
 309                for (n = 0; n < 256; n++)
 310                        if (ptab->data[n] > qopt->mtu)
 311                                break;
 312                size = (n << qopt->peakrate.cell_log) - 1;
 313                if (size < max_size)
 314                        max_size = size;
 315        }
 316        if (max_size < 0)
 317                goto done;
 318
 319        if (q->qdisc != &noop_qdisc) {
 320                err = fifo_set_limit(q->qdisc, qopt->limit);
 321                if (err)
 322                        goto done;
 323        } else if (qopt->limit > 0) {
 324                child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
 325                if (IS_ERR(child)) {
 326                        err = PTR_ERR(child);
 327                        goto done;
 328                }
 329        }
 330
 331        sch_tree_lock(sch);
 332        if (child) {
 333                qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
 334                qdisc_destroy(q->qdisc);
 335                q->qdisc = child;
 336        }
 337        q->limit = qopt->limit;
 338        q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 339        q->max_size = max_size;
 340        q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 341        q->tokens = q->buffer;
 342        q->ptokens = q->mtu;
 343
 344        psched_ratecfg_precompute(&q->rate, &rtab->rate);
 345        if (ptab) {
 346                psched_ratecfg_precompute(&q->peak, &ptab->rate);
 347                q->peak_present = true;
 348        } else {
 349                q->peak_present = false;
 350        }
 351
 352        sch_tree_unlock(sch);
 353        err = 0;
 354done:
 355        if (rtab)
 356                qdisc_put_rtab(rtab);
 357        if (ptab)
 358                qdisc_put_rtab(ptab);
 359        return err;
 360}
 361
 362static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
 363{
 364        struct tbf_sched_data *q = qdisc_priv(sch);
 365
 366        if (opt == NULL)
 367                return -EINVAL;
 368
 369        q->t_c = ktime_to_ns(ktime_get());
 370        qdisc_watchdog_init(&q->watchdog, sch);
 371        q->qdisc = &noop_qdisc;
 372
 373        return tbf_change(sch, opt);
 374}
 375
 376static void tbf_destroy(struct Qdisc *sch)
 377{
 378        struct tbf_sched_data *q = qdisc_priv(sch);
 379
 380        qdisc_watchdog_cancel(&q->watchdog);
 381        qdisc_destroy(q->qdisc);
 382}
 383
 384static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 385{
 386        struct tbf_sched_data *q = qdisc_priv(sch);
 387        struct nlattr *nest;
 388        struct tc_tbf_qopt opt;
 389
 390        sch->qstats.backlog = q->qdisc->qstats.backlog;
 391        nest = nla_nest_start(skb, TCA_OPTIONS);
 392        if (nest == NULL)
 393                goto nla_put_failure;
 394
 395        opt.limit = q->limit;
 396        psched_ratecfg_getrate(&opt.rate, &q->rate);
 397        if (q->peak_present)
 398                psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 399        else
 400                memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 401        opt.mtu = PSCHED_NS2TICKS(q->mtu);
 402        opt.buffer = PSCHED_NS2TICKS(q->buffer);
 403        if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 404                goto nla_put_failure;
 405
 406        nla_nest_end(skb, nest);
 407        return skb->len;
 408
 409nla_put_failure:
 410        nla_nest_cancel(skb, nest);
 411        return -1;
 412}
 413
 414static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 415                          struct sk_buff *skb, struct tcmsg *tcm)
 416{
 417        struct tbf_sched_data *q = qdisc_priv(sch);
 418
 419        tcm->tcm_handle |= TC_H_MIN(1);
 420        tcm->tcm_info = q->qdisc->handle;
 421
 422        return 0;
 423}
 424
 425static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 426                     struct Qdisc **old)
 427{
 428        struct tbf_sched_data *q = qdisc_priv(sch);
 429
 430        if (new == NULL)
 431                new = &noop_qdisc;
 432
 433        sch_tree_lock(sch);
 434        *old = q->qdisc;
 435        q->qdisc = new;
 436        qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 437        qdisc_reset(*old);
 438        sch_tree_unlock(sch);
 439
 440        return 0;
 441}
 442
 443static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 444{
 445        struct tbf_sched_data *q = qdisc_priv(sch);
 446        return q->qdisc;
 447}
 448
 449static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
 450{
 451        return 1;
 452}
 453
 454static void tbf_put(struct Qdisc *sch, unsigned long arg)
 455{
 456}
 457
 458static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 459{
 460        if (!walker->stop) {
 461                if (walker->count >= walker->skip)
 462                        if (walker->fn(sch, 1, walker) < 0) {
 463                                walker->stop = 1;
 464                                return;
 465                        }
 466                walker->count++;
 467        }
 468}
 469
 470static const struct Qdisc_class_ops tbf_class_ops = {
 471        .graft          =       tbf_graft,
 472        .leaf           =       tbf_leaf,
 473        .get            =       tbf_get,
 474        .put            =       tbf_put,
 475        .walk           =       tbf_walk,
 476        .dump           =       tbf_dump_class,
 477};
 478
 479static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 480        .next           =       NULL,
 481        .cl_ops         =       &tbf_class_ops,
 482        .id             =       "tbf",
 483        .priv_size      =       sizeof(struct tbf_sched_data),
 484        .enqueue        =       tbf_enqueue,
 485        .dequeue        =       tbf_dequeue,
 486        .peek           =       qdisc_peek_dequeued,
 487        .drop           =       tbf_drop,
 488        .init           =       tbf_init,
 489        .reset          =       tbf_reset,
 490        .destroy        =       tbf_destroy,
 491        .change         =       tbf_change,
 492        .dump           =       tbf_dump,
 493        .owner          =       THIS_MODULE,
 494};
 495
 496static int __init tbf_module_init(void)
 497{
 498        return register_qdisc(&tbf_qdisc_ops);
 499}
 500
 501static void __exit tbf_module_exit(void)
 502{
 503        unregister_qdisc(&tbf_qdisc_ops);
 504}
 505module_init(tbf_module_init)
 506module_exit(tbf_module_exit)
 507MODULE_LICENSE("GPL");
 508