linux/net/sched/sch_tbf.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_tbf.c  Token Bucket Filter queue.
   4 *
   5 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
   7 *                                               original idea by Martin Devera
   8 */
   9
  10#include <linux/module.h>
  11#include <linux/types.h>
  12#include <linux/kernel.h>
  13#include <linux/string.h>
  14#include <linux/errno.h>
  15#include <linux/skbuff.h>
  16#include <net/netlink.h>
  17#include <net/sch_generic.h>
  18#include <net/pkt_cls.h>
  19#include <net/pkt_sched.h>
  20
  21
  22/*      Simple Token Bucket Filter.
  23        =======================================
  24
  25        SOURCE.
  26        -------
  27
  28        None.
  29
  30        Description.
  31        ------------
  32
  33        A data flow obeys TBF with rate R and depth B, if for any
  34        time interval t_i...t_f the number of transmitted bits
  35        does not exceed B + R*(t_f-t_i).
  36
  37        Packetized version of this definition:
  38        The sequence of packets of sizes s_i served at moments t_i
  39        obeys TBF, if for any i<=k:
  40
  41        s_i+....+s_k <= B + R*(t_k - t_i)
  42
  43        Algorithm.
  44        ----------
  45
  46        Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  47
  48        N(t+delta) = min{B/R, N(t) + delta}
  49
  50        If the first packet in queue has length S, it may be
  51        transmitted only at the time t_* when S/R <= N(t_*),
  52        and in this case N(t) jumps:
  53
  54        N(t_* + 0) = N(t_* - 0) - S/R.
  55
  56
  57
  58        Actually, QoS requires two TBF to be applied to a data stream.
  59        One of them controls steady state burst size, another
  60        one with rate P (peak rate) and depth M (equal to link MTU)
  61        limits bursts at a smaller time scale.
  62
  63        It is easy to see that P>R, and B>M. If P is infinity, this double
  64        TBF is equivalent to a single one.
  65
  66        When TBF works in reshaping mode, latency is estimated as:
  67
  68        lat = max ((L-B)/R, (L-M)/P)
  69
  70
  71        NOTES.
  72        ------
  73
  74        If TBF throttles, it starts a watchdog timer, which will wake it up
  75        when it is ready to transmit.
  76        Note that the minimal timer resolution is 1/HZ.
  77        If no new packets arrive during this period,
  78        or if the device is not awaken by EOI for some previous packet,
  79        TBF can stop its activity for 1/HZ.
  80
  81
  82        This means, that with depth B, the maximal rate is
  83
  84        R_crit = B*HZ
  85
  86        F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  87
  88        Note that the peak rate TBF is much more tough: with MTU 1500
  89        P_crit = 150Kbytes/sec. So, if you need greater peak
  90        rates, use alpha with HZ=1000 :-)
  91
  92        With classful TBF, limit is just kept for backwards compatibility.
  93        It is passed to the default bfifo qdisc - if the inner qdisc is
  94        changed the limit is not effective anymore.
  95*/
  96
  97struct tbf_sched_data {
  98/* Parameters */
  99        u32             limit;          /* Maximal length of backlog: bytes */
 100        u32             max_size;
 101        s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 102        s64             mtu;
 103        struct psched_ratecfg rate;
 104        struct psched_ratecfg peak;
 105
 106/* Variables */
 107        s64     tokens;                 /* Current number of B tokens */
 108        s64     ptokens;                /* Current number of P tokens */
 109        s64     t_c;                    /* Time check-point */
 110        struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 111        struct qdisc_watchdog watchdog; /* Watchdog timer */
 112};
 113
 114
 115/* Time to Length, convert time in ns to length in bytes
 116 * to determinate how many bytes can be sent in given time.
 117 */
 118static u64 psched_ns_t2l(const struct psched_ratecfg *r,
 119                         u64 time_in_ns)
 120{
 121        /* The formula is :
 122         * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
 123         */
 124        u64 len = time_in_ns * r->rate_bytes_ps;
 125
 126        do_div(len, NSEC_PER_SEC);
 127
 128        if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
 129                do_div(len, 53);
 130                len = len * 48;
 131        }
 132
 133        if (len > r->overhead)
 134                len -= r->overhead;
 135        else
 136                len = 0;
 137
 138        return len;
 139}
 140
 141static void tbf_offload_change(struct Qdisc *sch)
 142{
 143        struct tbf_sched_data *q = qdisc_priv(sch);
 144        struct net_device *dev = qdisc_dev(sch);
 145        struct tc_tbf_qopt_offload qopt;
 146
 147        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 148                return;
 149
 150        qopt.command = TC_TBF_REPLACE;
 151        qopt.handle = sch->handle;
 152        qopt.parent = sch->parent;
 153        qopt.replace_params.rate = q->rate;
 154        qopt.replace_params.max_size = q->max_size;
 155        qopt.replace_params.qstats = &sch->qstats;
 156
 157        dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
 158}
 159
 160static void tbf_offload_destroy(struct Qdisc *sch)
 161{
 162        struct net_device *dev = qdisc_dev(sch);
 163        struct tc_tbf_qopt_offload qopt;
 164
 165        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 166                return;
 167
 168        qopt.command = TC_TBF_DESTROY;
 169        qopt.handle = sch->handle;
 170        qopt.parent = sch->parent;
 171        dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
 172}
 173
 174static int tbf_offload_dump(struct Qdisc *sch)
 175{
 176        struct tc_tbf_qopt_offload qopt;
 177
 178        qopt.command = TC_TBF_STATS;
 179        qopt.handle = sch->handle;
 180        qopt.parent = sch->parent;
 181        qopt.stats.bstats = &sch->bstats;
 182        qopt.stats.qstats = &sch->qstats;
 183
 184        return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
 185}
 186
 187/* GSO packet is too big, segment it so that tbf can transmit
 188 * each segment in time
 189 */
 190static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 191                       struct sk_buff **to_free)
 192{
 193        struct tbf_sched_data *q = qdisc_priv(sch);
 194        struct sk_buff *segs, *nskb;
 195        netdev_features_t features = netif_skb_features(skb);
 196        unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 197        int ret, nb;
 198
 199        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 200
 201        if (IS_ERR_OR_NULL(segs))
 202                return qdisc_drop(skb, sch, to_free);
 203
 204        nb = 0;
 205        skb_list_walk_safe(segs, segs, nskb) {
 206                skb_mark_not_on_list(segs);
 207                qdisc_skb_cb(segs)->pkt_len = segs->len;
 208                len += segs->len;
 209                ret = qdisc_enqueue(segs, q->qdisc, to_free);
 210                if (ret != NET_XMIT_SUCCESS) {
 211                        if (net_xmit_drop_count(ret))
 212                                qdisc_qstats_drop(sch);
 213                } else {
 214                        nb++;
 215                }
 216        }
 217        sch->q.qlen += nb;
 218        if (nb > 1)
 219                qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 220        consume_skb(skb);
 221        return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 222}
 223
 224static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 225                       struct sk_buff **to_free)
 226{
 227        struct tbf_sched_data *q = qdisc_priv(sch);
 228        unsigned int len = qdisc_pkt_len(skb);
 229        int ret;
 230
 231        if (qdisc_pkt_len(skb) > q->max_size) {
 232                if (skb_is_gso(skb) &&
 233                    skb_gso_validate_mac_len(skb, q->max_size))
 234                        return tbf_segment(skb, sch, to_free);
 235                return qdisc_drop(skb, sch, to_free);
 236        }
 237        ret = qdisc_enqueue(skb, q->qdisc, to_free);
 238        if (ret != NET_XMIT_SUCCESS) {
 239                if (net_xmit_drop_count(ret))
 240                        qdisc_qstats_drop(sch);
 241                return ret;
 242        }
 243
 244        sch->qstats.backlog += len;
 245        sch->q.qlen++;
 246        return NET_XMIT_SUCCESS;
 247}
 248
 249static bool tbf_peak_present(const struct tbf_sched_data *q)
 250{
 251        return q->peak.rate_bytes_ps;
 252}
 253
 254static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 255{
 256        struct tbf_sched_data *q = qdisc_priv(sch);
 257        struct sk_buff *skb;
 258
 259        skb = q->qdisc->ops->peek(q->qdisc);
 260
 261        if (skb) {
 262                s64 now;
 263                s64 toks;
 264                s64 ptoks = 0;
 265                unsigned int len = qdisc_pkt_len(skb);
 266
 267                now = ktime_get_ns();
 268                toks = min_t(s64, now - q->t_c, q->buffer);
 269
 270                if (tbf_peak_present(q)) {
 271                        ptoks = toks + q->ptokens;
 272                        if (ptoks > q->mtu)
 273                                ptoks = q->mtu;
 274                        ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 275                }
 276                toks += q->tokens;
 277                if (toks > q->buffer)
 278                        toks = q->buffer;
 279                toks -= (s64) psched_l2t_ns(&q->rate, len);
 280
 281                if ((toks|ptoks) >= 0) {
 282                        skb = qdisc_dequeue_peeked(q->qdisc);
 283                        if (unlikely(!skb))
 284                                return NULL;
 285
 286                        q->t_c = now;
 287                        q->tokens = toks;
 288                        q->ptokens = ptoks;
 289                        qdisc_qstats_backlog_dec(sch, skb);
 290                        sch->q.qlen--;
 291                        qdisc_bstats_update(sch, skb);
 292                        return skb;
 293                }
 294
 295                qdisc_watchdog_schedule_ns(&q->watchdog,
 296                                           now + max_t(long, -toks, -ptoks));
 297
 298                /* Maybe we have a shorter packet in the queue,
 299                   which can be sent now. It sounds cool,
 300                   but, however, this is wrong in principle.
 301                   We MUST NOT reorder packets under these circumstances.
 302
 303                   Really, if we split the flow into independent
 304                   subflows, it would be a very good solution.
 305                   This is the main idea of all FQ algorithms
 306                   (cf. CSZ, HPFQ, HFSC)
 307                 */
 308
 309                qdisc_qstats_overlimit(sch);
 310        }
 311        return NULL;
 312}
 313
 314static void tbf_reset(struct Qdisc *sch)
 315{
 316        struct tbf_sched_data *q = qdisc_priv(sch);
 317
 318        qdisc_reset(q->qdisc);
 319        sch->qstats.backlog = 0;
 320        sch->q.qlen = 0;
 321        q->t_c = ktime_get_ns();
 322        q->tokens = q->buffer;
 323        q->ptokens = q->mtu;
 324        qdisc_watchdog_cancel(&q->watchdog);
 325}
 326
 327static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 328        [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 329        [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 330        [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 331        [TCA_TBF_RATE64]        = { .type = NLA_U64 },
 332        [TCA_TBF_PRATE64]       = { .type = NLA_U64 },
 333        [TCA_TBF_BURST] = { .type = NLA_U32 },
 334        [TCA_TBF_PBURST] = { .type = NLA_U32 },
 335};
 336
 337static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 338                      struct netlink_ext_ack *extack)
 339{
 340        int err;
 341        struct tbf_sched_data *q = qdisc_priv(sch);
 342        struct nlattr *tb[TCA_TBF_MAX + 1];
 343        struct tc_tbf_qopt *qopt;
 344        struct Qdisc *child = NULL;
 345        struct psched_ratecfg rate;
 346        struct psched_ratecfg peak;
 347        u64 max_size;
 348        s64 buffer, mtu;
 349        u64 rate64 = 0, prate64 = 0;
 350
 351        err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
 352                                          NULL);
 353        if (err < 0)
 354                return err;
 355
 356        err = -EINVAL;
 357        if (tb[TCA_TBF_PARMS] == NULL)
 358                goto done;
 359
 360        qopt = nla_data(tb[TCA_TBF_PARMS]);
 361        if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 362                qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
 363                                              tb[TCA_TBF_RTAB],
 364                                              NULL));
 365
 366        if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 367                        qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
 368                                                      tb[TCA_TBF_PTAB],
 369                                                      NULL));
 370
 371        buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 372        mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
 373
 374        if (tb[TCA_TBF_RATE64])
 375                rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
 376        psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
 377
 378        if (tb[TCA_TBF_BURST]) {
 379                max_size = nla_get_u32(tb[TCA_TBF_BURST]);
 380                buffer = psched_l2t_ns(&rate, max_size);
 381        } else {
 382                max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
 383        }
 384
 385        if (qopt->peakrate.rate) {
 386                if (tb[TCA_TBF_PRATE64])
 387                        prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
 388                psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
 389                if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
 390                        pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
 391                                        peak.rate_bytes_ps, rate.rate_bytes_ps);
 392                        err = -EINVAL;
 393                        goto done;
 394                }
 395
 396                if (tb[TCA_TBF_PBURST]) {
 397                        u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
 398                        max_size = min_t(u32, max_size, pburst);
 399                        mtu = psched_l2t_ns(&peak, pburst);
 400                } else {
 401                        max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
 402                }
 403        } else {
 404                memset(&peak, 0, sizeof(peak));
 405        }
 406
 407        if (max_size < psched_mtu(qdisc_dev(sch)))
 408                pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
 409                                    max_size, qdisc_dev(sch)->name,
 410                                    psched_mtu(qdisc_dev(sch)));
 411
 412        if (!max_size) {
 413                err = -EINVAL;
 414                goto done;
 415        }
 416
 417        if (q->qdisc != &noop_qdisc) {
 418                err = fifo_set_limit(q->qdisc, qopt->limit);
 419                if (err)
 420                        goto done;
 421        } else if (qopt->limit > 0) {
 422                child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
 423                                         extack);
 424                if (IS_ERR(child)) {
 425                        err = PTR_ERR(child);
 426                        goto done;
 427                }
 428
 429                /* child is fifo, no need to check for noop_qdisc */
 430                qdisc_hash_add(child, true);
 431        }
 432
 433        sch_tree_lock(sch);
 434        if (child) {
 435                qdisc_tree_flush_backlog(q->qdisc);
 436                qdisc_put(q->qdisc);
 437                q->qdisc = child;
 438        }
 439        q->limit = qopt->limit;
 440        if (tb[TCA_TBF_PBURST])
 441                q->mtu = mtu;
 442        else
 443                q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 444        q->max_size = max_size;
 445        if (tb[TCA_TBF_BURST])
 446                q->buffer = buffer;
 447        else
 448                q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 449        q->tokens = q->buffer;
 450        q->ptokens = q->mtu;
 451
 452        memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
 453        memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 454
 455        sch_tree_unlock(sch);
 456        err = 0;
 457
 458        tbf_offload_change(sch);
 459done:
 460        return err;
 461}
 462
 463static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 464                    struct netlink_ext_ack *extack)
 465{
 466        struct tbf_sched_data *q = qdisc_priv(sch);
 467
 468        qdisc_watchdog_init(&q->watchdog, sch);
 469        q->qdisc = &noop_qdisc;
 470
 471        if (!opt)
 472                return -EINVAL;
 473
 474        q->t_c = ktime_get_ns();
 475
 476        return tbf_change(sch, opt, extack);
 477}
 478
 479static void tbf_destroy(struct Qdisc *sch)
 480{
 481        struct tbf_sched_data *q = qdisc_priv(sch);
 482
 483        qdisc_watchdog_cancel(&q->watchdog);
 484        tbf_offload_destroy(sch);
 485        qdisc_put(q->qdisc);
 486}
 487
 488static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 489{
 490        struct tbf_sched_data *q = qdisc_priv(sch);
 491        struct nlattr *nest;
 492        struct tc_tbf_qopt opt;
 493        int err;
 494
 495        err = tbf_offload_dump(sch);
 496        if (err)
 497                return err;
 498
 499        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 500        if (nest == NULL)
 501                goto nla_put_failure;
 502
 503        opt.limit = q->limit;
 504        psched_ratecfg_getrate(&opt.rate, &q->rate);
 505        if (tbf_peak_present(q))
 506                psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 507        else
 508                memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 509        opt.mtu = PSCHED_NS2TICKS(q->mtu);
 510        opt.buffer = PSCHED_NS2TICKS(q->buffer);
 511        if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 512                goto nla_put_failure;
 513        if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
 514            nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
 515                              TCA_TBF_PAD))
 516                goto nla_put_failure;
 517        if (tbf_peak_present(q) &&
 518            q->peak.rate_bytes_ps >= (1ULL << 32) &&
 519            nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
 520                              TCA_TBF_PAD))
 521                goto nla_put_failure;
 522
 523        return nla_nest_end(skb, nest);
 524
 525nla_put_failure:
 526        nla_nest_cancel(skb, nest);
 527        return -1;
 528}
 529
 530static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 531                          struct sk_buff *skb, struct tcmsg *tcm)
 532{
 533        struct tbf_sched_data *q = qdisc_priv(sch);
 534
 535        tcm->tcm_handle |= TC_H_MIN(1);
 536        tcm->tcm_info = q->qdisc->handle;
 537
 538        return 0;
 539}
 540
 541static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 542                     struct Qdisc **old, struct netlink_ext_ack *extack)
 543{
 544        struct tbf_sched_data *q = qdisc_priv(sch);
 545
 546        if (new == NULL)
 547                new = &noop_qdisc;
 548
 549        *old = qdisc_replace(sch, new, &q->qdisc);
 550        return 0;
 551}
 552
 553static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 554{
 555        struct tbf_sched_data *q = qdisc_priv(sch);
 556        return q->qdisc;
 557}
 558
 559static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 560{
 561        return 1;
 562}
 563
 564static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 565{
 566        if (!walker->stop) {
 567                if (walker->count >= walker->skip)
 568                        if (walker->fn(sch, 1, walker) < 0) {
 569                                walker->stop = 1;
 570                                return;
 571                        }
 572                walker->count++;
 573        }
 574}
 575
 576static const struct Qdisc_class_ops tbf_class_ops = {
 577        .graft          =       tbf_graft,
 578        .leaf           =       tbf_leaf,
 579        .find           =       tbf_find,
 580        .walk           =       tbf_walk,
 581        .dump           =       tbf_dump_class,
 582};
 583
 584static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 585        .next           =       NULL,
 586        .cl_ops         =       &tbf_class_ops,
 587        .id             =       "tbf",
 588        .priv_size      =       sizeof(struct tbf_sched_data),
 589        .enqueue        =       tbf_enqueue,
 590        .dequeue        =       tbf_dequeue,
 591        .peek           =       qdisc_peek_dequeued,
 592        .init           =       tbf_init,
 593        .reset          =       tbf_reset,
 594        .destroy        =       tbf_destroy,
 595        .change         =       tbf_change,
 596        .dump           =       tbf_dump,
 597        .owner          =       THIS_MODULE,
 598};
 599
 600static int __init tbf_module_init(void)
 601{
 602        return register_qdisc(&tbf_qdisc_ops);
 603}
 604
 605static void __exit tbf_module_exit(void)
 606{
 607        unregister_qdisc(&tbf_qdisc_ops);
 608}
 609module_init(tbf_module_init)
 610module_exit(tbf_module_exit)
 611MODULE_LICENSE("GPL");
 612