LXR linux/net/sched/sch

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_tbf.c  Token Bucket Filter queue.
   4 *
   5 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
   7 *                                               original idea by Martin Devera
   8 */
   9
  10#include <linux/module.h>
  11#include <linux/types.h>
  12#include <linux/kernel.h>
  13#include <linux/string.h>
  14#include <linux/errno.h>
  15#include <linux/skbuff.h>
  16#include <net/netlink.h>
  17#include <net/sch_generic.h>
  18#include <net/pkt_cls.h>
  19#include <net/pkt_sched.h>
  20
  21
  22/*      Simple Token Bucket Filter.
  23        =======================================
  24
  25        SOURCE.
  26        -------
  27
  28        None.
  29
  30        Description.
  31        ------------
  32
  33        A data flow obeys TBF with rate R and depth B, if for any
  34        time interval t_i...t_f the number of transmitted bits
  35        does not exceed B + R*(t_f-t_i).
  36
  37        Packetized version of this definition:
  38        The sequence of packets of sizes s_i served at moments t_i
  39        obeys TBF, if for any i<=k:
  40
  41        s_i+....+s_k <= B + R*(t_k - t_i)
  42
  43        Algorithm.
  44        ----------
  45
  46        Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  47
  48        N(t+delta) = min{B/R, N(t) + delta}
  49
  50        If the first packet in queue has length S, it may be
  51        transmitted only at the time t_* when S/R <= N(t_*),
  52        and in this case N(t) jumps:
  53
  54        N(t_* + 0) = N(t_* - 0) - S/R.
  55
  56
  57
  58        Actually, QoS requires two TBF to be applied to a data stream.
  59        One of them controls steady state burst size, another
  60        one with rate P (peak rate) and depth M (equal to link MTU)
  61        limits bursts at a smaller time scale.
  62
  63        It is easy to see that P>R, and B>M. If P is infinity, this double
  64        TBF is equivalent to a single one.
  65
  66        When TBF works in reshaping mode, latency is estimated as:
  67
  68        lat = max ((L-B)/R, (L-M)/P)
  69
  70
  71        NOTES.
  72        ------
  73
  74        If TBF throttles, it starts a watchdog timer, which will wake it up
  75        when it is ready to transmit.
  76        Note that the minimal timer resolution is 1/HZ.
  77        If no new packets arrive during this period,
  78        or if the device is not awaken by EOI for some previous packet,
  79        TBF can stop its activity for 1/HZ.
  80
  81
  82        This means, that with depth B, the maximal rate is
  83
  84        R_crit = B*HZ
  85
  86        F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  87
  88        Note that the peak rate TBF is much more tough: with MTU 1500
  89        P_crit = 150Kbytes/sec. So, if you need greater peak
  90        rates, use alpha with HZ=1000 :-)
  91
  92        With classful TBF, limit is just kept for backwards compatibility.
  93        It is passed to the default bfifo qdisc - if the inner qdisc is
  94        changed the limit is not effective anymore.
  95*/
  96
  97struct tbf_sched_data {
  98/* Parameters */
  99        u32             limit;          /* Maximal length of backlog: bytes */
 100        u32             max_size;
 101        s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 102        s64             mtu;
 103        struct psched_ratecfg rate;
 104        struct psched_ratecfg peak;
 105
 106/* Variables */
 107        s64     tokens;                 /* Current number of B tokens */
 108        s64     ptokens;                /* Current number of P tokens */
 109        s64     t_c;                    /* Time check-point */
 110        struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 111        struct qdisc_watchdog watchdog; /* Watchdog timer */
 112};
 113
 114
 115/* Time to Length, convert time in ns to length in bytes
 116 * to determinate how many bytes can be sent in given time.
 117 */
 118static u64 psched_ns_t2l(const struct psched_ratecfg *r,
 119                         u64 time_in_ns)
 120{
 121        /* The formula is :
 122         * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
 123         */
 124        u64 len = time_in_ns * r->rate_bytes_ps;
 125
 126        do_div(len, NSEC_PER_SEC);
 127
 128        if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
 129                do_div(len, 53);
 130                len = len * 48;
 131        }
 132
 133        if (len > r->overhead)
 134                len -= r->overhead;
 135        else
 136                len = 0;
 137
 138        return len;
 139}
 140
 141static void tbf_offload_change(struct Qdisc *sch)
 142{
 143        struct tbf_sched_data *q = qdisc_priv(sch);
 144        struct net_device *dev = qdisc_dev(sch);
 145        struct tc_tbf_qopt_offload qopt;
 146
 147        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 148                return;
 149
 150        qopt.command = TC_TBF_REPLACE;
 151        qopt.handle = sch->handle;
 152        qopt.parent = sch->parent;
 153        qopt.replace_params.rate = q->rate;
 154        qopt.replace_params.max_size = q->max_size;
 155        qopt.replace_params.qstats = &sch->qstats;
 156
 157        dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
 158}
 159
 160static void tbf_offload_destroy(struct Qdisc *sch)
 161{
 162        struct net_device *dev = qdisc_dev(sch);
 163        struct tc_tbf_qopt_offload qopt;
 164
 165        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 166                return;
 167
 168        qopt.command = TC_TBF_DESTROY;
 169        qopt.handle = sch->handle;
 170        qopt.parent = sch->parent;
 171        dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
 172}
 173
 174static int tbf_offload_dump(struct Qdisc *sch)
 175{
 176        struct tc_tbf_qopt_offload qopt;
 177
 178        qopt.command = TC_TBF_STATS;
 179        qopt.handle = sch->handle;
 180        qopt.parent = sch->parent;
 181        qopt.stats.bstats = &sch->bstats;
 182        qopt.stats.qstats = &sch->qstats;
 183
 184        return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
 185}
 186
 187static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new,
 188                              struct Qdisc *old, struct netlink_ext_ack *extack)
 189{
 190        struct tc_tbf_qopt_offload graft_offload = {
 191                .handle         = sch->handle,
 192                .parent         = sch->parent,
 193                .child_handle   = new->handle,
 194                .command        = TC_TBF_GRAFT,
 195        };
 196
 197        qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
 198                                   TC_SETUP_QDISC_TBF, &graft_offload, extack);
 199}
 200
 201/* GSO packet is too big, segment it so that tbf can transmit
 202 * each segment in time
 203 */
 204static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 205                       struct sk_buff **to_free)
 206{
 207        struct tbf_sched_data *q = qdisc_priv(sch);
 208        struct sk_buff *segs, *nskb;
 209        netdev_features_t features = netif_skb_features(skb);
 210        unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 211        int ret, nb;
 212
 213        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 214
 215        if (IS_ERR_OR_NULL(segs))
 216                return qdisc_drop(skb, sch, to_free);
 217
 218        nb = 0;
 219        skb_list_walk_safe(segs, segs, nskb) {
 220                skb_mark_not_on_list(segs);
 221                qdisc_skb_cb(segs)->pkt_len = segs->len;
 222                len += segs->len;
 223                ret = qdisc_enqueue(segs, q->qdisc, to_free);
 224                if (ret != NET_XMIT_SUCCESS) {
 225                        if (net_xmit_drop_count(ret))
 226                                qdisc_qstats_drop(sch);
 227                } else {
 228                        nb++;
 229                }
 230        }
 231        sch->q.qlen += nb;
 232        if (nb > 1)
 233                qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 234        consume_skb(skb);
 235        return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 236}
 237
 238static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 239                       struct sk_buff **to_free)
 240{
 241        struct tbf_sched_data *q = qdisc_priv(sch);
 242        unsigned int len = qdisc_pkt_len(skb);
 243        int ret;
 244
 245        if (qdisc_pkt_len(skb) > q->max_size) {
 246                if (skb_is_gso(skb) &&
 247                    skb_gso_validate_mac_len(skb, q->max_size))
 248                        return tbf_segment(skb, sch, to_free);
 249                return qdisc_drop(skb, sch, to_free);
 250        }
 251        ret = qdisc_enqueue(skb, q->qdisc, to_free);
 252        if (ret != NET_XMIT_SUCCESS) {
 253                if (net_xmit_drop_count(ret))
 254                        qdisc_qstats_drop(sch);
 255                return ret;
 256        }
 257
 258        sch->qstats.backlog += len;
 259        sch->q.qlen++;
 260        return NET_XMIT_SUCCESS;
 261}
 262
 263static bool tbf_peak_present(const struct tbf_sched_data *q)
 264{
 265        return q->peak.rate_bytes_ps;
 266}
 267
 268static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 269{
 270        struct tbf_sched_data *q = qdisc_priv(sch);
 271        struct sk_buff *skb;
 272
 273        skb = q->qdisc->ops->peek(q->qdisc);
 274
 275        if (skb) {
 276                s64 now;
 277                s64 toks;
 278                s64 ptoks = 0;
 279                unsigned int len = qdisc_pkt_len(skb);
 280
 281                now = ktime_get_ns();
 282                toks = min_t(s64, now - q->t_c, q->buffer);
 283
 284                if (tbf_peak_present(q)) {
 285                        ptoks = toks + q->ptokens;
 286                        if (ptoks > q->mtu)
 287                                ptoks = q->mtu;
 288                        ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 289                }
 290                toks += q->tokens;
 291                if (toks > q->buffer)
 292                        toks = q->buffer;
 293                toks -= (s64) psched_l2t_ns(&q->rate, len);
 294
 295                if ((toks|ptoks) >= 0) {
 296                        skb = qdisc_dequeue_peeked(q->qdisc);
 297                        if (unlikely(!skb))
 298                                return NULL;
 299
 300                        q->t_c = now;
 301                        q->tokens = toks;
 302                        q->ptokens = ptoks;
 303                        qdisc_qstats_backlog_dec(sch, skb);
 304                        sch->q.qlen--;
 305                        qdisc_bstats_update(sch, skb);
 306                        return skb;
 307                }
 308
 309                qdisc_watchdog_schedule_ns(&q->watchdog,
 310                                           now + max_t(long, -toks, -ptoks));
 311
 312                /* Maybe we have a shorter packet in the queue,
 313                   which can be sent now. It sounds cool,
 314                   but, however, this is wrong in principle.
 315                   We MUST NOT reorder packets under these circumstances.
 316
 317                   Really, if we split the flow into independent
 318                   subflows, it would be a very good solution.
 319                   This is the main idea of all FQ algorithms
 320                   (cf. CSZ, HPFQ, HFSC)
 321                 */
 322
 323                qdisc_qstats_overlimit(sch);
 324        }
 325        return NULL;
 326}
 327
 328static void tbf_reset(struct Qdisc *sch)
 329{
 330        struct tbf_sched_data *q = qdisc_priv(sch);
 331
 332        qdisc_reset(q->qdisc);
 333        sch->qstats.backlog = 0;
 334        sch->q.qlen = 0;
 335        q->t_c = ktime_get_ns();
 336        q->tokens = q->buffer;
 337        q->ptokens = q->mtu;
 338        qdisc_watchdog_cancel(&q->watchdog);
 339}
 340
 341static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 342        [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 343        [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 344        [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 345        [TCA_TBF_RATE64]        = { .type = NLA_U64 },
 346        [TCA_TBF_PRATE64]       = { .type = NLA_U64 },
 347        [TCA_TBF_BURST] = { .type = NLA_U32 },
 348        [TCA_TBF_PBURST] = { .type = NLA_U32 },
 349};
 350
 351static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 352                      struct netlink_ext_ack *extack)
 353{
 354        int err;
 355        struct tbf_sched_data *q = qdisc_priv(sch);
 356        struct nlattr *tb[TCA_TBF_MAX + 1];
 357        struct tc_tbf_qopt *qopt;
 358        struct Qdisc *child = NULL;
 359        struct psched_ratecfg rate;
 360        struct psched_ratecfg peak;
 361        u64 max_size;
 362        s64 buffer, mtu;
 363        u64 rate64 = 0, prate64 = 0;
 364
 365        err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
 366                                          NULL);
 367        if (err < 0)
 368                return err;
 369
 370        err = -EINVAL;
 371        if (tb[TCA_TBF_PARMS] == NULL)
 372                goto done;
 373
 374        qopt = nla_data(tb[TCA_TBF_PARMS]);
 375        if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 376                qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
 377                                              tb[TCA_TBF_RTAB],
 378                                              NULL));
 379
 380        if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 381                        qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
 382                                                      tb[TCA_TBF_PTAB],
 383                                                      NULL));
 384
 385        buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 386        mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
 387
 388        if (tb[TCA_TBF_RATE64])
 389                rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
 390        psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
 391
 392        if (tb[TCA_TBF_BURST]) {
 393                max_size = nla_get_u32(tb[TCA_TBF_BURST]);
 394                buffer = psched_l2t_ns(&rate, max_size);
 395        } else {
 396                max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
 397        }
 398
 399        if (qopt->peakrate.rate) {
 400                if (tb[TCA_TBF_PRATE64])
 401                        prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
 402                psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
 403                if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
 404                        pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
 405                                        peak.rate_bytes_ps, rate.rate_bytes_ps);
 406                        err = -EINVAL;
 407                        goto done;
 408                }
 409
 410                if (tb[TCA_TBF_PBURST]) {
 411                        u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
 412                        max_size = min_t(u32, max_size, pburst);
 413                        mtu = psched_l2t_ns(&peak, pburst);
 414                } else {
 415                        max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
 416                }
 417        } else {
 418                memset(&peak, 0, sizeof(peak));
 419        }
 420
 421        if (max_size < psched_mtu(qdisc_dev(sch)))
 422                pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
 423                                    max_size, qdisc_dev(sch)->name,
 424                                    psched_mtu(qdisc_dev(sch)));
 425
 426        if (!max_size) {
 427                err = -EINVAL;
 428                goto done;
 429        }
 430
 431        if (q->qdisc != &noop_qdisc) {
 432                err = fifo_set_limit(q->qdisc, qopt->limit);
 433                if (err)
 434                        goto done;
 435        } else if (qopt->limit > 0) {
 436                child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
 437                                         extack);
 438                if (IS_ERR(child)) {
 439                        err = PTR_ERR(child);
 440                        goto done;
 441                }
 442
 443                /* child is fifo, no need to check for noop_qdisc */
 444                qdisc_hash_add(child, true);
 445        }
 446
 447        sch_tree_lock(sch);
 448        if (child) {
 449                qdisc_tree_flush_backlog(q->qdisc);
 450                qdisc_put(q->qdisc);
 451                q->qdisc = child;
 452        }
 453        q->limit = qopt->limit;
 454        if (tb[TCA_TBF_PBURST])
 455                q->mtu = mtu;
 456        else
 457                q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 458        q->max_size = max_size;
 459        if (tb[TCA_TBF_BURST])
 460                q->buffer = buffer;
 461        else
 462                q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 463        q->tokens = q->buffer;
 464        q->ptokens = q->mtu;
 465
 466        memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
 467        memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 468
 469        sch_tree_unlock(sch);
 470        err = 0;
 471
 472        tbf_offload_change(sch);
 473done:
 474        return err;
 475}
 476
 477static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 478                    struct netlink_ext_ack *extack)
 479{
 480        struct tbf_sched_data *q = qdisc_priv(sch);
 481
 482        qdisc_watchdog_init(&q->watchdog, sch);
 483        q->qdisc = &noop_qdisc;
 484
 485        if (!opt)
 486                return -EINVAL;
 487
 488        q->t_c = ktime_get_ns();
 489
 490        return tbf_change(sch, opt, extack);
 491}
 492
 493static void tbf_destroy(struct Qdisc *sch)
 494{
 495        struct tbf_sched_data *q = qdisc_priv(sch);
 496
 497        qdisc_watchdog_cancel(&q->watchdog);
 498        tbf_offload_destroy(sch);
 499        qdisc_put(q->qdisc);
 500}
 501
 502static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 503{
 504        struct tbf_sched_data *q = qdisc_priv(sch);
 505        struct nlattr *nest;
 506        struct tc_tbf_qopt opt;
 507        int err;
 508
 509        err = tbf_offload_dump(sch);
 510        if (err)
 511                return err;
 512
 513        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 514        if (nest == NULL)
 515                goto nla_put_failure;
 516
 517        opt.limit = q->limit;
 518        psched_ratecfg_getrate(&opt.rate, &q->rate);
 519        if (tbf_peak_present(q))
 520                psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 521        else
 522                memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 523        opt.mtu = PSCHED_NS2TICKS(q->mtu);
 524        opt.buffer = PSCHED_NS2TICKS(q->buffer);
 525        if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 526                goto nla_put_failure;
 527        if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
 528            nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
 529                              TCA_TBF_PAD))
 530                goto nla_put_failure;
 531        if (tbf_peak_present(q) &&
 532            q->peak.rate_bytes_ps >= (1ULL << 32) &&
 533            nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
 534                              TCA_TBF_PAD))
 535                goto nla_put_failure;
 536
 537        return nla_nest_end(skb, nest);
 538
 539nla_put_failure:
 540        nla_nest_cancel(skb, nest);
 541        return -1;
 542}
 543
 544static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 545                          struct sk_buff *skb, struct tcmsg *tcm)
 546{
 547        struct tbf_sched_data *q = qdisc_priv(sch);
 548
 549        tcm->tcm_handle |= TC_H_MIN(1);
 550        tcm->tcm_info = q->qdisc->handle;
 551
 552        return 0;
 553}
 554
 555static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 556                     struct Qdisc **old, struct netlink_ext_ack *extack)
 557{
 558        struct tbf_sched_data *q = qdisc_priv(sch);
 559
 560        if (new == NULL)
 561                new = &noop_qdisc;
 562
 563        *old = qdisc_replace(sch, new, &q->qdisc);
 564
 565        tbf_offload_graft(sch, new, *old, extack);
 566        return 0;
 567}
 568
 569static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 570{
 571        struct tbf_sched_data *q = qdisc_priv(sch);
 572        return q->qdisc;
 573}
 574
 575static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 576{
 577        return 1;
 578}
 579
 580static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 581{
 582        if (!walker->stop) {
 583                if (walker->count >= walker->skip)
 584                        if (walker->fn(sch, 1, walker) < 0) {
 585                                walker->stop = 1;
 586                                return;
 587                        }
 588                walker->count++;
 589        }
 590}
 591
 592static const struct Qdisc_class_ops tbf_class_ops = {
 593        .graft          =       tbf_graft,
 594        .leaf           =       tbf_leaf,
 595        .find           =       tbf_find,
 596        .walk           =       tbf_walk,
 597        .dump           =       tbf_dump_class,
 598};
 599
 600static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 601        .next           =       NULL,
 602        .cl_ops         =       &tbf_class_ops,
 603        .id             =       "tbf",
 604        .priv_size      =       sizeof(struct tbf_sched_data),
 605        .enqueue        =       tbf_enqueue,
 606        .dequeue        =       tbf_dequeue,
 607        .peek           =       qdisc_peek_dequeued,
 608        .init           =       tbf_init,
 609        .reset          =       tbf_reset,
 610        .destroy        =       tbf_destroy,
 611        .change         =       tbf_change,
 612        .dump           =       tbf_dump,
 613        .owner          =       THIS_MODULE,
 614};
 615
 616static int __init tbf_module_init(void)
 617{
 618        return register_qdisc(&tbf_qdisc_ops);
 619}
 620
 621static void __exit tbf_module_exit(void)
 622{
 623        unregister_qdisc(&tbf_qdisc_ops);
 624}
 625module_init(tbf_module_init)
 626module_exit(tbf_module_exit)
 627MODULE_LICENSE("GPL");
 628