LXR linux/net/sched/sch

   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2
   3/* COMMON Applications Kept Enhanced (CAKE) discipline
   4 *
   5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11 *
  12 * The CAKE Principles:
  13 *                 (or, how to have your cake and eat it too)
  14 *
  15 * This is a combination of several shaping, AQM and FQ techniques into one
  16 * easy-to-use package:
  17 *
  18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19 *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20 *   eliminating the need for any sort of burst parameter (eg. token bucket
  21 *   depth).  Burst support is limited to that necessary to overcome scheduling
  22 *   latency.
  23 *
  24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25 *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26 *   the priority is reduced to avoid starving other tins.
  27 *
  28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29 *   flows from each other.  This prevents a burst on one flow from increasing
  30 *   the delay to another.  Flows are distributed to queues using a
  31 *   set-associative hash function.
  32 *
  33 * - Each queue is actively managed by Cobalt, which is a combination of the
  34 *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35 *   congestion early via ECN (if available) and/or packet drops, to keep
  36 *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37 *   setting, as is necessary at low bandwidths.
  38 *
  39 * The configuration parameters are kept deliberately simple for ease of use.
  40 * Everything has sane defaults.  Complete generality of configuration is *not*
  41 * a goal.
  42 *
  43 * The priority queue operates according to a weighted DRR scheme, combined with
  44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45 * bandwidth sharing threshold the tin is operating.  This determines whether a
  46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47 * that tin in the current pass.
  48 *
  49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50 * granted us permission to leverage.
  51 */
  52
  53#include <linux/module.h>
  54#include <linux/types.h>
  55#include <linux/kernel.h>
  56#include <linux/jiffies.h>
  57#include <linux/string.h>
  58#include <linux/in.h>
  59#include <linux/errno.h>
  60#include <linux/init.h>
  61#include <linux/skbuff.h>
  62#include <linux/jhash.h>
  63#include <linux/slab.h>
  64#include <linux/vmalloc.h>
  65#include <linux/reciprocal_div.h>
  66#include <net/netlink.h>
  67#include <linux/if_vlan.h>
  68#include <net/pkt_sched.h>
  69#include <net/pkt_cls.h>
  70#include <net/tcp.h>
  71#include <net/flow_dissector.h>
  72
  73#if IS_ENABLED(CONFIG_NF_CONNTRACK)
  74#include <net/netfilter/nf_conntrack_core.h>
  75#endif
  76
  77#define CAKE_SET_WAYS (8)
  78#define CAKE_MAX_TINS (8)
  79#define CAKE_QUEUES (1024)
  80#define CAKE_FLOW_MASK 63
  81#define CAKE_FLOW_NAT_FLAG 64
  82
  83/* struct cobalt_params - contains codel and blue parameters
  84 * @interval:   codel initial drop rate
  85 * @target:     maximum persistent sojourn time & blue update rate
  86 * @mtu_time:   serialisation delay of maximum-size packet
  87 * @p_inc:      increment of blue drop probability (0.32 fxp)
  88 * @p_dec:      decrement of blue drop probability (0.32 fxp)
  89 */
  90struct cobalt_params {
  91        u64     interval;
  92        u64     target;
  93        u64     mtu_time;
  94        u32     p_inc;
  95        u32     p_dec;
  96};
  97
  98/* struct cobalt_vars - contains codel and blue variables
  99 * @count:              codel dropping frequency
 100 * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
 101 * @drop_next:          time to drop next packet, or when we dropped last
 102 * @blue_timer:         Blue time to next drop
 103 * @p_drop:             BLUE drop probability (0.32 fxp)
 104 * @dropping:           set if in dropping state
 105 * @ecn_marked:         set if marked
 106 */
 107struct cobalt_vars {
 108        u32     count;
 109        u32     rec_inv_sqrt;
 110        ktime_t drop_next;
 111        ktime_t blue_timer;
 112        u32     p_drop;
 113        bool    dropping;
 114        bool    ecn_marked;
 115};
 116
 117enum {
 118        CAKE_SET_NONE = 0,
 119        CAKE_SET_SPARSE,
 120        CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 121        CAKE_SET_BULK,
 122        CAKE_SET_DECAYING
 123};
 124
 125struct cake_flow {
 126        /* this stuff is all needed per-flow at dequeue time */
 127        struct sk_buff    *head;
 128        struct sk_buff    *tail;
 129        struct list_head  flowchain;
 130        s32               deficit;
 131        u32               dropped;
 132        struct cobalt_vars cvars;
 133        u16               srchost; /* index into cake_host table */
 134        u16               dsthost;
 135        u8                set;
 136}; /* please try to keep this structure <= 64 bytes */
 137
 138struct cake_host {
 139        u32 srchost_tag;
 140        u32 dsthost_tag;
 141        u16 srchost_bulk_flow_count;
 142        u16 dsthost_bulk_flow_count;
 143};
 144
 145struct cake_heap_entry {
 146        u16 t:3, b:10;
 147};
 148
 149struct cake_tin_data {
 150        struct cake_flow flows[CAKE_QUEUES];
 151        u32     backlogs[CAKE_QUEUES];
 152        u32     tags[CAKE_QUEUES]; /* for set association */
 153        u16     overflow_idx[CAKE_QUEUES];
 154        struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 155        u16     flow_quantum;
 156
 157        struct cobalt_params cparams;
 158        u32     drop_overlimit;
 159        u16     bulk_flow_count;
 160        u16     sparse_flow_count;
 161        u16     decaying_flow_count;
 162        u16     unresponsive_flow_count;
 163
 164        u32     max_skblen;
 165
 166        struct list_head new_flows;
 167        struct list_head old_flows;
 168        struct list_head decaying_flows;
 169
 170        /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 171        ktime_t time_next_packet;
 172        u64     tin_rate_ns;
 173        u64     tin_rate_bps;
 174        u16     tin_rate_shft;
 175
 176        u16     tin_quantum;
 177        s32     tin_deficit;
 178        u32     tin_backlog;
 179        u32     tin_dropped;
 180        u32     tin_ecn_mark;
 181
 182        u32     packets;
 183        u64     bytes;
 184
 185        u32     ack_drops;
 186
 187        /* moving averages */
 188        u64 avge_delay;
 189        u64 peak_delay;
 190        u64 base_delay;
 191
 192        /* hash function stats */
 193        u32     way_directs;
 194        u32     way_hits;
 195        u32     way_misses;
 196        u32     way_collisions;
 197}; /* number of tins is small, so size of this struct doesn't matter much */
 198
 199struct cake_sched_data {
 200        struct tcf_proto __rcu *filter_list; /* optional external classifier */
 201        struct tcf_block *block;
 202        struct cake_tin_data *tins;
 203
 204        struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 205        u16             overflow_timeout;
 206
 207        u16             tin_cnt;
 208        u8              tin_mode;
 209        u8              flow_mode;
 210        u8              ack_filter;
 211        u8              atm_mode;
 212
 213        u32             fwmark_mask;
 214        u16             fwmark_shft;
 215
 216        /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 217        u16             rate_shft;
 218        ktime_t         time_next_packet;
 219        ktime_t         failsafe_next_packet;
 220        u64             rate_ns;
 221        u64             rate_bps;
 222        u16             rate_flags;
 223        s16             rate_overhead;
 224        u16             rate_mpu;
 225        u64             interval;
 226        u64             target;
 227
 228        /* resource tracking */
 229        u32             buffer_used;
 230        u32             buffer_max_used;
 231        u32             buffer_limit;
 232        u32             buffer_config_limit;
 233
 234        /* indices for dequeue */
 235        u16             cur_tin;
 236        u16             cur_flow;
 237
 238        struct qdisc_watchdog watchdog;
 239        const u8        *tin_index;
 240        const u8        *tin_order;
 241
 242        /* bandwidth capacity estimate */
 243        ktime_t         last_packet_time;
 244        ktime_t         avg_window_begin;
 245        u64             avg_packet_interval;
 246        u64             avg_window_bytes;
 247        u64             avg_peak_bandwidth;
 248        ktime_t         last_reconfig_time;
 249
 250        /* packet length stats */
 251        u32             avg_netoff;
 252        u16             max_netlen;
 253        u16             max_adjlen;
 254        u16             min_netlen;
 255        u16             min_adjlen;
 256};
 257
 258enum {
 259        CAKE_FLAG_OVERHEAD         = BIT(0),
 260        CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 261        CAKE_FLAG_INGRESS          = BIT(2),
 262        CAKE_FLAG_WASH             = BIT(3),
 263        CAKE_FLAG_SPLIT_GSO        = BIT(4)
 264};
 265
 266/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 267 * obtain the best features of each.  Codel is excellent on flows which
 268 * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 269 * unresponsive flows.
 270 */
 271
 272struct cobalt_skb_cb {
 273        ktime_t enqueue_time;
 274        u32     adjusted_len;
 275};
 276
 277static u64 us_to_ns(u64 us)
 278{
 279        return us * NSEC_PER_USEC;
 280}
 281
 282static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 283{
 284        qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 285        return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 286}
 287
 288static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 289{
 290        return get_cobalt_cb(skb)->enqueue_time;
 291}
 292
 293static void cobalt_set_enqueue_time(struct sk_buff *skb,
 294                                    ktime_t now)
 295{
 296        get_cobalt_cb(skb)->enqueue_time = now;
 297}
 298
 299static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 300
 301/* Diffserv lookup tables */
 302
 303static const u8 precedence[] = {
 304        0, 0, 0, 0, 0, 0, 0, 0,
 305        1, 1, 1, 1, 1, 1, 1, 1,
 306        2, 2, 2, 2, 2, 2, 2, 2,
 307        3, 3, 3, 3, 3, 3, 3, 3,
 308        4, 4, 4, 4, 4, 4, 4, 4,
 309        5, 5, 5, 5, 5, 5, 5, 5,
 310        6, 6, 6, 6, 6, 6, 6, 6,
 311        7, 7, 7, 7, 7, 7, 7, 7,
 312};
 313
 314static const u8 diffserv8[] = {
 315        2, 0, 1, 2, 4, 2, 2, 2,
 316        1, 2, 1, 2, 1, 2, 1, 2,
 317        5, 2, 4, 2, 4, 2, 4, 2,
 318        3, 2, 3, 2, 3, 2, 3, 2,
 319        6, 2, 3, 2, 3, 2, 3, 2,
 320        6, 2, 2, 2, 6, 2, 6, 2,
 321        7, 2, 2, 2, 2, 2, 2, 2,
 322        7, 2, 2, 2, 2, 2, 2, 2,
 323};
 324
 325static const u8 diffserv4[] = {
 326        0, 1, 0, 0, 2, 0, 0, 0,
 327        1, 0, 0, 0, 0, 0, 0, 0,
 328        2, 0, 2, 0, 2, 0, 2, 0,
 329        2, 0, 2, 0, 2, 0, 2, 0,
 330        3, 0, 2, 0, 2, 0, 2, 0,
 331        3, 0, 0, 0, 3, 0, 3, 0,
 332        3, 0, 0, 0, 0, 0, 0, 0,
 333        3, 0, 0, 0, 0, 0, 0, 0,
 334};
 335
 336static const u8 diffserv3[] = {
 337        0, 1, 0, 0, 2, 0, 0, 0,
 338        1, 0, 0, 0, 0, 0, 0, 0,
 339        0, 0, 0, 0, 0, 0, 0, 0,
 340        0, 0, 0, 0, 0, 0, 0, 0,
 341        0, 0, 0, 0, 0, 0, 0, 0,
 342        0, 0, 0, 0, 2, 0, 2, 0,
 343        2, 0, 0, 0, 0, 0, 0, 0,
 344        2, 0, 0, 0, 0, 0, 0, 0,
 345};
 346
 347static const u8 besteffort[] = {
 348        0, 0, 0, 0, 0, 0, 0, 0,
 349        0, 0, 0, 0, 0, 0, 0, 0,
 350        0, 0, 0, 0, 0, 0, 0, 0,
 351        0, 0, 0, 0, 0, 0, 0, 0,
 352        0, 0, 0, 0, 0, 0, 0, 0,
 353        0, 0, 0, 0, 0, 0, 0, 0,
 354        0, 0, 0, 0, 0, 0, 0, 0,
 355        0, 0, 0, 0, 0, 0, 0, 0,
 356};
 357
 358/* tin priority order for stats dumping */
 359
 360static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 361static const u8 bulk_order[] = {1, 0, 2, 3};
 362
 363#define REC_INV_SQRT_CACHE (16)
 364static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
 365
 366/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 367 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 368 *
 369 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 370 */
 371
 372static void cobalt_newton_step(struct cobalt_vars *vars)
 373{
 374        u32 invsqrt, invsqrt2;
 375        u64 val;
 376
 377        invsqrt = vars->rec_inv_sqrt;
 378        invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 379        val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 380
 381        val >>= 2; /* avoid overflow in following multiply */
 382        val = (val * invsqrt) >> (32 - 2 + 1);
 383
 384        vars->rec_inv_sqrt = val;
 385}
 386
 387static void cobalt_invsqrt(struct cobalt_vars *vars)
 388{
 389        if (vars->count < REC_INV_SQRT_CACHE)
 390                vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
 391        else
 392                cobalt_newton_step(vars);
 393}
 394
 395/* There is a big difference in timing between the accurate values placed in
 396 * the cache and the approximations given by a single Newton step for small
 397 * count values, particularly when stepping from count 1 to 2 or vice versa.
 398 * Above 16, a single Newton step gives sufficient accuracy in either
 399 * direction, given the precision stored.
 400 *
 401 * The magnitude of the error when stepping up to count 2 is such as to give
 402 * the value that *should* have been produced at count 4.
 403 */
 404
 405static void cobalt_cache_init(void)
 406{
 407        struct cobalt_vars v;
 408
 409        memset(&v, 0, sizeof(v));
 410        v.rec_inv_sqrt = ~0U;
 411        cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
 412
 413        for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
 414                cobalt_newton_step(&v);
 415                cobalt_newton_step(&v);
 416                cobalt_newton_step(&v);
 417                cobalt_newton_step(&v);
 418
 419                cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
 420        }
 421}
 422
 423static void cobalt_vars_init(struct cobalt_vars *vars)
 424{
 425        memset(vars, 0, sizeof(*vars));
 426
 427        if (!cobalt_rec_inv_sqrt_cache[0]) {
 428                cobalt_cache_init();
 429                cobalt_rec_inv_sqrt_cache[0] = ~0;
 430        }
 431}
 432
 433/* CoDel control_law is t + interval/sqrt(count)
 434 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 435 * both sqrt() and divide operation.
 436 */
 437static ktime_t cobalt_control(ktime_t t,
 438                              u64 interval,
 439                              u32 rec_inv_sqrt)
 440{
 441        return ktime_add_ns(t, reciprocal_scale(interval,
 442                                                rec_inv_sqrt));
 443}
 444
 445/* Call this when a packet had to be dropped due to queue overflow.  Returns
 446 * true if the BLUE state was quiescent before but active after this call.
 447 */
 448static bool cobalt_queue_full(struct cobalt_vars *vars,
 449                              struct cobalt_params *p,
 450                              ktime_t now)
 451{
 452        bool up = false;
 453
 454        if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 455                up = !vars->p_drop;
 456                vars->p_drop += p->p_inc;
 457                if (vars->p_drop < p->p_inc)
 458                        vars->p_drop = ~0;
 459                vars->blue_timer = now;
 460        }
 461        vars->dropping = true;
 462        vars->drop_next = now;
 463        if (!vars->count)
 464                vars->count = 1;
 465
 466        return up;
 467}
 468
 469/* Call this when the queue was serviced but turned out to be empty.  Returns
 470 * true if the BLUE state was active before but quiescent after this call.
 471 */
 472static bool cobalt_queue_empty(struct cobalt_vars *vars,
 473                               struct cobalt_params *p,
 474                               ktime_t now)
 475{
 476        bool down = false;
 477
 478        if (vars->p_drop &&
 479            ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 480                if (vars->p_drop < p->p_dec)
 481                        vars->p_drop = 0;
 482                else
 483                        vars->p_drop -= p->p_dec;
 484                vars->blue_timer = now;
 485                down = !vars->p_drop;
 486        }
 487        vars->dropping = false;
 488
 489        if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 490                vars->count--;
 491                cobalt_invsqrt(vars);
 492                vars->drop_next = cobalt_control(vars->drop_next,
 493                                                 p->interval,
 494                                                 vars->rec_inv_sqrt);
 495        }
 496
 497        return down;
 498}
 499
 500/* Call this with a freshly dequeued packet for possible congestion marking.
 501 * Returns true as an instruction to drop the packet, false for delivery.
 502 */
 503static bool cobalt_should_drop(struct cobalt_vars *vars,
 504                               struct cobalt_params *p,
 505                               ktime_t now,
 506                               struct sk_buff *skb,
 507                               u32 bulk_flows)
 508{
 509        bool next_due, over_target, drop = false;
 510        ktime_t schedule;
 511        u64 sojourn;
 512
 513/* The 'schedule' variable records, in its sign, whether 'now' is before or
 514 * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 515 * scheduling decision is actually branched, without destroying that
 516 * information.  Similarly, the first 'schedule' value calculated is preserved
 517 * in the boolean 'next_due'.
 518 *
 519 * As for 'drop_next', we take advantage of the fact that 'interval' is both
 520 * the delay between first exceeding 'target' and the first signalling event,
 521 * *and* the scaling factor for the signalling frequency.  It's therefore very
 522 * natural to use a single mechanism for both purposes, and eliminates a
 523 * significant amount of reference Codel's spaghetti code.  To help with this,
 524 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 525 * as possible to 1.0 in fixed-point.
 526 */
 527
 528        sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 529        schedule = ktime_sub(now, vars->drop_next);
 530        over_target = sojourn > p->target &&
 531                      sojourn > p->mtu_time * bulk_flows * 2 &&
 532                      sojourn > p->mtu_time * 4;
 533        next_due = vars->count && ktime_to_ns(schedule) >= 0;
 534
 535        vars->ecn_marked = false;
 536
 537        if (over_target) {
 538                if (!vars->dropping) {
 539                        vars->dropping = true;
 540                        vars->drop_next = cobalt_control(now,
 541                                                         p->interval,
 542                                                         vars->rec_inv_sqrt);
 543                }
 544                if (!vars->count)
 545                        vars->count = 1;
 546        } else if (vars->dropping) {
 547                vars->dropping = false;
 548        }
 549
 550        if (next_due && vars->dropping) {
 551                /* Use ECN mark if possible, otherwise drop */
 552                drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
 553
 554                vars->count++;
 555                if (!vars->count)
 556                        vars->count--;
 557                cobalt_invsqrt(vars);
 558                vars->drop_next = cobalt_control(vars->drop_next,
 559                                                 p->interval,
 560                                                 vars->rec_inv_sqrt);
 561                schedule = ktime_sub(now, vars->drop_next);
 562        } else {
 563                while (next_due) {
 564                        vars->count--;
 565                        cobalt_invsqrt(vars);
 566                        vars->drop_next = cobalt_control(vars->drop_next,
 567                                                         p->interval,
 568                                                         vars->rec_inv_sqrt);
 569                        schedule = ktime_sub(now, vars->drop_next);
 570                        next_due = vars->count && ktime_to_ns(schedule) >= 0;
 571                }
 572        }
 573
 574        /* Simple BLUE implementation.  Lack of ECN is deliberate. */
 575        if (vars->p_drop)
 576                drop |= (prandom_u32() < vars->p_drop);
 577
 578        /* Overload the drop_next field as an activity timeout */
 579        if (!vars->count)
 580                vars->drop_next = ktime_add_ns(now, p->interval);
 581        else if (ktime_to_ns(schedule) > 0 && !drop)
 582                vars->drop_next = now;
 583
 584        return drop;
 585}
 586
 587static bool cake_update_flowkeys(struct flow_keys *keys,
 588                                 const struct sk_buff *skb)
 589{
 590#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 591        struct nf_conntrack_tuple tuple = {};
 592        bool rev = !skb->_nfct, upd = false;
 593        __be32 ip;
 594
 595        if (skb_protocol(skb, true) != htons(ETH_P_IP))
 596                return false;
 597
 598        if (!nf_ct_get_tuple_skb(&tuple, skb))
 599                return false;
 600
 601        ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
 602        if (ip != keys->addrs.v4addrs.src) {
 603                keys->addrs.v4addrs.src = ip;
 604                upd = true;
 605        }
 606        ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
 607        if (ip != keys->addrs.v4addrs.dst) {
 608                keys->addrs.v4addrs.dst = ip;
 609                upd = true;
 610        }
 611
 612        if (keys->ports.ports) {
 613                __be16 port;
 614
 615                port = rev ? tuple.dst.u.all : tuple.src.u.all;
 616                if (port != keys->ports.src) {
 617                        keys->ports.src = port;
 618                        upd = true;
 619                }
 620                port = rev ? tuple.src.u.all : tuple.dst.u.all;
 621                if (port != keys->ports.dst) {
 622                        port = keys->ports.dst;
 623                        upd = true;
 624                }
 625        }
 626        return upd;
 627#else
 628        return false;
 629#endif
 630}
 631
 632/* Cake has several subtle multiple bit settings. In these cases you
 633 *  would be matching triple isolate mode as well.
 634 */
 635
 636static bool cake_dsrc(int flow_mode)
 637{
 638        return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 639}
 640
 641static bool cake_ddst(int flow_mode)
 642{
 643        return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 644}
 645
 646static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 647                     int flow_mode, u16 flow_override, u16 host_override)
 648{
 649        bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
 650        bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
 651        bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
 652        u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 653        u16 reduced_hash, srchost_idx, dsthost_idx;
 654        struct flow_keys keys, host_keys;
 655        bool use_skbhash = skb->l4_hash;
 656
 657        if (unlikely(flow_mode == CAKE_FLOW_NONE))
 658                return 0;
 659
 660        /* If both overrides are set, or we can use the SKB hash and nat mode is
 661         * disabled, we can skip packet dissection entirely. If nat mode is
 662         * enabled there's another check below after doing the conntrack lookup.
 663         */
 664        if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
 665                goto skip_hash;
 666
 667        skb_flow_dissect_flow_keys(skb, &keys,
 668                                   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 669
 670        /* Don't use the SKB hash if we change the lookup keys from conntrack */
 671        if (nat_enabled && cake_update_flowkeys(&keys, skb))
 672                use_skbhash = false;
 673
 674        /* If we can still use the SKB hash and don't need the host hash, we can
 675         * skip the rest of the hashing procedure
 676         */
 677        if (use_skbhash && !hash_hosts)
 678                goto skip_hash;
 679
 680        /* flow_hash_from_keys() sorts the addresses by value, so we have
 681         * to preserve their order in a separate data structure to treat
 682         * src and dst host addresses as independently selectable.
 683         */
 684        host_keys = keys;
 685        host_keys.ports.ports     = 0;
 686        host_keys.basic.ip_proto  = 0;
 687        host_keys.keyid.keyid     = 0;
 688        host_keys.tags.flow_label = 0;
 689
 690        switch (host_keys.control.addr_type) {
 691        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 692                host_keys.addrs.v4addrs.src = 0;
 693                dsthost_hash = flow_hash_from_keys(&host_keys);
 694                host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 695                host_keys.addrs.v4addrs.dst = 0;
 696                srchost_hash = flow_hash_from_keys(&host_keys);
 697                break;
 698
 699        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 700                memset(&host_keys.addrs.v6addrs.src, 0,
 701                       sizeof(host_keys.addrs.v6addrs.src));
 702                dsthost_hash = flow_hash_from_keys(&host_keys);
 703                host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 704                memset(&host_keys.addrs.v6addrs.dst, 0,
 705                       sizeof(host_keys.addrs.v6addrs.dst));
 706                srchost_hash = flow_hash_from_keys(&host_keys);
 707                break;
 708
 709        default:
 710                dsthost_hash = 0;
 711                srchost_hash = 0;
 712        }
 713
 714        /* This *must* be after the above switch, since as a
 715         * side-effect it sorts the src and dst addresses.
 716         */
 717        if (hash_flows && !use_skbhash)
 718                flow_hash = flow_hash_from_keys(&keys);
 719
 720skip_hash:
 721        if (flow_override)
 722                flow_hash = flow_override - 1;
 723        else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
 724                flow_hash = skb->hash;
 725        if (host_override) {
 726                dsthost_hash = host_override - 1;
 727                srchost_hash = host_override - 1;
 728        }
 729
 730        if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 731                if (flow_mode & CAKE_FLOW_SRC_IP)
 732                        flow_hash ^= srchost_hash;
 733
 734                if (flow_mode & CAKE_FLOW_DST_IP)
 735                        flow_hash ^= dsthost_hash;
 736        }
 737
 738        reduced_hash = flow_hash % CAKE_QUEUES;
 739
 740        /* set-associative hashing */
 741        /* fast path if no hash collision (direct lookup succeeds) */
 742        if (likely(q->tags[reduced_hash] == flow_hash &&
 743                   q->flows[reduced_hash].set)) {
 744                q->way_directs++;
 745        } else {
 746                u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 747                u32 outer_hash = reduced_hash - inner_hash;
 748                bool allocate_src = false;
 749                bool allocate_dst = false;
 750                u32 i, k;
 751
 752                /* check if any active queue in the set is reserved for
 753                 * this flow.
 754                 */
 755                for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 756                     i++, k = (k + 1) % CAKE_SET_WAYS) {
 757                        if (q->tags[outer_hash + k] == flow_hash) {
 758                                if (i)
 759                                        q->way_hits++;
 760
 761                                if (!q->flows[outer_hash + k].set) {
 762                                        /* need to increment host refcnts */
 763                                        allocate_src = cake_dsrc(flow_mode);
 764                                        allocate_dst = cake_ddst(flow_mode);
 765                                }
 766
 767                                goto found;
 768                        }
 769                }
 770
 771                /* no queue is reserved for this flow, look for an
 772                 * empty one.
 773                 */
 774                for (i = 0; i < CAKE_SET_WAYS;
 775                         i++, k = (k + 1) % CAKE_SET_WAYS) {
 776                        if (!q->flows[outer_hash + k].set) {
 777                                q->way_misses++;
 778                                allocate_src = cake_dsrc(flow_mode);
 779                                allocate_dst = cake_ddst(flow_mode);
 780                                goto found;
 781                        }
 782                }
 783
 784                /* With no empty queues, default to the original
 785                 * queue, accept the collision, update the host tags.
 786                 */
 787                q->way_collisions++;
 788                if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
 789                        q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
 790                        q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
 791                }
 792                allocate_src = cake_dsrc(flow_mode);
 793                allocate_dst = cake_ddst(flow_mode);
 794found:
 795                /* reserve queue for future packets in same flow */
 796                reduced_hash = outer_hash + k;
 797                q->tags[reduced_hash] = flow_hash;
 798
 799                if (allocate_src) {
 800                        srchost_idx = srchost_hash % CAKE_QUEUES;
 801                        inner_hash = srchost_idx % CAKE_SET_WAYS;
 802                        outer_hash = srchost_idx - inner_hash;
 803                        for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 804                                i++, k = (k + 1) % CAKE_SET_WAYS) {
 805                                if (q->hosts[outer_hash + k].srchost_tag ==
 806                                    srchost_hash)
 807                                        goto found_src;
 808                        }
 809                        for (i = 0; i < CAKE_SET_WAYS;
 810                                i++, k = (k + 1) % CAKE_SET_WAYS) {
 811                                if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
 812                                        break;
 813                        }
 814                        q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 815found_src:
 816                        srchost_idx = outer_hash + k;
 817                        if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 818                                q->hosts[srchost_idx].srchost_bulk_flow_count++;
 819                        q->flows[reduced_hash].srchost = srchost_idx;
 820                }
 821
 822                if (allocate_dst) {
 823                        dsthost_idx = dsthost_hash % CAKE_QUEUES;
 824                        inner_hash = dsthost_idx % CAKE_SET_WAYS;
 825                        outer_hash = dsthost_idx - inner_hash;
 826                        for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 827                             i++, k = (k + 1) % CAKE_SET_WAYS) {
 828                                if (q->hosts[outer_hash + k].dsthost_tag ==
 829                                    dsthost_hash)
 830                                        goto found_dst;
 831                        }
 832                        for (i = 0; i < CAKE_SET_WAYS;
 833                             i++, k = (k + 1) % CAKE_SET_WAYS) {
 834                                if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
 835                                        break;
 836                        }
 837                        q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 838found_dst:
 839                        dsthost_idx = outer_hash + k;
 840                        if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 841                                q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
 842                        q->flows[reduced_hash].dsthost = dsthost_idx;
 843                }
 844        }
 845
 846        return reduced_hash;
 847}
 848
 849/* helper functions : might be changed when/if skb use a standard list_head */
 850/* remove one skb from head of slot queue */
 851
 852static struct sk_buff *dequeue_head(struct cake_flow *flow)
 853{
 854        struct sk_buff *skb = flow->head;
 855
 856        if (skb) {
 857                flow->head = skb->next;
 858                skb_mark_not_on_list(skb);
 859        }
 860
 861        return skb;
 862}
 863
 864/* add skb to flow queue (tail add) */
 865
 866static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 867{
 868        if (!flow->head)
 869                flow->head = skb;
 870        else
 871                flow->tail->next = skb;
 872        flow->tail = skb;
 873        skb->next = NULL;
 874}
 875
 876static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 877                                    struct ipv6hdr *buf)
 878{
 879        unsigned int offset = skb_network_offset(skb);
 880        struct iphdr *iph;
 881
 882        iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 883
 884        if (!iph)
 885                return NULL;
 886
 887        if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 888                return skb_header_pointer(skb, offset + iph->ihl * 4,
 889                                          sizeof(struct ipv6hdr), buf);
 890
 891        else if (iph->version == 4)
 892                return iph;
 893
 894        else if (iph->version == 6)
 895                return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 896                                          buf);
 897
 898        return NULL;
 899}
 900
 901static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
 902                                      void *buf, unsigned int bufsize)
 903{
 904        unsigned int offset = skb_network_offset(skb);
 905        const struct ipv6hdr *ipv6h;
 906        const struct tcphdr *tcph;
 907        const struct iphdr *iph;
 908        struct ipv6hdr _ipv6h;
 909        struct tcphdr _tcph;
 910
 911        ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
 912
 913        if (!ipv6h)
 914                return NULL;
 915
 916        if (ipv6h->version == 4) {
 917                iph = (struct iphdr *)ipv6h;
 918                offset += iph->ihl * 4;
 919
 920                /* special-case 6in4 tunnelling, as that is a common way to get
 921                 * v6 connectivity in the home
 922                 */
 923                if (iph->protocol == IPPROTO_IPV6) {
 924                        ipv6h = skb_header_pointer(skb, offset,
 925                                                   sizeof(_ipv6h), &_ipv6h);
 926
 927                        if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
 928                                return NULL;
 929
 930                        offset += sizeof(struct ipv6hdr);
 931
 932                } else if (iph->protocol != IPPROTO_TCP) {
 933                        return NULL;
 934                }
 935
 936        } else if (ipv6h->version == 6) {
 937                if (ipv6h->nexthdr != IPPROTO_TCP)
 938                        return NULL;
 939
 940                offset += sizeof(struct ipv6hdr);
 941        } else {
 942                return NULL;
 943        }
 944
 945        tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
 946        if (!tcph || tcph->doff < 5)
 947                return NULL;
 948
 949        return skb_header_pointer(skb, offset,
 950                                  min(__tcp_hdrlen(tcph), bufsize), buf);
 951}
 952
 953static const void *cake_get_tcpopt(const struct tcphdr *tcph,
 954                                   int code, int *oplen)
 955{
 956        /* inspired by tcp_parse_options in tcp_input.c */
 957        int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
 958        const u8 *ptr = (const u8 *)(tcph + 1);
 959
 960        while (length > 0) {
 961                int opcode = *ptr++;
 962                int opsize;
 963
 964                if (opcode == TCPOPT_EOL)
 965                        break;
 966                if (opcode == TCPOPT_NOP) {
 967                        length--;
 968                        continue;
 969                }
 970                if (length < 2)
 971                        break;
 972                opsize = *ptr++;
 973                if (opsize < 2 || opsize > length)
 974                        break;
 975
 976                if (opcode == code) {
 977                        *oplen = opsize;
 978                        return ptr;
 979                }
 980
 981                ptr += opsize - 2;
 982                length -= opsize;
 983        }
 984
 985        return NULL;
 986}
 987
 988/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
 989 * bytes than the other. In the case where both sequences ACKs bytes that the
 990 * other doesn't, A is considered greater. DSACKs in A also makes A be
 991 * considered greater.
 992 *
 993 * @return -1, 0 or 1 as normal compare functions
 994 */
 995static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
 996                                  const struct tcphdr *tcph_b)
 997{
 998        const struct tcp_sack_block_wire *sack_a, *sack_b;
 999        u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1000        u32 bytes_a = 0, bytes_b = 0;

1001        int oplen_a, oplen_b;
1002        bool first = true;
1003
1004        sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1005        sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1006
1007        /* pointers point to option contents */
1008        oplen_a -= TCPOLEN_SACK_BASE;
1009        oplen_b -= TCPOLEN_SACK_BASE;
1010
1011        if (sack_a && oplen_a >= sizeof(*sack_a) &&
1012            (!sack_b || oplen_b < sizeof(*sack_b)))
1013                return -1;
1014        else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1015                 (!sack_a || oplen_a < sizeof(*sack_a)))
1016                return 1;
1017        else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1018                 (!sack_b || oplen_b < sizeof(*sack_b)))
1019                return 0;
1020
1021        while (oplen_a >= sizeof(*sack_a)) {
1022                const struct tcp_sack_block_wire *sack_tmp = sack_b;
1023                u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1024                u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1025                int oplen_tmp = oplen_b;
1026                bool found = false;
1027
1028                /* DSACK; always considered greater to prevent dropping */
1029                if (before(start_a, ack_seq_a))
1030                        return -1;
1031
1032                bytes_a += end_a - start_a;
1033
1034                while (oplen_tmp >= sizeof(*sack_tmp)) {
1035                        u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1036                        u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1037
1038                        /* first time through we count the total size */
1039                        if (first)
1040                                bytes_b += end_b - start_b;
1041
1042                        if (!after(start_b, start_a) && !before(end_b, end_a)) {
1043                                found = true;
1044                                if (!first)
1045                                        break;
1046                        }
1047                        oplen_tmp -= sizeof(*sack_tmp);
1048                        sack_tmp++;
1049                }
1050
1051                if (!found)
1052                        return -1;
1053
1054                oplen_a -= sizeof(*sack_a);
1055                sack_a++;
1056                first = false;
1057        }
1058
1059        /* If we made it this far, all ranges SACKed by A are covered by B, so
1060         * either the SACKs are equal, or B SACKs more bytes.
1061         */
1062        return bytes_b > bytes_a ? 1 : 0;
1063}
1064
1065static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1066                                 u32 *tsval, u32 *tsecr)
1067{
1068        const u8 *ptr;
1069        int opsize;
1070
1071        ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1072
1073        if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1074                *tsval = get_unaligned_be32(ptr);
1075                *tsecr = get_unaligned_be32(ptr + 4);
1076        }
1077}
1078
1079static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1080                               u32 tstamp_new, u32 tsecr_new)
1081{
1082        /* inspired by tcp_parse_options in tcp_input.c */
1083        int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1084        const u8 *ptr = (const u8 *)(tcph + 1);
1085        u32 tstamp, tsecr;
1086
1087        /* 3 reserved flags must be unset to avoid future breakage
1088         * ACK must be set
1089         * ECE/CWR are handled separately
1090         * All other flags URG/PSH/RST/SYN/FIN must be unset
1091         * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1092         * 0x00C00000 = CWR/ECE (handled separately)
1093         * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1094         */
1095        if (((tcp_flag_word(tcph) &
1096              cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1097                return false;
1098
1099        while (length > 0) {
1100                int opcode = *ptr++;
1101                int opsize;
1102
1103                if (opcode == TCPOPT_EOL)
1104                        break;
1105                if (opcode == TCPOPT_NOP) {
1106                        length--;
1107                        continue;
1108                }
1109                if (length < 2)
1110                        break;
1111                opsize = *ptr++;
1112                if (opsize < 2 || opsize > length)
1113                        break;
1114
1115                switch (opcode) {
1116                case TCPOPT_MD5SIG: /* doesn't influence state */
1117                        break;
1118
1119                case TCPOPT_SACK: /* stricter checking performed later */
1120                        if (opsize % 8 != 2)
1121                                return false;
1122                        break;
1123
1124                case TCPOPT_TIMESTAMP:
1125                        /* only drop timestamps lower than new */
1126                        if (opsize != TCPOLEN_TIMESTAMP)
1127                                return false;
1128                        tstamp = get_unaligned_be32(ptr);
1129                        tsecr = get_unaligned_be32(ptr + 4);
1130                        if (after(tstamp, tstamp_new) ||
1131                            after(tsecr, tsecr_new))
1132                                return false;
1133                        break;
1134
1135                case TCPOPT_MSS:  /* these should only be set on SYN */
1136                case TCPOPT_WINDOW:
1137                case TCPOPT_SACK_PERM:
1138                case TCPOPT_FASTOPEN:
1139                case TCPOPT_EXP:
1140                default: /* don't drop if any unknown options are present */
1141                        return false;
1142                }
1143
1144                ptr += opsize - 2;
1145                length -= opsize;
1146        }
1147
1148        return true;
1149}
1150
1151static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1152                                       struct cake_flow *flow)
1153{
1154        bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1155        struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1156        struct sk_buff *skb_check, *skb_prev = NULL;
1157        const struct ipv6hdr *ipv6h, *ipv6h_check;
1158        unsigned char _tcph[64], _tcph_check[64];
1159        const struct tcphdr *tcph, *tcph_check;
1160        const struct iphdr *iph, *iph_check;
1161        struct ipv6hdr _iph, _iph_check;
1162        const struct sk_buff *skb;
1163        int seglen, num_found = 0;
1164        u32 tstamp = 0, tsecr = 0;
1165        __be32 elig_flags = 0;
1166        int sack_comp;
1167
1168        /* no other possible ACKs to filter */
1169        if (flow->head == flow->tail)
1170                return NULL;
1171
1172        skb = flow->tail;
1173        tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1174        iph = cake_get_iphdr(skb, &_iph);
1175        if (!tcph)
1176                return NULL;
1177
1178        cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1179
1180        /* the 'triggering' packet need only have the ACK flag set.
1181         * also check that SYN is not set, as there won't be any previous ACKs.
1182         */
1183        if ((tcp_flag_word(tcph) &
1184             (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1185                return NULL;
1186
1187        /* the 'triggering' ACK is at the tail of the queue, we have already
1188         * returned if it is the only packet in the flow. loop through the rest
1189         * of the queue looking for pure ACKs with the same 5-tuple as the
1190         * triggering one.
1191         */
1192        for (skb_check = flow->head;
1193             skb_check && skb_check != skb;
1194             skb_prev = skb_check, skb_check = skb_check->next) {
1195                iph_check = cake_get_iphdr(skb_check, &_iph_check);
1196                tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1197                                             sizeof(_tcph_check));
1198
1199                /* only TCP packets with matching 5-tuple are eligible, and only
1200                 * drop safe headers
1201                 */
1202                if (!tcph_check || iph->version != iph_check->version ||
1203                    tcph_check->source != tcph->source ||
1204                    tcph_check->dest != tcph->dest)
1205                        continue;
1206
1207                if (iph_check->version == 4) {
1208                        if (iph_check->saddr != iph->saddr ||
1209                            iph_check->daddr != iph->daddr)
1210                                continue;
1211
1212                        seglen = ntohs(iph_check->tot_len) -
1213                                       (4 * iph_check->ihl);
1214                } else if (iph_check->version == 6) {
1215                        ipv6h = (struct ipv6hdr *)iph;
1216                        ipv6h_check = (struct ipv6hdr *)iph_check;
1217
1218                        if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1219                            ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1220                                continue;
1221
1222                        seglen = ntohs(ipv6h_check->payload_len);
1223                } else {
1224                        WARN_ON(1);  /* shouldn't happen */
1225                        continue;
1226                }
1227
1228                /* If the ECE/CWR flags changed from the previous eligible
1229                 * packet in the same flow, we should no longer be dropping that
1230                 * previous packet as this would lose information.
1231                 */
1232                if (elig_ack && (tcp_flag_word(tcph_check) &
1233                                 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1234                        elig_ack = NULL;
1235                        elig_ack_prev = NULL;
1236                        num_found--;
1237                }
1238
1239                /* Check TCP options and flags, don't drop ACKs with segment
1240                 * data, and don't drop ACKs with a higher cumulative ACK
1241                 * counter than the triggering packet. Check ACK seqno here to
1242                 * avoid parsing SACK options of packets we are going to exclude
1243                 * anyway.
1244                 */
1245                if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1246                    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1247                    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1248                        continue;
1249
1250                /* Check SACK options. The triggering packet must SACK more data
1251                 * than the ACK under consideration, or SACK the same range but
1252                 * have a larger cumulative ACK counter. The latter is a
1253                 * pathological case, but is contained in the following check
1254                 * anyway, just to be safe.
1255                 */
1256                sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1257
1258                if (sack_comp < 0 ||
1259                    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1260                     sack_comp == 0))
1261                        continue;
1262
1263                /* At this point we have found an eligible pure ACK to drop; if
1264                 * we are in aggressive mode, we are done. Otherwise, keep
1265                 * searching unless this is the second eligible ACK we
1266                 * found.
1267                 *
1268                 * Since we want to drop ACK closest to the head of the queue,
1269                 * save the first eligible ACK we find, even if we need to loop
1270                 * again.
1271                 */
1272                if (!elig_ack) {
1273                        elig_ack = skb_check;
1274                        elig_ack_prev = skb_prev;
1275                        elig_flags = (tcp_flag_word(tcph_check)
1276                                      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1277                }
1278
1279                if (num_found++ > 0)
1280                        goto found;
1281        }
1282
1283        /* We made it through the queue without finding two eligible ACKs . If
1284         * we found a single eligible ACK we can drop it in aggressive mode if
1285         * we can guarantee that this does not interfere with ECN flag
1286         * information. We ensure this by dropping it only if the enqueued
1287         * packet is consecutive with the eligible ACK, and their flags match.
1288         */
1289        if (elig_ack && aggressive && elig_ack->next == skb &&
1290            (elig_flags == (tcp_flag_word(tcph) &
1291                            (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1292                goto found;
1293
1294        return NULL;
1295
1296found:
1297        if (elig_ack_prev)
1298                elig_ack_prev->next = elig_ack->next;
1299        else
1300                flow->head = elig_ack->next;
1301
1302        skb_mark_not_on_list(elig_ack);
1303
1304        return elig_ack;
1305}
1306
1307static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1308{
1309        avg -= avg >> shift;
1310        avg += sample >> shift;
1311        return avg;
1312}
1313
1314static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1315{
1316        if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1317                len -= off;
1318
1319        if (q->max_netlen < len)
1320                q->max_netlen = len;
1321        if (q->min_netlen > len)
1322                q->min_netlen = len;
1323
1324        len += q->rate_overhead;
1325
1326        if (len < q->rate_mpu)
1327                len = q->rate_mpu;
1328
1329        if (q->atm_mode == CAKE_ATM_ATM) {
1330                len += 47;
1331                len /= 48;
1332                len *= 53;
1333        } else if (q->atm_mode == CAKE_ATM_PTM) {
1334                /* Add one byte per 64 bytes or part thereof.
1335                 * This is conservative and easier to calculate than the
1336                 * precise value.
1337                 */
1338                len += (len + 63) / 64;
1339        }
1340
1341        if (q->max_adjlen < len)
1342                q->max_adjlen = len;
1343        if (q->min_adjlen > len)
1344                q->min_adjlen = len;
1345
1346        return len;
1347}
1348
1349static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1350{
1351        const struct skb_shared_info *shinfo = skb_shinfo(skb);
1352        unsigned int hdr_len, last_len = 0;
1353        u32 off = skb_network_offset(skb);
1354        u32 len = qdisc_pkt_len(skb);
1355        u16 segs = 1;
1356
1357        q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1358
1359        if (!shinfo->gso_size)
1360                return cake_calc_overhead(q, len, off);
1361
1362        /* borrowed from qdisc_pkt_len_init() */
1363        hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1364
1365        /* + transport layer */
1366        if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1367                                                SKB_GSO_TCPV6))) {
1368                const struct tcphdr *th;
1369                struct tcphdr _tcphdr;
1370
1371                th = skb_header_pointer(skb, skb_transport_offset(skb),
1372                                        sizeof(_tcphdr), &_tcphdr);
1373                if (likely(th))
1374                        hdr_len += __tcp_hdrlen(th);
1375        } else {
1376                struct udphdr _udphdr;
1377
1378                if (skb_header_pointer(skb, skb_transport_offset(skb),
1379                                       sizeof(_udphdr), &_udphdr))
1380                        hdr_len += sizeof(struct udphdr);
1381        }
1382
1383        if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1384                segs = DIV_ROUND_UP(skb->len - hdr_len,
1385                                    shinfo->gso_size);
1386        else
1387                segs = shinfo->gso_segs;
1388
1389        len = shinfo->gso_size + hdr_len;
1390        last_len = skb->len - shinfo->gso_size * (segs - 1);
1391
1392        return (cake_calc_overhead(q, len, off) * (segs - 1) +
1393                cake_calc_overhead(q, last_len, off));
1394}
1395
1396static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1397{
1398        struct cake_heap_entry ii = q->overflow_heap[i];
1399        struct cake_heap_entry jj = q->overflow_heap[j];
1400
1401        q->overflow_heap[i] = jj;
1402        q->overflow_heap[j] = ii;
1403
1404        q->tins[ii.t].overflow_idx[ii.b] = j;
1405        q->tins[jj.t].overflow_idx[jj.b] = i;
1406}
1407
1408static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1409{
1410        struct cake_heap_entry ii = q->overflow_heap[i];
1411
1412        return q->tins[ii.t].backlogs[ii.b];
1413}
1414
1415static void cake_heapify(struct cake_sched_data *q, u16 i)
1416{
1417        static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1418        u32 mb = cake_heap_get_backlog(q, i);
1419        u32 m = i;
1420
1421        while (m < a) {
1422                u32 l = m + m + 1;
1423                u32 r = l + 1;
1424
1425                if (l < a) {
1426                        u32 lb = cake_heap_get_backlog(q, l);
1427
1428                        if (lb > mb) {
1429                                m  = l;
1430                                mb = lb;
1431                        }
1432                }
1433
1434                if (r < a) {
1435                        u32 rb = cake_heap_get_backlog(q, r);
1436
1437                        if (rb > mb) {
1438                                m  = r;
1439                                mb = rb;
1440                        }
1441                }
1442
1443                if (m != i) {
1444                        cake_heap_swap(q, i, m);
1445                        i = m;
1446                } else {
1447                        break;
1448                }
1449        }
1450}
1451
1452static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1453{
1454        while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1455                u16 p = (i - 1) >> 1;
1456                u32 ib = cake_heap_get_backlog(q, i);
1457                u32 pb = cake_heap_get_backlog(q, p);
1458
1459                if (ib > pb) {
1460                        cake_heap_swap(q, i, p);
1461                        i = p;
1462                } else {
1463                        break;
1464                }
1465        }
1466}
1467
1468static int cake_advance_shaper(struct cake_sched_data *q,
1469                               struct cake_tin_data *b,
1470                               struct sk_buff *skb,
1471                               ktime_t now, bool drop)
1472{
1473        u32 len = get_cobalt_cb(skb)->adjusted_len;
1474
1475        /* charge packet bandwidth to this tin
1476         * and to the global shaper.
1477         */
1478        if (q->rate_ns) {
1479                u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1480                u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1481                u64 failsafe_dur = global_dur + (global_dur >> 1);
1482
1483                if (ktime_before(b->time_next_packet, now))
1484                        b->time_next_packet = ktime_add_ns(b->time_next_packet,
1485                                                           tin_dur);
1486
1487                else if (ktime_before(b->time_next_packet,
1488                                      ktime_add_ns(now, tin_dur)))
1489                        b->time_next_packet = ktime_add_ns(now, tin_dur);
1490
1491                q->time_next_packet = ktime_add_ns(q->time_next_packet,
1492                                                   global_dur);
1493                if (!drop)
1494                        q->failsafe_next_packet = \
1495                                ktime_add_ns(q->failsafe_next_packet,
1496                                             failsafe_dur);
1497        }
1498        return len;
1499}
1500
1501static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1502{
1503        struct cake_sched_data *q = qdisc_priv(sch);
1504        ktime_t now = ktime_get();
1505        u32 idx = 0, tin = 0, len;
1506        struct cake_heap_entry qq;
1507        struct cake_tin_data *b;
1508        struct cake_flow *flow;
1509        struct sk_buff *skb;
1510
1511        if (!q->overflow_timeout) {
1512                int i;
1513                /* Build fresh max-heap */
1514                for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1515                        cake_heapify(q, i);
1516        }
1517        q->overflow_timeout = 65535;
1518
1519        /* select longest queue for pruning */
1520        qq  = q->overflow_heap[0];
1521        tin = qq.t;
1522        idx = qq.b;
1523
1524        b = &q->tins[tin];
1525        flow = &b->flows[idx];
1526        skb = dequeue_head(flow);
1527        if (unlikely(!skb)) {
1528                /* heap has gone wrong, rebuild it next time */
1529                q->overflow_timeout = 0;
1530                return idx + (tin << 16);
1531        }
1532
1533        if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1534                b->unresponsive_flow_count++;
1535
1536        len = qdisc_pkt_len(skb);
1537        q->buffer_used      -= skb->truesize;
1538        b->backlogs[idx]    -= len;
1539        b->tin_backlog      -= len;
1540        sch->qstats.backlog -= len;
1541        qdisc_tree_reduce_backlog(sch, 1, len);
1542
1543        flow->dropped++;
1544        b->tin_dropped++;
1545        sch->qstats.drops++;
1546
1547        if (q->rate_flags & CAKE_FLAG_INGRESS)
1548                cake_advance_shaper(q, b, skb, now, true);
1549
1550        __qdisc_drop(skb, to_free);
1551        sch->q.qlen--;
1552
1553        cake_heapify(q, 0);
1554
1555        return idx + (tin << 16);
1556}
1557
1558static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1559{
1560        const int offset = skb_network_offset(skb);
1561        u16 *buf, buf_;
1562        u8 dscp;
1563
1564        switch (skb_protocol(skb, true)) {
1565        case htons(ETH_P_IP):
1566                buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1567                if (unlikely(!buf))
1568                        return 0;
1569
1570                /* ToS is in the second byte of iphdr */
1571                dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1572
1573                if (wash && dscp) {
1574                        const int wlen = offset + sizeof(struct iphdr);
1575
1576                        if (!pskb_may_pull(skb, wlen) ||
1577                            skb_try_make_writable(skb, wlen))
1578                                return 0;
1579
1580                        ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1581                }
1582
1583                return dscp;
1584
1585        case htons(ETH_P_IPV6):
1586                buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1587                if (unlikely(!buf))
1588                        return 0;
1589
1590                /* Traffic class is in the first and second bytes of ipv6hdr */
1591                dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1592
1593                if (wash && dscp) {
1594                        const int wlen = offset + sizeof(struct ipv6hdr);
1595
1596                        if (!pskb_may_pull(skb, wlen) ||
1597                            skb_try_make_writable(skb, wlen))
1598                                return 0;
1599
1600                        ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1601                }
1602
1603                return dscp;
1604
1605        case htons(ETH_P_ARP):
1606                return 0x38;  /* CS7 - Net Control */
1607
1608        default:
1609                /* If there is no Diffserv field, treat as best-effort */
1610                return 0;
1611        }
1612}
1613
1614static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1615                                             struct sk_buff *skb)
1616{
1617        struct cake_sched_data *q = qdisc_priv(sch);
1618        u32 tin, mark;
1619        bool wash;
1620        u8 dscp;
1621
1622        /* Tin selection: Default to diffserv-based selection, allow overriding
1623         * using firewall marks or skb->priority. Call DSCP parsing early if
1624         * wash is enabled, otherwise defer to below to skip unneeded parsing.
1625         */
1626        mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1627        wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1628        if (wash)
1629                dscp = cake_handle_diffserv(skb, wash);
1630
1631        if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1632                tin = 0;
1633
1634        else if (mark && mark <= q->tin_cnt)
1635                tin = q->tin_order[mark - 1];
1636
1637        else if (TC_H_MAJ(skb->priority) == sch->handle &&
1638                 TC_H_MIN(skb->priority) > 0 &&
1639                 TC_H_MIN(skb->priority) <= q->tin_cnt)
1640                tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1641
1642        else {
1643                if (!wash)
1644                        dscp = cake_handle_diffserv(skb, wash);
1645                tin = q->tin_index[dscp];
1646
1647                if (unlikely(tin >= q->tin_cnt))
1648                        tin = 0;
1649        }
1650
1651        return &q->tins[tin];
1652}
1653
1654static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1655                         struct sk_buff *skb, int flow_mode, int *qerr)
1656{
1657        struct cake_sched_data *q = qdisc_priv(sch);
1658        struct tcf_proto *filter;
1659        struct tcf_result res;
1660        u16 flow = 0, host = 0;
1661        int result;
1662
1663        filter = rcu_dereference_bh(q->filter_list);
1664        if (!filter)
1665                goto hash;
1666
1667        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1668        result = tcf_classify(skb, NULL, filter, &res, false);
1669
1670        if (result >= 0) {
1671#ifdef CONFIG_NET_CLS_ACT
1672                switch (result) {
1673                case TC_ACT_STOLEN:
1674                case TC_ACT_QUEUED:
1675                case TC_ACT_TRAP:
1676                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1677                        fallthrough;
1678                case TC_ACT_SHOT:
1679                        return 0;
1680                }
1681#endif
1682                if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1683                        flow = TC_H_MIN(res.classid);
1684                if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1685                        host = TC_H_MAJ(res.classid) >> 16;
1686        }
1687hash:
1688        *t = cake_select_tin(sch, skb);
1689        return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1690}
1691
1692static void cake_reconfigure(struct Qdisc *sch);
1693
1694static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1695                        struct sk_buff **to_free)
1696{
1697        struct cake_sched_data *q = qdisc_priv(sch);
1698        int len = qdisc_pkt_len(skb);
1699        int ret;
1700        struct sk_buff *ack = NULL;
1701        ktime_t now = ktime_get();
1702        struct cake_tin_data *b;
1703        struct cake_flow *flow;
1704        u32 idx;
1705
1706        /* choose flow to insert into */
1707        idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1708        if (idx == 0) {
1709                if (ret & __NET_XMIT_BYPASS)
1710                        qdisc_qstats_drop(sch);
1711                __qdisc_drop(skb, to_free);
1712                return ret;
1713        }
1714        idx--;
1715        flow = &b->flows[idx];
1716
1717        /* ensure shaper state isn't stale */
1718        if (!b->tin_backlog) {
1719                if (ktime_before(b->time_next_packet, now))
1720                        b->time_next_packet = now;
1721
1722                if (!sch->q.qlen) {
1723                        if (ktime_before(q->time_next_packet, now)) {
1724                                q->failsafe_next_packet = now;
1725                                q->time_next_packet = now;
1726                        } else if (ktime_after(q->time_next_packet, now) &&
1727                                   ktime_after(q->failsafe_next_packet, now)) {
1728                                u64 next = \
1729                                        min(ktime_to_ns(q->time_next_packet),
1730                                            ktime_to_ns(
1731                                                   q->failsafe_next_packet));
1732                                sch->qstats.overlimits++;
1733                                qdisc_watchdog_schedule_ns(&q->watchdog, next);
1734                        }
1735                }
1736        }
1737
1738        if (unlikely(len > b->max_skblen))
1739                b->max_skblen = len;
1740
1741        if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1742                struct sk_buff *segs, *nskb;
1743                netdev_features_t features = netif_skb_features(skb);
1744                unsigned int slen = 0, numsegs = 0;
1745
1746                segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1747                if (IS_ERR_OR_NULL(segs))
1748                        return qdisc_drop(skb, sch, to_free);
1749
1750                skb_list_walk_safe(segs, segs, nskb) {
1751                        skb_mark_not_on_list(segs);
1752                        qdisc_skb_cb(segs)->pkt_len = segs->len;
1753                        cobalt_set_enqueue_time(segs, now);
1754                        get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1755                                                                          segs);
1756                        flow_queue_add(flow, segs);
1757
1758                        sch->q.qlen++;
1759                        numsegs++;
1760                        slen += segs->len;
1761                        q->buffer_used += segs->truesize;
1762                        b->packets++;
1763                }
1764
1765                /* stats */
1766                b->bytes            += slen;
1767                b->backlogs[idx]    += slen;
1768                b->tin_backlog      += slen;
1769                sch->qstats.backlog += slen;
1770                q->avg_window_bytes += slen;
1771
1772                qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1773                consume_skb(skb);
1774        } else {
1775                /* not splitting */
1776                cobalt_set_enqueue_time(skb, now);
1777                get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1778                flow_queue_add(flow, skb);
1779
1780                if (q->ack_filter)
1781                        ack = cake_ack_filter(q, flow);
1782
1783                if (ack) {
1784                        b->ack_drops++;
1785                        sch->qstats.drops++;
1786                        b->bytes += qdisc_pkt_len(ack);
1787                        len -= qdisc_pkt_len(ack);
1788                        q->buffer_used += skb->truesize - ack->truesize;
1789                        if (q->rate_flags & CAKE_FLAG_INGRESS)
1790                                cake_advance_shaper(q, b, ack, now, true);
1791
1792                        qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1793                        consume_skb(ack);
1794                } else {
1795                        sch->q.qlen++;
1796                        q->buffer_used      += skb->truesize;
1797                }
1798
1799                /* stats */
1800                b->packets++;
1801                b->bytes            += len;
1802                b->backlogs[idx]    += len;
1803                b->tin_backlog      += len;
1804                sch->qstats.backlog += len;
1805                q->avg_window_bytes += len;
1806        }
1807
1808        if (q->overflow_timeout)
1809                cake_heapify_up(q, b->overflow_idx[idx]);
1810
1811        /* incoming bandwidth capacity estimate */
1812        if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1813                u64 packet_interval = \
1814                        ktime_to_ns(ktime_sub(now, q->last_packet_time));
1815
1816                if (packet_interval > NSEC_PER_SEC)
1817                        packet_interval = NSEC_PER_SEC;
1818
1819                /* filter out short-term bursts, eg. wifi aggregation */
1820                q->avg_packet_interval = \
1821                        cake_ewma(q->avg_packet_interval,
1822                                  packet_interval,
1823                                  (packet_interval > q->avg_packet_interval ?
1824                                          2 : 8));
1825
1826                q->last_packet_time = now;
1827
1828                if (packet_interval > q->avg_packet_interval) {
1829                        u64 window_interval = \
1830                                ktime_to_ns(ktime_sub(now,
1831                                                      q->avg_window_begin));
1832                        u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1833
1834                        b = div64_u64(b, window_interval);
1835                        q->avg_peak_bandwidth =
1836                                cake_ewma(q->avg_peak_bandwidth, b,
1837                                          b > q->avg_peak_bandwidth ? 2 : 8);
1838                        q->avg_window_bytes = 0;
1839                        q->avg_window_begin = now;
1840
1841                        if (ktime_after(now,
1842                                        ktime_add_ms(q->last_reconfig_time,
1843                                                     250))) {
1844                                q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1845                                cake_reconfigure(sch);
1846                        }
1847                }
1848        } else {
1849                q->avg_window_bytes = 0;
1850                q->last_packet_time = now;
1851        }
1852
1853        /* flowchain */
1854        if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1855                struct cake_host *srchost = &b->hosts[flow->srchost];
1856                struct cake_host *dsthost = &b->hosts[flow->dsthost];
1857                u16 host_load = 1;
1858
1859                if (!flow->set) {
1860                        list_add_tail(&flow->flowchain, &b->new_flows);
1861                } else {
1862                        b->decaying_flow_count--;
1863                        list_move_tail(&flow->flowchain, &b->new_flows);
1864                }
1865                flow->set = CAKE_SET_SPARSE;
1866                b->sparse_flow_count++;
1867
1868                if (cake_dsrc(q->flow_mode))
1869                        host_load = max(host_load, srchost->srchost_bulk_flow_count);
1870
1871                if (cake_ddst(q->flow_mode))
1872                        host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1873
1874                flow->deficit = (b->flow_quantum *
1875                                 quantum_div[host_load]) >> 16;
1876        } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1877                struct cake_host *srchost = &b->hosts[flow->srchost];
1878                struct cake_host *dsthost = &b->hosts[flow->dsthost];
1879
1880                /* this flow was empty, accounted as a sparse flow, but actually
1881                 * in the bulk rotation.
1882                 */
1883                flow->set = CAKE_SET_BULK;
1884                b->sparse_flow_count--;
1885                b->bulk_flow_count++;
1886
1887                if (cake_dsrc(q->flow_mode))
1888                        srchost->srchost_bulk_flow_count++;
1889
1890                if (cake_ddst(q->flow_mode))
1891                        dsthost->dsthost_bulk_flow_count++;
1892
1893        }
1894
1895        if (q->buffer_used > q->buffer_max_used)
1896                q->buffer_max_used = q->buffer_used;
1897
1898        if (q->buffer_used > q->buffer_limit) {
1899                u32 dropped = 0;
1900
1901                while (q->buffer_used > q->buffer_limit) {
1902                        dropped++;
1903                        cake_drop(sch, to_free);
1904                }
1905                b->drop_overlimit += dropped;
1906        }
1907        return NET_XMIT_SUCCESS;
1908}
1909
1910static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1911{
1912        struct cake_sched_data *q = qdisc_priv(sch);
1913        struct cake_tin_data *b = &q->tins[q->cur_tin];
1914        struct cake_flow *flow = &b->flows[q->cur_flow];
1915        struct sk_buff *skb = NULL;
1916        u32 len;
1917
1918        if (flow->head) {
1919                skb = dequeue_head(flow);
1920                len = qdisc_pkt_len(skb);
1921                b->backlogs[q->cur_flow] -= len;
1922                b->tin_backlog           -= len;
1923                sch->qstats.backlog      -= len;
1924                q->buffer_used           -= skb->truesize;
1925                sch->q.qlen--;
1926
1927                if (q->overflow_timeout)
1928                        cake_heapify(q, b->overflow_idx[q->cur_flow]);
1929        }
1930        return skb;
1931}
1932
1933/* Discard leftover packets from a tin no longer in use. */
1934static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1935{
1936        struct cake_sched_data *q = qdisc_priv(sch);
1937        struct sk_buff *skb;
1938
1939        q->cur_tin = tin;
1940        for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1941                while (!!(skb = cake_dequeue_one(sch)))
1942                        kfree_skb(skb);
1943}
1944
1945static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1946{
1947        struct cake_sched_data *q = qdisc_priv(sch);
1948        struct cake_tin_data *b = &q->tins[q->cur_tin];
1949        struct cake_host *srchost, *dsthost;
1950        ktime_t now = ktime_get();
1951        struct cake_flow *flow;
1952        struct list_head *head;
1953        bool first_flow = true;
1954        struct sk_buff *skb;
1955        u16 host_load;
1956        u64 delay;
1957        u32 len;
1958
1959begin:
1960        if (!sch->q.qlen)
1961                return NULL;
1962
1963        /* global hard shaper */
1964        if (ktime_after(q->time_next_packet, now) &&
1965            ktime_after(q->failsafe_next_packet, now)) {
1966                u64 next = min(ktime_to_ns(q->time_next_packet),
1967                               ktime_to_ns(q->failsafe_next_packet));
1968
1969                sch->qstats.overlimits++;
1970                qdisc_watchdog_schedule_ns(&q->watchdog, next);
1971                return NULL;
1972        }
1973
1974        /* Choose a class to work on. */
1975        if (!q->rate_ns) {
1976                /* In unlimited mode, can't rely on shaper timings, just balance
1977                 * with DRR
1978                 */
1979                bool wrapped = false, empty = true;
1980
1981                while (b->tin_deficit < 0 ||
1982                       !(b->sparse_flow_count + b->bulk_flow_count)) {
1983                        if (b->tin_deficit <= 0)
1984                                b->tin_deficit += b->tin_quantum;
1985                        if (b->sparse_flow_count + b->bulk_flow_count)
1986                                empty = false;
1987
1988                        q->cur_tin++;
1989                        b++;
1990                        if (q->cur_tin >= q->tin_cnt) {
1991                                q->cur_tin = 0;
1992                                b = q->tins;
1993
1994                                if (wrapped) {
1995                                        /* It's possible for q->qlen to be
1996                                         * nonzero when we actually have no
1997                                         * packets anywhere.
1998                                         */
1999                                        if (empty)
2000                                                return NULL;

2001                                } else {
2002                                        wrapped = true;
2003                                }
2004                        }
2005                }
2006        } else {
2007                /* In shaped mode, choose:
2008                 * - Highest-priority tin with queue and meeting schedule, or
2009                 * - The earliest-scheduled tin with queue.
2010                 */
2011                ktime_t best_time = KTIME_MAX;
2012                int tin, best_tin = 0;
2013
2014                for (tin = 0; tin < q->tin_cnt; tin++) {
2015                        b = q->tins + tin;
2016                        if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2017                                ktime_t time_to_pkt = \
2018                                        ktime_sub(b->time_next_packet, now);
2019
2020                                if (ktime_to_ns(time_to_pkt) <= 0 ||
2021                                    ktime_compare(time_to_pkt,
2022                                                  best_time) <= 0) {
2023                                        best_time = time_to_pkt;
2024                                        best_tin = tin;
2025                                }
2026                        }
2027                }
2028
2029                q->cur_tin = best_tin;
2030                b = q->tins + best_tin;
2031
2032                /* No point in going further if no packets to deliver. */
2033                if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2034                        return NULL;
2035        }
2036
2037retry:
2038        /* service this class */
2039        head = &b->decaying_flows;
2040        if (!first_flow || list_empty(head)) {
2041                head = &b->new_flows;
2042                if (list_empty(head)) {
2043                        head = &b->old_flows;
2044                        if (unlikely(list_empty(head))) {
2045                                head = &b->decaying_flows;
2046                                if (unlikely(list_empty(head)))
2047                                        goto begin;
2048                        }
2049                }
2050        }
2051        flow = list_first_entry(head, struct cake_flow, flowchain);
2052        q->cur_flow = flow - b->flows;
2053        first_flow = false;
2054
2055        /* triple isolation (modified DRR++) */
2056        srchost = &b->hosts[flow->srchost];
2057        dsthost = &b->hosts[flow->dsthost];
2058        host_load = 1;
2059
2060        /* flow isolation (DRR++) */
2061        if (flow->deficit <= 0) {
2062                /* Keep all flows with deficits out of the sparse and decaying
2063                 * rotations.  No non-empty flow can go into the decaying
2064                 * rotation, so they can't get deficits
2065                 */
2066                if (flow->set == CAKE_SET_SPARSE) {
2067                        if (flow->head) {
2068                                b->sparse_flow_count--;
2069                                b->bulk_flow_count++;
2070
2071                                if (cake_dsrc(q->flow_mode))
2072                                        srchost->srchost_bulk_flow_count++;
2073
2074                                if (cake_ddst(q->flow_mode))
2075                                        dsthost->dsthost_bulk_flow_count++;
2076
2077                                flow->set = CAKE_SET_BULK;
2078                        } else {
2079                                /* we've moved it to the bulk rotation for
2080                                 * correct deficit accounting but we still want
2081                                 * to count it as a sparse flow, not a bulk one.
2082                                 */
2083                                flow->set = CAKE_SET_SPARSE_WAIT;
2084                        }
2085                }
2086
2087                if (cake_dsrc(q->flow_mode))
2088                        host_load = max(host_load, srchost->srchost_bulk_flow_count);
2089
2090                if (cake_ddst(q->flow_mode))
2091                        host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2092
2093                WARN_ON(host_load > CAKE_QUEUES);
2094
2095                /* The shifted prandom_u32() is a way to apply dithering to
2096                 * avoid accumulating roundoff errors
2097                 */
2098                flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2099                                  (prandom_u32() >> 16)) >> 16;
2100                list_move_tail(&flow->flowchain, &b->old_flows);
2101
2102                goto retry;
2103        }
2104
2105        /* Retrieve a packet via the AQM */
2106        while (1) {
2107                skb = cake_dequeue_one(sch);
2108                if (!skb) {
2109                        /* this queue was actually empty */
2110                        if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2111                                b->unresponsive_flow_count--;
2112
2113                        if (flow->cvars.p_drop || flow->cvars.count ||
2114                            ktime_before(now, flow->cvars.drop_next)) {
2115                                /* keep in the flowchain until the state has
2116                                 * decayed to rest
2117                                 */
2118                                list_move_tail(&flow->flowchain,
2119                                               &b->decaying_flows);
2120                                if (flow->set == CAKE_SET_BULK) {
2121                                        b->bulk_flow_count--;
2122
2123                                        if (cake_dsrc(q->flow_mode))
2124                                                srchost->srchost_bulk_flow_count--;
2125
2126                                        if (cake_ddst(q->flow_mode))
2127                                                dsthost->dsthost_bulk_flow_count--;
2128
2129                                        b->decaying_flow_count++;
2130                                } else if (flow->set == CAKE_SET_SPARSE ||
2131                                           flow->set == CAKE_SET_SPARSE_WAIT) {
2132                                        b->sparse_flow_count--;
2133                                        b->decaying_flow_count++;
2134                                }
2135                                flow->set = CAKE_SET_DECAYING;
2136                        } else {
2137                                /* remove empty queue from the flowchain */
2138                                list_del_init(&flow->flowchain);
2139                                if (flow->set == CAKE_SET_SPARSE ||
2140                                    flow->set == CAKE_SET_SPARSE_WAIT)
2141                                        b->sparse_flow_count--;
2142                                else if (flow->set == CAKE_SET_BULK) {
2143                                        b->bulk_flow_count--;
2144
2145                                        if (cake_dsrc(q->flow_mode))
2146                                                srchost->srchost_bulk_flow_count--;
2147
2148                                        if (cake_ddst(q->flow_mode))
2149                                                dsthost->dsthost_bulk_flow_count--;
2150
2151                                } else
2152                                        b->decaying_flow_count--;
2153
2154                                flow->set = CAKE_SET_NONE;
2155                        }
2156                        goto begin;
2157                }
2158
2159                /* Last packet in queue may be marked, shouldn't be dropped */
2160                if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2161                                        (b->bulk_flow_count *
2162                                         !!(q->rate_flags &
2163                                            CAKE_FLAG_INGRESS))) ||
2164                    !flow->head)
2165                        break;
2166
2167                /* drop this packet, get another one */
2168                if (q->rate_flags & CAKE_FLAG_INGRESS) {
2169                        len = cake_advance_shaper(q, b, skb,
2170                                                  now, true);
2171                        flow->deficit -= len;
2172                        b->tin_deficit -= len;
2173                }
2174                flow->dropped++;
2175                b->tin_dropped++;
2176                qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2177                qdisc_qstats_drop(sch);
2178                kfree_skb(skb);
2179                if (q->rate_flags & CAKE_FLAG_INGRESS)
2180                        goto retry;
2181        }
2182
2183        b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2184        qdisc_bstats_update(sch, skb);
2185
2186        /* collect delay stats */
2187        delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2188        b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2189        b->peak_delay = cake_ewma(b->peak_delay, delay,
2190                                  delay > b->peak_delay ? 2 : 8);
2191        b->base_delay = cake_ewma(b->base_delay, delay,
2192                                  delay < b->base_delay ? 2 : 8);
2193
2194        len = cake_advance_shaper(q, b, skb, now, false);
2195        flow->deficit -= len;
2196        b->tin_deficit -= len;
2197
2198        if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2199                u64 next = min(ktime_to_ns(q->time_next_packet),
2200                               ktime_to_ns(q->failsafe_next_packet));
2201
2202                qdisc_watchdog_schedule_ns(&q->watchdog, next);
2203        } else if (!sch->q.qlen) {
2204                int i;
2205
2206                for (i = 0; i < q->tin_cnt; i++) {
2207                        if (q->tins[i].decaying_flow_count) {
2208                                ktime_t next = \
2209                                        ktime_add_ns(now,
2210                                                     q->tins[i].cparams.target);
2211
2212                                qdisc_watchdog_schedule_ns(&q->watchdog,
2213                                                           ktime_to_ns(next));
2214                                break;
2215                        }
2216                }
2217        }
2218
2219        if (q->overflow_timeout)
2220                q->overflow_timeout--;
2221
2222        return skb;
2223}
2224
2225static void cake_reset(struct Qdisc *sch)
2226{
2227        u32 c;
2228
2229        for (c = 0; c < CAKE_MAX_TINS; c++)
2230                cake_clear_tin(sch, c);
2231}
2232
2233static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2234        [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2235        [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2236        [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2237        [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2238        [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2239        [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2240        [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2241        [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2242        [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2243        [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2244        [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2245        [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2246        [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2247        [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2248        [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2249        [TCA_CAKE_SPLIT_GSO]     = { .type = NLA_U32 },
2250        [TCA_CAKE_FWMARK]        = { .type = NLA_U32 },
2251};
2252
2253static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2254                          u64 target_ns, u64 rtt_est_ns)
2255{
2256        /* convert byte-rate into time-per-byte
2257         * so it will always unwedge in reasonable time.
2258         */
2259        static const u64 MIN_RATE = 64;
2260        u32 byte_target = mtu;
2261        u64 byte_target_ns;
2262        u8  rate_shft = 0;
2263        u64 rate_ns = 0;
2264
2265        b->flow_quantum = 1514;
2266        if (rate) {
2267                b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2268                rate_shft = 34;
2269                rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2270                rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2271                while (!!(rate_ns >> 34)) {
2272                        rate_ns >>= 1;
2273                        rate_shft--;
2274                }
2275        } /* else unlimited, ie. zero delay */
2276
2277        b->tin_rate_bps  = rate;
2278        b->tin_rate_ns   = rate_ns;
2279        b->tin_rate_shft = rate_shft;
2280
2281        byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2282
2283        b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2284        b->cparams.interval = max(rtt_est_ns +
2285                                     b->cparams.target - target_ns,
2286                                     b->cparams.target * 2);
2287        b->cparams.mtu_time = byte_target_ns;
2288        b->cparams.p_inc = 1 << 24; /* 1/256 */
2289        b->cparams.p_dec = 1 << 20; /* 1/4096 */
2290}
2291
2292static int cake_config_besteffort(struct Qdisc *sch)
2293{
2294        struct cake_sched_data *q = qdisc_priv(sch);
2295        struct cake_tin_data *b = &q->tins[0];
2296        u32 mtu = psched_mtu(qdisc_dev(sch));
2297        u64 rate = q->rate_bps;
2298
2299        q->tin_cnt = 1;
2300
2301        q->tin_index = besteffort;
2302        q->tin_order = normal_order;
2303
2304        cake_set_rate(b, rate, mtu,
2305                      us_to_ns(q->target), us_to_ns(q->interval));
2306        b->tin_quantum = 65535;
2307
2308        return 0;
2309}
2310
2311static int cake_config_precedence(struct Qdisc *sch)
2312{
2313        /* convert high-level (user visible) parameters into internal format */
2314        struct cake_sched_data *q = qdisc_priv(sch);
2315        u32 mtu = psched_mtu(qdisc_dev(sch));
2316        u64 rate = q->rate_bps;
2317        u32 quantum = 256;
2318        u32 i;
2319
2320        q->tin_cnt = 8;
2321        q->tin_index = precedence;
2322        q->tin_order = normal_order;
2323
2324        for (i = 0; i < q->tin_cnt; i++) {
2325                struct cake_tin_data *b = &q->tins[i];
2326
2327                cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2328                              us_to_ns(q->interval));
2329
2330                b->tin_quantum = max_t(u16, 1U, quantum);
2331
2332                /* calculate next class's parameters */
2333                rate  *= 7;
2334                rate >>= 3;
2335
2336                quantum  *= 7;
2337                quantum >>= 3;
2338        }
2339
2340        return 0;
2341}
2342
2343/*      List of known Diffserv codepoints:
2344 *
2345 *      Least Effort (CS1, LE)
2346 *      Best Effort (CS0)
2347 *      Max Reliability & LLT "Lo" (TOS1)
2348 *      Max Throughput (TOS2)
2349 *      Min Delay (TOS4)
2350 *      LLT "La" (TOS5)
2351 *      Assured Forwarding 1 (AF1x) - x3
2352 *      Assured Forwarding 2 (AF2x) - x3
2353 *      Assured Forwarding 3 (AF3x) - x3
2354 *      Assured Forwarding 4 (AF4x) - x3
2355 *      Precedence Class 2 (CS2)
2356 *      Precedence Class 3 (CS3)
2357 *      Precedence Class 4 (CS4)
2358 *      Precedence Class 5 (CS5)
2359 *      Precedence Class 6 (CS6)
2360 *      Precedence Class 7 (CS7)
2361 *      Voice Admit (VA)
2362 *      Expedited Forwarding (EF)
2363
2364 *      Total 25 codepoints.
2365 */
2366
2367/*      List of traffic classes in RFC 4594, updated by RFC 8622:
2368 *              (roughly descending order of contended priority)
2369 *              (roughly ascending order of uncontended throughput)
2370 *
2371 *      Network Control (CS6,CS7)      - routing traffic
2372 *      Telephony (EF,VA)         - aka. VoIP streams
2373 *      Signalling (CS5)               - VoIP setup
2374 *      Multimedia Conferencing (AF4x) - aka. video calls
2375 *      Realtime Interactive (CS4)     - eg. games
2376 *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2377 *      Broadcast Video (CS3)
2378 *      Low Latency Data (AF2x,TOS4)      - eg. database
2379 *      Ops, Admin, Management (CS2,TOS1) - eg. ssh
2380 *      Standard Service (CS0 & unrecognised codepoints)
2381 *      High Throughput Data (AF1x,TOS2)  - eg. web traffic
2382 *      Low Priority Data (CS1,LE)        - eg. BitTorrent
2383
2384 *      Total 12 traffic classes.
2385 */
2386
2387static int cake_config_diffserv8(struct Qdisc *sch)
2388{
2389/*      Pruned list of traffic classes for typical applications:
2390 *
2391 *              Network Control          (CS6, CS7)
2392 *              Minimum Latency          (EF, VA, CS5, CS4)
2393 *              Interactive Shell        (CS2, TOS1)
2394 *              Low Latency Transactions (AF2x, TOS4)
2395 *              Video Streaming          (AF4x, AF3x, CS3)
2396 *              Bog Standard             (CS0 etc.)
2397 *              High Throughput          (AF1x, TOS2)
2398 *              Background Traffic       (CS1, LE)
2399 *
2400 *              Total 8 traffic classes.
2401 */
2402
2403        struct cake_sched_data *q = qdisc_priv(sch);
2404        u32 mtu = psched_mtu(qdisc_dev(sch));
2405        u64 rate = q->rate_bps;
2406        u32 quantum = 256;
2407        u32 i;
2408
2409        q->tin_cnt = 8;
2410
2411        /* codepoint to class mapping */
2412        q->tin_index = diffserv8;
2413        q->tin_order = normal_order;
2414
2415        /* class characteristics */
2416        for (i = 0; i < q->tin_cnt; i++) {
2417                struct cake_tin_data *b = &q->tins[i];
2418
2419                cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2420                              us_to_ns(q->interval));
2421
2422                b->tin_quantum = max_t(u16, 1U, quantum);
2423
2424                /* calculate next class's parameters */
2425                rate  *= 7;
2426                rate >>= 3;
2427
2428                quantum  *= 7;
2429                quantum >>= 3;
2430        }
2431
2432        return 0;
2433}
2434
2435static int cake_config_diffserv4(struct Qdisc *sch)
2436{
2437/*  Further pruned list of traffic classes for four-class system:
2438 *
2439 *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2440 *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2441 *          Best Effort        (CS0, AF1x, TOS2, and those not specified)
2442 *          Background Traffic (CS1, LE)
2443 *
2444 *              Total 4 traffic classes.
2445 */
2446
2447        struct cake_sched_data *q = qdisc_priv(sch);
2448        u32 mtu = psched_mtu(qdisc_dev(sch));
2449        u64 rate = q->rate_bps;
2450        u32 quantum = 1024;
2451
2452        q->tin_cnt = 4;
2453
2454        /* codepoint to class mapping */
2455        q->tin_index = diffserv4;
2456        q->tin_order = bulk_order;
2457
2458        /* class characteristics */
2459        cake_set_rate(&q->tins[0], rate, mtu,
2460                      us_to_ns(q->target), us_to_ns(q->interval));
2461        cake_set_rate(&q->tins[1], rate >> 4, mtu,
2462                      us_to_ns(q->target), us_to_ns(q->interval));
2463        cake_set_rate(&q->tins[2], rate >> 1, mtu,
2464                      us_to_ns(q->target), us_to_ns(q->interval));
2465        cake_set_rate(&q->tins[3], rate >> 2, mtu,
2466                      us_to_ns(q->target), us_to_ns(q->interval));
2467
2468        /* bandwidth-sharing weights */
2469        q->tins[0].tin_quantum = quantum;
2470        q->tins[1].tin_quantum = quantum >> 4;
2471        q->tins[2].tin_quantum = quantum >> 1;
2472        q->tins[3].tin_quantum = quantum >> 2;
2473
2474        return 0;
2475}
2476
2477static int cake_config_diffserv3(struct Qdisc *sch)
2478{
2479/*  Simplified Diffserv structure with 3 tins.
2480 *              Low Priority            (CS1, LE)
2481 *              Best Effort
2482 *              Latency Sensitive       (TOS4, VA, EF, CS6, CS7)
2483 */
2484        struct cake_sched_data *q = qdisc_priv(sch);
2485        u32 mtu = psched_mtu(qdisc_dev(sch));
2486        u64 rate = q->rate_bps;
2487        u32 quantum = 1024;
2488
2489        q->tin_cnt = 3;
2490
2491        /* codepoint to class mapping */
2492        q->tin_index = diffserv3;
2493        q->tin_order = bulk_order;
2494
2495        /* class characteristics */
2496        cake_set_rate(&q->tins[0], rate, mtu,
2497                      us_to_ns(q->target), us_to_ns(q->interval));
2498        cake_set_rate(&q->tins[1], rate >> 4, mtu,
2499                      us_to_ns(q->target), us_to_ns(q->interval));
2500        cake_set_rate(&q->tins[2], rate >> 2, mtu,
2501                      us_to_ns(q->target), us_to_ns(q->interval));
2502
2503        /* bandwidth-sharing weights */
2504        q->tins[0].tin_quantum = quantum;
2505        q->tins[1].tin_quantum = quantum >> 4;
2506        q->tins[2].tin_quantum = quantum >> 2;
2507
2508        return 0;
2509}
2510
2511static void cake_reconfigure(struct Qdisc *sch)
2512{
2513        struct cake_sched_data *q = qdisc_priv(sch);
2514        int c, ft;
2515
2516        switch (q->tin_mode) {
2517        case CAKE_DIFFSERV_BESTEFFORT:
2518                ft = cake_config_besteffort(sch);
2519                break;
2520
2521        case CAKE_DIFFSERV_PRECEDENCE:
2522                ft = cake_config_precedence(sch);
2523                break;
2524
2525        case CAKE_DIFFSERV_DIFFSERV8:
2526                ft = cake_config_diffserv8(sch);
2527                break;
2528
2529        case CAKE_DIFFSERV_DIFFSERV4:
2530                ft = cake_config_diffserv4(sch);
2531                break;
2532
2533        case CAKE_DIFFSERV_DIFFSERV3:
2534        default:
2535                ft = cake_config_diffserv3(sch);
2536                break;
2537        }
2538
2539        for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2540                cake_clear_tin(sch, c);
2541                q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2542        }
2543
2544        q->rate_ns   = q->tins[ft].tin_rate_ns;
2545        q->rate_shft = q->tins[ft].tin_rate_shft;
2546
2547        if (q->buffer_config_limit) {
2548                q->buffer_limit = q->buffer_config_limit;
2549        } else if (q->rate_bps) {
2550                u64 t = q->rate_bps * q->interval;
2551
2552                do_div(t, USEC_PER_SEC / 4);
2553                q->buffer_limit = max_t(u32, t, 4U << 20);
2554        } else {
2555                q->buffer_limit = ~0;
2556        }
2557
2558        sch->flags &= ~TCQ_F_CAN_BYPASS;
2559
2560        q->buffer_limit = min(q->buffer_limit,
2561                              max(sch->limit * psched_mtu(qdisc_dev(sch)),
2562                                  q->buffer_config_limit));
2563}
2564
2565static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2566                       struct netlink_ext_ack *extack)
2567{
2568        struct cake_sched_data *q = qdisc_priv(sch);
2569        struct nlattr *tb[TCA_CAKE_MAX + 1];
2570        int err;
2571
2572        if (!opt)
2573                return -EINVAL;
2574
2575        err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2576                                          extack);
2577        if (err < 0)
2578                return err;
2579
2580        if (tb[TCA_CAKE_NAT]) {
2581#if IS_ENABLED(CONFIG_NF_CONNTRACK)
2582                q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2583                q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2584                        !!nla_get_u32(tb[TCA_CAKE_NAT]);
2585#else
2586                NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2587                                    "No conntrack support in kernel");
2588                return -EOPNOTSUPP;
2589#endif
2590        }
2591
2592        if (tb[TCA_CAKE_BASE_RATE64])
2593                q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2594
2595        if (tb[TCA_CAKE_DIFFSERV_MODE])
2596                q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2597
2598        if (tb[TCA_CAKE_WASH]) {
2599                if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2600                        q->rate_flags |= CAKE_FLAG_WASH;
2601                else
2602                        q->rate_flags &= ~CAKE_FLAG_WASH;
2603        }
2604
2605        if (tb[TCA_CAKE_FLOW_MODE])
2606                q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2607                                (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2608                                        CAKE_FLOW_MASK));
2609
2610        if (tb[TCA_CAKE_ATM])
2611                q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2612
2613        if (tb[TCA_CAKE_OVERHEAD]) {
2614                q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2615                q->rate_flags |= CAKE_FLAG_OVERHEAD;
2616
2617                q->max_netlen = 0;
2618                q->max_adjlen = 0;
2619                q->min_netlen = ~0;
2620                q->min_adjlen = ~0;
2621        }
2622
2623        if (tb[TCA_CAKE_RAW]) {
2624                q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2625
2626                q->max_netlen = 0;
2627                q->max_adjlen = 0;
2628                q->min_netlen = ~0;
2629                q->min_adjlen = ~0;
2630        }
2631
2632        if (tb[TCA_CAKE_MPU])
2633                q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2634
2635        if (tb[TCA_CAKE_RTT]) {
2636                q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2637
2638                if (!q->interval)
2639                        q->interval = 1;
2640        }
2641
2642        if (tb[TCA_CAKE_TARGET]) {
2643                q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2644
2645                if (!q->target)
2646                        q->target = 1;
2647        }
2648
2649        if (tb[TCA_CAKE_AUTORATE]) {
2650                if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2651                        q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2652                else
2653                        q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2654        }
2655
2656        if (tb[TCA_CAKE_INGRESS]) {
2657                if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2658                        q->rate_flags |= CAKE_FLAG_INGRESS;
2659                else
2660                        q->rate_flags &= ~CAKE_FLAG_INGRESS;
2661        }
2662
2663        if (tb[TCA_CAKE_ACK_FILTER])
2664                q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2665
2666        if (tb[TCA_CAKE_MEMORY])
2667                q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2668
2669        if (tb[TCA_CAKE_SPLIT_GSO]) {
2670                if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2671                        q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2672                else
2673                        q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2674        }
2675
2676        if (tb[TCA_CAKE_FWMARK]) {
2677                q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2678                q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2679        }
2680
2681        if (q->tins) {
2682                sch_tree_lock(sch);
2683                cake_reconfigure(sch);
2684                sch_tree_unlock(sch);
2685        }
2686
2687        return 0;
2688}
2689
2690static void cake_destroy(struct Qdisc *sch)
2691{
2692        struct cake_sched_data *q = qdisc_priv(sch);
2693
2694        qdisc_watchdog_cancel(&q->watchdog);
2695        tcf_block_put(q->block);
2696        kvfree(q->tins);
2697}
2698
2699static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2700                     struct netlink_ext_ack *extack)
2701{
2702        struct cake_sched_data *q = qdisc_priv(sch);
2703        int i, j, err;
2704
2705        sch->limit = 10240;
2706        q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2707        q->flow_mode  = CAKE_FLOW_TRIPLE;
2708
2709        q->rate_bps = 0; /* unlimited by default */
2710
2711        q->interval = 100000; /* 100ms default */
2712        q->target   =   5000; /* 5ms: codel RFC argues
2713                               * for 5 to 10% of interval
2714                               */
2715        q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2716        q->cur_tin = 0;
2717        q->cur_flow  = 0;
2718
2719        qdisc_watchdog_init(&q->watchdog, sch);
2720
2721        if (opt) {
2722                err = cake_change(sch, opt, extack);
2723
2724                if (err)
2725                        return err;
2726        }
2727
2728        err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2729        if (err)
2730                return err;
2731
2732        quantum_div[0] = ~0;
2733        for (i = 1; i <= CAKE_QUEUES; i++)
2734                quantum_div[i] = 65535 / i;
2735
2736        q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2737                           GFP_KERNEL);
2738        if (!q->tins)
2739                goto nomem;
2740
2741        for (i = 0; i < CAKE_MAX_TINS; i++) {
2742                struct cake_tin_data *b = q->tins + i;
2743
2744                INIT_LIST_HEAD(&b->new_flows);
2745                INIT_LIST_HEAD(&b->old_flows);
2746                INIT_LIST_HEAD(&b->decaying_flows);
2747                b->sparse_flow_count = 0;
2748                b->bulk_flow_count = 0;
2749                b->decaying_flow_count = 0;
2750
2751                for (j = 0; j < CAKE_QUEUES; j++) {
2752                        struct cake_flow *flow = b->flows + j;
2753                        u32 k = j * CAKE_MAX_TINS + i;
2754
2755                        INIT_LIST_HEAD(&flow->flowchain);
2756                        cobalt_vars_init(&flow->cvars);
2757
2758                        q->overflow_heap[k].t = i;
2759                        q->overflow_heap[k].b = j;
2760                        b->overflow_idx[j] = k;
2761                }
2762        }
2763
2764        cake_reconfigure(sch);
2765        q->avg_peak_bandwidth = q->rate_bps;
2766        q->min_netlen = ~0;
2767        q->min_adjlen = ~0;
2768        return 0;
2769
2770nomem:
2771        cake_destroy(sch);
2772        return -ENOMEM;
2773}
2774
2775static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2776{
2777        struct cake_sched_data *q = qdisc_priv(sch);
2778        struct nlattr *opts;
2779
2780        opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2781        if (!opts)
2782                goto nla_put_failure;
2783
2784        if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2785                              TCA_CAKE_PAD))
2786                goto nla_put_failure;
2787
2788        if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2789                        q->flow_mode & CAKE_FLOW_MASK))
2790                goto nla_put_failure;
2791
2792        if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2793                goto nla_put_failure;
2794
2795        if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2796                goto nla_put_failure;
2797
2798        if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2799                goto nla_put_failure;
2800
2801        if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2802                        !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2803                goto nla_put_failure;
2804
2805        if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2806                        !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2807                goto nla_put_failure;
2808
2809        if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2810                goto nla_put_failure;
2811
2812        if (nla_put_u32(skb, TCA_CAKE_NAT,
2813                        !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2814                goto nla_put_failure;
2815
2816        if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2817                goto nla_put_failure;
2818
2819        if (nla_put_u32(skb, TCA_CAKE_WASH,
2820                        !!(q->rate_flags & CAKE_FLAG_WASH)))
2821                goto nla_put_failure;
2822
2823        if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2824                goto nla_put_failure;
2825
2826        if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2827                if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2828                        goto nla_put_failure;
2829
2830        if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2831                goto nla_put_failure;
2832
2833        if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2834                goto nla_put_failure;
2835
2836        if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2837                        !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2838                goto nla_put_failure;
2839
2840        if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2841                goto nla_put_failure;
2842
2843        return nla_nest_end(skb, opts);
2844
2845nla_put_failure:
2846        return -1;
2847}
2848
2849static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2850{
2851        struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2852        struct cake_sched_data *q = qdisc_priv(sch);
2853        struct nlattr *tstats, *ts;
2854        int i;
2855
2856        if (!stats)
2857                return -1;
2858
2859#define PUT_STAT_U32(attr, data) do {                                  \
2860                if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2861                        goto nla_put_failure;                          \
2862        } while (0)
2863#define PUT_STAT_U64(attr, data) do {                                  \
2864                if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2865                                        data, TCA_CAKE_STATS_PAD)) \
2866                        goto nla_put_failure;                          \
2867        } while (0)
2868
2869        PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2870        PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2871        PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2872        PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2873        PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2874        PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2875        PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2876        PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2877
2878#undef PUT_STAT_U32
2879#undef PUT_STAT_U64
2880
2881        tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2882        if (!tstats)
2883                goto nla_put_failure;
2884
2885#define PUT_TSTAT_U32(attr, data) do {                                  \
2886                if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2887                        goto nla_put_failure;                           \
2888        } while (0)
2889#define PUT_TSTAT_U64(attr, data) do {                                  \
2890                if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2891                                        data, TCA_CAKE_TIN_STATS_PAD))  \
2892                        goto nla_put_failure;                           \
2893        } while (0)
2894
2895        for (i = 0; i < q->tin_cnt; i++) {
2896                struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2897
2898                ts = nla_nest_start_noflag(d->skb, i + 1);
2899                if (!ts)
2900                        goto nla_put_failure;
2901
2902                PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2903                PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2904                PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2905
2906                PUT_TSTAT_U32(TARGET_US,
2907                              ktime_to_us(ns_to_ktime(b->cparams.target)));
2908                PUT_TSTAT_U32(INTERVAL_US,
2909                              ktime_to_us(ns_to_ktime(b->cparams.interval)));
2910
2911                PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2912                PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2913                PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2914                PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2915
2916                PUT_TSTAT_U32(PEAK_DELAY_US,
2917                              ktime_to_us(ns_to_ktime(b->peak_delay)));
2918                PUT_TSTAT_U32(AVG_DELAY_US,
2919                              ktime_to_us(ns_to_ktime(b->avge_delay)));
2920                PUT_TSTAT_U32(BASE_DELAY_US,
2921                              ktime_to_us(ns_to_ktime(b->base_delay)));
2922
2923                PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2924                PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2925                PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2926
2927                PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2928                                            b->decaying_flow_count);
2929                PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2930                PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2931                PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2932
2933                PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2934                nla_nest_end(d->skb, ts);
2935        }
2936
2937#undef PUT_TSTAT_U32
2938#undef PUT_TSTAT_U64
2939
2940        nla_nest_end(d->skb, tstats);
2941        return nla_nest_end(d->skb, stats);
2942
2943nla_put_failure:
2944        nla_nest_cancel(d->skb, stats);
2945        return -1;
2946}
2947
2948static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2949{
2950        return NULL;
2951}
2952
2953static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2954{
2955        return 0;
2956}
2957
2958static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2959                               u32 classid)
2960{
2961        return 0;
2962}
2963
2964static void cake_unbind(struct Qdisc *q, unsigned long cl)
2965{
2966}
2967
2968static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2969                                        struct netlink_ext_ack *extack)
2970{
2971        struct cake_sched_data *q = qdisc_priv(sch);
2972
2973        if (cl)
2974                return NULL;
2975        return q->block;
2976}
2977
2978static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2979                           struct sk_buff *skb, struct tcmsg *tcm)
2980{
2981        tcm->tcm_handle |= TC_H_MIN(cl);
2982        return 0;
2983}
2984
2985static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2986                                 struct gnet_dump *d)
2987{
2988        struct cake_sched_data *q = qdisc_priv(sch);
2989        const struct cake_flow *flow = NULL;
2990        struct gnet_stats_queue qs = { 0 };
2991        struct nlattr *stats;
2992        u32 idx = cl - 1;
2993
2994        if (idx < CAKE_QUEUES * q->tin_cnt) {
2995                const struct cake_tin_data *b = \
2996                        &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2997                const struct sk_buff *skb;
2998
2999                flow = &b->flows[idx % CAKE_QUEUES];
3000

3001                if (flow->head) {
3002                        sch_tree_lock(sch);
3003                        skb = flow->head;
3004                        while (skb) {
3005                                qs.qlen++;
3006                                skb = skb->next;
3007                        }
3008                        sch_tree_unlock(sch);
3009                }
3010                qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3011                qs.drops = flow->dropped;
3012        }
3013        if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3014                return -1;
3015        if (flow) {
3016                ktime_t now = ktime_get();
3017
3018                stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3019                if (!stats)
3020                        return -1;
3021
3022#define PUT_STAT_U32(attr, data) do {                                  \
3023                if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3024                        goto nla_put_failure;                          \
3025        } while (0)
3026#define PUT_STAT_S32(attr, data) do {                                  \
3027                if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3028                        goto nla_put_failure;                          \
3029        } while (0)
3030
3031                PUT_STAT_S32(DEFICIT, flow->deficit);
3032                PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3033                PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3034                PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3035                if (flow->cvars.p_drop) {
3036                        PUT_STAT_S32(BLUE_TIMER_US,
3037                                     ktime_to_us(
3038                                             ktime_sub(now,
3039                                                       flow->cvars.blue_timer)));
3040                }
3041                if (flow->cvars.dropping) {
3042                        PUT_STAT_S32(DROP_NEXT_US,
3043                                     ktime_to_us(
3044                                             ktime_sub(now,
3045                                                       flow->cvars.drop_next)));
3046                }
3047
3048                if (nla_nest_end(d->skb, stats) < 0)
3049                        return -1;
3050        }
3051
3052        return 0;
3053
3054nla_put_failure:
3055        nla_nest_cancel(d->skb, stats);
3056        return -1;
3057}
3058
3059static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3060{
3061        struct cake_sched_data *q = qdisc_priv(sch);
3062        unsigned int i, j;
3063
3064        if (arg->stop)
3065                return;
3066
3067        for (i = 0; i < q->tin_cnt; i++) {
3068                struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3069
3070                for (j = 0; j < CAKE_QUEUES; j++) {
3071                        if (list_empty(&b->flows[j].flowchain) ||
3072                            arg->count < arg->skip) {
3073                                arg->count++;
3074                                continue;
3075                        }
3076                        if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3077                                arg->stop = 1;
3078                                break;
3079                        }
3080                        arg->count++;
3081                }
3082        }
3083}
3084
3085static const struct Qdisc_class_ops cake_class_ops = {
3086        .leaf           =       cake_leaf,
3087        .find           =       cake_find,
3088        .tcf_block      =       cake_tcf_block,
3089        .bind_tcf       =       cake_bind,
3090        .unbind_tcf     =       cake_unbind,
3091        .dump           =       cake_dump_class,
3092        .dump_stats     =       cake_dump_class_stats,
3093        .walk           =       cake_walk,
3094};
3095
3096static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3097        .cl_ops         =       &cake_class_ops,
3098        .id             =       "cake",
3099        .priv_size      =       sizeof(struct cake_sched_data),
3100        .enqueue        =       cake_enqueue,
3101        .dequeue        =       cake_dequeue,
3102        .peek           =       qdisc_peek_dequeued,
3103        .init           =       cake_init,
3104        .reset          =       cake_reset,
3105        .destroy        =       cake_destroy,
3106        .change         =       cake_change,
3107        .dump           =       cake_dump,
3108        .dump_stats     =       cake_dump_stats,
3109        .owner          =       THIS_MODULE,
3110};
3111
3112static int __init cake_module_init(void)
3113{
3114        return register_qdisc(&cake_qdisc_ops);
3115}
3116
3117static void __exit cake_module_exit(void)
3118{
3119        unregister_qdisc(&cake_qdisc_ops);
3120}
3121
3122module_init(cake_module_init)
3123module_exit(cake_module_exit)
3124MODULE_AUTHOR("Jonathan Morton");
3125MODULE_LICENSE("Dual BSD/GPL");
3126MODULE_DESCRIPTION("The CAKE shaper.");
3127