linux/net/sched/sch_cake.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2
   3/* COMMON Applications Kept Enhanced (CAKE) discipline
   4 *
   5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11 *
  12 * The CAKE Principles:
  13 *                 (or, how to have your cake and eat it too)
  14 *
  15 * This is a combination of several shaping, AQM and FQ techniques into one
  16 * easy-to-use package:
  17 *
  18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19 *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20 *   eliminating the need for any sort of burst parameter (eg. token bucket
  21 *   depth).  Burst support is limited to that necessary to overcome scheduling
  22 *   latency.
  23 *
  24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25 *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26 *   the priority is reduced to avoid starving other tins.
  27 *
  28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29 *   flows from each other.  This prevents a burst on one flow from increasing
  30 *   the delay to another.  Flows are distributed to queues using a
  31 *   set-associative hash function.
  32 *
  33 * - Each queue is actively managed by Cobalt, which is a combination of the
  34 *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35 *   congestion early via ECN (if available) and/or packet drops, to keep
  36 *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37 *   setting, as is necessary at low bandwidths.
  38 *
  39 * The configuration parameters are kept deliberately simple for ease of use.
  40 * Everything has sane defaults.  Complete generality of configuration is *not*
  41 * a goal.
  42 *
  43 * The priority queue operates according to a weighted DRR scheme, combined with
  44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45 * bandwidth sharing threshold the tin is operating.  This determines whether a
  46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47 * that tin in the current pass.
  48 *
  49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50 * granted us permission to leverage.
  51 */
  52
  53#include <linux/module.h>
  54#include <linux/types.h>
  55#include <linux/kernel.h>
  56#include <linux/jiffies.h>
  57#include <linux/string.h>
  58#include <linux/in.h>
  59#include <linux/errno.h>
  60#include <linux/init.h>
  61#include <linux/skbuff.h>
  62#include <linux/jhash.h>
  63#include <linux/slab.h>
  64#include <linux/vmalloc.h>
  65#include <linux/reciprocal_div.h>
  66#include <net/netlink.h>
  67#include <linux/if_vlan.h>
  68#include <net/pkt_sched.h>
  69#include <net/pkt_cls.h>
  70#include <net/tcp.h>
  71#include <net/flow_dissector.h>
  72
  73#if IS_ENABLED(CONFIG_NF_CONNTRACK)
  74#include <net/netfilter/nf_conntrack_core.h>
  75#endif
  76
  77#define CAKE_SET_WAYS (8)
  78#define CAKE_MAX_TINS (8)
  79#define CAKE_QUEUES (1024)
  80#define CAKE_FLOW_MASK 63
  81#define CAKE_FLOW_NAT_FLAG 64
  82
  83/* struct cobalt_params - contains codel and blue parameters
  84 * @interval:   codel initial drop rate
  85 * @target:     maximum persistent sojourn time & blue update rate
  86 * @mtu_time:   serialisation delay of maximum-size packet
  87 * @p_inc:      increment of blue drop probability (0.32 fxp)
  88 * @p_dec:      decrement of blue drop probability (0.32 fxp)
  89 */
  90struct cobalt_params {
  91        u64     interval;
  92        u64     target;
  93        u64     mtu_time;
  94        u32     p_inc;
  95        u32     p_dec;
  96};
  97
  98/* struct cobalt_vars - contains codel and blue variables
  99 * @count:              codel dropping frequency
 100 * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
 101 * @drop_next:          time to drop next packet, or when we dropped last
 102 * @blue_timer:         Blue time to next drop
 103 * @p_drop:             BLUE drop probability (0.32 fxp)
 104 * @dropping:           set if in dropping state
 105 * @ecn_marked:         set if marked
 106 */
 107struct cobalt_vars {
 108        u32     count;
 109        u32     rec_inv_sqrt;
 110        ktime_t drop_next;
 111        ktime_t blue_timer;
 112        u32     p_drop;
 113        bool    dropping;
 114        bool    ecn_marked;
 115};
 116
 117enum {
 118        CAKE_SET_NONE = 0,
 119        CAKE_SET_SPARSE,
 120        CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 121        CAKE_SET_BULK,
 122        CAKE_SET_DECAYING
 123};
 124
 125struct cake_flow {
 126        /* this stuff is all needed per-flow at dequeue time */
 127        struct sk_buff    *head;
 128        struct sk_buff    *tail;
 129        struct list_head  flowchain;
 130        s32               deficit;
 131        u32               dropped;
 132        struct cobalt_vars cvars;
 133        u16               srchost; /* index into cake_host table */
 134        u16               dsthost;
 135        u8                set;
 136}; /* please try to keep this structure <= 64 bytes */
 137
 138struct cake_host {
 139        u32 srchost_tag;
 140        u32 dsthost_tag;
 141        u16 srchost_refcnt;
 142        u16 dsthost_refcnt;
 143};
 144
 145struct cake_heap_entry {
 146        u16 t:3, b:10;
 147};
 148
 149struct cake_tin_data {
 150        struct cake_flow flows[CAKE_QUEUES];
 151        u32     backlogs[CAKE_QUEUES];
 152        u32     tags[CAKE_QUEUES]; /* for set association */
 153        u16     overflow_idx[CAKE_QUEUES];
 154        struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 155        u16     flow_quantum;
 156
 157        struct cobalt_params cparams;
 158        u32     drop_overlimit;
 159        u16     bulk_flow_count;
 160        u16     sparse_flow_count;
 161        u16     decaying_flow_count;
 162        u16     unresponsive_flow_count;
 163
 164        u32     max_skblen;
 165
 166        struct list_head new_flows;
 167        struct list_head old_flows;
 168        struct list_head decaying_flows;
 169
 170        /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 171        ktime_t time_next_packet;
 172        u64     tin_rate_ns;
 173        u64     tin_rate_bps;
 174        u16     tin_rate_shft;
 175
 176        u16     tin_quantum_prio;
 177        u16     tin_quantum_band;
 178        s32     tin_deficit;
 179        u32     tin_backlog;
 180        u32     tin_dropped;
 181        u32     tin_ecn_mark;
 182
 183        u32     packets;
 184        u64     bytes;
 185
 186        u32     ack_drops;
 187
 188        /* moving averages */
 189        u64 avge_delay;
 190        u64 peak_delay;
 191        u64 base_delay;
 192
 193        /* hash function stats */
 194        u32     way_directs;
 195        u32     way_hits;
 196        u32     way_misses;
 197        u32     way_collisions;
 198}; /* number of tins is small, so size of this struct doesn't matter much */
 199
 200struct cake_sched_data {
 201        struct tcf_proto __rcu *filter_list; /* optional external classifier */
 202        struct tcf_block *block;
 203        struct cake_tin_data *tins;
 204
 205        struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 206        u16             overflow_timeout;
 207
 208        u16             tin_cnt;
 209        u8              tin_mode;
 210        u8              flow_mode;
 211        u8              ack_filter;
 212        u8              atm_mode;
 213
 214        /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 215        u16             rate_shft;
 216        ktime_t         time_next_packet;
 217        ktime_t         failsafe_next_packet;
 218        u64             rate_ns;
 219        u64             rate_bps;
 220        u16             rate_flags;
 221        s16             rate_overhead;
 222        u16             rate_mpu;
 223        u64             interval;
 224        u64             target;
 225
 226        /* resource tracking */
 227        u32             buffer_used;
 228        u32             buffer_max_used;
 229        u32             buffer_limit;
 230        u32             buffer_config_limit;
 231
 232        /* indices for dequeue */
 233        u16             cur_tin;
 234        u16             cur_flow;
 235
 236        struct qdisc_watchdog watchdog;
 237        const u8        *tin_index;
 238        const u8        *tin_order;
 239
 240        /* bandwidth capacity estimate */
 241        ktime_t         last_packet_time;
 242        ktime_t         avg_window_begin;
 243        u64             avg_packet_interval;
 244        u64             avg_window_bytes;
 245        u64             avg_peak_bandwidth;
 246        ktime_t         last_reconfig_time;
 247
 248        /* packet length stats */
 249        u32             avg_netoff;
 250        u16             max_netlen;
 251        u16             max_adjlen;
 252        u16             min_netlen;
 253        u16             min_adjlen;
 254};
 255
 256enum {
 257        CAKE_FLAG_OVERHEAD         = BIT(0),
 258        CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 259        CAKE_FLAG_INGRESS          = BIT(2),
 260        CAKE_FLAG_WASH             = BIT(3),
 261        CAKE_FLAG_SPLIT_GSO        = BIT(4)
 262};
 263
 264/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 265 * obtain the best features of each.  Codel is excellent on flows which
 266 * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 267 * unresponsive flows.
 268 */
 269
 270struct cobalt_skb_cb {
 271        ktime_t enqueue_time;
 272        u32     adjusted_len;
 273};
 274
 275static u64 us_to_ns(u64 us)
 276{
 277        return us * NSEC_PER_USEC;
 278}
 279
 280static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 281{
 282        qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 283        return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 284}
 285
 286static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 287{
 288        return get_cobalt_cb(skb)->enqueue_time;
 289}
 290
 291static void cobalt_set_enqueue_time(struct sk_buff *skb,
 292                                    ktime_t now)
 293{
 294        get_cobalt_cb(skb)->enqueue_time = now;
 295}
 296
 297static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 298
 299/* Diffserv lookup tables */
 300
 301static const u8 precedence[] = {
 302        0, 0, 0, 0, 0, 0, 0, 0,
 303        1, 1, 1, 1, 1, 1, 1, 1,
 304        2, 2, 2, 2, 2, 2, 2, 2,
 305        3, 3, 3, 3, 3, 3, 3, 3,
 306        4, 4, 4, 4, 4, 4, 4, 4,
 307        5, 5, 5, 5, 5, 5, 5, 5,
 308        6, 6, 6, 6, 6, 6, 6, 6,
 309        7, 7, 7, 7, 7, 7, 7, 7,
 310};
 311
 312static const u8 diffserv8[] = {
 313        2, 5, 1, 2, 4, 2, 2, 2,
 314        0, 2, 1, 2, 1, 2, 1, 2,
 315        5, 2, 4, 2, 4, 2, 4, 2,
 316        3, 2, 3, 2, 3, 2, 3, 2,
 317        6, 2, 3, 2, 3, 2, 3, 2,
 318        6, 2, 2, 2, 6, 2, 6, 2,
 319        7, 2, 2, 2, 2, 2, 2, 2,
 320        7, 2, 2, 2, 2, 2, 2, 2,
 321};
 322
 323static const u8 diffserv4[] = {
 324        0, 2, 0, 0, 2, 0, 0, 0,
 325        1, 0, 0, 0, 0, 0, 0, 0,
 326        2, 0, 2, 0, 2, 0, 2, 0,
 327        2, 0, 2, 0, 2, 0, 2, 0,
 328        3, 0, 2, 0, 2, 0, 2, 0,
 329        3, 0, 0, 0, 3, 0, 3, 0,
 330        3, 0, 0, 0, 0, 0, 0, 0,
 331        3, 0, 0, 0, 0, 0, 0, 0,
 332};
 333
 334static const u8 diffserv3[] = {
 335        0, 0, 0, 0, 2, 0, 0, 0,
 336        1, 0, 0, 0, 0, 0, 0, 0,
 337        0, 0, 0, 0, 0, 0, 0, 0,
 338        0, 0, 0, 0, 0, 0, 0, 0,
 339        0, 0, 0, 0, 0, 0, 0, 0,
 340        0, 0, 0, 0, 2, 0, 2, 0,
 341        2, 0, 0, 0, 0, 0, 0, 0,
 342        2, 0, 0, 0, 0, 0, 0, 0,
 343};
 344
 345static const u8 besteffort[] = {
 346        0, 0, 0, 0, 0, 0, 0, 0,
 347        0, 0, 0, 0, 0, 0, 0, 0,
 348        0, 0, 0, 0, 0, 0, 0, 0,
 349        0, 0, 0, 0, 0, 0, 0, 0,
 350        0, 0, 0, 0, 0, 0, 0, 0,
 351        0, 0, 0, 0, 0, 0, 0, 0,
 352        0, 0, 0, 0, 0, 0, 0, 0,
 353        0, 0, 0, 0, 0, 0, 0, 0,
 354};
 355
 356/* tin priority order for stats dumping */
 357
 358static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 359static const u8 bulk_order[] = {1, 0, 2, 3};
 360
 361#define REC_INV_SQRT_CACHE (16)
 362static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
 363
 364/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 365 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 366 *
 367 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 368 */
 369
 370static void cobalt_newton_step(struct cobalt_vars *vars)
 371{
 372        u32 invsqrt, invsqrt2;
 373        u64 val;
 374
 375        invsqrt = vars->rec_inv_sqrt;
 376        invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 377        val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 378
 379        val >>= 2; /* avoid overflow in following multiply */
 380        val = (val * invsqrt) >> (32 - 2 + 1);
 381
 382        vars->rec_inv_sqrt = val;
 383}
 384
 385static void cobalt_invsqrt(struct cobalt_vars *vars)
 386{
 387        if (vars->count < REC_INV_SQRT_CACHE)
 388                vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
 389        else
 390                cobalt_newton_step(vars);
 391}
 392
 393/* There is a big difference in timing between the accurate values placed in
 394 * the cache and the approximations given by a single Newton step for small
 395 * count values, particularly when stepping from count 1 to 2 or vice versa.
 396 * Above 16, a single Newton step gives sufficient accuracy in either
 397 * direction, given the precision stored.
 398 *
 399 * The magnitude of the error when stepping up to count 2 is such as to give
 400 * the value that *should* have been produced at count 4.
 401 */
 402
 403static void cobalt_cache_init(void)
 404{
 405        struct cobalt_vars v;
 406
 407        memset(&v, 0, sizeof(v));
 408        v.rec_inv_sqrt = ~0U;
 409        cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
 410
 411        for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
 412                cobalt_newton_step(&v);
 413                cobalt_newton_step(&v);
 414                cobalt_newton_step(&v);
 415                cobalt_newton_step(&v);
 416
 417                cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
 418        }
 419}
 420
 421static void cobalt_vars_init(struct cobalt_vars *vars)
 422{
 423        memset(vars, 0, sizeof(*vars));
 424
 425        if (!cobalt_rec_inv_sqrt_cache[0]) {
 426                cobalt_cache_init();
 427                cobalt_rec_inv_sqrt_cache[0] = ~0;
 428        }
 429}
 430
 431/* CoDel control_law is t + interval/sqrt(count)
 432 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 433 * both sqrt() and divide operation.
 434 */
 435static ktime_t cobalt_control(ktime_t t,
 436                              u64 interval,
 437                              u32 rec_inv_sqrt)
 438{
 439        return ktime_add_ns(t, reciprocal_scale(interval,
 440                                                rec_inv_sqrt));
 441}
 442
 443/* Call this when a packet had to be dropped due to queue overflow.  Returns
 444 * true if the BLUE state was quiescent before but active after this call.
 445 */
 446static bool cobalt_queue_full(struct cobalt_vars *vars,
 447                              struct cobalt_params *p,
 448                              ktime_t now)
 449{
 450        bool up = false;
 451
 452        if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 453                up = !vars->p_drop;
 454                vars->p_drop += p->p_inc;
 455                if (vars->p_drop < p->p_inc)
 456                        vars->p_drop = ~0;
 457                vars->blue_timer = now;
 458        }
 459        vars->dropping = true;
 460        vars->drop_next = now;
 461        if (!vars->count)
 462                vars->count = 1;
 463
 464        return up;
 465}
 466
 467/* Call this when the queue was serviced but turned out to be empty.  Returns
 468 * true if the BLUE state was active before but quiescent after this call.
 469 */
 470static bool cobalt_queue_empty(struct cobalt_vars *vars,
 471                               struct cobalt_params *p,
 472                               ktime_t now)
 473{
 474        bool down = false;
 475
 476        if (vars->p_drop &&
 477            ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 478                if (vars->p_drop < p->p_dec)
 479                        vars->p_drop = 0;
 480                else
 481                        vars->p_drop -= p->p_dec;
 482                vars->blue_timer = now;
 483                down = !vars->p_drop;
 484        }
 485        vars->dropping = false;
 486
 487        if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 488                vars->count--;
 489                cobalt_invsqrt(vars);
 490                vars->drop_next = cobalt_control(vars->drop_next,
 491                                                 p->interval,
 492                                                 vars->rec_inv_sqrt);
 493        }
 494
 495        return down;
 496}
 497
 498/* Call this with a freshly dequeued packet for possible congestion marking.
 499 * Returns true as an instruction to drop the packet, false for delivery.
 500 */
 501static bool cobalt_should_drop(struct cobalt_vars *vars,
 502                               struct cobalt_params *p,
 503                               ktime_t now,
 504                               struct sk_buff *skb,
 505                               u32 bulk_flows)
 506{
 507        bool next_due, over_target, drop = false;
 508        ktime_t schedule;
 509        u64 sojourn;
 510
 511/* The 'schedule' variable records, in its sign, whether 'now' is before or
 512 * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 513 * scheduling decision is actually branched, without destroying that
 514 * information.  Similarly, the first 'schedule' value calculated is preserved
 515 * in the boolean 'next_due'.
 516 *
 517 * As for 'drop_next', we take advantage of the fact that 'interval' is both
 518 * the delay between first exceeding 'target' and the first signalling event,
 519 * *and* the scaling factor for the signalling frequency.  It's therefore very
 520 * natural to use a single mechanism for both purposes, and eliminates a
 521 * significant amount of reference Codel's spaghetti code.  To help with this,
 522 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 523 * as possible to 1.0 in fixed-point.
 524 */
 525
 526        sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 527        schedule = ktime_sub(now, vars->drop_next);
 528        over_target = sojourn > p->target &&
 529                      sojourn > p->mtu_time * bulk_flows * 2 &&
 530                      sojourn > p->mtu_time * 4;
 531        next_due = vars->count && ktime_to_ns(schedule) >= 0;
 532
 533        vars->ecn_marked = false;
 534
 535        if (over_target) {
 536                if (!vars->dropping) {
 537                        vars->dropping = true;
 538                        vars->drop_next = cobalt_control(now,
 539                                                         p->interval,
 540                                                         vars->rec_inv_sqrt);
 541                }
 542                if (!vars->count)
 543                        vars->count = 1;
 544        } else if (vars->dropping) {
 545                vars->dropping = false;
 546        }
 547
 548        if (next_due && vars->dropping) {
 549                /* Use ECN mark if possible, otherwise drop */
 550                drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
 551
 552                vars->count++;
 553                if (!vars->count)
 554                        vars->count--;
 555                cobalt_invsqrt(vars);
 556                vars->drop_next = cobalt_control(vars->drop_next,
 557                                                 p->interval,
 558                                                 vars->rec_inv_sqrt);
 559                schedule = ktime_sub(now, vars->drop_next);
 560        } else {
 561                while (next_due) {
 562                        vars->count--;
 563                        cobalt_invsqrt(vars);
 564                        vars->drop_next = cobalt_control(vars->drop_next,
 565                                                         p->interval,
 566                                                         vars->rec_inv_sqrt);
 567                        schedule = ktime_sub(now, vars->drop_next);
 568                        next_due = vars->count && ktime_to_ns(schedule) >= 0;
 569                }
 570        }
 571
 572        /* Simple BLUE implementation.  Lack of ECN is deliberate. */
 573        if (vars->p_drop)
 574                drop |= (prandom_u32() < vars->p_drop);
 575
 576        /* Overload the drop_next field as an activity timeout */
 577        if (!vars->count)
 578                vars->drop_next = ktime_add_ns(now, p->interval);
 579        else if (ktime_to_ns(schedule) > 0 && !drop)
 580                vars->drop_next = now;
 581
 582        return drop;
 583}
 584
 585static void cake_update_flowkeys(struct flow_keys *keys,
 586                                 const struct sk_buff *skb)
 587{
 588#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 589        struct nf_conntrack_tuple tuple = {};
 590        bool rev = !skb->_nfct;
 591
 592        if (tc_skb_protocol(skb) != htons(ETH_P_IP))
 593                return;
 594
 595        if (!nf_ct_get_tuple_skb(&tuple, skb))
 596                return;
 597
 598        keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
 599        keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
 600
 601        if (keys->ports.ports) {
 602                keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
 603                keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
 604        }
 605#endif
 606}
 607
 608/* Cake has several subtle multiple bit settings. In these cases you
 609 *  would be matching triple isolate mode as well.
 610 */
 611
 612static bool cake_dsrc(int flow_mode)
 613{
 614        return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 615}
 616
 617static bool cake_ddst(int flow_mode)
 618{
 619        return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 620}
 621
 622static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 623                     int flow_mode, u16 flow_override, u16 host_override)
 624{
 625        u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 626        u16 reduced_hash, srchost_idx, dsthost_idx;
 627        struct flow_keys keys, host_keys;
 628
 629        if (unlikely(flow_mode == CAKE_FLOW_NONE))
 630                return 0;
 631
 632        /* If both overrides are set we can skip packet dissection entirely */
 633        if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
 634            (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
 635                goto skip_hash;
 636
 637        skb_flow_dissect_flow_keys(skb, &keys,
 638                                   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 639
 640        if (flow_mode & CAKE_FLOW_NAT_FLAG)
 641                cake_update_flowkeys(&keys, skb);
 642
 643        /* flow_hash_from_keys() sorts the addresses by value, so we have
 644         * to preserve their order in a separate data structure to treat
 645         * src and dst host addresses as independently selectable.
 646         */
 647        host_keys = keys;
 648        host_keys.ports.ports     = 0;
 649        host_keys.basic.ip_proto  = 0;
 650        host_keys.keyid.keyid     = 0;
 651        host_keys.tags.flow_label = 0;
 652
 653        switch (host_keys.control.addr_type) {
 654        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 655                host_keys.addrs.v4addrs.src = 0;
 656                dsthost_hash = flow_hash_from_keys(&host_keys);
 657                host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 658                host_keys.addrs.v4addrs.dst = 0;
 659                srchost_hash = flow_hash_from_keys(&host_keys);
 660                break;
 661
 662        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 663                memset(&host_keys.addrs.v6addrs.src, 0,
 664                       sizeof(host_keys.addrs.v6addrs.src));
 665                dsthost_hash = flow_hash_from_keys(&host_keys);
 666                host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 667                memset(&host_keys.addrs.v6addrs.dst, 0,
 668                       sizeof(host_keys.addrs.v6addrs.dst));
 669                srchost_hash = flow_hash_from_keys(&host_keys);
 670                break;
 671
 672        default:
 673                dsthost_hash = 0;
 674                srchost_hash = 0;
 675        }
 676
 677        /* This *must* be after the above switch, since as a
 678         * side-effect it sorts the src and dst addresses.
 679         */
 680        if (flow_mode & CAKE_FLOW_FLOWS)
 681                flow_hash = flow_hash_from_keys(&keys);
 682
 683skip_hash:
 684        if (flow_override)
 685                flow_hash = flow_override - 1;
 686        if (host_override) {
 687                dsthost_hash = host_override - 1;
 688                srchost_hash = host_override - 1;
 689        }
 690
 691        if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 692                if (flow_mode & CAKE_FLOW_SRC_IP)
 693                        flow_hash ^= srchost_hash;
 694
 695                if (flow_mode & CAKE_FLOW_DST_IP)
 696                        flow_hash ^= dsthost_hash;
 697        }
 698
 699        reduced_hash = flow_hash % CAKE_QUEUES;
 700
 701        /* set-associative hashing */
 702        /* fast path if no hash collision (direct lookup succeeds) */
 703        if (likely(q->tags[reduced_hash] == flow_hash &&
 704                   q->flows[reduced_hash].set)) {
 705                q->way_directs++;
 706        } else {
 707                u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 708                u32 outer_hash = reduced_hash - inner_hash;
 709                bool allocate_src = false;
 710                bool allocate_dst = false;
 711                u32 i, k;
 712
 713                /* check if any active queue in the set is reserved for
 714                 * this flow.
 715                 */
 716                for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 717                     i++, k = (k + 1) % CAKE_SET_WAYS) {
 718                        if (q->tags[outer_hash + k] == flow_hash) {
 719                                if (i)
 720                                        q->way_hits++;
 721
 722                                if (!q->flows[outer_hash + k].set) {
 723                                        /* need to increment host refcnts */
 724                                        allocate_src = cake_dsrc(flow_mode);
 725                                        allocate_dst = cake_ddst(flow_mode);
 726                                }
 727
 728                                goto found;
 729                        }
 730                }
 731
 732                /* no queue is reserved for this flow, look for an
 733                 * empty one.
 734                 */
 735                for (i = 0; i < CAKE_SET_WAYS;
 736                         i++, k = (k + 1) % CAKE_SET_WAYS) {
 737                        if (!q->flows[outer_hash + k].set) {
 738                                q->way_misses++;
 739                                allocate_src = cake_dsrc(flow_mode);
 740                                allocate_dst = cake_ddst(flow_mode);
 741                                goto found;
 742                        }
 743                }
 744
 745                /* With no empty queues, default to the original
 746                 * queue, accept the collision, update the host tags.
 747                 */
 748                q->way_collisions++;
 749                q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
 750                q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
 751                allocate_src = cake_dsrc(flow_mode);
 752                allocate_dst = cake_ddst(flow_mode);
 753found:
 754                /* reserve queue for future packets in same flow */
 755                reduced_hash = outer_hash + k;
 756                q->tags[reduced_hash] = flow_hash;
 757
 758                if (allocate_src) {
 759                        srchost_idx = srchost_hash % CAKE_QUEUES;
 760                        inner_hash = srchost_idx % CAKE_SET_WAYS;
 761                        outer_hash = srchost_idx - inner_hash;
 762                        for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 763                                i++, k = (k + 1) % CAKE_SET_WAYS) {
 764                                if (q->hosts[outer_hash + k].srchost_tag ==
 765                                    srchost_hash)
 766                                        goto found_src;
 767                        }
 768                        for (i = 0; i < CAKE_SET_WAYS;
 769                                i++, k = (k + 1) % CAKE_SET_WAYS) {
 770                                if (!q->hosts[outer_hash + k].srchost_refcnt)
 771                                        break;
 772                        }
 773                        q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 774found_src:
 775                        srchost_idx = outer_hash + k;
 776                        q->hosts[srchost_idx].srchost_refcnt++;
 777                        q->flows[reduced_hash].srchost = srchost_idx;
 778                }
 779
 780                if (allocate_dst) {
 781                        dsthost_idx = dsthost_hash % CAKE_QUEUES;
 782                        inner_hash = dsthost_idx % CAKE_SET_WAYS;
 783                        outer_hash = dsthost_idx - inner_hash;
 784                        for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 785                             i++, k = (k + 1) % CAKE_SET_WAYS) {
 786                                if (q->hosts[outer_hash + k].dsthost_tag ==
 787                                    dsthost_hash)
 788                                        goto found_dst;
 789                        }
 790                        for (i = 0; i < CAKE_SET_WAYS;
 791                             i++, k = (k + 1) % CAKE_SET_WAYS) {
 792                                if (!q->hosts[outer_hash + k].dsthost_refcnt)
 793                                        break;
 794                        }
 795                        q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 796found_dst:
 797                        dsthost_idx = outer_hash + k;
 798                        q->hosts[dsthost_idx].dsthost_refcnt++;
 799                        q->flows[reduced_hash].dsthost = dsthost_idx;
 800                }
 801        }
 802
 803        return reduced_hash;
 804}
 805
 806/* helper functions : might be changed when/if skb use a standard list_head */
 807/* remove one skb from head of slot queue */
 808
 809static struct sk_buff *dequeue_head(struct cake_flow *flow)
 810{
 811        struct sk_buff *skb = flow->head;
 812
 813        if (skb) {
 814                flow->head = skb->next;
 815                skb->next = NULL;
 816        }
 817
 818        return skb;
 819}
 820
 821/* add skb to flow queue (tail add) */
 822
 823static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 824{
 825        if (!flow->head)
 826                flow->head = skb;
 827        else
 828                flow->tail->next = skb;
 829        flow->tail = skb;
 830        skb->next = NULL;
 831}
 832
 833static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 834                                    struct ipv6hdr *buf)
 835{
 836        unsigned int offset = skb_network_offset(skb);
 837        struct iphdr *iph;
 838
 839        iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 840
 841        if (!iph)
 842                return NULL;
 843
 844        if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 845                return skb_header_pointer(skb, offset + iph->ihl * 4,
 846                                          sizeof(struct ipv6hdr), buf);
 847
 848        else if (iph->version == 4)
 849                return iph;
 850
 851        else if (iph->version == 6)
 852                return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 853                                          buf);
 854
 855        return NULL;
 856}
 857
 858static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
 859                                      void *buf, unsigned int bufsize)
 860{
 861        unsigned int offset = skb_network_offset(skb);
 862        const struct ipv6hdr *ipv6h;
 863        const struct tcphdr *tcph;
 864        const struct iphdr *iph;
 865        struct ipv6hdr _ipv6h;
 866        struct tcphdr _tcph;
 867
 868        ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
 869
 870        if (!ipv6h)
 871                return NULL;
 872
 873        if (ipv6h->version == 4) {
 874                iph = (struct iphdr *)ipv6h;
 875                offset += iph->ihl * 4;
 876
 877                /* special-case 6in4 tunnelling, as that is a common way to get
 878                 * v6 connectivity in the home
 879                 */
 880                if (iph->protocol == IPPROTO_IPV6) {
 881                        ipv6h = skb_header_pointer(skb, offset,
 882                                                   sizeof(_ipv6h), &_ipv6h);
 883
 884                        if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
 885                                return NULL;
 886
 887                        offset += sizeof(struct ipv6hdr);
 888
 889                } else if (iph->protocol != IPPROTO_TCP) {
 890                        return NULL;
 891                }
 892
 893        } else if (ipv6h->version == 6) {
 894                if (ipv6h->nexthdr != IPPROTO_TCP)
 895                        return NULL;
 896
 897                offset += sizeof(struct ipv6hdr);
 898        } else {
 899                return NULL;
 900        }
 901
 902        tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
 903        if (!tcph)
 904                return NULL;
 905
 906        return skb_header_pointer(skb, offset,
 907                                  min(__tcp_hdrlen(tcph), bufsize), buf);
 908}
 909
 910static const void *cake_get_tcpopt(const struct tcphdr *tcph,
 911                                   int code, int *oplen)
 912{
 913        /* inspired by tcp_parse_options in tcp_input.c */
 914        int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
 915        const u8 *ptr = (const u8 *)(tcph + 1);
 916
 917        while (length > 0) {
 918                int opcode = *ptr++;
 919                int opsize;
 920
 921                if (opcode == TCPOPT_EOL)
 922                        break;
 923                if (opcode == TCPOPT_NOP) {
 924                        length--;
 925                        continue;
 926                }
 927                opsize = *ptr++;
 928                if (opsize < 2 || opsize > length)
 929                        break;
 930
 931                if (opcode == code) {
 932                        *oplen = opsize;
 933                        return ptr;
 934                }
 935
 936                ptr += opsize - 2;
 937                length -= opsize;
 938        }
 939
 940        return NULL;
 941}
 942
 943/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
 944 * bytes than the other. In the case where both sequences ACKs bytes that the
 945 * other doesn't, A is considered greater. DSACKs in A also makes A be
 946 * considered greater.
 947 *
 948 * @return -1, 0 or 1 as normal compare functions
 949 */
 950static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
 951                                  const struct tcphdr *tcph_b)
 952{
 953        const struct tcp_sack_block_wire *sack_a, *sack_b;
 954        u32 ack_seq_a = ntohl(tcph_a->ack_seq);
 955        u32 bytes_a = 0, bytes_b = 0;
 956        int oplen_a, oplen_b;
 957        bool first = true;
 958
 959        sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
 960        sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
 961
 962        /* pointers point to option contents */
 963        oplen_a -= TCPOLEN_SACK_BASE;
 964        oplen_b -= TCPOLEN_SACK_BASE;
 965
 966        if (sack_a && oplen_a >= sizeof(*sack_a) &&
 967            (!sack_b || oplen_b < sizeof(*sack_b)))
 968                return -1;
 969        else if (sack_b && oplen_b >= sizeof(*sack_b) &&
 970                 (!sack_a || oplen_a < sizeof(*sack_a)))
 971                return 1;
 972        else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
 973                 (!sack_b || oplen_b < sizeof(*sack_b)))
 974                return 0;
 975
 976        while (oplen_a >= sizeof(*sack_a)) {
 977                const struct tcp_sack_block_wire *sack_tmp = sack_b;
 978                u32 start_a = get_unaligned_be32(&sack_a->start_seq);
 979                u32 end_a = get_unaligned_be32(&sack_a->end_seq);
 980                int oplen_tmp = oplen_b;
 981                bool found = false;
 982
 983                /* DSACK; always considered greater to prevent dropping */
 984                if (before(start_a, ack_seq_a))
 985                        return -1;
 986
 987                bytes_a += end_a - start_a;
 988
 989                while (oplen_tmp >= sizeof(*sack_tmp)) {
 990                        u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
 991                        u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
 992
 993                        /* first time through we count the total size */
 994                        if (first)
 995                                bytes_b += end_b - start_b;
 996
 997                        if (!after(start_b, start_a) && !before(end_b, end_a)) {
 998                                found = true;
 999                                if (!first)
1000                                        break;
1001                        }
1002                        oplen_tmp -= sizeof(*sack_tmp);
1003                        sack_tmp++;
1004                }
1005
1006                if (!found)
1007                        return -1;
1008
1009                oplen_a -= sizeof(*sack_a);
1010                sack_a++;
1011                first = false;
1012        }
1013
1014        /* If we made it this far, all ranges SACKed by A are covered by B, so
1015         * either the SACKs are equal, or B SACKs more bytes.
1016         */
1017        return bytes_b > bytes_a ? 1 : 0;
1018}
1019
1020static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1021                                 u32 *tsval, u32 *tsecr)
1022{
1023        const u8 *ptr;
1024        int opsize;
1025
1026        ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1027
1028        if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1029                *tsval = get_unaligned_be32(ptr);
1030                *tsecr = get_unaligned_be32(ptr + 4);
1031        }
1032}
1033
1034static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1035                               u32 tstamp_new, u32 tsecr_new)
1036{
1037        /* inspired by tcp_parse_options in tcp_input.c */
1038        int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1039        const u8 *ptr = (const u8 *)(tcph + 1);
1040        u32 tstamp, tsecr;
1041
1042        /* 3 reserved flags must be unset to avoid future breakage
1043         * ACK must be set
1044         * ECE/CWR are handled separately
1045         * All other flags URG/PSH/RST/SYN/FIN must be unset
1046         * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1047         * 0x00C00000 = CWR/ECE (handled separately)
1048         * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1049         */
1050        if (((tcp_flag_word(tcph) &
1051              cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1052                return false;
1053
1054        while (length > 0) {
1055                int opcode = *ptr++;
1056                int opsize;
1057
1058                if (opcode == TCPOPT_EOL)
1059                        break;
1060                if (opcode == TCPOPT_NOP) {
1061                        length--;
1062                        continue;
1063                }
1064                opsize = *ptr++;
1065                if (opsize < 2 || opsize > length)
1066                        break;
1067
1068                switch (opcode) {
1069                case TCPOPT_MD5SIG: /* doesn't influence state */
1070                        break;
1071
1072                case TCPOPT_SACK: /* stricter checking performed later */
1073                        if (opsize % 8 != 2)
1074                                return false;
1075                        break;
1076
1077                case TCPOPT_TIMESTAMP:
1078                        /* only drop timestamps lower than new */
1079                        if (opsize != TCPOLEN_TIMESTAMP)
1080                                return false;
1081                        tstamp = get_unaligned_be32(ptr);
1082                        tsecr = get_unaligned_be32(ptr + 4);
1083                        if (after(tstamp, tstamp_new) ||
1084                            after(tsecr, tsecr_new))
1085                                return false;
1086                        break;
1087
1088                case TCPOPT_MSS:  /* these should only be set on SYN */
1089                case TCPOPT_WINDOW:
1090                case TCPOPT_SACK_PERM:
1091                case TCPOPT_FASTOPEN:
1092                case TCPOPT_EXP:
1093                default: /* don't drop if any unknown options are present */
1094                        return false;
1095                }
1096
1097                ptr += opsize - 2;
1098                length -= opsize;
1099        }
1100
1101        return true;
1102}
1103
1104static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1105                                       struct cake_flow *flow)
1106{
1107        bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1108        struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1109        struct sk_buff *skb_check, *skb_prev = NULL;
1110        const struct ipv6hdr *ipv6h, *ipv6h_check;
1111        unsigned char _tcph[64], _tcph_check[64];
1112        const struct tcphdr *tcph, *tcph_check;
1113        const struct iphdr *iph, *iph_check;
1114        struct ipv6hdr _iph, _iph_check;
1115        const struct sk_buff *skb;
1116        int seglen, num_found = 0;
1117        u32 tstamp = 0, tsecr = 0;
1118        __be32 elig_flags = 0;
1119        int sack_comp;
1120
1121        /* no other possible ACKs to filter */
1122        if (flow->head == flow->tail)
1123                return NULL;
1124
1125        skb = flow->tail;
1126        tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1127        iph = cake_get_iphdr(skb, &_iph);
1128        if (!tcph)
1129                return NULL;
1130
1131        cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1132
1133        /* the 'triggering' packet need only have the ACK flag set.
1134         * also check that SYN is not set, as there won't be any previous ACKs.
1135         */
1136        if ((tcp_flag_word(tcph) &
1137             (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1138                return NULL;
1139
1140        /* the 'triggering' ACK is at the tail of the queue, we have already
1141         * returned if it is the only packet in the flow. loop through the rest
1142         * of the queue looking for pure ACKs with the same 5-tuple as the
1143         * triggering one.
1144         */
1145        for (skb_check = flow->head;
1146             skb_check && skb_check != skb;
1147             skb_prev = skb_check, skb_check = skb_check->next) {
1148                iph_check = cake_get_iphdr(skb_check, &_iph_check);
1149                tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1150                                             sizeof(_tcph_check));
1151
1152                /* only TCP packets with matching 5-tuple are eligible, and only
1153                 * drop safe headers
1154                 */
1155                if (!tcph_check || iph->version != iph_check->version ||
1156                    tcph_check->source != tcph->source ||
1157                    tcph_check->dest != tcph->dest)
1158                        continue;
1159
1160                if (iph_check->version == 4) {
1161                        if (iph_check->saddr != iph->saddr ||
1162                            iph_check->daddr != iph->daddr)
1163                                continue;
1164
1165                        seglen = ntohs(iph_check->tot_len) -
1166                                       (4 * iph_check->ihl);
1167                } else if (iph_check->version == 6) {
1168                        ipv6h = (struct ipv6hdr *)iph;
1169                        ipv6h_check = (struct ipv6hdr *)iph_check;
1170
1171                        if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1172                            ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1173                                continue;
1174
1175                        seglen = ntohs(ipv6h_check->payload_len);
1176                } else {
1177                        WARN_ON(1);  /* shouldn't happen */
1178                        continue;
1179                }
1180
1181                /* If the ECE/CWR flags changed from the previous eligible
1182                 * packet in the same flow, we should no longer be dropping that
1183                 * previous packet as this would lose information.
1184                 */
1185                if (elig_ack && (tcp_flag_word(tcph_check) &
1186                                 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1187                        elig_ack = NULL;
1188                        elig_ack_prev = NULL;
1189                        num_found--;
1190                }
1191
1192                /* Check TCP options and flags, don't drop ACKs with segment
1193                 * data, and don't drop ACKs with a higher cumulative ACK
1194                 * counter than the triggering packet. Check ACK seqno here to
1195                 * avoid parsing SACK options of packets we are going to exclude
1196                 * anyway.
1197                 */
1198                if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1199                    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1200                    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1201                        continue;
1202
1203                /* Check SACK options. The triggering packet must SACK more data
1204                 * than the ACK under consideration, or SACK the same range but
1205                 * have a larger cumulative ACK counter. The latter is a
1206                 * pathological case, but is contained in the following check
1207                 * anyway, just to be safe.
1208                 */
1209                sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1210
1211                if (sack_comp < 0 ||
1212                    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1213                     sack_comp == 0))
1214                        continue;
1215
1216                /* At this point we have found an eligible pure ACK to drop; if
1217                 * we are in aggressive mode, we are done. Otherwise, keep
1218                 * searching unless this is the second eligible ACK we
1219                 * found.
1220                 *
1221                 * Since we want to drop ACK closest to the head of the queue,
1222                 * save the first eligible ACK we find, even if we need to loop
1223                 * again.
1224                 */
1225                if (!elig_ack) {
1226                        elig_ack = skb_check;
1227                        elig_ack_prev = skb_prev;
1228                        elig_flags = (tcp_flag_word(tcph_check)
1229                                      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1230                }
1231
1232                if (num_found++ > 0)
1233                        goto found;
1234        }
1235
1236        /* We made it through the queue without finding two eligible ACKs . If
1237         * we found a single eligible ACK we can drop it in aggressive mode if
1238         * we can guarantee that this does not interfere with ECN flag
1239         * information. We ensure this by dropping it only if the enqueued
1240         * packet is consecutive with the eligible ACK, and their flags match.
1241         */
1242        if (elig_ack && aggressive && elig_ack->next == skb &&
1243            (elig_flags == (tcp_flag_word(tcph) &
1244                            (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1245                goto found;
1246
1247        return NULL;
1248
1249found:
1250        if (elig_ack_prev)
1251                elig_ack_prev->next = elig_ack->next;
1252        else
1253                flow->head = elig_ack->next;
1254
1255        elig_ack->next = NULL;
1256
1257        return elig_ack;
1258}
1259
1260static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1261{
1262        avg -= avg >> shift;
1263        avg += sample >> shift;
1264        return avg;
1265}
1266
1267static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1268{
1269        if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1270                len -= off;
1271
1272        if (q->max_netlen < len)
1273                q->max_netlen = len;
1274        if (q->min_netlen > len)
1275                q->min_netlen = len;
1276
1277        len += q->rate_overhead;
1278
1279        if (len < q->rate_mpu)
1280                len = q->rate_mpu;
1281
1282        if (q->atm_mode == CAKE_ATM_ATM) {
1283                len += 47;
1284                len /= 48;
1285                len *= 53;
1286        } else if (q->atm_mode == CAKE_ATM_PTM) {
1287                /* Add one byte per 64 bytes or part thereof.
1288                 * This is conservative and easier to calculate than the
1289                 * precise value.
1290                 */
1291                len += (len + 63) / 64;
1292        }
1293
1294        if (q->max_adjlen < len)
1295                q->max_adjlen = len;
1296        if (q->min_adjlen > len)
1297                q->min_adjlen = len;
1298
1299        return len;
1300}
1301
1302static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1303{
1304        const struct skb_shared_info *shinfo = skb_shinfo(skb);
1305        unsigned int hdr_len, last_len = 0;
1306        u32 off = skb_network_offset(skb);
1307        u32 len = qdisc_pkt_len(skb);
1308        u16 segs = 1;
1309
1310        q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1311
1312        if (!shinfo->gso_size)
1313                return cake_calc_overhead(q, len, off);
1314
1315        /* borrowed from qdisc_pkt_len_init() */
1316        hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1317
1318        /* + transport layer */
1319        if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1320                                                SKB_GSO_TCPV6))) {
1321                const struct tcphdr *th;
1322                struct tcphdr _tcphdr;
1323
1324                th = skb_header_pointer(skb, skb_transport_offset(skb),
1325                                        sizeof(_tcphdr), &_tcphdr);
1326                if (likely(th))
1327                        hdr_len += __tcp_hdrlen(th);
1328        } else {
1329                struct udphdr _udphdr;
1330
1331                if (skb_header_pointer(skb, skb_transport_offset(skb),
1332                                       sizeof(_udphdr), &_udphdr))
1333                        hdr_len += sizeof(struct udphdr);
1334        }
1335
1336        if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1337                segs = DIV_ROUND_UP(skb->len - hdr_len,
1338                                    shinfo->gso_size);
1339        else
1340                segs = shinfo->gso_segs;
1341
1342        len = shinfo->gso_size + hdr_len;
1343        last_len = skb->len - shinfo->gso_size * (segs - 1);
1344
1345        return (cake_calc_overhead(q, len, off) * (segs - 1) +
1346                cake_calc_overhead(q, last_len, off));
1347}
1348
1349static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1350{
1351        struct cake_heap_entry ii = q->overflow_heap[i];
1352        struct cake_heap_entry jj = q->overflow_heap[j];
1353
1354        q->overflow_heap[i] = jj;
1355        q->overflow_heap[j] = ii;
1356
1357        q->tins[ii.t].overflow_idx[ii.b] = j;
1358        q->tins[jj.t].overflow_idx[jj.b] = i;
1359}
1360
1361static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1362{
1363        struct cake_heap_entry ii = q->overflow_heap[i];
1364
1365        return q->tins[ii.t].backlogs[ii.b];
1366}
1367
1368static void cake_heapify(struct cake_sched_data *q, u16 i)
1369{
1370        static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1371        u32 mb = cake_heap_get_backlog(q, i);
1372        u32 m = i;
1373
1374        while (m < a) {
1375                u32 l = m + m + 1;
1376                u32 r = l + 1;
1377
1378                if (l < a) {
1379                        u32 lb = cake_heap_get_backlog(q, l);
1380
1381                        if (lb > mb) {
1382                                m  = l;
1383                                mb = lb;
1384                        }
1385                }
1386
1387                if (r < a) {
1388                        u32 rb = cake_heap_get_backlog(q, r);
1389
1390                        if (rb > mb) {
1391                                m  = r;
1392                                mb = rb;
1393                        }
1394                }
1395
1396                if (m != i) {
1397                        cake_heap_swap(q, i, m);
1398                        i = m;
1399                } else {
1400                        break;
1401                }
1402        }
1403}
1404
1405static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1406{
1407        while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1408                u16 p = (i - 1) >> 1;
1409                u32 ib = cake_heap_get_backlog(q, i);
1410                u32 pb = cake_heap_get_backlog(q, p);
1411
1412                if (ib > pb) {
1413                        cake_heap_swap(q, i, p);
1414                        i = p;
1415                } else {
1416                        break;
1417                }
1418        }
1419}
1420
1421static int cake_advance_shaper(struct cake_sched_data *q,
1422                               struct cake_tin_data *b,
1423                               struct sk_buff *skb,
1424                               ktime_t now, bool drop)
1425{
1426        u32 len = get_cobalt_cb(skb)->adjusted_len;
1427
1428        /* charge packet bandwidth to this tin
1429         * and to the global shaper.
1430         */
1431        if (q->rate_ns) {
1432                u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1433                u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1434                u64 failsafe_dur = global_dur + (global_dur >> 1);
1435
1436                if (ktime_before(b->time_next_packet, now))
1437                        b->time_next_packet = ktime_add_ns(b->time_next_packet,
1438                                                           tin_dur);
1439
1440                else if (ktime_before(b->time_next_packet,
1441                                      ktime_add_ns(now, tin_dur)))
1442                        b->time_next_packet = ktime_add_ns(now, tin_dur);
1443
1444                q->time_next_packet = ktime_add_ns(q->time_next_packet,
1445                                                   global_dur);
1446                if (!drop)
1447                        q->failsafe_next_packet = \
1448                                ktime_add_ns(q->failsafe_next_packet,
1449                                             failsafe_dur);
1450        }
1451        return len;
1452}
1453
1454static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1455{
1456        struct cake_sched_data *q = qdisc_priv(sch);
1457        ktime_t now = ktime_get();
1458        u32 idx = 0, tin = 0, len;
1459        struct cake_heap_entry qq;
1460        struct cake_tin_data *b;
1461        struct cake_flow *flow;
1462        struct sk_buff *skb;
1463
1464        if (!q->overflow_timeout) {
1465                int i;
1466                /* Build fresh max-heap */
1467                for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1468                        cake_heapify(q, i);
1469        }
1470        q->overflow_timeout = 65535;
1471
1472        /* select longest queue for pruning */
1473        qq  = q->overflow_heap[0];
1474        tin = qq.t;
1475        idx = qq.b;
1476
1477        b = &q->tins[tin];
1478        flow = &b->flows[idx];
1479        skb = dequeue_head(flow);
1480        if (unlikely(!skb)) {
1481                /* heap has gone wrong, rebuild it next time */
1482                q->overflow_timeout = 0;
1483                return idx + (tin << 16);
1484        }
1485
1486        if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1487                b->unresponsive_flow_count++;
1488
1489        len = qdisc_pkt_len(skb);
1490        q->buffer_used      -= skb->truesize;
1491        b->backlogs[idx]    -= len;
1492        b->tin_backlog      -= len;
1493        sch->qstats.backlog -= len;
1494        qdisc_tree_reduce_backlog(sch, 1, len);
1495
1496        flow->dropped++;
1497        b->tin_dropped++;
1498        sch->qstats.drops++;
1499
1500        if (q->rate_flags & CAKE_FLAG_INGRESS)
1501                cake_advance_shaper(q, b, skb, now, true);
1502
1503        __qdisc_drop(skb, to_free);
1504        sch->q.qlen--;
1505
1506        cake_heapify(q, 0);
1507
1508        return idx + (tin << 16);
1509}
1510
1511static void cake_wash_diffserv(struct sk_buff *skb)
1512{
1513        switch (skb->protocol) {
1514        case htons(ETH_P_IP):
1515                ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1516                break;
1517        case htons(ETH_P_IPV6):
1518                ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1519                break;
1520        default:
1521                break;
1522        }
1523}
1524
1525static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1526{
1527        u8 dscp;
1528
1529        switch (skb->protocol) {
1530        case htons(ETH_P_IP):
1531                dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1532                if (wash && dscp)
1533                        ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1534                return dscp;
1535
1536        case htons(ETH_P_IPV6):
1537                dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1538                if (wash && dscp)
1539                        ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1540                return dscp;
1541
1542        case htons(ETH_P_ARP):
1543                return 0x38;  /* CS7 - Net Control */
1544
1545        default:
1546                /* If there is no Diffserv field, treat as best-effort */
1547                return 0;
1548        }
1549}
1550
1551static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1552                                             struct sk_buff *skb)
1553{
1554        struct cake_sched_data *q = qdisc_priv(sch);
1555        u32 tin;
1556
1557        if (TC_H_MAJ(skb->priority) == sch->handle &&
1558            TC_H_MIN(skb->priority) > 0 &&
1559            TC_H_MIN(skb->priority) <= q->tin_cnt) {
1560                tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1561
1562                if (q->rate_flags & CAKE_FLAG_WASH)
1563                        cake_wash_diffserv(skb);
1564        } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
1565                /* extract the Diffserv Precedence field, if it exists */
1566                /* and clear DSCP bits if washing */
1567                tin = q->tin_index[cake_handle_diffserv(skb,
1568                                q->rate_flags & CAKE_FLAG_WASH)];
1569                if (unlikely(tin >= q->tin_cnt))
1570                        tin = 0;
1571        } else {
1572                tin = 0;
1573                if (q->rate_flags & CAKE_FLAG_WASH)
1574                        cake_wash_diffserv(skb);
1575        }
1576
1577        return &q->tins[tin];
1578}
1579
1580static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1581                         struct sk_buff *skb, int flow_mode, int *qerr)
1582{
1583        struct cake_sched_data *q = qdisc_priv(sch);
1584        struct tcf_proto *filter;
1585        struct tcf_result res;
1586        u16 flow = 0, host = 0;
1587        int result;
1588
1589        filter = rcu_dereference_bh(q->filter_list);
1590        if (!filter)
1591                goto hash;
1592
1593        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1594        result = tcf_classify(skb, filter, &res, false);
1595
1596        if (result >= 0) {
1597#ifdef CONFIG_NET_CLS_ACT
1598                switch (result) {
1599                case TC_ACT_STOLEN:
1600                case TC_ACT_QUEUED:
1601                case TC_ACT_TRAP:
1602                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1603                        /* fall through */
1604                case TC_ACT_SHOT:
1605                        return 0;
1606                }
1607#endif
1608                if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1609                        flow = TC_H_MIN(res.classid);
1610                if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1611                        host = TC_H_MAJ(res.classid) >> 16;
1612        }
1613hash:
1614        *t = cake_select_tin(sch, skb);
1615        return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1616}
1617
1618static void cake_reconfigure(struct Qdisc *sch);
1619
1620static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1621                        struct sk_buff **to_free)
1622{
1623        struct cake_sched_data *q = qdisc_priv(sch);
1624        int len = qdisc_pkt_len(skb);
1625        int uninitialized_var(ret);
1626        struct sk_buff *ack = NULL;
1627        ktime_t now = ktime_get();
1628        struct cake_tin_data *b;
1629        struct cake_flow *flow;
1630        u32 idx;
1631
1632        /* choose flow to insert into */
1633        idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1634        if (idx == 0) {
1635                if (ret & __NET_XMIT_BYPASS)
1636                        qdisc_qstats_drop(sch);
1637                __qdisc_drop(skb, to_free);
1638                return ret;
1639        }
1640        idx--;
1641        flow = &b->flows[idx];
1642
1643        /* ensure shaper state isn't stale */
1644        if (!b->tin_backlog) {
1645                if (ktime_before(b->time_next_packet, now))
1646                        b->time_next_packet = now;
1647
1648                if (!sch->q.qlen) {
1649                        if (ktime_before(q->time_next_packet, now)) {
1650                                q->failsafe_next_packet = now;
1651                                q->time_next_packet = now;
1652                        } else if (ktime_after(q->time_next_packet, now) &&
1653                                   ktime_after(q->failsafe_next_packet, now)) {
1654                                u64 next = \
1655                                        min(ktime_to_ns(q->time_next_packet),
1656                                            ktime_to_ns(
1657                                                   q->failsafe_next_packet));
1658                                sch->qstats.overlimits++;
1659                                qdisc_watchdog_schedule_ns(&q->watchdog, next);
1660                        }
1661                }
1662        }
1663
1664        if (unlikely(len > b->max_skblen))
1665                b->max_skblen = len;
1666
1667        if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1668                struct sk_buff *segs, *nskb;
1669                netdev_features_t features = netif_skb_features(skb);
1670                unsigned int slen = 0;
1671
1672                segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1673                if (IS_ERR_OR_NULL(segs))
1674                        return qdisc_drop(skb, sch, to_free);
1675
1676                while (segs) {
1677                        nskb = segs->next;
1678                        segs->next = NULL;
1679                        qdisc_skb_cb(segs)->pkt_len = segs->len;
1680                        cobalt_set_enqueue_time(segs, now);
1681                        get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1682                                                                          segs);
1683                        flow_queue_add(flow, segs);
1684
1685                        sch->q.qlen++;
1686                        slen += segs->len;
1687                        q->buffer_used += segs->truesize;
1688                        b->packets++;
1689                        segs = nskb;
1690                }
1691
1692                /* stats */
1693                b->bytes            += slen;
1694                b->backlogs[idx]    += slen;
1695                b->tin_backlog      += slen;
1696                sch->qstats.backlog += slen;
1697                q->avg_window_bytes += slen;
1698
1699                qdisc_tree_reduce_backlog(sch, 1, len);
1700                consume_skb(skb);
1701        } else {
1702                /* not splitting */
1703                cobalt_set_enqueue_time(skb, now);
1704                get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1705                flow_queue_add(flow, skb);
1706
1707                if (q->ack_filter)
1708                        ack = cake_ack_filter(q, flow);
1709
1710                if (ack) {
1711                        b->ack_drops++;
1712                        sch->qstats.drops++;
1713                        b->bytes += qdisc_pkt_len(ack);
1714                        len -= qdisc_pkt_len(ack);
1715                        q->buffer_used += skb->truesize - ack->truesize;
1716                        if (q->rate_flags & CAKE_FLAG_INGRESS)
1717                                cake_advance_shaper(q, b, ack, now, true);
1718
1719                        qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1720                        consume_skb(ack);
1721                } else {
1722                        sch->q.qlen++;
1723                        q->buffer_used      += skb->truesize;
1724                }
1725
1726                /* stats */
1727                b->packets++;
1728                b->bytes            += len;
1729                b->backlogs[idx]    += len;
1730                b->tin_backlog      += len;
1731                sch->qstats.backlog += len;
1732                q->avg_window_bytes += len;
1733        }
1734
1735        if (q->overflow_timeout)
1736                cake_heapify_up(q, b->overflow_idx[idx]);
1737
1738        /* incoming bandwidth capacity estimate */
1739        if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1740                u64 packet_interval = \
1741                        ktime_to_ns(ktime_sub(now, q->last_packet_time));
1742
1743                if (packet_interval > NSEC_PER_SEC)
1744                        packet_interval = NSEC_PER_SEC;
1745
1746                /* filter out short-term bursts, eg. wifi aggregation */
1747                q->avg_packet_interval = \
1748                        cake_ewma(q->avg_packet_interval,
1749                                  packet_interval,
1750                                  (packet_interval > q->avg_packet_interval ?
1751                                          2 : 8));
1752
1753                q->last_packet_time = now;
1754
1755                if (packet_interval > q->avg_packet_interval) {
1756                        u64 window_interval = \
1757                                ktime_to_ns(ktime_sub(now,
1758                                                      q->avg_window_begin));
1759                        u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1760
1761                        do_div(b, window_interval);
1762                        q->avg_peak_bandwidth =
1763                                cake_ewma(q->avg_peak_bandwidth, b,
1764                                          b > q->avg_peak_bandwidth ? 2 : 8);
1765                        q->avg_window_bytes = 0;
1766                        q->avg_window_begin = now;
1767
1768                        if (ktime_after(now,
1769                                        ktime_add_ms(q->last_reconfig_time,
1770                                                     250))) {
1771                                q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1772                                cake_reconfigure(sch);
1773                        }
1774                }
1775        } else {
1776                q->avg_window_bytes = 0;
1777                q->last_packet_time = now;
1778        }
1779
1780        /* flowchain */
1781        if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1782                struct cake_host *srchost = &b->hosts[flow->srchost];
1783                struct cake_host *dsthost = &b->hosts[flow->dsthost];
1784                u16 host_load = 1;
1785
1786                if (!flow->set) {
1787                        list_add_tail(&flow->flowchain, &b->new_flows);
1788                } else {
1789                        b->decaying_flow_count--;
1790                        list_move_tail(&flow->flowchain, &b->new_flows);
1791                }
1792                flow->set = CAKE_SET_SPARSE;
1793                b->sparse_flow_count++;
1794
1795                if (cake_dsrc(q->flow_mode))
1796                        host_load = max(host_load, srchost->srchost_refcnt);
1797
1798                if (cake_ddst(q->flow_mode))
1799                        host_load = max(host_load, dsthost->dsthost_refcnt);
1800
1801                flow->deficit = (b->flow_quantum *
1802                                 quantum_div[host_load]) >> 16;
1803        } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1804                /* this flow was empty, accounted as a sparse flow, but actually
1805                 * in the bulk rotation.
1806                 */
1807                flow->set = CAKE_SET_BULK;
1808                b->sparse_flow_count--;
1809                b->bulk_flow_count++;
1810        }
1811
1812        if (q->buffer_used > q->buffer_max_used)
1813                q->buffer_max_used = q->buffer_used;
1814
1815        if (q->buffer_used > q->buffer_limit) {
1816                u32 dropped = 0;
1817
1818                while (q->buffer_used > q->buffer_limit) {
1819                        dropped++;
1820                        cake_drop(sch, to_free);
1821                }
1822                b->drop_overlimit += dropped;
1823        }
1824        return NET_XMIT_SUCCESS;
1825}
1826
1827static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1828{
1829        struct cake_sched_data *q = qdisc_priv(sch);
1830        struct cake_tin_data *b = &q->tins[q->cur_tin];
1831        struct cake_flow *flow = &b->flows[q->cur_flow];
1832        struct sk_buff *skb = NULL;
1833        u32 len;
1834
1835        if (flow->head) {
1836                skb = dequeue_head(flow);
1837                len = qdisc_pkt_len(skb);
1838                b->backlogs[q->cur_flow] -= len;
1839                b->tin_backlog           -= len;
1840                sch->qstats.backlog      -= len;
1841                q->buffer_used           -= skb->truesize;
1842                sch->q.qlen--;
1843
1844                if (q->overflow_timeout)
1845                        cake_heapify(q, b->overflow_idx[q->cur_flow]);
1846        }
1847        return skb;
1848}
1849
1850/* Discard leftover packets from a tin no longer in use. */
1851static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1852{
1853        struct cake_sched_data *q = qdisc_priv(sch);
1854        struct sk_buff *skb;
1855
1856        q->cur_tin = tin;
1857        for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1858                while (!!(skb = cake_dequeue_one(sch)))
1859                        kfree_skb(skb);
1860}
1861
1862static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1863{
1864        struct cake_sched_data *q = qdisc_priv(sch);
1865        struct cake_tin_data *b = &q->tins[q->cur_tin];
1866        struct cake_host *srchost, *dsthost;
1867        ktime_t now = ktime_get();
1868        struct cake_flow *flow;
1869        struct list_head *head;
1870        bool first_flow = true;
1871        struct sk_buff *skb;
1872        u16 host_load;
1873        u64 delay;
1874        u32 len;
1875
1876begin:
1877        if (!sch->q.qlen)
1878                return NULL;
1879
1880        /* global hard shaper */
1881        if (ktime_after(q->time_next_packet, now) &&
1882            ktime_after(q->failsafe_next_packet, now)) {
1883                u64 next = min(ktime_to_ns(q->time_next_packet),
1884                               ktime_to_ns(q->failsafe_next_packet));
1885
1886                sch->qstats.overlimits++;
1887                qdisc_watchdog_schedule_ns(&q->watchdog, next);
1888                return NULL;
1889        }
1890
1891        /* Choose a class to work on. */
1892        if (!q->rate_ns) {
1893                /* In unlimited mode, can't rely on shaper timings, just balance
1894                 * with DRR
1895                 */
1896                bool wrapped = false, empty = true;
1897
1898                while (b->tin_deficit < 0 ||
1899                       !(b->sparse_flow_count + b->bulk_flow_count)) {
1900                        if (b->tin_deficit <= 0)
1901                                b->tin_deficit += b->tin_quantum_band;
1902                        if (b->sparse_flow_count + b->bulk_flow_count)
1903                                empty = false;
1904
1905                        q->cur_tin++;
1906                        b++;
1907                        if (q->cur_tin >= q->tin_cnt) {
1908                                q->cur_tin = 0;
1909                                b = q->tins;
1910
1911                                if (wrapped) {
1912                                        /* It's possible for q->qlen to be
1913                                         * nonzero when we actually have no
1914                                         * packets anywhere.
1915                                         */
1916                                        if (empty)
1917                                                return NULL;
1918                                } else {
1919                                        wrapped = true;
1920                                }
1921                        }
1922                }
1923        } else {
1924                /* In shaped mode, choose:
1925                 * - Highest-priority tin with queue and meeting schedule, or
1926                 * - The earliest-scheduled tin with queue.
1927                 */
1928                ktime_t best_time = KTIME_MAX;
1929                int tin, best_tin = 0;
1930
1931                for (tin = 0; tin < q->tin_cnt; tin++) {
1932                        b = q->tins + tin;
1933                        if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1934                                ktime_t time_to_pkt = \
1935                                        ktime_sub(b->time_next_packet, now);
1936
1937                                if (ktime_to_ns(time_to_pkt) <= 0 ||
1938                                    ktime_compare(time_to_pkt,
1939                                                  best_time) <= 0) {
1940                                        best_time = time_to_pkt;
1941                                        best_tin = tin;
1942                                }
1943                        }
1944                }
1945
1946                q->cur_tin = best_tin;
1947                b = q->tins + best_tin;
1948
1949                /* No point in going further if no packets to deliver. */
1950                if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1951                        return NULL;
1952        }
1953
1954retry:
1955        /* service this class */
1956        head = &b->decaying_flows;
1957        if (!first_flow || list_empty(head)) {
1958                head = &b->new_flows;
1959                if (list_empty(head)) {
1960                        head = &b->old_flows;
1961                        if (unlikely(list_empty(head))) {
1962                                head = &b->decaying_flows;
1963                                if (unlikely(list_empty(head)))
1964                                        goto begin;
1965                        }
1966                }
1967        }
1968        flow = list_first_entry(head, struct cake_flow, flowchain);
1969        q->cur_flow = flow - b->flows;
1970        first_flow = false;
1971
1972        /* triple isolation (modified DRR++) */
1973        srchost = &b->hosts[flow->srchost];
1974        dsthost = &b->hosts[flow->dsthost];
1975        host_load = 1;
1976
1977        if (cake_dsrc(q->flow_mode))
1978                host_load = max(host_load, srchost->srchost_refcnt);
1979
1980        if (cake_ddst(q->flow_mode))
1981                host_load = max(host_load, dsthost->dsthost_refcnt);
1982
1983        WARN_ON(host_load > CAKE_QUEUES);
1984
1985        /* flow isolation (DRR++) */
1986        if (flow->deficit <= 0) {
1987                /* The shifted prandom_u32() is a way to apply dithering to
1988                 * avoid accumulating roundoff errors
1989                 */
1990                flow->deficit += (b->flow_quantum * quantum_div[host_load] +
1991                                  (prandom_u32() >> 16)) >> 16;
1992                list_move_tail(&flow->flowchain, &b->old_flows);
1993
1994                /* Keep all flows with deficits out of the sparse and decaying
1995                 * rotations.  No non-empty flow can go into the decaying
1996                 * rotation, so they can't get deficits
1997                 */
1998                if (flow->set == CAKE_SET_SPARSE) {
1999                        if (flow->head) {
2000                                b->sparse_flow_count--;
2001                                b->bulk_flow_count++;
2002                                flow->set = CAKE_SET_BULK;
2003                        } else {
2004                                /* we've moved it to the bulk rotation for
2005                                 * correct deficit accounting but we still want
2006                                 * to count it as a sparse flow, not a bulk one.
2007                                 */
2008                                flow->set = CAKE_SET_SPARSE_WAIT;
2009                        }
2010                }
2011                goto retry;
2012        }
2013
2014        /* Retrieve a packet via the AQM */
2015        while (1) {
2016                skb = cake_dequeue_one(sch);
2017                if (!skb) {
2018                        /* this queue was actually empty */
2019                        if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2020                                b->unresponsive_flow_count--;
2021
2022                        if (flow->cvars.p_drop || flow->cvars.count ||
2023                            ktime_before(now, flow->cvars.drop_next)) {
2024                                /* keep in the flowchain until the state has
2025                                 * decayed to rest
2026                                 */
2027                                list_move_tail(&flow->flowchain,
2028                                               &b->decaying_flows);
2029                                if (flow->set == CAKE_SET_BULK) {
2030                                        b->bulk_flow_count--;
2031                                        b->decaying_flow_count++;
2032                                } else if (flow->set == CAKE_SET_SPARSE ||
2033                                           flow->set == CAKE_SET_SPARSE_WAIT) {
2034                                        b->sparse_flow_count--;
2035                                        b->decaying_flow_count++;
2036                                }
2037                                flow->set = CAKE_SET_DECAYING;
2038                        } else {
2039                                /* remove empty queue from the flowchain */
2040                                list_del_init(&flow->flowchain);
2041                                if (flow->set == CAKE_SET_SPARSE ||
2042                                    flow->set == CAKE_SET_SPARSE_WAIT)
2043                                        b->sparse_flow_count--;
2044                                else if (flow->set == CAKE_SET_BULK)
2045                                        b->bulk_flow_count--;
2046                                else
2047                                        b->decaying_flow_count--;
2048
2049                                flow->set = CAKE_SET_NONE;
2050                                srchost->srchost_refcnt--;
2051                                dsthost->dsthost_refcnt--;
2052                        }
2053                        goto begin;
2054                }
2055
2056                /* Last packet in queue may be marked, shouldn't be dropped */
2057                if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2058                                        (b->bulk_flow_count *
2059                                         !!(q->rate_flags &
2060                                            CAKE_FLAG_INGRESS))) ||
2061                    !flow->head)
2062                        break;
2063
2064                /* drop this packet, get another one */
2065                if (q->rate_flags & CAKE_FLAG_INGRESS) {
2066                        len = cake_advance_shaper(q, b, skb,
2067                                                  now, true);
2068                        flow->deficit -= len;
2069                        b->tin_deficit -= len;
2070                }
2071                flow->dropped++;
2072                b->tin_dropped++;
2073                qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2074                qdisc_qstats_drop(sch);
2075                kfree_skb(skb);
2076                if (q->rate_flags & CAKE_FLAG_INGRESS)
2077                        goto retry;
2078        }
2079
2080        b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2081        qdisc_bstats_update(sch, skb);
2082
2083        /* collect delay stats */
2084        delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2085        b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2086        b->peak_delay = cake_ewma(b->peak_delay, delay,
2087                                  delay > b->peak_delay ? 2 : 8);
2088        b->base_delay = cake_ewma(b->base_delay, delay,
2089                                  delay < b->base_delay ? 2 : 8);
2090
2091        len = cake_advance_shaper(q, b, skb, now, false);
2092        flow->deficit -= len;
2093        b->tin_deficit -= len;
2094
2095        if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2096                u64 next = min(ktime_to_ns(q->time_next_packet),
2097                               ktime_to_ns(q->failsafe_next_packet));
2098
2099                qdisc_watchdog_schedule_ns(&q->watchdog, next);
2100        } else if (!sch->q.qlen) {
2101                int i;
2102
2103                for (i = 0; i < q->tin_cnt; i++) {
2104                        if (q->tins[i].decaying_flow_count) {
2105                                ktime_t next = \
2106                                        ktime_add_ns(now,
2107                                                     q->tins[i].cparams.target);
2108
2109                                qdisc_watchdog_schedule_ns(&q->watchdog,
2110                                                           ktime_to_ns(next));
2111                                break;
2112                        }
2113                }
2114        }
2115
2116        if (q->overflow_timeout)
2117                q->overflow_timeout--;
2118
2119        return skb;
2120}
2121
2122static void cake_reset(struct Qdisc *sch)
2123{
2124        u32 c;
2125
2126        for (c = 0; c < CAKE_MAX_TINS; c++)
2127                cake_clear_tin(sch, c);
2128}
2129
2130static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2131        [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2132        [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2133        [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2134        [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2135        [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2136        [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2137        [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2138        [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2139        [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2140        [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2141        [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2142        [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2143        [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2144        [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2145        [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2146};
2147
2148static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2149                          u64 target_ns, u64 rtt_est_ns)
2150{
2151        /* convert byte-rate into time-per-byte
2152         * so it will always unwedge in reasonable time.
2153         */
2154        static const u64 MIN_RATE = 64;
2155        u32 byte_target = mtu;
2156        u64 byte_target_ns;
2157        u8  rate_shft = 0;
2158        u64 rate_ns = 0;
2159
2160        b->flow_quantum = 1514;
2161        if (rate) {
2162                b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2163                rate_shft = 34;
2164                rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2165                rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2166                while (!!(rate_ns >> 34)) {
2167                        rate_ns >>= 1;
2168                        rate_shft--;
2169                }
2170        } /* else unlimited, ie. zero delay */
2171
2172        b->tin_rate_bps  = rate;
2173        b->tin_rate_ns   = rate_ns;
2174        b->tin_rate_shft = rate_shft;
2175
2176        byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2177
2178        b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2179        b->cparams.interval = max(rtt_est_ns +
2180                                     b->cparams.target - target_ns,
2181                                     b->cparams.target * 2);
2182        b->cparams.mtu_time = byte_target_ns;
2183        b->cparams.p_inc = 1 << 24; /* 1/256 */
2184        b->cparams.p_dec = 1 << 20; /* 1/4096 */
2185}
2186
2187static int cake_config_besteffort(struct Qdisc *sch)
2188{
2189        struct cake_sched_data *q = qdisc_priv(sch);
2190        struct cake_tin_data *b = &q->tins[0];
2191        u32 mtu = psched_mtu(qdisc_dev(sch));
2192        u64 rate = q->rate_bps;
2193
2194        q->tin_cnt = 1;
2195
2196        q->tin_index = besteffort;
2197        q->tin_order = normal_order;
2198
2199        cake_set_rate(b, rate, mtu,
2200                      us_to_ns(q->target), us_to_ns(q->interval));
2201        b->tin_quantum_band = 65535;
2202        b->tin_quantum_prio = 65535;
2203
2204        return 0;
2205}
2206
2207static int cake_config_precedence(struct Qdisc *sch)
2208{
2209        /* convert high-level (user visible) parameters into internal format */
2210        struct cake_sched_data *q = qdisc_priv(sch);
2211        u32 mtu = psched_mtu(qdisc_dev(sch));
2212        u64 rate = q->rate_bps;
2213        u32 quantum1 = 256;
2214        u32 quantum2 = 256;
2215        u32 i;
2216
2217        q->tin_cnt = 8;
2218        q->tin_index = precedence;
2219        q->tin_order = normal_order;
2220
2221        for (i = 0; i < q->tin_cnt; i++) {
2222                struct cake_tin_data *b = &q->tins[i];
2223
2224                cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2225                              us_to_ns(q->interval));
2226
2227                b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2228                b->tin_quantum_band = max_t(u16, 1U, quantum2);
2229
2230                /* calculate next class's parameters */
2231                rate  *= 7;
2232                rate >>= 3;
2233
2234                quantum1  *= 3;
2235                quantum1 >>= 1;
2236
2237                quantum2  *= 7;
2238                quantum2 >>= 3;
2239        }
2240
2241        return 0;
2242}
2243
2244/*      List of known Diffserv codepoints:
2245 *
2246 *      Least Effort (CS1)
2247 *      Best Effort (CS0)
2248 *      Max Reliability & LLT "Lo" (TOS1)
2249 *      Max Throughput (TOS2)
2250 *      Min Delay (TOS4)
2251 *      LLT "La" (TOS5)
2252 *      Assured Forwarding 1 (AF1x) - x3
2253 *      Assured Forwarding 2 (AF2x) - x3
2254 *      Assured Forwarding 3 (AF3x) - x3
2255 *      Assured Forwarding 4 (AF4x) - x3
2256 *      Precedence Class 2 (CS2)
2257 *      Precedence Class 3 (CS3)
2258 *      Precedence Class 4 (CS4)
2259 *      Precedence Class 5 (CS5)
2260 *      Precedence Class 6 (CS6)
2261 *      Precedence Class 7 (CS7)
2262 *      Voice Admit (VA)
2263 *      Expedited Forwarding (EF)
2264
2265 *      Total 25 codepoints.
2266 */
2267
2268/*      List of traffic classes in RFC 4594:
2269 *              (roughly descending order of contended priority)
2270 *              (roughly ascending order of uncontended throughput)
2271 *
2272 *      Network Control (CS6,CS7)      - routing traffic
2273 *      Telephony (EF,VA)         - aka. VoIP streams
2274 *      Signalling (CS5)               - VoIP setup
2275 *      Multimedia Conferencing (AF4x) - aka. video calls
2276 *      Realtime Interactive (CS4)     - eg. games
2277 *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2278 *      Broadcast Video (CS3)
2279 *      Low Latency Data (AF2x,TOS4)      - eg. database
2280 *      Ops, Admin, Management (CS2,TOS1) - eg. ssh
2281 *      Standard Service (CS0 & unrecognised codepoints)
2282 *      High Throughput Data (AF1x,TOS2)  - eg. web traffic
2283 *      Low Priority Data (CS1)           - eg. BitTorrent
2284
2285 *      Total 12 traffic classes.
2286 */
2287
2288static int cake_config_diffserv8(struct Qdisc *sch)
2289{
2290/*      Pruned list of traffic classes for typical applications:
2291 *
2292 *              Network Control          (CS6, CS7)
2293 *              Minimum Latency          (EF, VA, CS5, CS4)
2294 *              Interactive Shell        (CS2, TOS1)
2295 *              Low Latency Transactions (AF2x, TOS4)
2296 *              Video Streaming          (AF4x, AF3x, CS3)
2297 *              Bog Standard             (CS0 etc.)
2298 *              High Throughput          (AF1x, TOS2)
2299 *              Background Traffic       (CS1)
2300 *
2301 *              Total 8 traffic classes.
2302 */
2303
2304        struct cake_sched_data *q = qdisc_priv(sch);
2305        u32 mtu = psched_mtu(qdisc_dev(sch));
2306        u64 rate = q->rate_bps;
2307        u32 quantum1 = 256;
2308        u32 quantum2 = 256;
2309        u32 i;
2310
2311        q->tin_cnt = 8;
2312
2313        /* codepoint to class mapping */
2314        q->tin_index = diffserv8;
2315        q->tin_order = normal_order;
2316
2317        /* class characteristics */
2318        for (i = 0; i < q->tin_cnt; i++) {
2319                struct cake_tin_data *b = &q->tins[i];
2320
2321                cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2322                              us_to_ns(q->interval));
2323
2324                b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2325                b->tin_quantum_band = max_t(u16, 1U, quantum2);
2326
2327                /* calculate next class's parameters */
2328                rate  *= 7;
2329                rate >>= 3;
2330
2331                quantum1  *= 3;
2332                quantum1 >>= 1;
2333
2334                quantum2  *= 7;
2335                quantum2 >>= 3;
2336        }
2337
2338        return 0;
2339}
2340
2341static int cake_config_diffserv4(struct Qdisc *sch)
2342{
2343/*  Further pruned list of traffic classes for four-class system:
2344 *
2345 *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2346 *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2347 *          Best Effort        (CS0, AF1x, TOS2, and those not specified)
2348 *          Background Traffic (CS1)
2349 *
2350 *              Total 4 traffic classes.
2351 */
2352
2353        struct cake_sched_data *q = qdisc_priv(sch);
2354        u32 mtu = psched_mtu(qdisc_dev(sch));
2355        u64 rate = q->rate_bps;
2356        u32 quantum = 1024;
2357
2358        q->tin_cnt = 4;
2359
2360        /* codepoint to class mapping */
2361        q->tin_index = diffserv4;
2362        q->tin_order = bulk_order;
2363
2364        /* class characteristics */
2365        cake_set_rate(&q->tins[0], rate, mtu,
2366                      us_to_ns(q->target), us_to_ns(q->interval));
2367        cake_set_rate(&q->tins[1], rate >> 4, mtu,
2368                      us_to_ns(q->target), us_to_ns(q->interval));
2369        cake_set_rate(&q->tins[2], rate >> 1, mtu,
2370                      us_to_ns(q->target), us_to_ns(q->interval));
2371        cake_set_rate(&q->tins[3], rate >> 2, mtu,
2372                      us_to_ns(q->target), us_to_ns(q->interval));
2373
2374        /* priority weights */
2375        q->tins[0].tin_quantum_prio = quantum;
2376        q->tins[1].tin_quantum_prio = quantum >> 4;
2377        q->tins[2].tin_quantum_prio = quantum << 2;
2378        q->tins[3].tin_quantum_prio = quantum << 4;
2379
2380        /* bandwidth-sharing weights */
2381        q->tins[0].tin_quantum_band = quantum;
2382        q->tins[1].tin_quantum_band = quantum >> 4;
2383        q->tins[2].tin_quantum_band = quantum >> 1;
2384        q->tins[3].tin_quantum_band = quantum >> 2;
2385
2386        return 0;
2387}
2388
2389static int cake_config_diffserv3(struct Qdisc *sch)
2390{
2391/*  Simplified Diffserv structure with 3 tins.
2392 *              Low Priority            (CS1)
2393 *              Best Effort
2394 *              Latency Sensitive       (TOS4, VA, EF, CS6, CS7)
2395 */
2396        struct cake_sched_data *q = qdisc_priv(sch);
2397        u32 mtu = psched_mtu(qdisc_dev(sch));
2398        u64 rate = q->rate_bps;
2399        u32 quantum = 1024;
2400
2401        q->tin_cnt = 3;
2402
2403        /* codepoint to class mapping */
2404        q->tin_index = diffserv3;
2405        q->tin_order = bulk_order;
2406
2407        /* class characteristics */
2408        cake_set_rate(&q->tins[0], rate, mtu,
2409                      us_to_ns(q->target), us_to_ns(q->interval));
2410        cake_set_rate(&q->tins[1], rate >> 4, mtu,
2411                      us_to_ns(q->target), us_to_ns(q->interval));
2412        cake_set_rate(&q->tins[2], rate >> 2, mtu,
2413                      us_to_ns(q->target), us_to_ns(q->interval));
2414
2415        /* priority weights */
2416        q->tins[0].tin_quantum_prio = quantum;
2417        q->tins[1].tin_quantum_prio = quantum >> 4;
2418        q->tins[2].tin_quantum_prio = quantum << 4;
2419
2420        /* bandwidth-sharing weights */
2421        q->tins[0].tin_quantum_band = quantum;
2422        q->tins[1].tin_quantum_band = quantum >> 4;
2423        q->tins[2].tin_quantum_band = quantum >> 2;
2424
2425        return 0;
2426}
2427
2428static void cake_reconfigure(struct Qdisc *sch)
2429{
2430        struct cake_sched_data *q = qdisc_priv(sch);
2431        int c, ft;
2432
2433        switch (q->tin_mode) {
2434        case CAKE_DIFFSERV_BESTEFFORT:
2435                ft = cake_config_besteffort(sch);
2436                break;
2437
2438        case CAKE_DIFFSERV_PRECEDENCE:
2439                ft = cake_config_precedence(sch);
2440                break;
2441
2442        case CAKE_DIFFSERV_DIFFSERV8:
2443                ft = cake_config_diffserv8(sch);
2444                break;
2445
2446        case CAKE_DIFFSERV_DIFFSERV4:
2447                ft = cake_config_diffserv4(sch);
2448                break;
2449
2450        case CAKE_DIFFSERV_DIFFSERV3:
2451        default:
2452                ft = cake_config_diffserv3(sch);
2453                break;
2454        }
2455
2456        for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2457                cake_clear_tin(sch, c);
2458                q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2459        }
2460
2461        q->rate_ns   = q->tins[ft].tin_rate_ns;
2462        q->rate_shft = q->tins[ft].tin_rate_shft;
2463
2464        if (q->buffer_config_limit) {
2465                q->buffer_limit = q->buffer_config_limit;
2466        } else if (q->rate_bps) {
2467                u64 t = q->rate_bps * q->interval;
2468
2469                do_div(t, USEC_PER_SEC / 4);
2470                q->buffer_limit = max_t(u32, t, 4U << 20);
2471        } else {
2472                q->buffer_limit = ~0;
2473        }
2474
2475        sch->flags &= ~TCQ_F_CAN_BYPASS;
2476
2477        q->buffer_limit = min(q->buffer_limit,
2478                              max(sch->limit * psched_mtu(qdisc_dev(sch)),
2479                                  q->buffer_config_limit));
2480}
2481
2482static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2483                       struct netlink_ext_ack *extack)
2484{
2485        struct cake_sched_data *q = qdisc_priv(sch);
2486        struct nlattr *tb[TCA_CAKE_MAX + 1];
2487        int err;
2488
2489        if (!opt)
2490                return -EINVAL;
2491
2492        err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2493        if (err < 0)
2494                return err;
2495
2496        if (tb[TCA_CAKE_NAT]) {
2497#if IS_ENABLED(CONFIG_NF_CONNTRACK)
2498                q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2499                q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2500                        !!nla_get_u32(tb[TCA_CAKE_NAT]);
2501#else
2502                NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2503                                    "No conntrack support in kernel");
2504                return -EOPNOTSUPP;
2505#endif
2506        }
2507
2508        if (tb[TCA_CAKE_BASE_RATE64])
2509                q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2510
2511        if (tb[TCA_CAKE_DIFFSERV_MODE])
2512                q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2513
2514        if (tb[TCA_CAKE_WASH]) {
2515                if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2516                        q->rate_flags |= CAKE_FLAG_WASH;
2517                else
2518                        q->rate_flags &= ~CAKE_FLAG_WASH;
2519        }
2520
2521        if (tb[TCA_CAKE_FLOW_MODE])
2522                q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2523                                (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2524                                        CAKE_FLOW_MASK));
2525
2526        if (tb[TCA_CAKE_ATM])
2527                q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2528
2529        if (tb[TCA_CAKE_OVERHEAD]) {
2530                q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2531                q->rate_flags |= CAKE_FLAG_OVERHEAD;
2532
2533                q->max_netlen = 0;
2534                q->max_adjlen = 0;
2535                q->min_netlen = ~0;
2536                q->min_adjlen = ~0;
2537        }
2538
2539        if (tb[TCA_CAKE_RAW]) {
2540                q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2541
2542                q->max_netlen = 0;
2543                q->max_adjlen = 0;
2544                q->min_netlen = ~0;
2545                q->min_adjlen = ~0;
2546        }
2547
2548        if (tb[TCA_CAKE_MPU])
2549                q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2550
2551        if (tb[TCA_CAKE_RTT]) {
2552                q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2553
2554                if (!q->interval)
2555                        q->interval = 1;
2556        }
2557
2558        if (tb[TCA_CAKE_TARGET]) {
2559                q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2560
2561                if (!q->target)
2562                        q->target = 1;
2563        }
2564
2565        if (tb[TCA_CAKE_AUTORATE]) {
2566                if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2567                        q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2568                else
2569                        q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2570        }
2571
2572        if (tb[TCA_CAKE_INGRESS]) {
2573                if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2574                        q->rate_flags |= CAKE_FLAG_INGRESS;
2575                else
2576                        q->rate_flags &= ~CAKE_FLAG_INGRESS;
2577        }
2578
2579        if (tb[TCA_CAKE_ACK_FILTER])
2580                q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2581
2582        if (tb[TCA_CAKE_MEMORY])
2583                q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2584
2585        if (tb[TCA_CAKE_SPLIT_GSO]) {
2586                if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2587                        q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2588                else
2589                        q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2590        }
2591
2592        if (q->tins) {
2593                sch_tree_lock(sch);
2594                cake_reconfigure(sch);
2595                sch_tree_unlock(sch);
2596        }
2597
2598        return 0;
2599}
2600
2601static void cake_destroy(struct Qdisc *sch)
2602{
2603        struct cake_sched_data *q = qdisc_priv(sch);
2604
2605        qdisc_watchdog_cancel(&q->watchdog);
2606        tcf_block_put(q->block);
2607        kvfree(q->tins);
2608}
2609
2610static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2611                     struct netlink_ext_ack *extack)
2612{
2613        struct cake_sched_data *q = qdisc_priv(sch);
2614        int i, j, err;
2615
2616        sch->limit = 10240;
2617        q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2618        q->flow_mode  = CAKE_FLOW_TRIPLE;
2619
2620        q->rate_bps = 0; /* unlimited by default */
2621
2622        q->interval = 100000; /* 100ms default */
2623        q->target   =   5000; /* 5ms: codel RFC argues
2624                               * for 5 to 10% of interval
2625                               */
2626        q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2627        q->cur_tin = 0;
2628        q->cur_flow  = 0;
2629
2630        qdisc_watchdog_init(&q->watchdog, sch);
2631
2632        if (opt) {
2633                int err = cake_change(sch, opt, extack);
2634
2635                if (err)
2636                        return err;
2637        }
2638
2639        err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2640        if (err)
2641                return err;
2642
2643        quantum_div[0] = ~0;
2644        for (i = 1; i <= CAKE_QUEUES; i++)
2645                quantum_div[i] = 65535 / i;
2646
2647        q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2648                           GFP_KERNEL);
2649        if (!q->tins)
2650                goto nomem;
2651
2652        for (i = 0; i < CAKE_MAX_TINS; i++) {
2653                struct cake_tin_data *b = q->tins + i;
2654
2655                INIT_LIST_HEAD(&b->new_flows);
2656                INIT_LIST_HEAD(&b->old_flows);
2657                INIT_LIST_HEAD(&b->decaying_flows);
2658                b->sparse_flow_count = 0;
2659                b->bulk_flow_count = 0;
2660                b->decaying_flow_count = 0;
2661
2662                for (j = 0; j < CAKE_QUEUES; j++) {
2663                        struct cake_flow *flow = b->flows + j;
2664                        u32 k = j * CAKE_MAX_TINS + i;
2665
2666                        INIT_LIST_HEAD(&flow->flowchain);
2667                        cobalt_vars_init(&flow->cvars);
2668
2669                        q->overflow_heap[k].t = i;
2670                        q->overflow_heap[k].b = j;
2671                        b->overflow_idx[j] = k;
2672                }
2673        }
2674
2675        cake_reconfigure(sch);
2676        q->avg_peak_bandwidth = q->rate_bps;
2677        q->min_netlen = ~0;
2678        q->min_adjlen = ~0;
2679        return 0;
2680
2681nomem:
2682        cake_destroy(sch);
2683        return -ENOMEM;
2684}
2685
2686static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2687{
2688        struct cake_sched_data *q = qdisc_priv(sch);
2689        struct nlattr *opts;
2690
2691        opts = nla_nest_start(skb, TCA_OPTIONS);
2692        if (!opts)
2693                goto nla_put_failure;
2694
2695        if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2696                              TCA_CAKE_PAD))
2697                goto nla_put_failure;
2698
2699        if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2700                        q->flow_mode & CAKE_FLOW_MASK))
2701                goto nla_put_failure;
2702
2703        if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2704                goto nla_put_failure;
2705
2706        if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2707                goto nla_put_failure;
2708
2709        if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2710                goto nla_put_failure;
2711
2712        if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2713                        !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2714                goto nla_put_failure;
2715
2716        if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2717                        !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2718                goto nla_put_failure;
2719
2720        if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2721                goto nla_put_failure;
2722
2723        if (nla_put_u32(skb, TCA_CAKE_NAT,
2724                        !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2725                goto nla_put_failure;
2726
2727        if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2728                goto nla_put_failure;
2729
2730        if (nla_put_u32(skb, TCA_CAKE_WASH,
2731                        !!(q->rate_flags & CAKE_FLAG_WASH)))
2732                goto nla_put_failure;
2733
2734        if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2735                goto nla_put_failure;
2736
2737        if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2738                if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2739                        goto nla_put_failure;
2740
2741        if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2742                goto nla_put_failure;
2743
2744        if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2745                goto nla_put_failure;
2746
2747        if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2748                        !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2749                goto nla_put_failure;
2750
2751        return nla_nest_end(skb, opts);
2752
2753nla_put_failure:
2754        return -1;
2755}
2756
2757static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2758{
2759        struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2760        struct cake_sched_data *q = qdisc_priv(sch);
2761        struct nlattr *tstats, *ts;
2762        int i;
2763
2764        if (!stats)
2765                return -1;
2766
2767#define PUT_STAT_U32(attr, data) do {                                  \
2768                if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2769                        goto nla_put_failure;                          \
2770        } while (0)
2771#define PUT_STAT_U64(attr, data) do {                                  \
2772                if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2773                                        data, TCA_CAKE_STATS_PAD)) \
2774                        goto nla_put_failure;                          \
2775        } while (0)
2776
2777        PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2778        PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2779        PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2780        PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2781        PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2782        PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2783        PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2784        PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2785
2786#undef PUT_STAT_U32
2787#undef PUT_STAT_U64
2788
2789        tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2790        if (!tstats)
2791                goto nla_put_failure;
2792
2793#define PUT_TSTAT_U32(attr, data) do {                                  \
2794                if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2795                        goto nla_put_failure;                           \
2796        } while (0)
2797#define PUT_TSTAT_U64(attr, data) do {                                  \
2798                if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2799                                        data, TCA_CAKE_TIN_STATS_PAD))  \
2800                        goto nla_put_failure;                           \
2801        } while (0)
2802
2803        for (i = 0; i < q->tin_cnt; i++) {
2804                struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2805
2806                ts = nla_nest_start(d->skb, i + 1);
2807                if (!ts)
2808                        goto nla_put_failure;
2809
2810                PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2811                PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2812                PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2813
2814                PUT_TSTAT_U32(TARGET_US,
2815                              ktime_to_us(ns_to_ktime(b->cparams.target)));
2816                PUT_TSTAT_U32(INTERVAL_US,
2817                              ktime_to_us(ns_to_ktime(b->cparams.interval)));
2818
2819                PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2820                PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2821                PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2822                PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2823
2824                PUT_TSTAT_U32(PEAK_DELAY_US,
2825                              ktime_to_us(ns_to_ktime(b->peak_delay)));
2826                PUT_TSTAT_U32(AVG_DELAY_US,
2827                              ktime_to_us(ns_to_ktime(b->avge_delay)));
2828                PUT_TSTAT_U32(BASE_DELAY_US,
2829                              ktime_to_us(ns_to_ktime(b->base_delay)));
2830
2831                PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2832                PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2833                PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2834
2835                PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2836                                            b->decaying_flow_count);
2837                PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2838                PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2839                PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2840
2841                PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2842                nla_nest_end(d->skb, ts);
2843        }
2844
2845#undef PUT_TSTAT_U32
2846#undef PUT_TSTAT_U64
2847
2848        nla_nest_end(d->skb, tstats);
2849        return nla_nest_end(d->skb, stats);
2850
2851nla_put_failure:
2852        nla_nest_cancel(d->skb, stats);
2853        return -1;
2854}
2855
2856static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2857{
2858        return NULL;
2859}
2860
2861static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2862{
2863        return 0;
2864}
2865
2866static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2867                               u32 classid)
2868{
2869        return 0;
2870}
2871
2872static void cake_unbind(struct Qdisc *q, unsigned long cl)
2873{
2874}
2875
2876static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2877                                        struct netlink_ext_ack *extack)
2878{
2879        struct cake_sched_data *q = qdisc_priv(sch);
2880
2881        if (cl)
2882                return NULL;
2883        return q->block;
2884}
2885
2886static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2887                           struct sk_buff *skb, struct tcmsg *tcm)
2888{
2889        tcm->tcm_handle |= TC_H_MIN(cl);
2890        return 0;
2891}
2892
2893static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2894                                 struct gnet_dump *d)
2895{
2896        struct cake_sched_data *q = qdisc_priv(sch);
2897        const struct cake_flow *flow = NULL;
2898        struct gnet_stats_queue qs = { 0 };
2899        struct nlattr *stats;
2900        u32 idx = cl - 1;
2901
2902        if (idx < CAKE_QUEUES * q->tin_cnt) {
2903                const struct cake_tin_data *b = \
2904                        &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2905                const struct sk_buff *skb;
2906
2907                flow = &b->flows[idx % CAKE_QUEUES];
2908
2909                if (flow->head) {
2910                        sch_tree_lock(sch);
2911                        skb = flow->head;
2912                        while (skb) {
2913                                qs.qlen++;
2914                                skb = skb->next;
2915                        }
2916                        sch_tree_unlock(sch);
2917                }
2918                qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2919                qs.drops = flow->dropped;
2920        }
2921        if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2922                return -1;
2923        if (flow) {
2924                ktime_t now = ktime_get();
2925
2926                stats = nla_nest_start(d->skb, TCA_STATS_APP);
2927                if (!stats)
2928                        return -1;
2929
2930#define PUT_STAT_U32(attr, data) do {                                  \
2931                if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2932                        goto nla_put_failure;                          \
2933        } while (0)
2934#define PUT_STAT_S32(attr, data) do {                                  \
2935                if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2936                        goto nla_put_failure;                          \
2937        } while (0)
2938
2939                PUT_STAT_S32(DEFICIT, flow->deficit);
2940                PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2941                PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2942                PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2943                if (flow->cvars.p_drop) {
2944                        PUT_STAT_S32(BLUE_TIMER_US,
2945                                     ktime_to_us(
2946                                             ktime_sub(now,
2947                                                     flow->cvars.blue_timer)));
2948                }
2949                if (flow->cvars.dropping) {
2950                        PUT_STAT_S32(DROP_NEXT_US,
2951                                     ktime_to_us(
2952                                             ktime_sub(now,
2953                                                       flow->cvars.drop_next)));
2954                }
2955
2956                if (nla_nest_end(d->skb, stats) < 0)
2957                        return -1;
2958        }
2959
2960        return 0;
2961
2962nla_put_failure:
2963        nla_nest_cancel(d->skb, stats);
2964        return -1;
2965}
2966
2967static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2968{
2969        struct cake_sched_data *q = qdisc_priv(sch);
2970        unsigned int i, j;
2971
2972        if (arg->stop)
2973                return;
2974
2975        for (i = 0; i < q->tin_cnt; i++) {
2976                struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2977
2978                for (j = 0; j < CAKE_QUEUES; j++) {
2979                        if (list_empty(&b->flows[j].flowchain) ||
2980                            arg->count < arg->skip) {
2981                                arg->count++;
2982                                continue;
2983                        }
2984                        if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
2985                                arg->stop = 1;
2986                                break;
2987                        }
2988                        arg->count++;
2989                }
2990        }
2991}
2992
2993static const struct Qdisc_class_ops cake_class_ops = {
2994        .leaf           =       cake_leaf,
2995        .find           =       cake_find,
2996        .tcf_block      =       cake_tcf_block,
2997        .bind_tcf       =       cake_bind,
2998        .unbind_tcf     =       cake_unbind,
2999        .dump           =       cake_dump_class,
3000        .dump_stats     =       cake_dump_class_stats,
3001        .walk           =       cake_walk,
3002};
3003
3004static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3005        .cl_ops         =       &cake_class_ops,
3006        .id             =       "cake",
3007        .priv_size      =       sizeof(struct cake_sched_data),
3008        .enqueue        =       cake_enqueue,
3009        .dequeue        =       cake_dequeue,
3010        .peek           =       qdisc_peek_dequeued,
3011        .init           =       cake_init,
3012        .reset          =       cake_reset,
3013        .destroy        =       cake_destroy,
3014        .change         =       cake_change,
3015        .dump           =       cake_dump,
3016        .dump_stats     =       cake_dump_stats,
3017        .owner          =       THIS_MODULE,
3018};
3019
3020static int __init cake_module_init(void)
3021{
3022        return register_qdisc(&cake_qdisc_ops);
3023}
3024
3025static void __exit cake_module_exit(void)
3026{
3027        unregister_qdisc(&cake_qdisc_ops);
3028}
3029
3030module_init(cake_module_init)
3031module_exit(cake_module_exit)
3032MODULE_AUTHOR("Jonathan Morton");
3033MODULE_LICENSE("Dual BSD/GPL");
3034MODULE_DESCRIPTION("The CAKE shaper.");
3035