linux/net/sched/sch_hhf.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* net/sched/sch_hhf.c          Heavy-Hitter Filter (HHF)
   3 *
   4 * Copyright (C) 2013 Terry Lam <vtlam@google.com>
   5 * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
   6 */
   7
   8#include <linux/jiffies.h>
   9#include <linux/module.h>
  10#include <linux/skbuff.h>
  11#include <linux/vmalloc.h>
  12#include <linux/siphash.h>
  13#include <net/pkt_sched.h>
  14#include <net/sock.h>
  15
  16/*      Heavy-Hitter Filter (HHF)
  17 *
  18 * Principles :
  19 * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
  20 * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
  21 * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
  22 * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
  23 * in which the heavy-hitter bucket is served with less weight.
  24 * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
  25 * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
  26 * higher share of bandwidth.
  27 *
  28 * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
  29 * following paper:
  30 * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
  31 * Accounting", in ACM SIGCOMM, 2002.
  32 *
  33 * Conceptually, a multi-stage filter comprises k independent hash functions
  34 * and k counter arrays. Packets are indexed into k counter arrays by k hash
  35 * functions, respectively. The counters are then increased by the packet sizes.
  36 * Therefore,
  37 *    - For a heavy-hitter flow: *all* of its k array counters must be large.
  38 *    - For a non-heavy-hitter flow: some of its k array counters can be large
  39 *      due to hash collision with other small flows; however, with high
  40 *      probability, not *all* k counters are large.
  41 *
  42 * By the design of the multi-stage filter algorithm, the false negative rate
  43 * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
  44 * susceptible to false positives (non-heavy-hitters mistakenly classified as
  45 * heavy-hitters).
  46 * Therefore, we also implement the following optimizations to reduce false
  47 * positives by avoiding unnecessary increment of the counter values:
  48 *    - Optimization O1: once a heavy-hitter is identified, its bytes are not
  49 *        accounted in the array counters. This technique is called "shielding"
  50 *        in Section 3.3.1 of [EV02].
  51 *    - Optimization O2: conservative update of counters
  52 *                       (Section 3.3.2 of [EV02]),
  53 *        New counter value = max {old counter value,
  54 *                                 smallest counter value + packet bytes}
  55 *
  56 * Finally, we refresh the counters periodically since otherwise the counter
  57 * values will keep accumulating.
  58 *
  59 * Once a flow is classified as heavy-hitter, we also save its per-flow state
  60 * in an exact-matching flow table so that its subsequent packets can be
  61 * dispatched to the heavy-hitter bucket accordingly.
  62 *
  63 *
  64 * At a high level, this qdisc works as follows:
  65 * Given a packet p:
  66 *   - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
  67 *     heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
  68 *     bucket.
  69 *   - Otherwise, forward p to the multi-stage filter, denoted filter F
  70 *        + If F decides that p belongs to a non-heavy-hitter flow, then send p
  71 *          to the non-heavy-hitter bucket.
  72 *        + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
  73 *          then set up a new flow entry for the flow-id of p in the table T and
  74 *          send p to the heavy-hitter bucket.
  75 *
  76 * In this implementation:
  77 *   - T is a fixed-size hash-table with 1024 entries. Hash collision is
  78 *     resolved by linked-list chaining.
  79 *   - F has four counter arrays, each array containing 1024 32-bit counters.
  80 *     That means 4 * 1024 * 32 bits = 16KB of memory.
  81 *   - Since each array in F contains 1024 counters, 10 bits are sufficient to
  82 *     index into each array.
  83 *     Hence, instead of having four hash functions, we chop the 32-bit
  84 *     skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
  85 *     computed as XOR sum of those three chunks.
  86 *   - We need to clear the counter arrays periodically; however, directly
  87 *     memsetting 16KB of memory can lead to cache eviction and unwanted delay.
  88 *     So by representing each counter by a valid bit, we only need to reset
  89 *     4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
  90 *   - The Deficit Round Robin engine is taken from fq_codel implementation
  91 *     (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
  92 *     fq_codel_flow in fq_codel implementation.
  93 *
  94 */
  95
  96/* Non-configurable parameters */
  97#define HH_FLOWS_CNT     1024  /* number of entries in exact-matching table T */
  98#define HHF_ARRAYS_CNT   4     /* number of arrays in multi-stage filter F */
  99#define HHF_ARRAYS_LEN   1024  /* number of counters in each array of F */
 100#define HHF_BIT_MASK_LEN 10    /* masking 10 bits */
 101#define HHF_BIT_MASK     0x3FF /* bitmask of 10 bits */
 102
 103#define WDRR_BUCKET_CNT  2     /* two buckets for Weighted DRR */
 104enum wdrr_bucket_idx {
 105        WDRR_BUCKET_FOR_HH      = 0, /* bucket id for heavy-hitters */
 106        WDRR_BUCKET_FOR_NON_HH  = 1  /* bucket id for non-heavy-hitters */
 107};
 108
 109#define hhf_time_before(a, b)   \
 110        (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
 111
 112/* Heavy-hitter per-flow state */
 113struct hh_flow_state {
 114        u32              hash_id;       /* hash of flow-id (e.g. TCP 5-tuple) */
 115        u32              hit_timestamp; /* last time heavy-hitter was seen */
 116        struct list_head flowchain;     /* chaining under hash collision */
 117};
 118
 119/* Weighted Deficit Round Robin (WDRR) scheduler */
 120struct wdrr_bucket {
 121        struct sk_buff    *head;
 122        struct sk_buff    *tail;
 123        struct list_head  bucketchain;
 124        int               deficit;
 125};
 126
 127struct hhf_sched_data {
 128        struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
 129        siphash_key_t      perturbation;   /* hash perturbation */
 130        u32                quantum;        /* psched_mtu(qdisc_dev(sch)); */
 131        u32                drop_overlimit; /* number of times max qdisc packet
 132                                            * limit was hit
 133                                            */
 134        struct list_head   *hh_flows;       /* table T (currently active HHs) */
 135        u32                hh_flows_limit;            /* max active HH allocs */
 136        u32                hh_flows_overlimit; /* num of disallowed HH allocs */
 137        u32                hh_flows_total_cnt;          /* total admitted HHs */
 138        u32                hh_flows_current_cnt;        /* total current HHs  */
 139        u32                *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
 140        u32                hhf_arrays_reset_timestamp;  /* last time hhf_arrays
 141                                                         * was reset
 142                                                         */
 143        unsigned long      *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
 144                                                             * of hhf_arrays
 145                                                             */
 146        /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
 147        struct list_head   new_buckets; /* list of new buckets */
 148        struct list_head   old_buckets; /* list of old buckets */
 149
 150        /* Configurable HHF parameters */
 151        u32                hhf_reset_timeout; /* interval to reset counter
 152                                               * arrays in filter F
 153                                               * (default 40ms)
 154                                               */
 155        u32                hhf_admit_bytes;   /* counter thresh to classify as
 156                                               * HH (default 128KB).
 157                                               * With these default values,
 158                                               * 128KB / 40ms = 25 Mbps
 159                                               * i.e., we expect to capture HHs
 160                                               * sending > 25 Mbps.
 161                                               */
 162        u32                hhf_evict_timeout; /* aging threshold to evict idle
 163                                               * HHs out of table T. This should
 164                                               * be large enough to avoid
 165                                               * reordering during HH eviction.
 166                                               * (default 1s)
 167                                               */
 168        u32                hhf_non_hh_weight; /* WDRR weight for non-HHs
 169                                               * (default 2,
 170                                               *  i.e., non-HH : HH = 2 : 1)
 171                                               */
 172};
 173
 174static u32 hhf_time_stamp(void)
 175{
 176        return jiffies;
 177}
 178
 179/* Looks up a heavy-hitter flow in a chaining list of table T. */
 180static struct hh_flow_state *seek_list(const u32 hash,
 181                                       struct list_head *head,
 182                                       struct hhf_sched_data *q)
 183{
 184        struct hh_flow_state *flow, *next;
 185        u32 now = hhf_time_stamp();
 186
 187        if (list_empty(head))
 188                return NULL;
 189
 190        list_for_each_entry_safe(flow, next, head, flowchain) {
 191                u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
 192
 193                if (hhf_time_before(prev, now)) {
 194                        /* Delete expired heavy-hitters, but preserve one entry
 195                         * to avoid kzalloc() when next time this slot is hit.
 196                         */
 197                        if (list_is_last(&flow->flowchain, head))
 198                                return NULL;
 199                        list_del(&flow->flowchain);
 200                        kfree(flow);
 201                        q->hh_flows_current_cnt--;
 202                } else if (flow->hash_id == hash) {
 203                        return flow;
 204                }
 205        }
 206        return NULL;
 207}
 208
 209/* Returns a flow state entry for a new heavy-hitter.  Either reuses an expired
 210 * entry or dynamically alloc a new entry.
 211 */
 212static struct hh_flow_state *alloc_new_hh(struct list_head *head,
 213                                          struct hhf_sched_data *q)
 214{
 215        struct hh_flow_state *flow;
 216        u32 now = hhf_time_stamp();
 217
 218        if (!list_empty(head)) {
 219                /* Find an expired heavy-hitter flow entry. */
 220                list_for_each_entry(flow, head, flowchain) {
 221                        u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
 222
 223                        if (hhf_time_before(prev, now))
 224                                return flow;
 225                }
 226        }
 227
 228        if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
 229                q->hh_flows_overlimit++;
 230                return NULL;
 231        }
 232        /* Create new entry. */
 233        flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
 234        if (!flow)
 235                return NULL;
 236
 237        q->hh_flows_current_cnt++;
 238        INIT_LIST_HEAD(&flow->flowchain);
 239        list_add_tail(&flow->flowchain, head);
 240
 241        return flow;
 242}
 243
 244/* Assigns packets to WDRR buckets.  Implements a multi-stage filter to
 245 * classify heavy-hitters.
 246 */
 247static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
 248{
 249        struct hhf_sched_data *q = qdisc_priv(sch);
 250        u32 tmp_hash, hash;
 251        u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
 252        struct hh_flow_state *flow;
 253        u32 pkt_len, min_hhf_val;
 254        int i;
 255        u32 prev;
 256        u32 now = hhf_time_stamp();
 257
 258        /* Reset the HHF counter arrays if this is the right time. */
 259        prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
 260        if (hhf_time_before(prev, now)) {
 261                for (i = 0; i < HHF_ARRAYS_CNT; i++)
 262                        bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
 263                q->hhf_arrays_reset_timestamp = now;
 264        }
 265
 266        /* Get hashed flow-id of the skb. */
 267        hash = skb_get_hash_perturb(skb, &q->perturbation);
 268
 269        /* Check if this packet belongs to an already established HH flow. */
 270        flow_pos = hash & HHF_BIT_MASK;
 271        flow = seek_list(hash, &q->hh_flows[flow_pos], q);
 272        if (flow) { /* found its HH flow */
 273                flow->hit_timestamp = now;
 274                return WDRR_BUCKET_FOR_HH;
 275        }
 276
 277        /* Now pass the packet through the multi-stage filter. */
 278        tmp_hash = hash;
 279        xorsum = 0;
 280        for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
 281                /* Split the skb_hash into three 10-bit chunks. */
 282                filter_pos[i] = tmp_hash & HHF_BIT_MASK;
 283                xorsum ^= filter_pos[i];
 284                tmp_hash >>= HHF_BIT_MASK_LEN;
 285        }
 286        /* The last chunk is computed as XOR sum of other chunks. */
 287        filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
 288
 289        pkt_len = qdisc_pkt_len(skb);
 290        min_hhf_val = ~0U;
 291        for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 292                u32 val;
 293
 294                if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
 295                        q->hhf_arrays[i][filter_pos[i]] = 0;
 296                        __set_bit(filter_pos[i], q->hhf_valid_bits[i]);
 297                }
 298
 299                val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
 300                if (min_hhf_val > val)
 301                        min_hhf_val = val;
 302        }
 303
 304        /* Found a new HH iff all counter values > HH admit threshold. */
 305        if (min_hhf_val > q->hhf_admit_bytes) {
 306                /* Just captured a new heavy-hitter. */
 307                flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
 308                if (!flow) /* memory alloc problem */
 309                        return WDRR_BUCKET_FOR_NON_HH;
 310                flow->hash_id = hash;
 311                flow->hit_timestamp = now;
 312                q->hh_flows_total_cnt++;
 313
 314                /* By returning without updating counters in q->hhf_arrays,
 315                 * we implicitly implement "shielding" (see Optimization O1).
 316                 */
 317                return WDRR_BUCKET_FOR_HH;
 318        }
 319
 320        /* Conservative update of HHF arrays (see Optimization O2). */
 321        for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 322                if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
 323                        q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
 324        }
 325        return WDRR_BUCKET_FOR_NON_HH;
 326}
 327
 328/* Removes one skb from head of bucket. */
 329static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
 330{
 331        struct sk_buff *skb = bucket->head;
 332
 333        bucket->head = skb->next;
 334        skb_mark_not_on_list(skb);
 335        return skb;
 336}
 337
 338/* Tail-adds skb to bucket. */
 339static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
 340{
 341        if (bucket->head == NULL)
 342                bucket->head = skb;
 343        else
 344                bucket->tail->next = skb;
 345        bucket->tail = skb;
 346        skb->next = NULL;
 347}
 348
 349static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free)
 350{
 351        struct hhf_sched_data *q = qdisc_priv(sch);
 352        struct wdrr_bucket *bucket;
 353
 354        /* Always try to drop from heavy-hitters first. */
 355        bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
 356        if (!bucket->head)
 357                bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
 358
 359        if (bucket->head) {
 360                struct sk_buff *skb = dequeue_head(bucket);
 361
 362                sch->q.qlen--;
 363                qdisc_qstats_backlog_dec(sch, skb);
 364                qdisc_drop(skb, sch, to_free);
 365        }
 366
 367        /* Return id of the bucket from which the packet was dropped. */
 368        return bucket - q->buckets;
 369}
 370
 371static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 372                       struct sk_buff **to_free)
 373{
 374        struct hhf_sched_data *q = qdisc_priv(sch);
 375        enum wdrr_bucket_idx idx;
 376        struct wdrr_bucket *bucket;
 377        unsigned int prev_backlog;
 378
 379        idx = hhf_classify(skb, sch);
 380
 381        bucket = &q->buckets[idx];
 382        bucket_add(bucket, skb);
 383        qdisc_qstats_backlog_inc(sch, skb);
 384
 385        if (list_empty(&bucket->bucketchain)) {
 386                unsigned int weight;
 387
 388                /* The logic of new_buckets vs. old_buckets is the same as
 389                 * new_flows vs. old_flows in the implementation of fq_codel,
 390                 * i.e., short bursts of non-HHs should have strict priority.
 391                 */
 392                if (idx == WDRR_BUCKET_FOR_HH) {
 393                        /* Always move heavy-hitters to old bucket. */
 394                        weight = 1;
 395                        list_add_tail(&bucket->bucketchain, &q->old_buckets);
 396                } else {
 397                        weight = q->hhf_non_hh_weight;
 398                        list_add_tail(&bucket->bucketchain, &q->new_buckets);
 399                }
 400                bucket->deficit = weight * q->quantum;
 401        }
 402        if (++sch->q.qlen <= sch->limit)
 403                return NET_XMIT_SUCCESS;
 404
 405        prev_backlog = sch->qstats.backlog;
 406        q->drop_overlimit++;
 407        /* Return Congestion Notification only if we dropped a packet from this
 408         * bucket.
 409         */
 410        if (hhf_drop(sch, to_free) == idx)
 411                return NET_XMIT_CN;
 412
 413        /* As we dropped a packet, better let upper stack know this. */
 414        qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
 415        return NET_XMIT_SUCCESS;
 416}
 417
 418static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
 419{
 420        struct hhf_sched_data *q = qdisc_priv(sch);
 421        struct sk_buff *skb = NULL;
 422        struct wdrr_bucket *bucket;
 423        struct list_head *head;
 424
 425begin:
 426        head = &q->new_buckets;
 427        if (list_empty(head)) {
 428                head = &q->old_buckets;
 429                if (list_empty(head))
 430                        return NULL;
 431        }
 432        bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
 433
 434        if (bucket->deficit <= 0) {
 435                int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
 436                              1 : q->hhf_non_hh_weight;
 437
 438                bucket->deficit += weight * q->quantum;
 439                list_move_tail(&bucket->bucketchain, &q->old_buckets);
 440                goto begin;
 441        }
 442
 443        if (bucket->head) {
 444                skb = dequeue_head(bucket);
 445                sch->q.qlen--;
 446                qdisc_qstats_backlog_dec(sch, skb);
 447        }
 448
 449        if (!skb) {
 450                /* Force a pass through old_buckets to prevent starvation. */
 451                if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
 452                        list_move_tail(&bucket->bucketchain, &q->old_buckets);
 453                else
 454                        list_del_init(&bucket->bucketchain);
 455                goto begin;
 456        }
 457        qdisc_bstats_update(sch, skb);
 458        bucket->deficit -= qdisc_pkt_len(skb);
 459
 460        return skb;
 461}
 462
 463static void hhf_reset(struct Qdisc *sch)
 464{
 465        struct sk_buff *skb;
 466
 467        while ((skb = hhf_dequeue(sch)) != NULL)
 468                rtnl_kfree_skbs(skb, skb);
 469}
 470
 471static void hhf_destroy(struct Qdisc *sch)
 472{
 473        int i;
 474        struct hhf_sched_data *q = qdisc_priv(sch);
 475
 476        for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 477                kvfree(q->hhf_arrays[i]);
 478                kvfree(q->hhf_valid_bits[i]);
 479        }
 480
 481        if (!q->hh_flows)
 482                return;
 483
 484        for (i = 0; i < HH_FLOWS_CNT; i++) {
 485                struct hh_flow_state *flow, *next;
 486                struct list_head *head = &q->hh_flows[i];
 487
 488                if (list_empty(head))
 489                        continue;
 490                list_for_each_entry_safe(flow, next, head, flowchain) {
 491                        list_del(&flow->flowchain);
 492                        kfree(flow);
 493                }
 494        }
 495        kvfree(q->hh_flows);
 496}
 497
 498static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
 499        [TCA_HHF_BACKLOG_LIMIT]  = { .type = NLA_U32 },
 500        [TCA_HHF_QUANTUM]        = { .type = NLA_U32 },
 501        [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
 502        [TCA_HHF_RESET_TIMEOUT]  = { .type = NLA_U32 },
 503        [TCA_HHF_ADMIT_BYTES]    = { .type = NLA_U32 },
 504        [TCA_HHF_EVICT_TIMEOUT]  = { .type = NLA_U32 },
 505        [TCA_HHF_NON_HH_WEIGHT]  = { .type = NLA_U32 },
 506};
 507
 508static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
 509                      struct netlink_ext_ack *extack)
 510{
 511        struct hhf_sched_data *q = qdisc_priv(sch);
 512        struct nlattr *tb[TCA_HHF_MAX + 1];
 513        unsigned int qlen, prev_backlog;
 514        int err;
 515        u64 non_hh_quantum;
 516        u32 new_quantum = q->quantum;
 517        u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
 518
 519        if (!opt)
 520                return -EINVAL;
 521
 522        err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy,
 523                                          NULL);
 524        if (err < 0)
 525                return err;
 526
 527        if (tb[TCA_HHF_QUANTUM])
 528                new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
 529
 530        if (tb[TCA_HHF_NON_HH_WEIGHT])
 531                new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
 532
 533        non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
 534        if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX)
 535                return -EINVAL;
 536
 537        sch_tree_lock(sch);
 538
 539        if (tb[TCA_HHF_BACKLOG_LIMIT])
 540                sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
 541
 542        q->quantum = new_quantum;
 543        q->hhf_non_hh_weight = new_hhf_non_hh_weight;
 544
 545        if (tb[TCA_HHF_HH_FLOWS_LIMIT])
 546                q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
 547
 548        if (tb[TCA_HHF_RESET_TIMEOUT]) {
 549                u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
 550
 551                q->hhf_reset_timeout = usecs_to_jiffies(us);
 552        }
 553
 554        if (tb[TCA_HHF_ADMIT_BYTES])
 555                q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
 556
 557        if (tb[TCA_HHF_EVICT_TIMEOUT]) {
 558                u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
 559
 560                q->hhf_evict_timeout = usecs_to_jiffies(us);
 561        }
 562
 563        qlen = sch->q.qlen;
 564        prev_backlog = sch->qstats.backlog;
 565        while (sch->q.qlen > sch->limit) {
 566                struct sk_buff *skb = hhf_dequeue(sch);
 567
 568                rtnl_kfree_skbs(skb, skb);
 569        }
 570        qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
 571                                  prev_backlog - sch->qstats.backlog);
 572
 573        sch_tree_unlock(sch);
 574        return 0;
 575}
 576
 577static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
 578                    struct netlink_ext_ack *extack)
 579{
 580        struct hhf_sched_data *q = qdisc_priv(sch);
 581        int i;
 582
 583        sch->limit = 1000;
 584        q->quantum = psched_mtu(qdisc_dev(sch));
 585        get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 586        INIT_LIST_HEAD(&q->new_buckets);
 587        INIT_LIST_HEAD(&q->old_buckets);
 588
 589        /* Configurable HHF parameters */
 590        q->hhf_reset_timeout = HZ / 25; /* 40  ms */
 591        q->hhf_admit_bytes = 131072;    /* 128 KB */
 592        q->hhf_evict_timeout = HZ;      /* 1  sec */
 593        q->hhf_non_hh_weight = 2;
 594
 595        if (opt) {
 596                int err = hhf_change(sch, opt, extack);
 597
 598                if (err)
 599                        return err;
 600        }
 601
 602        if (!q->hh_flows) {
 603                /* Initialize heavy-hitter flow table. */
 604                q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head),
 605                                       GFP_KERNEL);
 606                if (!q->hh_flows)
 607                        return -ENOMEM;
 608                for (i = 0; i < HH_FLOWS_CNT; i++)
 609                        INIT_LIST_HEAD(&q->hh_flows[i]);
 610
 611                /* Cap max active HHs at twice len of hh_flows table. */
 612                q->hh_flows_limit = 2 * HH_FLOWS_CNT;
 613                q->hh_flows_overlimit = 0;
 614                q->hh_flows_total_cnt = 0;
 615                q->hh_flows_current_cnt = 0;
 616
 617                /* Initialize heavy-hitter filter arrays. */
 618                for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 619                        q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN,
 620                                                    sizeof(u32),
 621                                                    GFP_KERNEL);
 622                        if (!q->hhf_arrays[i]) {
 623                                /* Note: hhf_destroy() will be called
 624                                 * by our caller.
 625                                 */
 626                                return -ENOMEM;
 627                        }
 628                }
 629                q->hhf_arrays_reset_timestamp = hhf_time_stamp();
 630
 631                /* Initialize valid bits of heavy-hitter filter arrays. */
 632                for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 633                        q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN /
 634                                                          BITS_PER_BYTE, GFP_KERNEL);
 635                        if (!q->hhf_valid_bits[i]) {
 636                                /* Note: hhf_destroy() will be called
 637                                 * by our caller.
 638                                 */
 639                                return -ENOMEM;
 640                        }
 641                }
 642
 643                /* Initialize Weighted DRR buckets. */
 644                for (i = 0; i < WDRR_BUCKET_CNT; i++) {
 645                        struct wdrr_bucket *bucket = q->buckets + i;
 646
 647                        INIT_LIST_HEAD(&bucket->bucketchain);
 648                }
 649        }
 650
 651        return 0;
 652}
 653
 654static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
 655{
 656        struct hhf_sched_data *q = qdisc_priv(sch);
 657        struct nlattr *opts;
 658
 659        opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
 660        if (opts == NULL)
 661                goto nla_put_failure;
 662
 663        if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
 664            nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
 665            nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
 666            nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
 667                        jiffies_to_usecs(q->hhf_reset_timeout)) ||
 668            nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
 669            nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
 670                        jiffies_to_usecs(q->hhf_evict_timeout)) ||
 671            nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
 672                goto nla_put_failure;
 673
 674        return nla_nest_end(skb, opts);
 675
 676nla_put_failure:
 677        return -1;
 678}
 679
 680static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 681{
 682        struct hhf_sched_data *q = qdisc_priv(sch);
 683        struct tc_hhf_xstats st = {
 684                .drop_overlimit = q->drop_overlimit,
 685                .hh_overlimit   = q->hh_flows_overlimit,
 686                .hh_tot_count   = q->hh_flows_total_cnt,
 687                .hh_cur_count   = q->hh_flows_current_cnt,
 688        };
 689
 690        return gnet_stats_copy_app(d, &st, sizeof(st));
 691}
 692
 693static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
 694        .id             =       "hhf",
 695        .priv_size      =       sizeof(struct hhf_sched_data),
 696
 697        .enqueue        =       hhf_enqueue,
 698        .dequeue        =       hhf_dequeue,
 699        .peek           =       qdisc_peek_dequeued,
 700        .init           =       hhf_init,
 701        .reset          =       hhf_reset,
 702        .destroy        =       hhf_destroy,
 703        .change         =       hhf_change,
 704        .dump           =       hhf_dump,
 705        .dump_stats     =       hhf_dump_stats,
 706        .owner          =       THIS_MODULE,
 707};
 708
 709static int __init hhf_module_init(void)
 710{
 711        return register_qdisc(&hhf_qdisc_ops);
 712}
 713
 714static void __exit hhf_module_exit(void)
 715{
 716        unregister_qdisc(&hhf_qdisc_ops);
 717}
 718
 719module_init(hhf_module_init)
 720module_exit(hhf_module_exit)
 721MODULE_AUTHOR("Terry Lam");
 722MODULE_AUTHOR("Nandita Dukkipati");
 723MODULE_LICENSE("GPL");
 724MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
 725