linux/net/sched/sch_hhf.c
<<
>>
Prefs
   1/* net/sched/sch_hhf.c          Heavy-Hitter Filter (HHF)
   2 *
   3 * Copyright (C) 2013 Terry Lam <vtlam@google.com>
   4 * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
   5 */
   6
   7#include <linux/jhash.h>
   8#include <linux/jiffies.h>
   9#include <linux/module.h>
  10#include <linux/skbuff.h>
  11#include <linux/vmalloc.h>
  12#include <net/flow_keys.h>
  13#include <net/pkt_sched.h>
  14#include <net/sock.h>
  15
  16/*      Heavy-Hitter Filter (HHF)
  17 *
  18 * Principles :
  19 * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
  20 * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
  21 * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
  22 * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
  23 * in which the heavy-hitter bucket is served with less weight.
  24 * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
  25 * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
  26 * higher share of bandwidth.
  27 *
  28 * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
  29 * following paper:
  30 * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
  31 * Accounting", in ACM SIGCOMM, 2002.
  32 *
  33 * Conceptually, a multi-stage filter comprises k independent hash functions
  34 * and k counter arrays. Packets are indexed into k counter arrays by k hash
  35 * functions, respectively. The counters are then increased by the packet sizes.
  36 * Therefore,
  37 *    - For a heavy-hitter flow: *all* of its k array counters must be large.
  38 *    - For a non-heavy-hitter flow: some of its k array counters can be large
  39 *      due to hash collision with other small flows; however, with high
  40 *      probability, not *all* k counters are large.
  41 *
  42 * By the design of the multi-stage filter algorithm, the false negative rate
  43 * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
  44 * susceptible to false positives (non-heavy-hitters mistakenly classified as
  45 * heavy-hitters).
  46 * Therefore, we also implement the following optimizations to reduce false
  47 * positives by avoiding unnecessary increment of the counter values:
  48 *    - Optimization O1: once a heavy-hitter is identified, its bytes are not
  49 *        accounted in the array counters. This technique is called "shielding"
  50 *        in Section 3.3.1 of [EV02].
  51 *    - Optimization O2: conservative update of counters
  52 *                       (Section 3.3.2 of [EV02]),
  53 *        New counter value = max {old counter value,
  54 *                                 smallest counter value + packet bytes}
  55 *
  56 * Finally, we refresh the counters periodically since otherwise the counter
  57 * values will keep accumulating.
  58 *
  59 * Once a flow is classified as heavy-hitter, we also save its per-flow state
  60 * in an exact-matching flow table so that its subsequent packets can be
  61 * dispatched to the heavy-hitter bucket accordingly.
  62 *
  63 *
  64 * At a high level, this qdisc works as follows:
  65 * Given a packet p:
  66 *   - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
  67 *     heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
  68 *     bucket.
  69 *   - Otherwise, forward p to the multi-stage filter, denoted filter F
  70 *        + If F decides that p belongs to a non-heavy-hitter flow, then send p
  71 *          to the non-heavy-hitter bucket.
  72 *        + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
  73 *          then set up a new flow entry for the flow-id of p in the table T and
  74 *          send p to the heavy-hitter bucket.
  75 *
  76 * In this implementation:
  77 *   - T is a fixed-size hash-table with 1024 entries. Hash collision is
  78 *     resolved by linked-list chaining.
  79 *   - F has four counter arrays, each array containing 1024 32-bit counters.
  80 *     That means 4 * 1024 * 32 bits = 16KB of memory.
  81 *   - Since each array in F contains 1024 counters, 10 bits are sufficient to
  82 *     index into each array.
  83 *     Hence, instead of having four hash functions, we chop the 32-bit
  84 *     skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
  85 *     computed as XOR sum of those three chunks.
  86 *   - We need to clear the counter arrays periodically; however, directly
  87 *     memsetting 16KB of memory can lead to cache eviction and unwanted delay.
  88 *     So by representing each counter by a valid bit, we only need to reset
  89 *     4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
  90 *   - The Deficit Round Robin engine is taken from fq_codel implementation
  91 *     (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
  92 *     fq_codel_flow in fq_codel implementation.
  93 *
  94 */
  95
  96/* Non-configurable parameters */
  97#define HH_FLOWS_CNT     1024  /* number of entries in exact-matching table T */
  98#define HHF_ARRAYS_CNT   4     /* number of arrays in multi-stage filter F */
  99#define HHF_ARRAYS_LEN   1024  /* number of counters in each array of F */
 100#define HHF_BIT_MASK_LEN 10    /* masking 10 bits */
 101#define HHF_BIT_MASK     0x3FF /* bitmask of 10 bits */
 102
 103#define WDRR_BUCKET_CNT  2     /* two buckets for Weighted DRR */
 104enum wdrr_bucket_idx {
 105        WDRR_BUCKET_FOR_HH      = 0, /* bucket id for heavy-hitters */
 106        WDRR_BUCKET_FOR_NON_HH  = 1  /* bucket id for non-heavy-hitters */
 107};
 108
 109#define hhf_time_before(a, b)   \
 110        (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
 111
 112/* Heavy-hitter per-flow state */
 113struct hh_flow_state {
 114        u32              hash_id;       /* hash of flow-id (e.g. TCP 5-tuple) */
 115        u32              hit_timestamp; /* last time heavy-hitter was seen */
 116        struct list_head flowchain;     /* chaining under hash collision */
 117};
 118
 119/* Weighted Deficit Round Robin (WDRR) scheduler */
 120struct wdrr_bucket {
 121        struct sk_buff    *head;
 122        struct sk_buff    *tail;
 123        struct list_head  bucketchain;
 124        int               deficit;
 125};
 126
 127struct hhf_sched_data {
 128        struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
 129        u32                perturbation;   /* hash perturbation */
 130        u32                quantum;        /* psched_mtu(qdisc_dev(sch)); */
 131        u32                drop_overlimit; /* number of times max qdisc packet
 132                                            * limit was hit
 133                                            */
 134        struct list_head   *hh_flows;       /* table T (currently active HHs) */
 135        u32                hh_flows_limit;            /* max active HH allocs */
 136        u32                hh_flows_overlimit; /* num of disallowed HH allocs */
 137        u32                hh_flows_total_cnt;          /* total admitted HHs */
 138        u32                hh_flows_current_cnt;        /* total current HHs  */
 139        u32                *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
 140        u32                hhf_arrays_reset_timestamp;  /* last time hhf_arrays
 141                                                         * was reset
 142                                                         */
 143        unsigned long      *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
 144                                                             * of hhf_arrays
 145                                                             */
 146        /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
 147        struct list_head   new_buckets; /* list of new buckets */
 148        struct list_head   old_buckets; /* list of old buckets */
 149
 150        /* Configurable HHF parameters */
 151        u32                hhf_reset_timeout; /* interval to reset counter
 152                                               * arrays in filter F
 153                                               * (default 40ms)
 154                                               */
 155        u32                hhf_admit_bytes;   /* counter thresh to classify as
 156                                               * HH (default 128KB).
 157                                               * With these default values,
 158                                               * 128KB / 40ms = 25 Mbps
 159                                               * i.e., we expect to capture HHs
 160                                               * sending > 25 Mbps.
 161                                               */
 162        u32                hhf_evict_timeout; /* aging threshold to evict idle
 163                                               * HHs out of table T. This should
 164                                               * be large enough to avoid
 165                                               * reordering during HH eviction.
 166                                               * (default 1s)
 167                                               */
 168        u32                hhf_non_hh_weight; /* WDRR weight for non-HHs
 169                                               * (default 2,
 170                                               *  i.e., non-HH : HH = 2 : 1)
 171                                               */
 172};
 173
 174static u32 hhf_time_stamp(void)
 175{
 176        return jiffies;
 177}
 178
 179static unsigned int skb_hash(const struct hhf_sched_data *q,
 180                             const struct sk_buff *skb)
 181{
 182        struct flow_keys keys;
 183        unsigned int hash;
 184
 185        if (skb->sk && skb->sk->sk_hash)
 186                return skb->sk->sk_hash;
 187
 188        skb_flow_dissect(skb, &keys);
 189        hash = jhash_3words((__force u32)keys.dst,
 190                            (__force u32)keys.src ^ keys.ip_proto,
 191                            (__force u32)keys.ports, q->perturbation);
 192        return hash;
 193}
 194
 195/* Looks up a heavy-hitter flow in a chaining list of table T. */
 196static struct hh_flow_state *seek_list(const u32 hash,
 197                                       struct list_head *head,
 198                                       struct hhf_sched_data *q)
 199{
 200        struct hh_flow_state *flow, *next;
 201        u32 now = hhf_time_stamp();
 202
 203        if (list_empty(head))
 204                return NULL;
 205
 206        list_for_each_entry_safe(flow, next, head, flowchain) {
 207                u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
 208
 209                if (hhf_time_before(prev, now)) {
 210                        /* Delete expired heavy-hitters, but preserve one entry
 211                         * to avoid kzalloc() when next time this slot is hit.
 212                         */
 213                        if (list_is_last(&flow->flowchain, head))
 214                                return NULL;
 215                        list_del(&flow->flowchain);
 216                        kfree(flow);
 217                        q->hh_flows_current_cnt--;
 218                } else if (flow->hash_id == hash) {
 219                        return flow;
 220                }
 221        }
 222        return NULL;
 223}
 224
 225/* Returns a flow state entry for a new heavy-hitter.  Either reuses an expired
 226 * entry or dynamically alloc a new entry.
 227 */
 228static struct hh_flow_state *alloc_new_hh(struct list_head *head,
 229                                          struct hhf_sched_data *q)
 230{
 231        struct hh_flow_state *flow;
 232        u32 now = hhf_time_stamp();
 233
 234        if (!list_empty(head)) {
 235                /* Find an expired heavy-hitter flow entry. */
 236                list_for_each_entry(flow, head, flowchain) {
 237                        u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
 238
 239                        if (hhf_time_before(prev, now))
 240                                return flow;
 241                }
 242        }
 243
 244        if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
 245                q->hh_flows_overlimit++;
 246                return NULL;
 247        }
 248        /* Create new entry. */
 249        flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
 250        if (!flow)
 251                return NULL;
 252
 253        q->hh_flows_current_cnt++;
 254        INIT_LIST_HEAD(&flow->flowchain);
 255        list_add_tail(&flow->flowchain, head);
 256
 257        return flow;
 258}
 259
 260/* Assigns packets to WDRR buckets.  Implements a multi-stage filter to
 261 * classify heavy-hitters.
 262 */
 263static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
 264{
 265        struct hhf_sched_data *q = qdisc_priv(sch);
 266        u32 tmp_hash, hash;
 267        u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
 268        struct hh_flow_state *flow;
 269        u32 pkt_len, min_hhf_val;
 270        int i;
 271        u32 prev;
 272        u32 now = hhf_time_stamp();
 273
 274        /* Reset the HHF counter arrays if this is the right time. */
 275        prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
 276        if (hhf_time_before(prev, now)) {
 277                for (i = 0; i < HHF_ARRAYS_CNT; i++)
 278                        bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
 279                q->hhf_arrays_reset_timestamp = now;
 280        }
 281
 282        /* Get hashed flow-id of the skb. */
 283        hash = skb_hash(q, skb);
 284
 285        /* Check if this packet belongs to an already established HH flow. */
 286        flow_pos = hash & HHF_BIT_MASK;
 287        flow = seek_list(hash, &q->hh_flows[flow_pos], q);
 288        if (flow) { /* found its HH flow */
 289                flow->hit_timestamp = now;
 290                return WDRR_BUCKET_FOR_HH;
 291        }
 292
 293        /* Now pass the packet through the multi-stage filter. */
 294        tmp_hash = hash;
 295        xorsum = 0;
 296        for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
 297                /* Split the skb_hash into three 10-bit chunks. */
 298                filter_pos[i] = tmp_hash & HHF_BIT_MASK;
 299                xorsum ^= filter_pos[i];
 300                tmp_hash >>= HHF_BIT_MASK_LEN;
 301        }
 302        /* The last chunk is computed as XOR sum of other chunks. */
 303        filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
 304
 305        pkt_len = qdisc_pkt_len(skb);
 306        min_hhf_val = ~0U;
 307        for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 308                u32 val;
 309
 310                if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
 311                        q->hhf_arrays[i][filter_pos[i]] = 0;
 312                        __set_bit(filter_pos[i], q->hhf_valid_bits[i]);
 313                }
 314
 315                val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
 316                if (min_hhf_val > val)
 317                        min_hhf_val = val;
 318        }
 319
 320        /* Found a new HH iff all counter values > HH admit threshold. */
 321        if (min_hhf_val > q->hhf_admit_bytes) {
 322                /* Just captured a new heavy-hitter. */
 323                flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
 324                if (!flow) /* memory alloc problem */
 325                        return WDRR_BUCKET_FOR_NON_HH;
 326                flow->hash_id = hash;
 327                flow->hit_timestamp = now;
 328                q->hh_flows_total_cnt++;
 329
 330                /* By returning without updating counters in q->hhf_arrays,
 331                 * we implicitly implement "shielding" (see Optimization O1).
 332                 */
 333                return WDRR_BUCKET_FOR_HH;
 334        }
 335
 336        /* Conservative update of HHF arrays (see Optimization O2). */
 337        for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 338                if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
 339                        q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
 340        }
 341        return WDRR_BUCKET_FOR_NON_HH;
 342}
 343
 344/* Removes one skb from head of bucket. */
 345static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
 346{
 347        struct sk_buff *skb = bucket->head;
 348
 349        bucket->head = skb->next;
 350        skb->next = NULL;
 351        return skb;
 352}
 353
 354/* Tail-adds skb to bucket. */
 355static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
 356{
 357        if (bucket->head == NULL)
 358                bucket->head = skb;
 359        else
 360                bucket->tail->next = skb;
 361        bucket->tail = skb;
 362        skb->next = NULL;
 363}
 364
 365static unsigned int hhf_drop(struct Qdisc *sch)
 366{
 367        struct hhf_sched_data *q = qdisc_priv(sch);
 368        struct wdrr_bucket *bucket;
 369
 370        /* Always try to drop from heavy-hitters first. */
 371        bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
 372        if (!bucket->head)
 373                bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
 374
 375        if (bucket->head) {
 376                struct sk_buff *skb = dequeue_head(bucket);
 377
 378                sch->q.qlen--;
 379                qdisc_qstats_drop(sch);
 380                qdisc_qstats_backlog_dec(sch, skb);
 381                kfree_skb(skb);
 382        }
 383
 384        /* Return id of the bucket from which the packet was dropped. */
 385        return bucket - q->buckets;
 386}
 387
 388static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 389{
 390        struct hhf_sched_data *q = qdisc_priv(sch);
 391        enum wdrr_bucket_idx idx;
 392        struct wdrr_bucket *bucket;
 393
 394        idx = hhf_classify(skb, sch);
 395
 396        bucket = &q->buckets[idx];
 397        bucket_add(bucket, skb);
 398        qdisc_qstats_backlog_inc(sch, skb);
 399
 400        if (list_empty(&bucket->bucketchain)) {
 401                unsigned int weight;
 402
 403                /* The logic of new_buckets vs. old_buckets is the same as
 404                 * new_flows vs. old_flows in the implementation of fq_codel,
 405                 * i.e., short bursts of non-HHs should have strict priority.
 406                 */
 407                if (idx == WDRR_BUCKET_FOR_HH) {
 408                        /* Always move heavy-hitters to old bucket. */
 409                        weight = 1;
 410                        list_add_tail(&bucket->bucketchain, &q->old_buckets);
 411                } else {
 412                        weight = q->hhf_non_hh_weight;
 413                        list_add_tail(&bucket->bucketchain, &q->new_buckets);
 414                }
 415                bucket->deficit = weight * q->quantum;
 416        }
 417        if (++sch->q.qlen <= sch->limit)
 418                return NET_XMIT_SUCCESS;
 419
 420        q->drop_overlimit++;
 421        /* Return Congestion Notification only if we dropped a packet from this
 422         * bucket.
 423         */
 424        if (hhf_drop(sch) == idx)
 425                return NET_XMIT_CN;
 426
 427        /* As we dropped a packet, better let upper stack know this. */
 428        qdisc_tree_decrease_qlen(sch, 1);
 429        return NET_XMIT_SUCCESS;
 430}
 431
 432static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
 433{
 434        struct hhf_sched_data *q = qdisc_priv(sch);
 435        struct sk_buff *skb = NULL;
 436        struct wdrr_bucket *bucket;
 437        struct list_head *head;
 438
 439begin:
 440        head = &q->new_buckets;
 441        if (list_empty(head)) {
 442                head = &q->old_buckets;
 443                if (list_empty(head))
 444                        return NULL;
 445        }
 446        bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
 447
 448        if (bucket->deficit <= 0) {
 449                int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
 450                              1 : q->hhf_non_hh_weight;
 451
 452                bucket->deficit += weight * q->quantum;
 453                list_move_tail(&bucket->bucketchain, &q->old_buckets);
 454                goto begin;
 455        }
 456
 457        if (bucket->head) {
 458                skb = dequeue_head(bucket);
 459                sch->q.qlen--;
 460                qdisc_qstats_backlog_dec(sch, skb);
 461        }
 462
 463        if (!skb) {
 464                /* Force a pass through old_buckets to prevent starvation. */
 465                if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
 466                        list_move_tail(&bucket->bucketchain, &q->old_buckets);
 467                else
 468                        list_del_init(&bucket->bucketchain);
 469                goto begin;
 470        }
 471        qdisc_bstats_update(sch, skb);
 472        bucket->deficit -= qdisc_pkt_len(skb);
 473
 474        return skb;
 475}
 476
 477static void hhf_reset(struct Qdisc *sch)
 478{
 479        struct sk_buff *skb;
 480
 481        while ((skb = hhf_dequeue(sch)) != NULL)
 482                kfree_skb(skb);
 483}
 484
 485static void *hhf_zalloc(size_t sz)
 486{
 487        void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 488
 489        if (!ptr)
 490                ptr = vzalloc(sz);
 491
 492        return ptr;
 493}
 494
 495static void hhf_free(void *addr)
 496{
 497        kvfree(addr);
 498}
 499
 500static void hhf_destroy(struct Qdisc *sch)
 501{
 502        int i;
 503        struct hhf_sched_data *q = qdisc_priv(sch);
 504
 505        for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 506                hhf_free(q->hhf_arrays[i]);
 507                hhf_free(q->hhf_valid_bits[i]);
 508        }
 509
 510        for (i = 0; i < HH_FLOWS_CNT; i++) {
 511                struct hh_flow_state *flow, *next;
 512                struct list_head *head = &q->hh_flows[i];
 513
 514                if (list_empty(head))
 515                        continue;
 516                list_for_each_entry_safe(flow, next, head, flowchain) {
 517                        list_del(&flow->flowchain);
 518                        kfree(flow);
 519                }
 520        }
 521        hhf_free(q->hh_flows);
 522}
 523
 524static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
 525        [TCA_HHF_BACKLOG_LIMIT]  = { .type = NLA_U32 },
 526        [TCA_HHF_QUANTUM]        = { .type = NLA_U32 },
 527        [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
 528        [TCA_HHF_RESET_TIMEOUT]  = { .type = NLA_U32 },
 529        [TCA_HHF_ADMIT_BYTES]    = { .type = NLA_U32 },
 530        [TCA_HHF_EVICT_TIMEOUT]  = { .type = NLA_U32 },
 531        [TCA_HHF_NON_HH_WEIGHT]  = { .type = NLA_U32 },
 532};
 533
 534static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
 535{
 536        struct hhf_sched_data *q = qdisc_priv(sch);
 537        struct nlattr *tb[TCA_HHF_MAX + 1];
 538        unsigned int qlen;
 539        int err;
 540        u64 non_hh_quantum;
 541        u32 new_quantum = q->quantum;
 542        u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
 543
 544        if (!opt)
 545                return -EINVAL;
 546
 547        err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
 548        if (err < 0)
 549                return err;
 550
 551        if (tb[TCA_HHF_QUANTUM])
 552                new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
 553
 554        if (tb[TCA_HHF_NON_HH_WEIGHT])
 555                new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
 556
 557        non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
 558        if (non_hh_quantum > INT_MAX)
 559                return -EINVAL;
 560
 561        sch_tree_lock(sch);
 562
 563        if (tb[TCA_HHF_BACKLOG_LIMIT])
 564                sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
 565
 566        q->quantum = new_quantum;
 567        q->hhf_non_hh_weight = new_hhf_non_hh_weight;
 568
 569        if (tb[TCA_HHF_HH_FLOWS_LIMIT])
 570                q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
 571
 572        if (tb[TCA_HHF_RESET_TIMEOUT]) {
 573                u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
 574
 575                q->hhf_reset_timeout = usecs_to_jiffies(us);
 576        }
 577
 578        if (tb[TCA_HHF_ADMIT_BYTES])
 579                q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
 580
 581        if (tb[TCA_HHF_EVICT_TIMEOUT]) {
 582                u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
 583
 584                q->hhf_evict_timeout = usecs_to_jiffies(us);
 585        }
 586
 587        qlen = sch->q.qlen;
 588        while (sch->q.qlen > sch->limit) {
 589                struct sk_buff *skb = hhf_dequeue(sch);
 590
 591                kfree_skb(skb);
 592        }
 593        qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 594
 595        sch_tree_unlock(sch);
 596        return 0;
 597}
 598
 599static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 600{
 601        struct hhf_sched_data *q = qdisc_priv(sch);
 602        int i;
 603
 604        sch->limit = 1000;
 605        q->quantum = psched_mtu(qdisc_dev(sch));
 606        q->perturbation = prandom_u32();
 607        INIT_LIST_HEAD(&q->new_buckets);
 608        INIT_LIST_HEAD(&q->old_buckets);
 609
 610        /* Configurable HHF parameters */
 611        q->hhf_reset_timeout = HZ / 25; /* 40  ms */
 612        q->hhf_admit_bytes = 131072;    /* 128 KB */
 613        q->hhf_evict_timeout = HZ;      /* 1  sec */
 614        q->hhf_non_hh_weight = 2;
 615
 616        if (opt) {
 617                int err = hhf_change(sch, opt);
 618
 619                if (err)
 620                        return err;
 621        }
 622
 623        if (!q->hh_flows) {
 624                /* Initialize heavy-hitter flow table. */
 625                q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
 626                                         sizeof(struct list_head));
 627                if (!q->hh_flows)
 628                        return -ENOMEM;
 629                for (i = 0; i < HH_FLOWS_CNT; i++)
 630                        INIT_LIST_HEAD(&q->hh_flows[i]);
 631
 632                /* Cap max active HHs at twice len of hh_flows table. */
 633                q->hh_flows_limit = 2 * HH_FLOWS_CNT;
 634                q->hh_flows_overlimit = 0;
 635                q->hh_flows_total_cnt = 0;
 636                q->hh_flows_current_cnt = 0;
 637
 638                /* Initialize heavy-hitter filter arrays. */
 639                for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 640                        q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
 641                                                      sizeof(u32));
 642                        if (!q->hhf_arrays[i]) {
 643                                hhf_destroy(sch);
 644                                return -ENOMEM;
 645                        }
 646                }
 647                q->hhf_arrays_reset_timestamp = hhf_time_stamp();
 648
 649                /* Initialize valid bits of heavy-hitter filter arrays. */
 650                for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 651                        q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
 652                                                          BITS_PER_BYTE);
 653                        if (!q->hhf_valid_bits[i]) {
 654                                hhf_destroy(sch);
 655                                return -ENOMEM;
 656                        }
 657                }
 658
 659                /* Initialize Weighted DRR buckets. */
 660                for (i = 0; i < WDRR_BUCKET_CNT; i++) {
 661                        struct wdrr_bucket *bucket = q->buckets + i;
 662
 663                        INIT_LIST_HEAD(&bucket->bucketchain);
 664                }
 665        }
 666
 667        return 0;
 668}
 669
 670static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
 671{
 672        struct hhf_sched_data *q = qdisc_priv(sch);
 673        struct nlattr *opts;
 674
 675        opts = nla_nest_start(skb, TCA_OPTIONS);
 676        if (opts == NULL)
 677                goto nla_put_failure;
 678
 679        if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
 680            nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
 681            nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
 682            nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
 683                        jiffies_to_usecs(q->hhf_reset_timeout)) ||
 684            nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
 685            nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
 686                        jiffies_to_usecs(q->hhf_evict_timeout)) ||
 687            nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
 688                goto nla_put_failure;
 689
 690        return nla_nest_end(skb, opts);
 691
 692nla_put_failure:
 693        return -1;
 694}
 695
 696static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 697{
 698        struct hhf_sched_data *q = qdisc_priv(sch);
 699        struct tc_hhf_xstats st = {
 700                .drop_overlimit = q->drop_overlimit,
 701                .hh_overlimit   = q->hh_flows_overlimit,
 702                .hh_tot_count   = q->hh_flows_total_cnt,
 703                .hh_cur_count   = q->hh_flows_current_cnt,
 704        };
 705
 706        return gnet_stats_copy_app(d, &st, sizeof(st));
 707}
 708
 709static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
 710        .id             =       "hhf",
 711        .priv_size      =       sizeof(struct hhf_sched_data),
 712
 713        .enqueue        =       hhf_enqueue,
 714        .dequeue        =       hhf_dequeue,
 715        .peek           =       qdisc_peek_dequeued,
 716        .drop           =       hhf_drop,
 717        .init           =       hhf_init,
 718        .reset          =       hhf_reset,
 719        .destroy        =       hhf_destroy,
 720        .change         =       hhf_change,
 721        .dump           =       hhf_dump,
 722        .dump_stats     =       hhf_dump_stats,
 723        .owner          =       THIS_MODULE,
 724};
 725
 726static int __init hhf_module_init(void)
 727{
 728        return register_qdisc(&hhf_qdisc_ops);
 729}
 730
 731static void __exit hhf_module_exit(void)
 732{
 733        unregister_qdisc(&hhf_qdisc_ops);
 734}
 735
 736module_init(hhf_module_init)
 737module_exit(hhf_module_exit)
 738MODULE_AUTHOR("Terry Lam");
 739MODULE_AUTHOR("Nandita Dukkipati");
 740MODULE_LICENSE("GPL");
 741