linux/net/sched/sch_choke.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_choke.c        CHOKE scheduler
   3 *
   4 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
   5 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License
   9 * version 2 as published by the Free Software Foundation.
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/types.h>
  15#include <linux/kernel.h>
  16#include <linux/skbuff.h>
  17#include <linux/reciprocal_div.h>
  18#include <linux/vmalloc.h>
  19#include <net/pkt_sched.h>
  20#include <net/inet_ecn.h>
  21#include <net/red.h>
  22#include <net/flow_keys.h>
  23
  24/*
  25   CHOKe stateless AQM for fair bandwidth allocation
  26   =================================================
  27
  28   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
  29   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
  30   maintains no flow state. The difference from RED is an additional step
  31   during the enqueuing process. If average queue size is over the
  32   low threshold (qmin), a packet is chosen at random from the queue.
  33   If both the new and chosen packet are from the same flow, both
  34   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
  35   needs to access packets in queue randomly. It has a minimal class
  36   interface to allow overriding the builtin flow classifier with
  37   filters.
  38
  39   Source:
  40   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
  41   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
  42   IEEE INFOCOM, 2000.
  43
  44   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
  45   Characteristics", IEEE/ACM Transactions on Networking, 2004
  46
  47 */
  48
  49/* Upper bound on size of sk_buff table (packets) */
  50#define CHOKE_MAX_QUEUE (128*1024 - 1)
  51
  52struct choke_sched_data {
  53/* Parameters */
  54        u32              limit;
  55        unsigned char    flags;
  56
  57        struct red_parms parms;
  58
  59/* Variables */
  60        struct red_vars  vars;
  61        struct tcf_proto *filter_list;
  62        struct {
  63                u32     prob_drop;      /* Early probability drops */
  64                u32     prob_mark;      /* Early probability marks */
  65                u32     forced_drop;    /* Forced drops, qavg > max_thresh */
  66                u32     forced_mark;    /* Forced marks, qavg > max_thresh */
  67                u32     pdrop;          /* Drops due to queue limits */
  68                u32     other;          /* Drops due to drop() calls */
  69                u32     matched;        /* Drops to flow match */
  70        } stats;
  71
  72        unsigned int     head;
  73        unsigned int     tail;
  74
  75        unsigned int     tab_mask; /* size - 1 */
  76
  77        struct sk_buff **tab;
  78};
  79
  80/* deliver a random number between 0 and N - 1 */
  81static u32 random_N(unsigned int N)
  82{
  83        return reciprocal_divide(random32(), N);
  84}
  85
  86/* number of elements in queue including holes */
  87static unsigned int choke_len(const struct choke_sched_data *q)
  88{
  89        return (q->tail - q->head) & q->tab_mask;
  90}
  91
  92/* Is ECN parameter configured */
  93static int use_ecn(const struct choke_sched_data *q)
  94{
  95        return q->flags & TC_RED_ECN;
  96}
  97
  98/* Should packets over max just be dropped (versus marked) */
  99static int use_harddrop(const struct choke_sched_data *q)
 100{
 101        return q->flags & TC_RED_HARDDROP;
 102}
 103
 104/* Move head pointer forward to skip over holes */
 105static void choke_zap_head_holes(struct choke_sched_data *q)
 106{
 107        do {
 108                q->head = (q->head + 1) & q->tab_mask;
 109                if (q->head == q->tail)
 110                        break;
 111        } while (q->tab[q->head] == NULL);
 112}
 113
 114/* Move tail pointer backwards to reuse holes */
 115static void choke_zap_tail_holes(struct choke_sched_data *q)
 116{
 117        do {
 118                q->tail = (q->tail - 1) & q->tab_mask;
 119                if (q->head == q->tail)
 120                        break;
 121        } while (q->tab[q->tail] == NULL);
 122}
 123
 124/* Drop packet from queue array by creating a "hole" */
 125static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
 126{
 127        struct choke_sched_data *q = qdisc_priv(sch);
 128        struct sk_buff *skb = q->tab[idx];
 129
 130        q->tab[idx] = NULL;
 131
 132        if (idx == q->head)
 133                choke_zap_head_holes(q);
 134        if (idx == q->tail)
 135                choke_zap_tail_holes(q);
 136
 137        sch->qstats.backlog -= qdisc_pkt_len(skb);
 138        qdisc_drop(skb, sch);
 139        qdisc_tree_decrease_qlen(sch, 1);
 140        --sch->q.qlen;
 141}
 142
 143struct choke_skb_cb {
 144        u16                     classid;
 145        u8                      keys_valid;
 146        struct flow_keys        keys;
 147};
 148
 149static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
 150{
 151        qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
 152        return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
 153}
 154
 155static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
 156{
 157        choke_skb_cb(skb)->classid = classid;
 158}
 159
 160static u16 choke_get_classid(const struct sk_buff *skb)
 161{
 162        return choke_skb_cb(skb)->classid;
 163}
 164
 165/*
 166 * Compare flow of two packets
 167 *  Returns true only if source and destination address and port match.
 168 *          false for special cases
 169 */
 170static bool choke_match_flow(struct sk_buff *skb1,
 171                             struct sk_buff *skb2)
 172{
 173        if (skb1->protocol != skb2->protocol)
 174                return false;
 175
 176        if (!choke_skb_cb(skb1)->keys_valid) {
 177                choke_skb_cb(skb1)->keys_valid = 1;
 178                skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
 179        }
 180
 181        if (!choke_skb_cb(skb2)->keys_valid) {
 182                choke_skb_cb(skb2)->keys_valid = 1;
 183                skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
 184        }
 185
 186        return !memcmp(&choke_skb_cb(skb1)->keys,
 187                       &choke_skb_cb(skb2)->keys,
 188                       sizeof(struct flow_keys));
 189}
 190
 191/*
 192 * Classify flow using either:
 193 *  1. pre-existing classification result in skb
 194 *  2. fast internal classification
 195 *  3. use TC filter based classification
 196 */
 197static bool choke_classify(struct sk_buff *skb,
 198                           struct Qdisc *sch, int *qerr)
 199
 200{
 201        struct choke_sched_data *q = qdisc_priv(sch);
 202        struct tcf_result res;
 203        int result;
 204
 205        result = tc_classify(skb, q->filter_list, &res);
 206        if (result >= 0) {
 207#ifdef CONFIG_NET_CLS_ACT
 208                switch (result) {
 209                case TC_ACT_STOLEN:
 210                case TC_ACT_QUEUED:
 211                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 212                case TC_ACT_SHOT:
 213                        return false;
 214                }
 215#endif
 216                choke_set_classid(skb, TC_H_MIN(res.classid));
 217                return true;
 218        }
 219
 220        return false;
 221}
 222
 223/*
 224 * Select a packet at random from queue
 225 * HACK: since queue can have holes from previous deletion; retry several
 226 *   times to find a random skb but then just give up and return the head
 227 * Will return NULL if queue is empty (q->head == q->tail)
 228 */
 229static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
 230                                         unsigned int *pidx)
 231{
 232        struct sk_buff *skb;
 233        int retrys = 3;
 234
 235        do {
 236                *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
 237                skb = q->tab[*pidx];
 238                if (skb)
 239                        return skb;
 240        } while (--retrys > 0);
 241
 242        return q->tab[*pidx = q->head];
 243}
 244
 245/*
 246 * Compare new packet with random packet in queue
 247 * returns true if matched and sets *pidx
 248 */
 249static bool choke_match_random(const struct choke_sched_data *q,
 250                               struct sk_buff *nskb,
 251                               unsigned int *pidx)
 252{
 253        struct sk_buff *oskb;
 254
 255        if (q->head == q->tail)
 256                return false;
 257
 258        oskb = choke_peek_random(q, pidx);
 259        if (q->filter_list)
 260                return choke_get_classid(nskb) == choke_get_classid(oskb);
 261
 262        return choke_match_flow(oskb, nskb);
 263}
 264
 265static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 266{
 267        struct choke_sched_data *q = qdisc_priv(sch);
 268        const struct red_parms *p = &q->parms;
 269        int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 270
 271        if (q->filter_list) {
 272                /* If using external classifiers, get result and record it. */
 273                if (!choke_classify(skb, sch, &ret))
 274                        goto other_drop;        /* Packet was eaten by filter */
 275        }
 276
 277        choke_skb_cb(skb)->keys_valid = 0;
 278        /* Compute average queue usage (see RED) */
 279        q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
 280        if (red_is_idling(&q->vars))
 281                red_end_of_idle_period(&q->vars);
 282
 283        /* Is queue small? */
 284        if (q->vars.qavg <= p->qth_min)
 285                q->vars.qcount = -1;
 286        else {
 287                unsigned int idx;
 288
 289                /* Draw a packet at random from queue and compare flow */
 290                if (choke_match_random(q, skb, &idx)) {
 291                        q->stats.matched++;
 292                        choke_drop_by_idx(sch, idx);
 293                        goto congestion_drop;
 294                }
 295
 296                /* Queue is large, always mark/drop */
 297                if (q->vars.qavg > p->qth_max) {
 298                        q->vars.qcount = -1;
 299
 300                        sch->qstats.overlimits++;
 301                        if (use_harddrop(q) || !use_ecn(q) ||
 302                            !INET_ECN_set_ce(skb)) {
 303                                q->stats.forced_drop++;
 304                                goto congestion_drop;
 305                        }
 306
 307                        q->stats.forced_mark++;
 308                } else if (++q->vars.qcount) {
 309                        if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
 310                                q->vars.qcount = 0;
 311                                q->vars.qR = red_random(p);
 312
 313                                sch->qstats.overlimits++;
 314                                if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
 315                                        q->stats.prob_drop++;
 316                                        goto congestion_drop;
 317                                }
 318
 319                                q->stats.prob_mark++;
 320                        }
 321                } else
 322                        q->vars.qR = red_random(p);
 323        }
 324
 325        /* Admit new packet */
 326        if (sch->q.qlen < q->limit) {
 327                q->tab[q->tail] = skb;
 328                q->tail = (q->tail + 1) & q->tab_mask;
 329                ++sch->q.qlen;
 330                sch->qstats.backlog += qdisc_pkt_len(skb);
 331                return NET_XMIT_SUCCESS;
 332        }
 333
 334        q->stats.pdrop++;
 335        sch->qstats.drops++;
 336        kfree_skb(skb);
 337        return NET_XMIT_DROP;
 338
 339 congestion_drop:
 340        qdisc_drop(skb, sch);
 341        return NET_XMIT_CN;
 342
 343 other_drop:
 344        if (ret & __NET_XMIT_BYPASS)
 345                sch->qstats.drops++;
 346        kfree_skb(skb);
 347        return ret;
 348}
 349
 350static struct sk_buff *choke_dequeue(struct Qdisc *sch)
 351{
 352        struct choke_sched_data *q = qdisc_priv(sch);
 353        struct sk_buff *skb;
 354
 355        if (q->head == q->tail) {
 356                if (!red_is_idling(&q->vars))
 357                        red_start_of_idle_period(&q->vars);
 358                return NULL;
 359        }
 360
 361        skb = q->tab[q->head];
 362        q->tab[q->head] = NULL;
 363        choke_zap_head_holes(q);
 364        --sch->q.qlen;
 365        sch->qstats.backlog -= qdisc_pkt_len(skb);
 366        qdisc_bstats_update(sch, skb);
 367
 368        return skb;
 369}
 370
 371static unsigned int choke_drop(struct Qdisc *sch)
 372{
 373        struct choke_sched_data *q = qdisc_priv(sch);
 374        unsigned int len;
 375
 376        len = qdisc_queue_drop(sch);
 377        if (len > 0)
 378                q->stats.other++;
 379        else {
 380                if (!red_is_idling(&q->vars))
 381                        red_start_of_idle_period(&q->vars);
 382        }
 383
 384        return len;
 385}
 386
 387static void choke_reset(struct Qdisc *sch)
 388{
 389        struct choke_sched_data *q = qdisc_priv(sch);
 390
 391        red_restart(&q->vars);
 392}
 393
 394static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
 395        [TCA_CHOKE_PARMS]       = { .len = sizeof(struct tc_red_qopt) },
 396        [TCA_CHOKE_STAB]        = { .len = RED_STAB_SIZE },
 397        [TCA_CHOKE_MAX_P]       = { .type = NLA_U32 },
 398};
 399
 400
 401static void choke_free(void *addr)
 402{
 403        if (addr) {
 404                if (is_vmalloc_addr(addr))
 405                        vfree(addr);
 406                else
 407                        kfree(addr);
 408        }
 409}
 410
 411static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 412{
 413        struct choke_sched_data *q = qdisc_priv(sch);
 414        struct nlattr *tb[TCA_CHOKE_MAX + 1];
 415        const struct tc_red_qopt *ctl;
 416        int err;
 417        struct sk_buff **old = NULL;
 418        unsigned int mask;
 419        u32 max_P;
 420
 421        if (opt == NULL)
 422                return -EINVAL;
 423
 424        err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
 425        if (err < 0)
 426                return err;
 427
 428        if (tb[TCA_CHOKE_PARMS] == NULL ||
 429            tb[TCA_CHOKE_STAB] == NULL)
 430                return -EINVAL;
 431
 432        max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
 433
 434        ctl = nla_data(tb[TCA_CHOKE_PARMS]);
 435
 436        if (ctl->limit > CHOKE_MAX_QUEUE)
 437                return -EINVAL;
 438
 439        mask = roundup_pow_of_two(ctl->limit + 1) - 1;
 440        if (mask != q->tab_mask) {
 441                struct sk_buff **ntab;
 442
 443                ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
 444                if (!ntab)
 445                        ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
 446                if (!ntab)
 447                        return -ENOMEM;
 448
 449                sch_tree_lock(sch);
 450                old = q->tab;
 451                if (old) {
 452                        unsigned int oqlen = sch->q.qlen, tail = 0;
 453
 454                        while (q->head != q->tail) {
 455                                struct sk_buff *skb = q->tab[q->head];
 456
 457                                q->head = (q->head + 1) & q->tab_mask;
 458                                if (!skb)
 459                                        continue;
 460                                if (tail < mask) {
 461                                        ntab[tail++] = skb;
 462                                        continue;
 463                                }
 464                                sch->qstats.backlog -= qdisc_pkt_len(skb);
 465                                --sch->q.qlen;
 466                                qdisc_drop(skb, sch);
 467                        }
 468                        qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
 469                        q->head = 0;
 470                        q->tail = tail;
 471                }
 472
 473                q->tab_mask = mask;
 474                q->tab = ntab;
 475        } else
 476                sch_tree_lock(sch);
 477
 478        q->flags = ctl->flags;
 479        q->limit = ctl->limit;
 480
 481        red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
 482                      ctl->Plog, ctl->Scell_log,
 483                      nla_data(tb[TCA_CHOKE_STAB]),
 484                      max_P);
 485        red_set_vars(&q->vars);
 486
 487        if (q->head == q->tail)
 488                red_end_of_idle_period(&q->vars);
 489
 490        sch_tree_unlock(sch);
 491        choke_free(old);
 492        return 0;
 493}
 494
 495static int choke_init(struct Qdisc *sch, struct nlattr *opt)
 496{
 497        return choke_change(sch, opt);
 498}
 499
 500static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
 501{
 502        struct choke_sched_data *q = qdisc_priv(sch);
 503        struct nlattr *opts = NULL;
 504        struct tc_red_qopt opt = {
 505                .limit          = q->limit,
 506                .flags          = q->flags,
 507                .qth_min        = q->parms.qth_min >> q->parms.Wlog,
 508                .qth_max        = q->parms.qth_max >> q->parms.Wlog,
 509                .Wlog           = q->parms.Wlog,
 510                .Plog           = q->parms.Plog,
 511                .Scell_log      = q->parms.Scell_log,
 512        };
 513
 514        opts = nla_nest_start(skb, TCA_OPTIONS);
 515        if (opts == NULL)
 516                goto nla_put_failure;
 517
 518        NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
 519        NLA_PUT_U32(skb, TCA_CHOKE_MAX_P, q->parms.max_P);
 520        return nla_nest_end(skb, opts);
 521
 522nla_put_failure:
 523        nla_nest_cancel(skb, opts);
 524        return -EMSGSIZE;
 525}
 526
 527static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 528{
 529        struct choke_sched_data *q = qdisc_priv(sch);
 530        struct tc_choke_xstats st = {
 531                .early  = q->stats.prob_drop + q->stats.forced_drop,
 532                .marked = q->stats.prob_mark + q->stats.forced_mark,
 533                .pdrop  = q->stats.pdrop,
 534                .other  = q->stats.other,
 535                .matched = q->stats.matched,
 536        };
 537
 538        return gnet_stats_copy_app(d, &st, sizeof(st));
 539}
 540
 541static void choke_destroy(struct Qdisc *sch)
 542{
 543        struct choke_sched_data *q = qdisc_priv(sch);
 544
 545        tcf_destroy_chain(&q->filter_list);
 546        choke_free(q->tab);
 547}
 548
 549static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
 550{
 551        return NULL;
 552}
 553
 554static unsigned long choke_get(struct Qdisc *sch, u32 classid)
 555{
 556        return 0;
 557}
 558
 559static void choke_put(struct Qdisc *q, unsigned long cl)
 560{
 561}
 562
 563static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
 564                                u32 classid)
 565{
 566        return 0;
 567}
 568
 569static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
 570{
 571        struct choke_sched_data *q = qdisc_priv(sch);
 572
 573        if (cl)
 574                return NULL;
 575        return &q->filter_list;
 576}
 577
 578static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
 579                          struct sk_buff *skb, struct tcmsg *tcm)
 580{
 581        tcm->tcm_handle |= TC_H_MIN(cl);
 582        return 0;
 583}
 584
 585static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 586{
 587        if (!arg->stop) {
 588                if (arg->fn(sch, 1, arg) < 0) {
 589                        arg->stop = 1;
 590                        return;
 591                }
 592                arg->count++;
 593        }
 594}
 595
 596static const struct Qdisc_class_ops choke_class_ops = {
 597        .leaf           =       choke_leaf,
 598        .get            =       choke_get,
 599        .put            =       choke_put,
 600        .tcf_chain      =       choke_find_tcf,
 601        .bind_tcf       =       choke_bind,
 602        .unbind_tcf     =       choke_put,
 603        .dump           =       choke_dump_class,
 604        .walk           =       choke_walk,
 605};
 606
 607static struct sk_buff *choke_peek_head(struct Qdisc *sch)
 608{
 609        struct choke_sched_data *q = qdisc_priv(sch);
 610
 611        return (q->head != q->tail) ? q->tab[q->head] : NULL;
 612}
 613
 614static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
 615        .id             =       "choke",
 616        .priv_size      =       sizeof(struct choke_sched_data),
 617
 618        .enqueue        =       choke_enqueue,
 619        .dequeue        =       choke_dequeue,
 620        .peek           =       choke_peek_head,
 621        .drop           =       choke_drop,
 622        .init           =       choke_init,
 623        .destroy        =       choke_destroy,
 624        .reset          =       choke_reset,
 625        .change         =       choke_change,
 626        .dump           =       choke_dump,
 627        .dump_stats     =       choke_dump_stats,
 628        .owner          =       THIS_MODULE,
 629};
 630
 631static int __init choke_module_init(void)
 632{
 633        return register_qdisc(&choke_qdisc_ops);
 634}
 635
 636static void __exit choke_module_exit(void)
 637{
 638        unregister_qdisc(&choke_qdisc_ops);
 639}
 640
 641module_init(choke_module_init)
 642module_exit(choke_module_exit)
 643
 644MODULE_LICENSE("GPL");
 645