linux/net/sched/sch_choke.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_choke.c        CHOKE scheduler
   3 *
   4 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
   5 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License
   9 * version 2 as published by the Free Software Foundation.
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/types.h>
  15#include <linux/kernel.h>
  16#include <linux/skbuff.h>
  17#include <linux/vmalloc.h>
  18#include <net/pkt_sched.h>
  19#include <net/inet_ecn.h>
  20#include <net/red.h>
  21#include <net/flow_dissector.h>
  22
  23/*
  24   CHOKe stateless AQM for fair bandwidth allocation
  25   =================================================
  26
  27   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
  28   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
  29   maintains no flow state. The difference from RED is an additional step
  30   during the enqueuing process. If average queue size is over the
  31   low threshold (qmin), a packet is chosen at random from the queue.
  32   If both the new and chosen packet are from the same flow, both
  33   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
  34   needs to access packets in queue randomly. It has a minimal class
  35   interface to allow overriding the builtin flow classifier with
  36   filters.
  37
  38   Source:
  39   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
  40   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
  41   IEEE INFOCOM, 2000.
  42
  43   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
  44   Characteristics", IEEE/ACM Transactions on Networking, 2004
  45
  46 */
  47
  48/* Upper bound on size of sk_buff table (packets) */
  49#define CHOKE_MAX_QUEUE (128*1024 - 1)
  50
  51struct choke_sched_data {
  52/* Parameters */
  53        u32              limit;
  54        unsigned char    flags;
  55
  56        struct red_parms parms;
  57
  58/* Variables */
  59        struct red_vars  vars;
  60        struct tcf_proto __rcu *filter_list;
  61        struct {
  62                u32     prob_drop;      /* Early probability drops */
  63                u32     prob_mark;      /* Early probability marks */
  64                u32     forced_drop;    /* Forced drops, qavg > max_thresh */
  65                u32     forced_mark;    /* Forced marks, qavg > max_thresh */
  66                u32     pdrop;          /* Drops due to queue limits */
  67                u32     other;          /* Drops due to drop() calls */
  68                u32     matched;        /* Drops to flow match */
  69        } stats;
  70
  71        unsigned int     head;
  72        unsigned int     tail;
  73
  74        unsigned int     tab_mask; /* size - 1 */
  75
  76        struct sk_buff **tab;
  77};
  78
  79/* number of elements in queue including holes */
  80static unsigned int choke_len(const struct choke_sched_data *q)
  81{
  82        return (q->tail - q->head) & q->tab_mask;
  83}
  84
  85/* Is ECN parameter configured */
  86static int use_ecn(const struct choke_sched_data *q)
  87{
  88        return q->flags & TC_RED_ECN;
  89}
  90
  91/* Should packets over max just be dropped (versus marked) */
  92static int use_harddrop(const struct choke_sched_data *q)
  93{
  94        return q->flags & TC_RED_HARDDROP;
  95}
  96
  97/* Move head pointer forward to skip over holes */
  98static void choke_zap_head_holes(struct choke_sched_data *q)
  99{
 100        do {
 101                q->head = (q->head + 1) & q->tab_mask;
 102                if (q->head == q->tail)
 103                        break;
 104        } while (q->tab[q->head] == NULL);
 105}
 106
 107/* Move tail pointer backwards to reuse holes */
 108static void choke_zap_tail_holes(struct choke_sched_data *q)
 109{
 110        do {
 111                q->tail = (q->tail - 1) & q->tab_mask;
 112                if (q->head == q->tail)
 113                        break;
 114        } while (q->tab[q->tail] == NULL);
 115}
 116
 117/* Drop packet from queue array by creating a "hole" */
 118static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
 119                              struct sk_buff **to_free)
 120{
 121        struct choke_sched_data *q = qdisc_priv(sch);
 122        struct sk_buff *skb = q->tab[idx];
 123
 124        q->tab[idx] = NULL;
 125
 126        if (idx == q->head)
 127                choke_zap_head_holes(q);
 128        if (idx == q->tail)
 129                choke_zap_tail_holes(q);
 130
 131        qdisc_qstats_backlog_dec(sch, skb);
 132        qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
 133        qdisc_drop(skb, sch, to_free);
 134        --sch->q.qlen;
 135}
 136
 137struct choke_skb_cb {
 138        u16                     classid;
 139        u8                      keys_valid;
 140        struct                  flow_keys_digest keys;
 141};
 142
 143static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
 144{
 145        qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
 146        return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
 147}
 148
 149static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
 150{
 151        choke_skb_cb(skb)->classid = classid;
 152}
 153
 154static u16 choke_get_classid(const struct sk_buff *skb)
 155{
 156        return choke_skb_cb(skb)->classid;
 157}
 158
 159/*
 160 * Compare flow of two packets
 161 *  Returns true only if source and destination address and port match.
 162 *          false for special cases
 163 */
 164static bool choke_match_flow(struct sk_buff *skb1,
 165                             struct sk_buff *skb2)
 166{
 167        struct flow_keys temp;
 168
 169        if (skb1->protocol != skb2->protocol)
 170                return false;
 171
 172        if (!choke_skb_cb(skb1)->keys_valid) {
 173                choke_skb_cb(skb1)->keys_valid = 1;
 174                skb_flow_dissect_flow_keys(skb1, &temp, 0);
 175                make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
 176        }
 177
 178        if (!choke_skb_cb(skb2)->keys_valid) {
 179                choke_skb_cb(skb2)->keys_valid = 1;
 180                skb_flow_dissect_flow_keys(skb2, &temp, 0);
 181                make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
 182        }
 183
 184        return !memcmp(&choke_skb_cb(skb1)->keys,
 185                       &choke_skb_cb(skb2)->keys,
 186                       sizeof(choke_skb_cb(skb1)->keys));
 187}
 188
 189/*
 190 * Classify flow using either:
 191 *  1. pre-existing classification result in skb
 192 *  2. fast internal classification
 193 *  3. use TC filter based classification
 194 */
 195static bool choke_classify(struct sk_buff *skb,
 196                           struct Qdisc *sch, int *qerr)
 197
 198{
 199        struct choke_sched_data *q = qdisc_priv(sch);
 200        struct tcf_result res;
 201        struct tcf_proto *fl;
 202        int result;
 203
 204        fl = rcu_dereference_bh(q->filter_list);
 205        result = tc_classify(skb, fl, &res, false);
 206        if (result >= 0) {
 207#ifdef CONFIG_NET_CLS_ACT
 208                switch (result) {
 209                case TC_ACT_STOLEN:
 210                case TC_ACT_QUEUED:
 211                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 212                case TC_ACT_SHOT:
 213                        return false;
 214                }
 215#endif
 216                choke_set_classid(skb, TC_H_MIN(res.classid));
 217                return true;
 218        }
 219
 220        return false;
 221}
 222
 223/*
 224 * Select a packet at random from queue
 225 * HACK: since queue can have holes from previous deletion; retry several
 226 *   times to find a random skb but then just give up and return the head
 227 * Will return NULL if queue is empty (q->head == q->tail)
 228 */
 229static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
 230                                         unsigned int *pidx)
 231{
 232        struct sk_buff *skb;
 233        int retrys = 3;
 234
 235        do {
 236                *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
 237                skb = q->tab[*pidx];
 238                if (skb)
 239                        return skb;
 240        } while (--retrys > 0);
 241
 242        return q->tab[*pidx = q->head];
 243}
 244
 245/*
 246 * Compare new packet with random packet in queue
 247 * returns true if matched and sets *pidx
 248 */
 249static bool choke_match_random(const struct choke_sched_data *q,
 250                               struct sk_buff *nskb,
 251                               unsigned int *pidx)
 252{
 253        struct sk_buff *oskb;
 254
 255        if (q->head == q->tail)
 256                return false;
 257
 258        oskb = choke_peek_random(q, pidx);
 259        if (rcu_access_pointer(q->filter_list))
 260                return choke_get_classid(nskb) == choke_get_classid(oskb);
 261
 262        return choke_match_flow(oskb, nskb);
 263}
 264
 265static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 266                         struct sk_buff **to_free)
 267{
 268        int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 269        struct choke_sched_data *q = qdisc_priv(sch);
 270        const struct red_parms *p = &q->parms;
 271
 272        if (rcu_access_pointer(q->filter_list)) {
 273                /* If using external classifiers, get result and record it. */
 274                if (!choke_classify(skb, sch, &ret))
 275                        goto other_drop;        /* Packet was eaten by filter */
 276        }
 277
 278        choke_skb_cb(skb)->keys_valid = 0;
 279        /* Compute average queue usage (see RED) */
 280        q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
 281        if (red_is_idling(&q->vars))
 282                red_end_of_idle_period(&q->vars);
 283
 284        /* Is queue small? */
 285        if (q->vars.qavg <= p->qth_min)
 286                q->vars.qcount = -1;
 287        else {
 288                unsigned int idx;
 289
 290                /* Draw a packet at random from queue and compare flow */
 291                if (choke_match_random(q, skb, &idx)) {
 292                        q->stats.matched++;
 293                        choke_drop_by_idx(sch, idx, to_free);
 294                        goto congestion_drop;
 295                }
 296
 297                /* Queue is large, always mark/drop */
 298                if (q->vars.qavg > p->qth_max) {
 299                        q->vars.qcount = -1;
 300
 301                        qdisc_qstats_overlimit(sch);
 302                        if (use_harddrop(q) || !use_ecn(q) ||
 303                            !INET_ECN_set_ce(skb)) {
 304                                q->stats.forced_drop++;
 305                                goto congestion_drop;
 306                        }
 307
 308                        q->stats.forced_mark++;
 309                } else if (++q->vars.qcount) {
 310                        if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
 311                                q->vars.qcount = 0;
 312                                q->vars.qR = red_random(p);
 313
 314                                qdisc_qstats_overlimit(sch);
 315                                if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
 316                                        q->stats.prob_drop++;
 317                                        goto congestion_drop;
 318                                }
 319
 320                                q->stats.prob_mark++;
 321                        }
 322                } else
 323                        q->vars.qR = red_random(p);
 324        }
 325
 326        /* Admit new packet */
 327        if (sch->q.qlen < q->limit) {
 328                q->tab[q->tail] = skb;
 329                q->tail = (q->tail + 1) & q->tab_mask;
 330                ++sch->q.qlen;
 331                qdisc_qstats_backlog_inc(sch, skb);
 332                return NET_XMIT_SUCCESS;
 333        }
 334
 335        q->stats.pdrop++;
 336        return qdisc_drop(skb, sch, to_free);
 337
 338congestion_drop:
 339        qdisc_drop(skb, sch, to_free);
 340        return NET_XMIT_CN;
 341
 342other_drop:
 343        if (ret & __NET_XMIT_BYPASS)
 344                qdisc_qstats_drop(sch);
 345        __qdisc_drop(skb, to_free);
 346        return ret;
 347}
 348
 349static struct sk_buff *choke_dequeue(struct Qdisc *sch)
 350{
 351        struct choke_sched_data *q = qdisc_priv(sch);
 352        struct sk_buff *skb;
 353
 354        if (q->head == q->tail) {
 355                if (!red_is_idling(&q->vars))
 356                        red_start_of_idle_period(&q->vars);
 357                return NULL;
 358        }
 359
 360        skb = q->tab[q->head];
 361        q->tab[q->head] = NULL;
 362        choke_zap_head_holes(q);
 363        --sch->q.qlen;
 364        qdisc_qstats_backlog_dec(sch, skb);
 365        qdisc_bstats_update(sch, skb);
 366
 367        return skb;
 368}
 369
 370static void choke_reset(struct Qdisc *sch)
 371{
 372        struct choke_sched_data *q = qdisc_priv(sch);
 373
 374        while (q->head != q->tail) {
 375                struct sk_buff *skb = q->tab[q->head];
 376
 377                q->head = (q->head + 1) & q->tab_mask;
 378                if (!skb)
 379                        continue;
 380                rtnl_qdisc_drop(skb, sch);
 381        }
 382
 383        sch->q.qlen = 0;
 384        sch->qstats.backlog = 0;
 385        memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
 386        q->head = q->tail = 0;
 387        red_restart(&q->vars);
 388}
 389
 390static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
 391        [TCA_CHOKE_PARMS]       = { .len = sizeof(struct tc_red_qopt) },
 392        [TCA_CHOKE_STAB]        = { .len = RED_STAB_SIZE },
 393        [TCA_CHOKE_MAX_P]       = { .type = NLA_U32 },
 394};
 395
 396
 397static void choke_free(void *addr)
 398{
 399        kvfree(addr);
 400}
 401
 402static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 403{
 404        struct choke_sched_data *q = qdisc_priv(sch);
 405        struct nlattr *tb[TCA_CHOKE_MAX + 1];
 406        const struct tc_red_qopt *ctl;
 407        int err;
 408        struct sk_buff **old = NULL;
 409        unsigned int mask;
 410        u32 max_P;
 411
 412        if (opt == NULL)
 413                return -EINVAL;
 414
 415        err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
 416        if (err < 0)
 417                return err;
 418
 419        if (tb[TCA_CHOKE_PARMS] == NULL ||
 420            tb[TCA_CHOKE_STAB] == NULL)
 421                return -EINVAL;
 422
 423        max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
 424
 425        ctl = nla_data(tb[TCA_CHOKE_PARMS]);
 426
 427        if (ctl->limit > CHOKE_MAX_QUEUE)
 428                return -EINVAL;
 429
 430        mask = roundup_pow_of_two(ctl->limit + 1) - 1;
 431        if (mask != q->tab_mask) {
 432                struct sk_buff **ntab;
 433
 434                ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
 435                               GFP_KERNEL | __GFP_NOWARN);
 436                if (!ntab)
 437                        ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
 438                if (!ntab)
 439                        return -ENOMEM;
 440
 441                sch_tree_lock(sch);
 442                old = q->tab;
 443                if (old) {
 444                        unsigned int oqlen = sch->q.qlen, tail = 0;
 445                        unsigned dropped = 0;
 446
 447                        while (q->head != q->tail) {
 448                                struct sk_buff *skb = q->tab[q->head];
 449
 450                                q->head = (q->head + 1) & q->tab_mask;
 451                                if (!skb)
 452                                        continue;
 453                                if (tail < mask) {
 454                                        ntab[tail++] = skb;
 455                                        continue;
 456                                }
 457                                dropped += qdisc_pkt_len(skb);
 458                                qdisc_qstats_backlog_dec(sch, skb);
 459                                --sch->q.qlen;
 460                                rtnl_qdisc_drop(skb, sch);
 461                        }
 462                        qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
 463                        q->head = 0;
 464                        q->tail = tail;
 465                }
 466
 467                q->tab_mask = mask;
 468                q->tab = ntab;
 469        } else
 470                sch_tree_lock(sch);
 471
 472        q->flags = ctl->flags;
 473        q->limit = ctl->limit;
 474
 475        red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
 476                      ctl->Plog, ctl->Scell_log,
 477                      nla_data(tb[TCA_CHOKE_STAB]),
 478                      max_P);
 479        red_set_vars(&q->vars);
 480
 481        if (q->head == q->tail)
 482                red_end_of_idle_period(&q->vars);
 483
 484        sch_tree_unlock(sch);
 485        choke_free(old);
 486        return 0;
 487}
 488
 489static int choke_init(struct Qdisc *sch, struct nlattr *opt)
 490{
 491        return choke_change(sch, opt);
 492}
 493
 494static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
 495{
 496        struct choke_sched_data *q = qdisc_priv(sch);
 497        struct nlattr *opts = NULL;
 498        struct tc_red_qopt opt = {
 499                .limit          = q->limit,
 500                .flags          = q->flags,
 501                .qth_min        = q->parms.qth_min >> q->parms.Wlog,
 502                .qth_max        = q->parms.qth_max >> q->parms.Wlog,
 503                .Wlog           = q->parms.Wlog,
 504                .Plog           = q->parms.Plog,
 505                .Scell_log      = q->parms.Scell_log,
 506        };
 507
 508        opts = nla_nest_start(skb, TCA_OPTIONS);
 509        if (opts == NULL)
 510                goto nla_put_failure;
 511
 512        if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
 513            nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
 514                goto nla_put_failure;
 515        return nla_nest_end(skb, opts);
 516
 517nla_put_failure:
 518        nla_nest_cancel(skb, opts);
 519        return -EMSGSIZE;
 520}
 521
 522static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 523{
 524        struct choke_sched_data *q = qdisc_priv(sch);
 525        struct tc_choke_xstats st = {
 526                .early  = q->stats.prob_drop + q->stats.forced_drop,
 527                .marked = q->stats.prob_mark + q->stats.forced_mark,
 528                .pdrop  = q->stats.pdrop,
 529                .other  = q->stats.other,
 530                .matched = q->stats.matched,
 531        };
 532
 533        return gnet_stats_copy_app(d, &st, sizeof(st));
 534}
 535
 536static void choke_destroy(struct Qdisc *sch)
 537{
 538        struct choke_sched_data *q = qdisc_priv(sch);
 539
 540        tcf_destroy_chain(&q->filter_list);
 541        choke_free(q->tab);
 542}
 543
 544static struct sk_buff *choke_peek_head(struct Qdisc *sch)
 545{
 546        struct choke_sched_data *q = qdisc_priv(sch);
 547
 548        return (q->head != q->tail) ? q->tab[q->head] : NULL;
 549}
 550
 551static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
 552        .id             =       "choke",
 553        .priv_size      =       sizeof(struct choke_sched_data),
 554
 555        .enqueue        =       choke_enqueue,
 556        .dequeue        =       choke_dequeue,
 557        .peek           =       choke_peek_head,
 558        .init           =       choke_init,
 559        .destroy        =       choke_destroy,
 560        .reset          =       choke_reset,
 561        .change         =       choke_change,
 562        .dump           =       choke_dump,
 563        .dump_stats     =       choke_dump_stats,
 564        .owner          =       THIS_MODULE,
 565};
 566
 567static int __init choke_module_init(void)
 568{
 569        return register_qdisc(&choke_qdisc_ops);
 570}
 571
 572static void __exit choke_module_exit(void)
 573{
 574        unregister_qdisc(&choke_qdisc_ops);
 575}
 576
 577module_init(choke_module_init)
 578module_exit(choke_module_exit)
 579
 580MODULE_LICENSE("GPL");
 581