linux/net/sched/sch_choke.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_choke.c        CHOKE scheduler
   3 *
   4 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
   5 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License
   9 * version 2 as published by the Free Software Foundation.
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/types.h>
  15#include <linux/kernel.h>
  16#include <linux/skbuff.h>
  17#include <linux/vmalloc.h>
  18#include <net/pkt_sched.h>
  19#include <net/inet_ecn.h>
  20#include <net/red.h>
  21#include <net/flow_keys.h>
  22
  23/*
  24   CHOKe stateless AQM for fair bandwidth allocation
  25   =================================================
  26
  27   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
  28   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
  29   maintains no flow state. The difference from RED is an additional step
  30   during the enqueuing process. If average queue size is over the
  31   low threshold (qmin), a packet is chosen at random from the queue.
  32   If both the new and chosen packet are from the same flow, both
  33   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
  34   needs to access packets in queue randomly. It has a minimal class
  35   interface to allow overriding the builtin flow classifier with
  36   filters.
  37
  38   Source:
  39   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
  40   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
  41   IEEE INFOCOM, 2000.
  42
  43   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
  44   Characteristics", IEEE/ACM Transactions on Networking, 2004
  45
  46 */
  47
  48/* Upper bound on size of sk_buff table (packets) */
  49#define CHOKE_MAX_QUEUE (128*1024 - 1)
  50
  51struct choke_sched_data {
  52/* Parameters */
  53        u32              limit;
  54        unsigned char    flags;
  55
  56        struct red_parms parms;
  57
  58/* Variables */
  59        struct red_vars  vars;
  60        struct tcf_proto *filter_list;
  61        struct {
  62                u32     prob_drop;      /* Early probability drops */
  63                u32     prob_mark;      /* Early probability marks */
  64                u32     forced_drop;    /* Forced drops, qavg > max_thresh */
  65                u32     forced_mark;    /* Forced marks, qavg > max_thresh */
  66                u32     pdrop;          /* Drops due to queue limits */
  67                u32     other;          /* Drops due to drop() calls */
  68                u32     matched;        /* Drops to flow match */
  69        } stats;
  70
  71        unsigned int     head;
  72        unsigned int     tail;
  73
  74        unsigned int     tab_mask; /* size - 1 */
  75
  76        struct sk_buff **tab;
  77};
  78
  79/* number of elements in queue including holes */
  80static unsigned int choke_len(const struct choke_sched_data *q)
  81{
  82        return (q->tail - q->head) & q->tab_mask;
  83}
  84
  85/* Is ECN parameter configured */
  86static int use_ecn(const struct choke_sched_data *q)
  87{
  88        return q->flags & TC_RED_ECN;
  89}
  90
  91/* Should packets over max just be dropped (versus marked) */
  92static int use_harddrop(const struct choke_sched_data *q)
  93{
  94        return q->flags & TC_RED_HARDDROP;
  95}
  96
  97/* Move head pointer forward to skip over holes */
  98static void choke_zap_head_holes(struct choke_sched_data *q)
  99{
 100        do {
 101                q->head = (q->head + 1) & q->tab_mask;
 102                if (q->head == q->tail)
 103                        break;
 104        } while (q->tab[q->head] == NULL);
 105}
 106
 107/* Move tail pointer backwards to reuse holes */
 108static void choke_zap_tail_holes(struct choke_sched_data *q)
 109{
 110        do {
 111                q->tail = (q->tail - 1) & q->tab_mask;
 112                if (q->head == q->tail)
 113                        break;
 114        } while (q->tab[q->tail] == NULL);
 115}
 116
 117/* Drop packet from queue array by creating a "hole" */
 118static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
 119{
 120        struct choke_sched_data *q = qdisc_priv(sch);
 121        struct sk_buff *skb = q->tab[idx];
 122
 123        q->tab[idx] = NULL;
 124
 125        if (idx == q->head)
 126                choke_zap_head_holes(q);
 127        if (idx == q->tail)
 128                choke_zap_tail_holes(q);
 129
 130        sch->qstats.backlog -= qdisc_pkt_len(skb);
 131        qdisc_drop(skb, sch);
 132        qdisc_tree_decrease_qlen(sch, 1);
 133        --sch->q.qlen;
 134}
 135
 136struct choke_skb_cb {
 137        u16                     classid;
 138        u8                      keys_valid;
 139        struct flow_keys        keys;
 140};
 141
 142static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
 143{
 144        qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
 145        return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
 146}
 147
 148static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
 149{
 150        choke_skb_cb(skb)->classid = classid;
 151}
 152
 153static u16 choke_get_classid(const struct sk_buff *skb)
 154{
 155        return choke_skb_cb(skb)->classid;
 156}
 157
 158/*
 159 * Compare flow of two packets
 160 *  Returns true only if source and destination address and port match.
 161 *          false for special cases
 162 */
 163static bool choke_match_flow(struct sk_buff *skb1,
 164                             struct sk_buff *skb2)
 165{
 166        if (skb1->protocol != skb2->protocol)
 167                return false;
 168
 169        if (!choke_skb_cb(skb1)->keys_valid) {
 170                choke_skb_cb(skb1)->keys_valid = 1;
 171                skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
 172        }
 173
 174        if (!choke_skb_cb(skb2)->keys_valid) {
 175                choke_skb_cb(skb2)->keys_valid = 1;
 176                skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
 177        }
 178
 179        return !memcmp(&choke_skb_cb(skb1)->keys,
 180                       &choke_skb_cb(skb2)->keys,
 181                       sizeof(struct flow_keys));
 182}
 183
 184/*
 185 * Classify flow using either:
 186 *  1. pre-existing classification result in skb
 187 *  2. fast internal classification
 188 *  3. use TC filter based classification
 189 */
 190static bool choke_classify(struct sk_buff *skb,
 191                           struct Qdisc *sch, int *qerr)
 192
 193{
 194        struct choke_sched_data *q = qdisc_priv(sch);
 195        struct tcf_result res;
 196        int result;
 197
 198        result = tc_classify(skb, q->filter_list, &res);
 199        if (result >= 0) {
 200#ifdef CONFIG_NET_CLS_ACT
 201                switch (result) {
 202                case TC_ACT_STOLEN:
 203                case TC_ACT_QUEUED:
 204                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 205                case TC_ACT_SHOT:
 206                        return false;
 207                }
 208#endif
 209                choke_set_classid(skb, TC_H_MIN(res.classid));
 210                return true;
 211        }
 212
 213        return false;
 214}
 215
 216/*
 217 * Select a packet at random from queue
 218 * HACK: since queue can have holes from previous deletion; retry several
 219 *   times to find a random skb but then just give up and return the head
 220 * Will return NULL if queue is empty (q->head == q->tail)
 221 */
 222static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
 223                                         unsigned int *pidx)
 224{
 225        struct sk_buff *skb;
 226        int retrys = 3;
 227
 228        do {
 229                *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
 230                skb = q->tab[*pidx];
 231                if (skb)
 232                        return skb;
 233        } while (--retrys > 0);
 234
 235        return q->tab[*pidx = q->head];
 236}
 237
 238/*
 239 * Compare new packet with random packet in queue
 240 * returns true if matched and sets *pidx
 241 */
 242static bool choke_match_random(const struct choke_sched_data *q,
 243                               struct sk_buff *nskb,
 244                               unsigned int *pidx)
 245{
 246        struct sk_buff *oskb;
 247
 248        if (q->head == q->tail)
 249                return false;
 250
 251        oskb = choke_peek_random(q, pidx);
 252        if (q->filter_list)
 253                return choke_get_classid(nskb) == choke_get_classid(oskb);
 254
 255        return choke_match_flow(oskb, nskb);
 256}
 257
 258static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 259{
 260        struct choke_sched_data *q = qdisc_priv(sch);
 261        const struct red_parms *p = &q->parms;
 262        int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 263
 264        if (q->filter_list) {
 265                /* If using external classifiers, get result and record it. */
 266                if (!choke_classify(skb, sch, &ret))
 267                        goto other_drop;        /* Packet was eaten by filter */
 268        }
 269
 270        choke_skb_cb(skb)->keys_valid = 0;
 271        /* Compute average queue usage (see RED) */
 272        q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
 273        if (red_is_idling(&q->vars))
 274                red_end_of_idle_period(&q->vars);
 275
 276        /* Is queue small? */
 277        if (q->vars.qavg <= p->qth_min)
 278                q->vars.qcount = -1;
 279        else {
 280                unsigned int idx;
 281
 282                /* Draw a packet at random from queue and compare flow */
 283                if (choke_match_random(q, skb, &idx)) {
 284                        q->stats.matched++;
 285                        choke_drop_by_idx(sch, idx);
 286                        goto congestion_drop;
 287                }
 288
 289                /* Queue is large, always mark/drop */
 290                if (q->vars.qavg > p->qth_max) {
 291                        q->vars.qcount = -1;
 292
 293                        sch->qstats.overlimits++;
 294                        if (use_harddrop(q) || !use_ecn(q) ||
 295                            !INET_ECN_set_ce(skb)) {
 296                                q->stats.forced_drop++;
 297                                goto congestion_drop;
 298                        }
 299
 300                        q->stats.forced_mark++;
 301                } else if (++q->vars.qcount) {
 302                        if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
 303                                q->vars.qcount = 0;
 304                                q->vars.qR = red_random(p);
 305
 306                                sch->qstats.overlimits++;
 307                                if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
 308                                        q->stats.prob_drop++;
 309                                        goto congestion_drop;
 310                                }
 311
 312                                q->stats.prob_mark++;
 313                        }
 314                } else
 315                        q->vars.qR = red_random(p);
 316        }
 317
 318        /* Admit new packet */
 319        if (sch->q.qlen < q->limit) {
 320                q->tab[q->tail] = skb;
 321                q->tail = (q->tail + 1) & q->tab_mask;
 322                ++sch->q.qlen;
 323                sch->qstats.backlog += qdisc_pkt_len(skb);
 324                return NET_XMIT_SUCCESS;
 325        }
 326
 327        q->stats.pdrop++;
 328        return qdisc_drop(skb, sch);
 329
 330congestion_drop:
 331        qdisc_drop(skb, sch);
 332        return NET_XMIT_CN;
 333
 334other_drop:
 335        if (ret & __NET_XMIT_BYPASS)
 336                sch->qstats.drops++;
 337        kfree_skb(skb);
 338        return ret;
 339}
 340
 341static struct sk_buff *choke_dequeue(struct Qdisc *sch)
 342{
 343        struct choke_sched_data *q = qdisc_priv(sch);
 344        struct sk_buff *skb;
 345
 346        if (q->head == q->tail) {
 347                if (!red_is_idling(&q->vars))
 348                        red_start_of_idle_period(&q->vars);
 349                return NULL;
 350        }
 351
 352        skb = q->tab[q->head];
 353        q->tab[q->head] = NULL;
 354        choke_zap_head_holes(q);
 355        --sch->q.qlen;
 356        sch->qstats.backlog -= qdisc_pkt_len(skb);
 357        qdisc_bstats_update(sch, skb);
 358
 359        return skb;
 360}
 361
 362static unsigned int choke_drop(struct Qdisc *sch)
 363{
 364        struct choke_sched_data *q = qdisc_priv(sch);
 365        unsigned int len;
 366
 367        len = qdisc_queue_drop(sch);
 368        if (len > 0)
 369                q->stats.other++;
 370        else {
 371                if (!red_is_idling(&q->vars))
 372                        red_start_of_idle_period(&q->vars);
 373        }
 374
 375        return len;
 376}
 377
 378static void choke_reset(struct Qdisc *sch)
 379{
 380        struct choke_sched_data *q = qdisc_priv(sch);
 381
 382        red_restart(&q->vars);
 383}
 384
 385static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
 386        [TCA_CHOKE_PARMS]       = { .len = sizeof(struct tc_red_qopt) },
 387        [TCA_CHOKE_STAB]        = { .len = RED_STAB_SIZE },
 388        [TCA_CHOKE_MAX_P]       = { .type = NLA_U32 },
 389};
 390
 391
 392static void choke_free(void *addr)
 393{
 394        if (addr) {
 395                if (is_vmalloc_addr(addr))
 396                        vfree(addr);
 397                else
 398                        kfree(addr);
 399        }
 400}
 401
 402static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 403{
 404        struct choke_sched_data *q = qdisc_priv(sch);
 405        struct nlattr *tb[TCA_CHOKE_MAX + 1];
 406        const struct tc_red_qopt *ctl;
 407        int err;
 408        struct sk_buff **old = NULL;
 409        unsigned int mask;
 410        u32 max_P;
 411
 412        if (opt == NULL)
 413                return -EINVAL;
 414
 415        err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
 416        if (err < 0)
 417                return err;
 418
 419        if (tb[TCA_CHOKE_PARMS] == NULL ||
 420            tb[TCA_CHOKE_STAB] == NULL)
 421                return -EINVAL;
 422
 423        max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
 424
 425        ctl = nla_data(tb[TCA_CHOKE_PARMS]);
 426
 427        if (ctl->limit > CHOKE_MAX_QUEUE)
 428                return -EINVAL;
 429
 430        mask = roundup_pow_of_two(ctl->limit + 1) - 1;
 431        if (mask != q->tab_mask) {
 432                struct sk_buff **ntab;
 433
 434                ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
 435                               GFP_KERNEL | __GFP_NOWARN);
 436                if (!ntab)
 437                        ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
 438                if (!ntab)
 439                        return -ENOMEM;
 440
 441                sch_tree_lock(sch);
 442                old = q->tab;
 443                if (old) {
 444                        unsigned int oqlen = sch->q.qlen, tail = 0;
 445
 446                        while (q->head != q->tail) {
 447                                struct sk_buff *skb = q->tab[q->head];
 448
 449                                q->head = (q->head + 1) & q->tab_mask;
 450                                if (!skb)
 451                                        continue;
 452                                if (tail < mask) {
 453                                        ntab[tail++] = skb;
 454                                        continue;
 455                                }
 456                                sch->qstats.backlog -= qdisc_pkt_len(skb);
 457                                --sch->q.qlen;
 458                                qdisc_drop(skb, sch);
 459                        }
 460                        qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
 461                        q->head = 0;
 462                        q->tail = tail;
 463                }
 464
 465                q->tab_mask = mask;
 466                q->tab = ntab;
 467        } else
 468                sch_tree_lock(sch);
 469
 470        q->flags = ctl->flags;
 471        q->limit = ctl->limit;
 472
 473        red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
 474                      ctl->Plog, ctl->Scell_log,
 475                      nla_data(tb[TCA_CHOKE_STAB]),
 476                      max_P);
 477        red_set_vars(&q->vars);
 478
 479        if (q->head == q->tail)
 480                red_end_of_idle_period(&q->vars);
 481
 482        sch_tree_unlock(sch);
 483        choke_free(old);
 484        return 0;
 485}
 486
 487static int choke_init(struct Qdisc *sch, struct nlattr *opt)
 488{
 489        return choke_change(sch, opt);
 490}
 491
 492static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
 493{
 494        struct choke_sched_data *q = qdisc_priv(sch);
 495        struct nlattr *opts = NULL;
 496        struct tc_red_qopt opt = {
 497                .limit          = q->limit,
 498                .flags          = q->flags,
 499                .qth_min        = q->parms.qth_min >> q->parms.Wlog,
 500                .qth_max        = q->parms.qth_max >> q->parms.Wlog,
 501                .Wlog           = q->parms.Wlog,
 502                .Plog           = q->parms.Plog,
 503                .Scell_log      = q->parms.Scell_log,
 504        };
 505
 506        opts = nla_nest_start(skb, TCA_OPTIONS);
 507        if (opts == NULL)
 508                goto nla_put_failure;
 509
 510        if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
 511            nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
 512                goto nla_put_failure;
 513        return nla_nest_end(skb, opts);
 514
 515nla_put_failure:
 516        nla_nest_cancel(skb, opts);
 517        return -EMSGSIZE;
 518}
 519
 520static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 521{
 522        struct choke_sched_data *q = qdisc_priv(sch);
 523        struct tc_choke_xstats st = {
 524                .early  = q->stats.prob_drop + q->stats.forced_drop,
 525                .marked = q->stats.prob_mark + q->stats.forced_mark,
 526                .pdrop  = q->stats.pdrop,
 527                .other  = q->stats.other,
 528                .matched = q->stats.matched,
 529        };
 530
 531        return gnet_stats_copy_app(d, &st, sizeof(st));
 532}
 533
 534static void choke_destroy(struct Qdisc *sch)
 535{
 536        struct choke_sched_data *q = qdisc_priv(sch);
 537
 538        tcf_destroy_chain(&q->filter_list);
 539        choke_free(q->tab);
 540}
 541
 542static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
 543{
 544        return NULL;
 545}
 546
 547static unsigned long choke_get(struct Qdisc *sch, u32 classid)
 548{
 549        return 0;
 550}
 551
 552static void choke_put(struct Qdisc *q, unsigned long cl)
 553{
 554}
 555
 556static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
 557                                u32 classid)
 558{
 559        return 0;
 560}
 561
 562static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
 563{
 564        struct choke_sched_data *q = qdisc_priv(sch);
 565
 566        if (cl)
 567                return NULL;
 568        return &q->filter_list;
 569}
 570
 571static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
 572                          struct sk_buff *skb, struct tcmsg *tcm)
 573{
 574        tcm->tcm_handle |= TC_H_MIN(cl);
 575        return 0;
 576}
 577
 578static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 579{
 580        if (!arg->stop) {
 581                if (arg->fn(sch, 1, arg) < 0) {
 582                        arg->stop = 1;
 583                        return;
 584                }
 585                arg->count++;
 586        }
 587}
 588
 589static const struct Qdisc_class_ops choke_class_ops = {
 590        .leaf           =       choke_leaf,
 591        .get            =       choke_get,
 592        .put            =       choke_put,
 593        .tcf_chain      =       choke_find_tcf,
 594        .bind_tcf       =       choke_bind,
 595        .unbind_tcf     =       choke_put,
 596        .dump           =       choke_dump_class,
 597        .walk           =       choke_walk,
 598};
 599
 600static struct sk_buff *choke_peek_head(struct Qdisc *sch)
 601{
 602        struct choke_sched_data *q = qdisc_priv(sch);
 603
 604        return (q->head != q->tail) ? q->tab[q->head] : NULL;
 605}
 606
 607static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
 608        .id             =       "choke",
 609        .priv_size      =       sizeof(struct choke_sched_data),
 610
 611        .enqueue        =       choke_enqueue,
 612        .dequeue        =       choke_dequeue,
 613        .peek           =       choke_peek_head,
 614        .drop           =       choke_drop,
 615        .init           =       choke_init,
 616        .destroy        =       choke_destroy,
 617        .reset          =       choke_reset,
 618        .change         =       choke_change,
 619        .dump           =       choke_dump,
 620        .dump_stats     =       choke_dump_stats,
 621        .owner          =       THIS_MODULE,
 622};
 623
 624static int __init choke_module_init(void)
 625{
 626        return register_qdisc(&choke_qdisc_ops);
 627}
 628
 629static void __exit choke_module_exit(void)
 630{
 631        unregister_qdisc(&choke_qdisc_ops);
 632}
 633
 634module_init(choke_module_init)
 635module_exit(choke_module_exit)
 636
 637MODULE_LICENSE("GPL");
 638