linux/net/sched/sch_choke.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_choke.c        CHOKE scheduler
   3 *
   4 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
   5 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License
   9 * version 2 as published by the Free Software Foundation.
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/types.h>
  15#include <linux/kernel.h>
  16#include <linux/skbuff.h>
  17#include <linux/vmalloc.h>
  18#include <net/pkt_sched.h>
  19#include <net/inet_ecn.h>
  20#include <net/red.h>
  21#include <net/flow_dissector.h>
  22
  23/*
  24   CHOKe stateless AQM for fair bandwidth allocation
  25   =================================================
  26
  27   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
  28   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
  29   maintains no flow state. The difference from RED is an additional step
  30   during the enqueuing process. If average queue size is over the
  31   low threshold (qmin), a packet is chosen at random from the queue.
  32   If both the new and chosen packet are from the same flow, both
  33   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
  34   needs to access packets in queue randomly. It has a minimal class
  35   interface to allow overriding the builtin flow classifier with
  36   filters.
  37
  38   Source:
  39   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
  40   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
  41   IEEE INFOCOM, 2000.
  42
  43   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
  44   Characteristics", IEEE/ACM Transactions on Networking, 2004
  45
  46 */
  47
  48/* Upper bound on size of sk_buff table (packets) */
  49#define CHOKE_MAX_QUEUE (128*1024 - 1)
  50
  51struct choke_sched_data {
  52/* Parameters */
  53        u32              limit;
  54        unsigned char    flags;
  55
  56        struct red_parms parms;
  57
  58/* Variables */
  59        struct red_vars  vars;
  60        struct tcf_proto __rcu *filter_list;
  61        struct {
  62                u32     prob_drop;      /* Early probability drops */
  63                u32     prob_mark;      /* Early probability marks */
  64                u32     forced_drop;    /* Forced drops, qavg > max_thresh */
  65                u32     forced_mark;    /* Forced marks, qavg > max_thresh */
  66                u32     pdrop;          /* Drops due to queue limits */
  67                u32     other;          /* Drops due to drop() calls */
  68                u32     matched;        /* Drops to flow match */
  69        } stats;
  70
  71        unsigned int     head;
  72        unsigned int     tail;
  73
  74        unsigned int     tab_mask; /* size - 1 */
  75
  76        struct sk_buff **tab;
  77};
  78
  79/* number of elements in queue including holes */
  80static unsigned int choke_len(const struct choke_sched_data *q)
  81{
  82        return (q->tail - q->head) & q->tab_mask;
  83}
  84
  85/* Is ECN parameter configured */
  86static int use_ecn(const struct choke_sched_data *q)
  87{
  88        return q->flags & TC_RED_ECN;
  89}
  90
  91/* Should packets over max just be dropped (versus marked) */
  92static int use_harddrop(const struct choke_sched_data *q)
  93{
  94        return q->flags & TC_RED_HARDDROP;
  95}
  96
  97/* Move head pointer forward to skip over holes */
  98static void choke_zap_head_holes(struct choke_sched_data *q)
  99{
 100        do {
 101                q->head = (q->head + 1) & q->tab_mask;
 102                if (q->head == q->tail)
 103                        break;
 104        } while (q->tab[q->head] == NULL);
 105}
 106
 107/* Move tail pointer backwards to reuse holes */
 108static void choke_zap_tail_holes(struct choke_sched_data *q)
 109{
 110        do {
 111                q->tail = (q->tail - 1) & q->tab_mask;
 112                if (q->head == q->tail)
 113                        break;
 114        } while (q->tab[q->tail] == NULL);
 115}
 116
 117/* Drop packet from queue array by creating a "hole" */
 118static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
 119{
 120        struct choke_sched_data *q = qdisc_priv(sch);
 121        struct sk_buff *skb = q->tab[idx];
 122
 123        q->tab[idx] = NULL;
 124
 125        if (idx == q->head)
 126                choke_zap_head_holes(q);
 127        if (idx == q->tail)
 128                choke_zap_tail_holes(q);
 129
 130        qdisc_qstats_backlog_dec(sch, skb);
 131        qdisc_drop(skb, sch);
 132        qdisc_tree_decrease_qlen(sch, 1);
 133        --sch->q.qlen;
 134}
 135
 136struct choke_skb_cb {
 137        u16                     classid;
 138        u8                      keys_valid;
 139        struct                  flow_keys_digest keys;
 140};
 141
 142static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
 143{
 144        qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
 145        return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
 146}
 147
 148static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
 149{
 150        choke_skb_cb(skb)->classid = classid;
 151}
 152
 153static u16 choke_get_classid(const struct sk_buff *skb)
 154{
 155        return choke_skb_cb(skb)->classid;
 156}
 157
 158/*
 159 * Compare flow of two packets
 160 *  Returns true only if source and destination address and port match.
 161 *          false for special cases
 162 */
 163static bool choke_match_flow(struct sk_buff *skb1,
 164                             struct sk_buff *skb2)
 165{
 166        struct flow_keys temp;
 167
 168        if (skb1->protocol != skb2->protocol)
 169                return false;
 170
 171        if (!choke_skb_cb(skb1)->keys_valid) {
 172                choke_skb_cb(skb1)->keys_valid = 1;
 173                skb_flow_dissect_flow_keys(skb1, &temp, 0);
 174                make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
 175        }
 176
 177        if (!choke_skb_cb(skb2)->keys_valid) {
 178                choke_skb_cb(skb2)->keys_valid = 1;
 179                skb_flow_dissect_flow_keys(skb2, &temp, 0);
 180                make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
 181        }
 182
 183        return !memcmp(&choke_skb_cb(skb1)->keys,
 184                       &choke_skb_cb(skb2)->keys,
 185                       sizeof(choke_skb_cb(skb1)->keys));
 186}
 187
 188/*
 189 * Classify flow using either:
 190 *  1. pre-existing classification result in skb
 191 *  2. fast internal classification
 192 *  3. use TC filter based classification
 193 */
 194static bool choke_classify(struct sk_buff *skb,
 195                           struct Qdisc *sch, int *qerr)
 196
 197{
 198        struct choke_sched_data *q = qdisc_priv(sch);
 199        struct tcf_result res;
 200        struct tcf_proto *fl;
 201        int result;
 202
 203        fl = rcu_dereference_bh(q->filter_list);
 204        result = tc_classify(skb, fl, &res, false);
 205        if (result >= 0) {
 206#ifdef CONFIG_NET_CLS_ACT
 207                switch (result) {
 208                case TC_ACT_STOLEN:
 209                case TC_ACT_QUEUED:
 210                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 211                case TC_ACT_SHOT:
 212                        return false;
 213                }
 214#endif
 215                choke_set_classid(skb, TC_H_MIN(res.classid));
 216                return true;
 217        }
 218
 219        return false;
 220}
 221
 222/*
 223 * Select a packet at random from queue
 224 * HACK: since queue can have holes from previous deletion; retry several
 225 *   times to find a random skb but then just give up and return the head
 226 * Will return NULL if queue is empty (q->head == q->tail)
 227 */
 228static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
 229                                         unsigned int *pidx)
 230{
 231        struct sk_buff *skb;
 232        int retrys = 3;
 233
 234        do {
 235                *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
 236                skb = q->tab[*pidx];
 237                if (skb)
 238                        return skb;
 239        } while (--retrys > 0);
 240
 241        return q->tab[*pidx = q->head];
 242}
 243
 244/*
 245 * Compare new packet with random packet in queue
 246 * returns true if matched and sets *pidx
 247 */
 248static bool choke_match_random(const struct choke_sched_data *q,
 249                               struct sk_buff *nskb,
 250                               unsigned int *pidx)
 251{
 252        struct sk_buff *oskb;
 253
 254        if (q->head == q->tail)
 255                return false;
 256
 257        oskb = choke_peek_random(q, pidx);
 258        if (rcu_access_pointer(q->filter_list))
 259                return choke_get_classid(nskb) == choke_get_classid(oskb);
 260
 261        return choke_match_flow(oskb, nskb);
 262}
 263
 264static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 265{
 266        int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 267        struct choke_sched_data *q = qdisc_priv(sch);
 268        const struct red_parms *p = &q->parms;
 269
 270        if (rcu_access_pointer(q->filter_list)) {
 271                /* If using external classifiers, get result and record it. */
 272                if (!choke_classify(skb, sch, &ret))
 273                        goto other_drop;        /* Packet was eaten by filter */
 274        }
 275
 276        choke_skb_cb(skb)->keys_valid = 0;
 277        /* Compute average queue usage (see RED) */
 278        q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
 279        if (red_is_idling(&q->vars))
 280                red_end_of_idle_period(&q->vars);
 281
 282        /* Is queue small? */
 283        if (q->vars.qavg <= p->qth_min)
 284                q->vars.qcount = -1;
 285        else {
 286                unsigned int idx;
 287
 288                /* Draw a packet at random from queue and compare flow */
 289                if (choke_match_random(q, skb, &idx)) {
 290                        q->stats.matched++;
 291                        choke_drop_by_idx(sch, idx);
 292                        goto congestion_drop;
 293                }
 294
 295                /* Queue is large, always mark/drop */
 296                if (q->vars.qavg > p->qth_max) {
 297                        q->vars.qcount = -1;
 298
 299                        qdisc_qstats_overlimit(sch);
 300                        if (use_harddrop(q) || !use_ecn(q) ||
 301                            !INET_ECN_set_ce(skb)) {
 302                                q->stats.forced_drop++;
 303                                goto congestion_drop;
 304                        }
 305
 306                        q->stats.forced_mark++;
 307                } else if (++q->vars.qcount) {
 308                        if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
 309                                q->vars.qcount = 0;
 310                                q->vars.qR = red_random(p);
 311
 312                                qdisc_qstats_overlimit(sch);
 313                                if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
 314                                        q->stats.prob_drop++;
 315                                        goto congestion_drop;
 316                                }
 317
 318                                q->stats.prob_mark++;
 319                        }
 320                } else
 321                        q->vars.qR = red_random(p);
 322        }
 323
 324        /* Admit new packet */
 325        if (sch->q.qlen < q->limit) {
 326                q->tab[q->tail] = skb;
 327                q->tail = (q->tail + 1) & q->tab_mask;
 328                ++sch->q.qlen;
 329                qdisc_qstats_backlog_inc(sch, skb);
 330                return NET_XMIT_SUCCESS;
 331        }
 332
 333        q->stats.pdrop++;
 334        return qdisc_drop(skb, sch);
 335
 336congestion_drop:
 337        qdisc_drop(skb, sch);
 338        return NET_XMIT_CN;
 339
 340other_drop:
 341        if (ret & __NET_XMIT_BYPASS)
 342                qdisc_qstats_drop(sch);
 343        kfree_skb(skb);
 344        return ret;
 345}
 346
 347static struct sk_buff *choke_dequeue(struct Qdisc *sch)
 348{
 349        struct choke_sched_data *q = qdisc_priv(sch);
 350        struct sk_buff *skb;
 351
 352        if (q->head == q->tail) {
 353                if (!red_is_idling(&q->vars))
 354                        red_start_of_idle_period(&q->vars);
 355                return NULL;
 356        }
 357
 358        skb = q->tab[q->head];
 359        q->tab[q->head] = NULL;
 360        choke_zap_head_holes(q);
 361        --sch->q.qlen;
 362        qdisc_qstats_backlog_dec(sch, skb);
 363        qdisc_bstats_update(sch, skb);
 364
 365        return skb;
 366}
 367
 368static unsigned int choke_drop(struct Qdisc *sch)
 369{
 370        struct choke_sched_data *q = qdisc_priv(sch);
 371        unsigned int len;
 372
 373        len = qdisc_queue_drop(sch);
 374        if (len > 0)
 375                q->stats.other++;
 376        else {
 377                if (!red_is_idling(&q->vars))
 378                        red_start_of_idle_period(&q->vars);
 379        }
 380
 381        return len;
 382}
 383
 384static void choke_reset(struct Qdisc *sch)
 385{
 386        struct choke_sched_data *q = qdisc_priv(sch);
 387
 388        while (q->head != q->tail) {
 389                struct sk_buff *skb = q->tab[q->head];
 390
 391                q->head = (q->head + 1) & q->tab_mask;
 392                if (!skb)
 393                        continue;
 394                qdisc_qstats_backlog_dec(sch, skb);
 395                --sch->q.qlen;
 396                qdisc_drop(skb, sch);
 397        }
 398
 399        memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
 400        q->head = q->tail = 0;
 401        red_restart(&q->vars);
 402}
 403
 404static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
 405        [TCA_CHOKE_PARMS]       = { .len = sizeof(struct tc_red_qopt) },
 406        [TCA_CHOKE_STAB]        = { .len = RED_STAB_SIZE },
 407        [TCA_CHOKE_MAX_P]       = { .type = NLA_U32 },
 408};
 409
 410
 411static void choke_free(void *addr)
 412{
 413        kvfree(addr);
 414}
 415
 416static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 417{
 418        struct choke_sched_data *q = qdisc_priv(sch);
 419        struct nlattr *tb[TCA_CHOKE_MAX + 1];
 420        const struct tc_red_qopt *ctl;
 421        int err;
 422        struct sk_buff **old = NULL;
 423        unsigned int mask;
 424        u32 max_P;
 425
 426        if (opt == NULL)
 427                return -EINVAL;
 428
 429        err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
 430        if (err < 0)
 431                return err;
 432
 433        if (tb[TCA_CHOKE_PARMS] == NULL ||
 434            tb[TCA_CHOKE_STAB] == NULL)
 435                return -EINVAL;
 436
 437        max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
 438
 439        ctl = nla_data(tb[TCA_CHOKE_PARMS]);
 440
 441        if (ctl->limit > CHOKE_MAX_QUEUE)
 442                return -EINVAL;
 443
 444        mask = roundup_pow_of_two(ctl->limit + 1) - 1;
 445        if (mask != q->tab_mask) {
 446                struct sk_buff **ntab;
 447
 448                ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
 449                               GFP_KERNEL | __GFP_NOWARN);
 450                if (!ntab)
 451                        ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
 452                if (!ntab)
 453                        return -ENOMEM;
 454
 455                sch_tree_lock(sch);
 456                old = q->tab;
 457                if (old) {
 458                        unsigned int oqlen = sch->q.qlen, tail = 0;
 459
 460                        while (q->head != q->tail) {
 461                                struct sk_buff *skb = q->tab[q->head];
 462
 463                                q->head = (q->head + 1) & q->tab_mask;
 464                                if (!skb)
 465                                        continue;
 466                                if (tail < mask) {
 467                                        ntab[tail++] = skb;
 468                                        continue;
 469                                }
 470                                qdisc_qstats_backlog_dec(sch, skb);
 471                                --sch->q.qlen;
 472                                qdisc_drop(skb, sch);
 473                        }
 474                        qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
 475                        q->head = 0;
 476                        q->tail = tail;
 477                }
 478
 479                q->tab_mask = mask;
 480                q->tab = ntab;
 481        } else
 482                sch_tree_lock(sch);
 483
 484        q->flags = ctl->flags;
 485        q->limit = ctl->limit;
 486
 487        red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
 488                      ctl->Plog, ctl->Scell_log,
 489                      nla_data(tb[TCA_CHOKE_STAB]),
 490                      max_P);
 491        red_set_vars(&q->vars);
 492
 493        if (q->head == q->tail)
 494                red_end_of_idle_period(&q->vars);
 495
 496        sch_tree_unlock(sch);
 497        choke_free(old);
 498        return 0;
 499}
 500
 501static int choke_init(struct Qdisc *sch, struct nlattr *opt)
 502{
 503        return choke_change(sch, opt);
 504}
 505
 506static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
 507{
 508        struct choke_sched_data *q = qdisc_priv(sch);
 509        struct nlattr *opts = NULL;
 510        struct tc_red_qopt opt = {
 511                .limit          = q->limit,
 512                .flags          = q->flags,
 513                .qth_min        = q->parms.qth_min >> q->parms.Wlog,
 514                .qth_max        = q->parms.qth_max >> q->parms.Wlog,
 515                .Wlog           = q->parms.Wlog,
 516                .Plog           = q->parms.Plog,
 517                .Scell_log      = q->parms.Scell_log,
 518        };
 519
 520        opts = nla_nest_start(skb, TCA_OPTIONS);
 521        if (opts == NULL)
 522                goto nla_put_failure;
 523
 524        if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
 525            nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
 526                goto nla_put_failure;
 527        return nla_nest_end(skb, opts);
 528
 529nla_put_failure:
 530        nla_nest_cancel(skb, opts);
 531        return -EMSGSIZE;
 532}
 533
 534static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 535{
 536        struct choke_sched_data *q = qdisc_priv(sch);
 537        struct tc_choke_xstats st = {
 538                .early  = q->stats.prob_drop + q->stats.forced_drop,
 539                .marked = q->stats.prob_mark + q->stats.forced_mark,
 540                .pdrop  = q->stats.pdrop,
 541                .other  = q->stats.other,
 542                .matched = q->stats.matched,
 543        };
 544
 545        return gnet_stats_copy_app(d, &st, sizeof(st));
 546}
 547
 548static void choke_destroy(struct Qdisc *sch)
 549{
 550        struct choke_sched_data *q = qdisc_priv(sch);
 551
 552        tcf_destroy_chain(&q->filter_list);
 553        choke_free(q->tab);
 554}
 555
 556static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
 557{
 558        return NULL;
 559}
 560
 561static unsigned long choke_get(struct Qdisc *sch, u32 classid)
 562{
 563        return 0;
 564}
 565
 566static void choke_put(struct Qdisc *q, unsigned long cl)
 567{
 568}
 569
 570static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
 571                                u32 classid)
 572{
 573        return 0;
 574}
 575
 576static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch,
 577                                               unsigned long cl)
 578{
 579        struct choke_sched_data *q = qdisc_priv(sch);
 580
 581        if (cl)
 582                return NULL;
 583        return &q->filter_list;
 584}
 585
 586static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
 587                          struct sk_buff *skb, struct tcmsg *tcm)
 588{
 589        tcm->tcm_handle |= TC_H_MIN(cl);
 590        return 0;
 591}
 592
 593static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 594{
 595        if (!arg->stop) {
 596                if (arg->fn(sch, 1, arg) < 0) {
 597                        arg->stop = 1;
 598                        return;
 599                }
 600                arg->count++;
 601        }
 602}
 603
 604static const struct Qdisc_class_ops choke_class_ops = {
 605        .leaf           =       choke_leaf,
 606        .get            =       choke_get,
 607        .put            =       choke_put,
 608        .tcf_chain      =       choke_find_tcf,
 609        .bind_tcf       =       choke_bind,
 610        .unbind_tcf     =       choke_put,
 611        .dump           =       choke_dump_class,
 612        .walk           =       choke_walk,
 613};
 614
 615static struct sk_buff *choke_peek_head(struct Qdisc *sch)
 616{
 617        struct choke_sched_data *q = qdisc_priv(sch);
 618
 619        return (q->head != q->tail) ? q->tab[q->head] : NULL;
 620}
 621
 622static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
 623        .id             =       "choke",
 624        .priv_size      =       sizeof(struct choke_sched_data),
 625
 626        .enqueue        =       choke_enqueue,
 627        .dequeue        =       choke_dequeue,
 628        .peek           =       choke_peek_head,
 629        .drop           =       choke_drop,
 630        .init           =       choke_init,
 631        .destroy        =       choke_destroy,
 632        .reset          =       choke_reset,
 633        .change         =       choke_change,
 634        .dump           =       choke_dump,
 635        .dump_stats     =       choke_dump_stats,
 636        .owner          =       THIS_MODULE,
 637};
 638
 639static int __init choke_module_init(void)
 640{
 641        return register_qdisc(&choke_qdisc_ops);
 642}
 643
 644static void __exit choke_module_exit(void)
 645{
 646        unregister_qdisc(&choke_qdisc_ops);
 647}
 648
 649module_init(choke_module_init)
 650module_exit(choke_module_exit)
 651
 652MODULE_LICENSE("GPL");
 653