linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/types.h>
  14#include <linux/kernel.h>
  15#include <linux/jiffies.h>
  16#include <linux/string.h>
  17#include <linux/in.h>
  18#include <linux/errno.h>
  19#include <linux/init.h>
  20#include <linux/skbuff.h>
  21#include <linux/jhash.h>
  22#include <linux/slab.h>
  23#include <linux/vmalloc.h>
  24#include <net/netlink.h>
  25#include <net/pkt_sched.h>
  26#include <net/flow_keys.h>
  27#include <net/red.h>
  28
  29
  30/*      Stochastic Fairness Queuing algorithm.
  31        =======================================
  32
  33        Source:
  34        Paul E. McKenney "Stochastic Fairness Queuing",
  35        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37        Paul E. McKenney "Stochastic Fairness Queuing",
  38        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41        See also:
  42        M. Shreedhar and George Varghese "Efficient Fair
  43        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46        This is not the thing that is usually called (W)FQ nowadays.
  47        It does not use any timestamp mechanism, but instead
  48        processes queues in round-robin order.
  49
  50        ADVANTAGE:
  51
  52        - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54        DRAWBACKS:
  55
  56        - "Stochastic" -> It is not 100% fair.
  57        When hash collisions occur, several flows are considered as one.
  58
  59        - "Round-robin" -> It introduces larger delays than virtual clock
  60        based schemes, and should not be used for isolating interactive
  61        traffic from non-interactive. It means, that this scheduler
  62        should be used as leaf of CBQ or P3, which put interactive traffic
  63        to higher priority band.
  64
  65        We still need true WFQ for top level CSZ, but using WFQ
  66        for the best effort traffic is absolutely pointless:
  67        SFQ is superior for this purpose.
  68
  69        IMPLEMENTATION:
  70        This implementation limits :
  71        - maximal queue length per flow to 127 packets.
  72        - max mtu to 2^18-1;
  73        - max 65408 flows,
  74        - number of hash buckets to 65536.
  75
  76        It is easy to increase these values, but not in flight.  */
  77
  78#define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79#define SFQ_DEFAULT_FLOWS       128
  80#define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81#define SFQ_EMPTY_SLOT          0xffff
  82#define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84/* We use 16 bits to store allot, and want to handle packets up to 64K
  85 * Scale allot by 8 (1<<3) so that no overflow occurs.
  86 */
  87#define SFQ_ALLOT_SHIFT         3
  88#define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91typedef u16 sfq_index;
  92
  93/*
  94 * We dont use pointers to save space.
  95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97 * are 'pointers' to dep[] array
  98 */
  99struct sfq_head {
 100        sfq_index       next;
 101        sfq_index       prev;
 102};
 103
 104struct sfq_slot {
 105        struct sk_buff  *skblist_next;
 106        struct sk_buff  *skblist_prev;
 107        sfq_index       qlen; /* number of skbs in skblist */
 108        sfq_index       next; /* next slot in sfq RR chain */
 109        struct sfq_head dep; /* anchor in dep[] chains */
 110        unsigned short  hash; /* hash value (index in ht[]) */
 111        short           allot; /* credit for this slot */
 112
 113        unsigned int    backlog;
 114        struct red_vars vars;
 115};
 116
 117struct sfq_sched_data {
 118/* frequently used fields */
 119        int             limit;          /* limit of total number of packets in this qdisc */
 120        unsigned int    divisor;        /* number of slots in hash table */
 121        u8              headdrop;
 122        u8              maxdepth;       /* limit of packets per flow */
 123
 124        u32             perturbation;
 125        u8              cur_depth;      /* depth of longest slot */
 126        u8              flags;
 127        unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128        struct tcf_proto __rcu *filter_list;
 129        sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130        struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132        struct red_parms *red_parms;
 133        struct tc_sfqred_stats stats;
 134        struct sfq_slot *tail;          /* current slot in round */
 135
 136        struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                        /* Linked lists of slots, indexed by depth
 138                                         * dep[0] : list of unused flows
 139                                         * dep[1] : list of flows with 1 packet
 140                                         * dep[X] : list of flows with X packets
 141                                         */
 142
 143        unsigned int    maxflows;       /* number of flows in flows array */
 144        int             perturb_period;
 145        unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146        struct timer_list perturb_timer;
 147};
 148
 149/*
 150 * sfq_head are either in a sfq_slot or in dep[] array
 151 */
 152static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153{
 154        if (val < SFQ_MAX_FLOWS)
 155                return &q->slots[val].dep;
 156        return &q->dep[val - SFQ_MAX_FLOWS];
 157}
 158
 159/*
 160 * In order to be able to quickly rehash our queue when timer changes
 161 * q->perturbation, we store flow_keys in skb->cb[]
 162 */
 163struct sfq_skb_cb {
 164       struct flow_keys        keys;
 165};
 166
 167static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 168{
 169        qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
 170        return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 171}
 172
 173static unsigned int sfq_hash(const struct sfq_sched_data *q,
 174                             const struct sk_buff *skb)
 175{
 176        const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 177        unsigned int hash;
 178
 179        hash = jhash_3words((__force u32)keys->dst,
 180                            (__force u32)keys->src ^ keys->ip_proto,
 181                            (__force u32)keys->ports, q->perturbation);
 182        return hash & (q->divisor - 1);
 183}
 184
 185static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 186                                 int *qerr)
 187{
 188        struct sfq_sched_data *q = qdisc_priv(sch);
 189        struct tcf_result res;
 190        struct tcf_proto *fl;
 191        int result;
 192
 193        if (TC_H_MAJ(skb->priority) == sch->handle &&
 194            TC_H_MIN(skb->priority) > 0 &&
 195            TC_H_MIN(skb->priority) <= q->divisor)
 196                return TC_H_MIN(skb->priority);
 197
 198        fl = rcu_dereference_bh(q->filter_list);
 199        if (!fl) {
 200                skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 201                return sfq_hash(q, skb) + 1;
 202        }
 203
 204        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 205        result = tc_classify(skb, fl, &res);
 206        if (result >= 0) {
 207#ifdef CONFIG_NET_CLS_ACT
 208                switch (result) {
 209                case TC_ACT_STOLEN:
 210                case TC_ACT_QUEUED:
 211                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 212                case TC_ACT_SHOT:
 213                        return 0;
 214                }
 215#endif
 216                if (TC_H_MIN(res.classid) <= q->divisor)
 217                        return TC_H_MIN(res.classid);
 218        }
 219        return 0;
 220}
 221
 222/*
 223 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 224 */
 225static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 226{
 227        sfq_index p, n;
 228        struct sfq_slot *slot = &q->slots[x];
 229        int qlen = slot->qlen;
 230
 231        p = qlen + SFQ_MAX_FLOWS;
 232        n = q->dep[qlen].next;
 233
 234        slot->dep.next = n;
 235        slot->dep.prev = p;
 236
 237        q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 238        sfq_dep_head(q, n)->prev = x;
 239}
 240
 241#define sfq_unlink(q, x, n, p)                  \
 242        do {                                    \
 243                n = q->slots[x].dep.next;       \
 244                p = q->slots[x].dep.prev;       \
 245                sfq_dep_head(q, p)->next = n;   \
 246                sfq_dep_head(q, n)->prev = p;   \
 247        } while (0)
 248
 249
 250static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 251{
 252        sfq_index p, n;
 253        int d;
 254
 255        sfq_unlink(q, x, n, p);
 256
 257        d = q->slots[x].qlen--;
 258        if (n == p && q->cur_depth == d)
 259                q->cur_depth--;
 260        sfq_link(q, x);
 261}
 262
 263static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 264{
 265        sfq_index p, n;
 266        int d;
 267
 268        sfq_unlink(q, x, n, p);
 269
 270        d = ++q->slots[x].qlen;
 271        if (q->cur_depth < d)
 272                q->cur_depth = d;
 273        sfq_link(q, x);
 274}
 275
 276/* helper functions : might be changed when/if skb use a standard list_head */
 277
 278/* remove one skb from tail of slot queue */
 279static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 280{
 281        struct sk_buff *skb = slot->skblist_prev;
 282
 283        slot->skblist_prev = skb->prev;
 284        skb->prev->next = (struct sk_buff *)slot;
 285        skb->next = skb->prev = NULL;
 286        return skb;
 287}
 288
 289/* remove one skb from head of slot queue */
 290static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 291{
 292        struct sk_buff *skb = slot->skblist_next;
 293
 294        slot->skblist_next = skb->next;
 295        skb->next->prev = (struct sk_buff *)slot;
 296        skb->next = skb->prev = NULL;
 297        return skb;
 298}
 299
 300static inline void slot_queue_init(struct sfq_slot *slot)
 301{
 302        memset(slot, 0, sizeof(*slot));
 303        slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 304}
 305
 306/* add skb to slot queue (tail add) */
 307static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 308{
 309        skb->prev = slot->skblist_prev;
 310        skb->next = (struct sk_buff *)slot;
 311        slot->skblist_prev->next = skb;
 312        slot->skblist_prev = skb;
 313}
 314
 315static unsigned int sfq_drop(struct Qdisc *sch)
 316{
 317        struct sfq_sched_data *q = qdisc_priv(sch);
 318        sfq_index x, d = q->cur_depth;
 319        struct sk_buff *skb;
 320        unsigned int len;
 321        struct sfq_slot *slot;
 322
 323        /* Queue is full! Find the longest slot and drop tail packet from it */
 324        if (d > 1) {
 325                x = q->dep[d].next;
 326                slot = &q->slots[x];
 327drop:
 328                skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 329                len = qdisc_pkt_len(skb);
 330                slot->backlog -= len;
 331                sfq_dec(q, x);
 332                kfree_skb(skb);
 333                sch->q.qlen--;
 334                qdisc_qstats_drop(sch);
 335                qdisc_qstats_backlog_dec(sch, skb);
 336                return len;
 337        }
 338
 339        if (d == 1) {
 340                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 341                x = q->tail->next;
 342                slot = &q->slots[x];
 343                q->tail->next = slot->next;
 344                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 345                goto drop;
 346        }
 347
 348        return 0;
 349}
 350
 351/* Is ECN parameter configured */
 352static int sfq_prob_mark(const struct sfq_sched_data *q)
 353{
 354        return q->flags & TC_RED_ECN;
 355}
 356
 357/* Should packets over max threshold just be marked */
 358static int sfq_hard_mark(const struct sfq_sched_data *q)
 359{
 360        return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 361}
 362
 363static int sfq_headdrop(const struct sfq_sched_data *q)
 364{
 365        return q->headdrop;
 366}
 367
 368static int
 369sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 370{
 371        struct sfq_sched_data *q = qdisc_priv(sch);
 372        unsigned int hash;
 373        sfq_index x, qlen;
 374        struct sfq_slot *slot;
 375        int uninitialized_var(ret);
 376        struct sk_buff *head;
 377        int delta;
 378
 379        hash = sfq_classify(skb, sch, &ret);
 380        if (hash == 0) {
 381                if (ret & __NET_XMIT_BYPASS)
 382                        qdisc_qstats_drop(sch);
 383                kfree_skb(skb);
 384                return ret;
 385        }
 386        hash--;
 387
 388        x = q->ht[hash];
 389        slot = &q->slots[x];
 390        if (x == SFQ_EMPTY_SLOT) {
 391                x = q->dep[0].next; /* get a free slot */
 392                if (x >= SFQ_MAX_FLOWS)
 393                        return qdisc_drop(skb, sch);
 394                q->ht[hash] = x;
 395                slot = &q->slots[x];
 396                slot->hash = hash;
 397                slot->backlog = 0; /* should already be 0 anyway... */
 398                red_set_vars(&slot->vars);
 399                goto enqueue;
 400        }
 401        if (q->red_parms) {
 402                slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 403                                                        &slot->vars,
 404                                                        slot->backlog);
 405                switch (red_action(q->red_parms,
 406                                   &slot->vars,
 407                                   slot->vars.qavg)) {
 408                case RED_DONT_MARK:
 409                        break;
 410
 411                case RED_PROB_MARK:
 412                        qdisc_qstats_overlimit(sch);
 413                        if (sfq_prob_mark(q)) {
 414                                /* We know we have at least one packet in queue */
 415                                if (sfq_headdrop(q) &&
 416                                    INET_ECN_set_ce(slot->skblist_next)) {
 417                                        q->stats.prob_mark_head++;
 418                                        break;
 419                                }
 420                                if (INET_ECN_set_ce(skb)) {
 421                                        q->stats.prob_mark++;
 422                                        break;
 423                                }
 424                        }
 425                        q->stats.prob_drop++;
 426                        goto congestion_drop;
 427
 428                case RED_HARD_MARK:
 429                        qdisc_qstats_overlimit(sch);
 430                        if (sfq_hard_mark(q)) {
 431                                /* We know we have at least one packet in queue */
 432                                if (sfq_headdrop(q) &&
 433                                    INET_ECN_set_ce(slot->skblist_next)) {
 434                                        q->stats.forced_mark_head++;
 435                                        break;
 436                                }
 437                                if (INET_ECN_set_ce(skb)) {
 438                                        q->stats.forced_mark++;
 439                                        break;
 440                                }
 441                        }
 442                        q->stats.forced_drop++;
 443                        goto congestion_drop;
 444                }
 445        }
 446
 447        if (slot->qlen >= q->maxdepth) {
 448congestion_drop:
 449                if (!sfq_headdrop(q))
 450                        return qdisc_drop(skb, sch);
 451
 452                /* We know we have at least one packet in queue */
 453                head = slot_dequeue_head(slot);
 454                delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 455                sch->qstats.backlog -= delta;
 456                slot->backlog -= delta;
 457                qdisc_drop(head, sch);
 458
 459                slot_queue_add(slot, skb);
 460                return NET_XMIT_CN;
 461        }
 462
 463enqueue:
 464        qdisc_qstats_backlog_inc(sch, skb);
 465        slot->backlog += qdisc_pkt_len(skb);
 466        slot_queue_add(slot, skb);
 467        sfq_inc(q, x);
 468        if (slot->qlen == 1) {          /* The flow is new */
 469                if (q->tail == NULL) {  /* It is the first flow */
 470                        slot->next = x;
 471                } else {
 472                        slot->next = q->tail->next;
 473                        q->tail->next = x;
 474                }
 475                /* We put this flow at the end of our flow list.
 476                 * This might sound unfair for a new flow to wait after old ones,
 477                 * but we could endup servicing new flows only, and freeze old ones.
 478                 */
 479                q->tail = slot;
 480                /* We could use a bigger initial quantum for new flows */
 481                slot->allot = q->scaled_quantum;
 482        }
 483        if (++sch->q.qlen <= q->limit)
 484                return NET_XMIT_SUCCESS;
 485
 486        qlen = slot->qlen;
 487        sfq_drop(sch);
 488        /* Return Congestion Notification only if we dropped a packet
 489         * from this flow.
 490         */
 491        if (qlen != slot->qlen)
 492                return NET_XMIT_CN;
 493
 494        /* As we dropped a packet, better let upper stack know this */
 495        qdisc_tree_decrease_qlen(sch, 1);
 496        return NET_XMIT_SUCCESS;
 497}
 498
 499static struct sk_buff *
 500sfq_dequeue(struct Qdisc *sch)
 501{
 502        struct sfq_sched_data *q = qdisc_priv(sch);
 503        struct sk_buff *skb;
 504        sfq_index a, next_a;
 505        struct sfq_slot *slot;
 506
 507        /* No active slots */
 508        if (q->tail == NULL)
 509                return NULL;
 510
 511next_slot:
 512        a = q->tail->next;
 513        slot = &q->slots[a];
 514        if (slot->allot <= 0) {
 515                q->tail = slot;
 516                slot->allot += q->scaled_quantum;
 517                goto next_slot;
 518        }
 519        skb = slot_dequeue_head(slot);
 520        sfq_dec(q, a);
 521        qdisc_bstats_update(sch, skb);
 522        sch->q.qlen--;
 523        qdisc_qstats_backlog_dec(sch, skb);
 524        slot->backlog -= qdisc_pkt_len(skb);
 525        /* Is the slot empty? */
 526        if (slot->qlen == 0) {
 527                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 528                next_a = slot->next;
 529                if (a == next_a) {
 530                        q->tail = NULL; /* no more active slots */
 531                        return skb;
 532                }
 533                q->tail->next = next_a;
 534        } else {
 535                slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 536        }
 537        return skb;
 538}
 539
 540static void
 541sfq_reset(struct Qdisc *sch)
 542{
 543        struct sk_buff *skb;
 544
 545        while ((skb = sfq_dequeue(sch)) != NULL)
 546                kfree_skb(skb);
 547}
 548
 549/*
 550 * When q->perturbation is changed, we rehash all queued skbs
 551 * to avoid OOO (Out Of Order) effects.
 552 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 553 * counters.
 554 */
 555static void sfq_rehash(struct Qdisc *sch)
 556{
 557        struct sfq_sched_data *q = qdisc_priv(sch);
 558        struct sk_buff *skb;
 559        int i;
 560        struct sfq_slot *slot;
 561        struct sk_buff_head list;
 562        int dropped = 0;
 563
 564        __skb_queue_head_init(&list);
 565
 566        for (i = 0; i < q->maxflows; i++) {
 567                slot = &q->slots[i];
 568                if (!slot->qlen)
 569                        continue;
 570                while (slot->qlen) {
 571                        skb = slot_dequeue_head(slot);
 572                        sfq_dec(q, i);
 573                        __skb_queue_tail(&list, skb);
 574                }
 575                slot->backlog = 0;
 576                red_set_vars(&slot->vars);
 577                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 578        }
 579        q->tail = NULL;
 580
 581        while ((skb = __skb_dequeue(&list)) != NULL) {
 582                unsigned int hash = sfq_hash(q, skb);
 583                sfq_index x = q->ht[hash];
 584
 585                slot = &q->slots[x];
 586                if (x == SFQ_EMPTY_SLOT) {
 587                        x = q->dep[0].next; /* get a free slot */
 588                        if (x >= SFQ_MAX_FLOWS) {
 589drop:
 590                                qdisc_qstats_backlog_dec(sch, skb);
 591                                kfree_skb(skb);
 592                                dropped++;
 593                                continue;
 594                        }
 595                        q->ht[hash] = x;
 596                        slot = &q->slots[x];
 597                        slot->hash = hash;
 598                }
 599                if (slot->qlen >= q->maxdepth)
 600                        goto drop;
 601                slot_queue_add(slot, skb);
 602                if (q->red_parms)
 603                        slot->vars.qavg = red_calc_qavg(q->red_parms,
 604                                                        &slot->vars,
 605                                                        slot->backlog);
 606                slot->backlog += qdisc_pkt_len(skb);
 607                sfq_inc(q, x);
 608                if (slot->qlen == 1) {          /* The flow is new */
 609                        if (q->tail == NULL) {  /* It is the first flow */
 610                                slot->next = x;
 611                        } else {
 612                                slot->next = q->tail->next;
 613                                q->tail->next = x;
 614                        }
 615                        q->tail = slot;
 616                        slot->allot = q->scaled_quantum;
 617                }
 618        }
 619        sch->q.qlen -= dropped;
 620        qdisc_tree_decrease_qlen(sch, dropped);
 621}
 622
 623static void sfq_perturbation(unsigned long arg)
 624{
 625        struct Qdisc *sch = (struct Qdisc *)arg;
 626        struct sfq_sched_data *q = qdisc_priv(sch);
 627        spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 628
 629        spin_lock(root_lock);
 630        q->perturbation = prandom_u32();
 631        if (!q->filter_list && q->tail)
 632                sfq_rehash(sch);
 633        spin_unlock(root_lock);
 634
 635        if (q->perturb_period)
 636                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 637}
 638
 639static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 640{
 641        struct sfq_sched_data *q = qdisc_priv(sch);
 642        struct tc_sfq_qopt *ctl = nla_data(opt);
 643        struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 644        unsigned int qlen;
 645        struct red_parms *p = NULL;
 646
 647        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 648                return -EINVAL;
 649        if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 650                ctl_v1 = nla_data(opt);
 651        if (ctl->divisor &&
 652            (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 653                return -EINVAL;
 654        if (ctl_v1 && ctl_v1->qth_min) {
 655                p = kmalloc(sizeof(*p), GFP_KERNEL);
 656                if (!p)
 657                        return -ENOMEM;
 658        }
 659        sch_tree_lock(sch);
 660        if (ctl->quantum) {
 661                q->quantum = ctl->quantum;
 662                q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 663        }
 664        q->perturb_period = ctl->perturb_period * HZ;
 665        if (ctl->flows)
 666                q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 667        if (ctl->divisor) {
 668                q->divisor = ctl->divisor;
 669                q->maxflows = min_t(u32, q->maxflows, q->divisor);
 670        }
 671        if (ctl_v1) {
 672                if (ctl_v1->depth)
 673                        q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 674                if (p) {
 675                        swap(q->red_parms, p);
 676                        red_set_parms(q->red_parms,
 677                                      ctl_v1->qth_min, ctl_v1->qth_max,
 678                                      ctl_v1->Wlog,
 679                                      ctl_v1->Plog, ctl_v1->Scell_log,
 680                                      NULL,
 681                                      ctl_v1->max_P);
 682                }
 683                q->flags = ctl_v1->flags;
 684                q->headdrop = ctl_v1->headdrop;
 685        }
 686        if (ctl->limit) {
 687                q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 688                q->maxflows = min_t(u32, q->maxflows, q->limit);
 689        }
 690
 691        qlen = sch->q.qlen;
 692        while (sch->q.qlen > q->limit)
 693                sfq_drop(sch);
 694        qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 695
 696        del_timer(&q->perturb_timer);
 697        if (q->perturb_period) {
 698                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 699                q->perturbation = prandom_u32();
 700        }
 701        sch_tree_unlock(sch);
 702        kfree(p);
 703        return 0;
 704}
 705
 706static void *sfq_alloc(size_t sz)
 707{
 708        void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 709
 710        if (!ptr)
 711                ptr = vmalloc(sz);
 712        return ptr;
 713}
 714
 715static void sfq_free(void *addr)
 716{
 717        kvfree(addr);
 718}
 719
 720static void sfq_destroy(struct Qdisc *sch)
 721{
 722        struct sfq_sched_data *q = qdisc_priv(sch);
 723
 724        tcf_destroy_chain(&q->filter_list);
 725        q->perturb_period = 0;
 726        del_timer_sync(&q->perturb_timer);
 727        sfq_free(q->ht);
 728        sfq_free(q->slots);
 729        kfree(q->red_parms);
 730}
 731
 732static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 733{
 734        struct sfq_sched_data *q = qdisc_priv(sch);
 735        int i;
 736
 737        q->perturb_timer.function = sfq_perturbation;
 738        q->perturb_timer.data = (unsigned long)sch;
 739        init_timer_deferrable(&q->perturb_timer);
 740
 741        for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 742                q->dep[i].next = i + SFQ_MAX_FLOWS;
 743                q->dep[i].prev = i + SFQ_MAX_FLOWS;
 744        }
 745
 746        q->limit = SFQ_MAX_DEPTH;
 747        q->maxdepth = SFQ_MAX_DEPTH;
 748        q->cur_depth = 0;
 749        q->tail = NULL;
 750        q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 751        q->maxflows = SFQ_DEFAULT_FLOWS;
 752        q->quantum = psched_mtu(qdisc_dev(sch));
 753        q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 754        q->perturb_period = 0;
 755        q->perturbation = prandom_u32();
 756
 757        if (opt) {
 758                int err = sfq_change(sch, opt);
 759                if (err)
 760                        return err;
 761        }
 762
 763        q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 764        q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 765        if (!q->ht || !q->slots) {
 766                sfq_destroy(sch);
 767                return -ENOMEM;
 768        }
 769        for (i = 0; i < q->divisor; i++)
 770                q->ht[i] = SFQ_EMPTY_SLOT;
 771
 772        for (i = 0; i < q->maxflows; i++) {
 773                slot_queue_init(&q->slots[i]);
 774                sfq_link(q, i);
 775        }
 776        if (q->limit >= 1)
 777                sch->flags |= TCQ_F_CAN_BYPASS;
 778        else
 779                sch->flags &= ~TCQ_F_CAN_BYPASS;
 780        return 0;
 781}
 782
 783static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 784{
 785        struct sfq_sched_data *q = qdisc_priv(sch);
 786        unsigned char *b = skb_tail_pointer(skb);
 787        struct tc_sfq_qopt_v1 opt;
 788        struct red_parms *p = q->red_parms;
 789
 790        memset(&opt, 0, sizeof(opt));
 791        opt.v0.quantum  = q->quantum;
 792        opt.v0.perturb_period = q->perturb_period / HZ;
 793        opt.v0.limit    = q->limit;
 794        opt.v0.divisor  = q->divisor;
 795        opt.v0.flows    = q->maxflows;
 796        opt.depth       = q->maxdepth;
 797        opt.headdrop    = q->headdrop;
 798
 799        if (p) {
 800                opt.qth_min     = p->qth_min >> p->Wlog;
 801                opt.qth_max     = p->qth_max >> p->Wlog;
 802                opt.Wlog        = p->Wlog;
 803                opt.Plog        = p->Plog;
 804                opt.Scell_log   = p->Scell_log;
 805                opt.max_P       = p->max_P;
 806        }
 807        memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 808        opt.flags       = q->flags;
 809
 810        if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 811                goto nla_put_failure;
 812
 813        return skb->len;
 814
 815nla_put_failure:
 816        nlmsg_trim(skb, b);
 817        return -1;
 818}
 819
 820static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 821{
 822        return NULL;
 823}
 824
 825static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 826{
 827        return 0;
 828}
 829
 830static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 831                              u32 classid)
 832{
 833        /* we cannot bypass queue discipline anymore */
 834        sch->flags &= ~TCQ_F_CAN_BYPASS;
 835        return 0;
 836}
 837
 838static void sfq_put(struct Qdisc *q, unsigned long cl)
 839{
 840}
 841
 842static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
 843                                             unsigned long cl)
 844{
 845        struct sfq_sched_data *q = qdisc_priv(sch);
 846
 847        if (cl)
 848                return NULL;
 849        return &q->filter_list;
 850}
 851
 852static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 853                          struct sk_buff *skb, struct tcmsg *tcm)
 854{
 855        tcm->tcm_handle |= TC_H_MIN(cl);
 856        return 0;
 857}
 858
 859static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 860                                struct gnet_dump *d)
 861{
 862        struct sfq_sched_data *q = qdisc_priv(sch);
 863        sfq_index idx = q->ht[cl - 1];
 864        struct gnet_stats_queue qs = { 0 };
 865        struct tc_sfq_xstats xstats = { 0 };
 866
 867        if (idx != SFQ_EMPTY_SLOT) {
 868                const struct sfq_slot *slot = &q->slots[idx];
 869
 870                xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 871                qs.qlen = slot->qlen;
 872                qs.backlog = slot->backlog;
 873        }
 874        if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 875                return -1;
 876        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 877}
 878
 879static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 880{
 881        struct sfq_sched_data *q = qdisc_priv(sch);
 882        unsigned int i;
 883
 884        if (arg->stop)
 885                return;
 886
 887        for (i = 0; i < q->divisor; i++) {
 888                if (q->ht[i] == SFQ_EMPTY_SLOT ||
 889                    arg->count < arg->skip) {
 890                        arg->count++;
 891                        continue;
 892                }
 893                if (arg->fn(sch, i + 1, arg) < 0) {
 894                        arg->stop = 1;
 895                        break;
 896                }
 897                arg->count++;
 898        }
 899}
 900
 901static const struct Qdisc_class_ops sfq_class_ops = {
 902        .leaf           =       sfq_leaf,
 903        .get            =       sfq_get,
 904        .put            =       sfq_put,
 905        .tcf_chain      =       sfq_find_tcf,
 906        .bind_tcf       =       sfq_bind,
 907        .unbind_tcf     =       sfq_put,
 908        .dump           =       sfq_dump_class,
 909        .dump_stats     =       sfq_dump_class_stats,
 910        .walk           =       sfq_walk,
 911};
 912
 913static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 914        .cl_ops         =       &sfq_class_ops,
 915        .id             =       "sfq",
 916        .priv_size      =       sizeof(struct sfq_sched_data),
 917        .enqueue        =       sfq_enqueue,
 918        .dequeue        =       sfq_dequeue,
 919        .peek           =       qdisc_peek_dequeued,
 920        .drop           =       sfq_drop,
 921        .init           =       sfq_init,
 922        .reset          =       sfq_reset,
 923        .destroy        =       sfq_destroy,
 924        .change         =       NULL,
 925        .dump           =       sfq_dump,
 926        .owner          =       THIS_MODULE,
 927};
 928
 929static int __init sfq_module_init(void)
 930{
 931        return register_qdisc(&sfq_qdisc_ops);
 932}
 933static void __exit sfq_module_exit(void)
 934{
 935        unregister_qdisc(&sfq_qdisc_ops);
 936}
 937module_init(sfq_module_init)
 938module_exit(sfq_module_exit)
 939MODULE_LICENSE("GPL");
 940