linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/types.h>
  14#include <linux/kernel.h>
  15#include <linux/jiffies.h>
  16#include <linux/string.h>
  17#include <linux/in.h>
  18#include <linux/errno.h>
  19#include <linux/init.h>
  20#include <linux/skbuff.h>
  21#include <linux/jhash.h>
  22#include <linux/slab.h>
  23#include <linux/vmalloc.h>
  24#include <net/netlink.h>
  25#include <net/pkt_sched.h>
  26#include <net/flow_keys.h>
  27#include <net/red.h>
  28
  29
  30/*      Stochastic Fairness Queuing algorithm.
  31        =======================================
  32
  33        Source:
  34        Paul E. McKenney "Stochastic Fairness Queuing",
  35        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37        Paul E. McKenney "Stochastic Fairness Queuing",
  38        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41        See also:
  42        M. Shreedhar and George Varghese "Efficient Fair
  43        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46        This is not the thing that is usually called (W)FQ nowadays.
  47        It does not use any timestamp mechanism, but instead
  48        processes queues in round-robin order.
  49
  50        ADVANTAGE:
  51
  52        - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54        DRAWBACKS:
  55
  56        - "Stochastic" -> It is not 100% fair.
  57        When hash collisions occur, several flows are considered as one.
  58
  59        - "Round-robin" -> It introduces larger delays than virtual clock
  60        based schemes, and should not be used for isolating interactive
  61        traffic from non-interactive. It means, that this scheduler
  62        should be used as leaf of CBQ or P3, which put interactive traffic
  63        to higher priority band.
  64
  65        We still need true WFQ for top level CSZ, but using WFQ
  66        for the best effort traffic is absolutely pointless:
  67        SFQ is superior for this purpose.
  68
  69        IMPLEMENTATION:
  70        This implementation limits :
  71        - maximal queue length per flow to 127 packets.
  72        - max mtu to 2^18-1;
  73        - max 65408 flows,
  74        - number of hash buckets to 65536.
  75
  76        It is easy to increase these values, but not in flight.  */
  77
  78#define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79#define SFQ_DEFAULT_FLOWS       128
  80#define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81#define SFQ_EMPTY_SLOT          0xffff
  82#define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84/* We use 16 bits to store allot, and want to handle packets up to 64K
  85 * Scale allot by 8 (1<<3) so that no overflow occurs.
  86 */
  87#define SFQ_ALLOT_SHIFT         3
  88#define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91typedef u16 sfq_index;
  92
  93/*
  94 * We dont use pointers to save space.
  95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97 * are 'pointers' to dep[] array
  98 */
  99struct sfq_head {
 100        sfq_index       next;
 101        sfq_index       prev;
 102};
 103
 104struct sfq_slot {
 105        struct sk_buff  *skblist_next;
 106        struct sk_buff  *skblist_prev;
 107        sfq_index       qlen; /* number of skbs in skblist */
 108        sfq_index       next; /* next slot in sfq RR chain */
 109        struct sfq_head dep; /* anchor in dep[] chains */
 110        unsigned short  hash; /* hash value (index in ht[]) */
 111        short           allot; /* credit for this slot */
 112
 113        unsigned int    backlog;
 114        struct red_vars vars;
 115};
 116
 117struct sfq_sched_data {
 118/* frequently used fields */
 119        int             limit;          /* limit of total number of packets in this qdisc */
 120        unsigned int    divisor;        /* number of slots in hash table */
 121        u8              headdrop;
 122        u8              maxdepth;       /* limit of packets per flow */
 123
 124        u32             perturbation;
 125        u8              cur_depth;      /* depth of longest slot */
 126        u8              flags;
 127        unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128        struct tcf_proto *filter_list;
 129        sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130        struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132        struct red_parms *red_parms;
 133        struct tc_sfqred_stats stats;
 134        struct sfq_slot *tail;          /* current slot in round */
 135
 136        struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                        /* Linked lists of slots, indexed by depth
 138                                         * dep[0] : list of unused flows
 139                                         * dep[1] : list of flows with 1 packet
 140                                         * dep[X] : list of flows with X packets
 141                                         */
 142
 143        unsigned int    maxflows;       /* number of flows in flows array */
 144        int             perturb_period;
 145        unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146        struct timer_list perturb_timer;
 147};
 148
 149/*
 150 * sfq_head are either in a sfq_slot or in dep[] array
 151 */
 152static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153{
 154        if (val < SFQ_MAX_FLOWS)
 155                return &q->slots[val].dep;
 156        return &q->dep[val - SFQ_MAX_FLOWS];
 157}
 158
 159/*
 160 * In order to be able to quickly rehash our queue when timer changes
 161 * q->perturbation, we store flow_keys in skb->cb[]
 162 */
 163struct sfq_skb_cb {
 164       struct flow_keys        keys;
 165};
 166
 167static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 168{
 169        qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
 170        return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 171}
 172
 173static unsigned int sfq_hash(const struct sfq_sched_data *q,
 174                             const struct sk_buff *skb)
 175{
 176        const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 177        unsigned int hash;
 178
 179        hash = jhash_3words((__force u32)keys->dst,
 180                            (__force u32)keys->src ^ keys->ip_proto,
 181                            (__force u32)keys->ports, q->perturbation);
 182        return hash & (q->divisor - 1);
 183}
 184
 185static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 186                                 int *qerr)
 187{
 188        struct sfq_sched_data *q = qdisc_priv(sch);
 189        struct tcf_result res;
 190        int result;
 191
 192        if (TC_H_MAJ(skb->priority) == sch->handle &&
 193            TC_H_MIN(skb->priority) > 0 &&
 194            TC_H_MIN(skb->priority) <= q->divisor)
 195                return TC_H_MIN(skb->priority);
 196
 197        if (!q->filter_list) {
 198                skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 199                return sfq_hash(q, skb) + 1;
 200        }
 201
 202        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 203        result = tc_classify(skb, q->filter_list, &res);
 204        if (result >= 0) {
 205#ifdef CONFIG_NET_CLS_ACT
 206                switch (result) {
 207                case TC_ACT_STOLEN:
 208                case TC_ACT_QUEUED:
 209                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 210                case TC_ACT_SHOT:
 211                        return 0;
 212                }
 213#endif
 214                if (TC_H_MIN(res.classid) <= q->divisor)
 215                        return TC_H_MIN(res.classid);
 216        }
 217        return 0;
 218}
 219
 220/*
 221 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 222 */
 223static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 224{
 225        sfq_index p, n;
 226        struct sfq_slot *slot = &q->slots[x];
 227        int qlen = slot->qlen;
 228
 229        p = qlen + SFQ_MAX_FLOWS;
 230        n = q->dep[qlen].next;
 231
 232        slot->dep.next = n;
 233        slot->dep.prev = p;
 234
 235        q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 236        sfq_dep_head(q, n)->prev = x;
 237}
 238
 239#define sfq_unlink(q, x, n, p)                  \
 240        n = q->slots[x].dep.next;               \
 241        p = q->slots[x].dep.prev;               \
 242        sfq_dep_head(q, p)->next = n;           \
 243        sfq_dep_head(q, n)->prev = p
 244
 245
 246static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 247{
 248        sfq_index p, n;
 249        int d;
 250
 251        sfq_unlink(q, x, n, p);
 252
 253        d = q->slots[x].qlen--;
 254        if (n == p && q->cur_depth == d)
 255                q->cur_depth--;
 256        sfq_link(q, x);
 257}
 258
 259static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 260{
 261        sfq_index p, n;
 262        int d;
 263
 264        sfq_unlink(q, x, n, p);
 265
 266        d = ++q->slots[x].qlen;
 267        if (q->cur_depth < d)
 268                q->cur_depth = d;
 269        sfq_link(q, x);
 270}
 271
 272/* helper functions : might be changed when/if skb use a standard list_head */
 273
 274/* remove one skb from tail of slot queue */
 275static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 276{
 277        struct sk_buff *skb = slot->skblist_prev;
 278
 279        slot->skblist_prev = skb->prev;
 280        skb->prev->next = (struct sk_buff *)slot;
 281        skb->next = skb->prev = NULL;
 282        return skb;
 283}
 284
 285/* remove one skb from head of slot queue */
 286static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 287{
 288        struct sk_buff *skb = slot->skblist_next;
 289
 290        slot->skblist_next = skb->next;
 291        skb->next->prev = (struct sk_buff *)slot;
 292        skb->next = skb->prev = NULL;
 293        return skb;
 294}
 295
 296static inline void slot_queue_init(struct sfq_slot *slot)
 297{
 298        memset(slot, 0, sizeof(*slot));
 299        slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 300}
 301
 302/* add skb to slot queue (tail add) */
 303static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 304{
 305        skb->prev = slot->skblist_prev;
 306        skb->next = (struct sk_buff *)slot;
 307        slot->skblist_prev->next = skb;
 308        slot->skblist_prev = skb;
 309}
 310
 311#define slot_queue_walk(slot, skb)              \
 312        for (skb = slot->skblist_next;          \
 313             skb != (struct sk_buff *)slot;     \
 314             skb = skb->next)
 315
 316static unsigned int sfq_drop(struct Qdisc *sch)
 317{
 318        struct sfq_sched_data *q = qdisc_priv(sch);
 319        sfq_index x, d = q->cur_depth;
 320        struct sk_buff *skb;
 321        unsigned int len;
 322        struct sfq_slot *slot;
 323
 324        /* Queue is full! Find the longest slot and drop tail packet from it */
 325        if (d > 1) {
 326                x = q->dep[d].next;
 327                slot = &q->slots[x];
 328drop:
 329                skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 330                len = qdisc_pkt_len(skb);
 331                slot->backlog -= len;
 332                sfq_dec(q, x);
 333                kfree_skb(skb);
 334                sch->q.qlen--;
 335                sch->qstats.drops++;
 336                sch->qstats.backlog -= len;
 337                return len;
 338        }
 339
 340        if (d == 1) {
 341                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 342                x = q->tail->next;
 343                slot = &q->slots[x];
 344                q->tail->next = slot->next;
 345                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 346                goto drop;
 347        }
 348
 349        return 0;
 350}
 351
 352/* Is ECN parameter configured */
 353static int sfq_prob_mark(const struct sfq_sched_data *q)
 354{
 355        return q->flags & TC_RED_ECN;
 356}
 357
 358/* Should packets over max threshold just be marked */
 359static int sfq_hard_mark(const struct sfq_sched_data *q)
 360{
 361        return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 362}
 363
 364static int sfq_headdrop(const struct sfq_sched_data *q)
 365{
 366        return q->headdrop;
 367}
 368
 369static int
 370sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 371{
 372        struct sfq_sched_data *q = qdisc_priv(sch);
 373        unsigned int hash;
 374        sfq_index x, qlen;
 375        struct sfq_slot *slot;
 376        int uninitialized_var(ret);
 377        struct sk_buff *head;
 378        int delta;
 379
 380        hash = sfq_classify(skb, sch, &ret);
 381        if (hash == 0) {
 382                if (ret & __NET_XMIT_BYPASS)
 383                        sch->qstats.drops++;
 384                kfree_skb(skb);
 385                return ret;
 386        }
 387        hash--;
 388
 389        x = q->ht[hash];
 390        slot = &q->slots[x];
 391        if (x == SFQ_EMPTY_SLOT) {
 392                x = q->dep[0].next; /* get a free slot */
 393                if (x >= SFQ_MAX_FLOWS)
 394                        return qdisc_drop(skb, sch);
 395                q->ht[hash] = x;
 396                slot = &q->slots[x];
 397                slot->hash = hash;
 398                slot->backlog = 0; /* should already be 0 anyway... */
 399                red_set_vars(&slot->vars);
 400                goto enqueue;
 401        }
 402        if (q->red_parms) {
 403                slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 404                                                        &slot->vars,
 405                                                        slot->backlog);
 406                switch (red_action(q->red_parms,
 407                                   &slot->vars,
 408                                   slot->vars.qavg)) {
 409                case RED_DONT_MARK:
 410                        break;
 411
 412                case RED_PROB_MARK:
 413                        sch->qstats.overlimits++;
 414                        if (sfq_prob_mark(q)) {
 415                                /* We know we have at least one packet in queue */
 416                                if (sfq_headdrop(q) &&
 417                                    INET_ECN_set_ce(slot->skblist_next)) {
 418                                        q->stats.prob_mark_head++;
 419                                        break;
 420                                }
 421                                if (INET_ECN_set_ce(skb)) {
 422                                        q->stats.prob_mark++;
 423                                        break;
 424                                }
 425                        }
 426                        q->stats.prob_drop++;
 427                        goto congestion_drop;
 428
 429                case RED_HARD_MARK:
 430                        sch->qstats.overlimits++;
 431                        if (sfq_hard_mark(q)) {
 432                                /* We know we have at least one packet in queue */
 433                                if (sfq_headdrop(q) &&
 434                                    INET_ECN_set_ce(slot->skblist_next)) {
 435                                        q->stats.forced_mark_head++;
 436                                        break;
 437                                }
 438                                if (INET_ECN_set_ce(skb)) {
 439                                        q->stats.forced_mark++;
 440                                        break;
 441                                }
 442                        }
 443                        q->stats.forced_drop++;
 444                        goto congestion_drop;
 445                }
 446        }
 447
 448        if (slot->qlen >= q->maxdepth) {
 449congestion_drop:
 450                if (!sfq_headdrop(q))
 451                        return qdisc_drop(skb, sch);
 452
 453                /* We know we have at least one packet in queue */
 454                head = slot_dequeue_head(slot);
 455                delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 456                sch->qstats.backlog -= delta;
 457                slot->backlog -= delta;
 458                qdisc_drop(head, sch);
 459
 460                slot_queue_add(slot, skb);
 461                return NET_XMIT_CN;
 462        }
 463
 464enqueue:
 465        sch->qstats.backlog += qdisc_pkt_len(skb);
 466        slot->backlog += qdisc_pkt_len(skb);
 467        slot_queue_add(slot, skb);
 468        sfq_inc(q, x);
 469        if (slot->qlen == 1) {          /* The flow is new */
 470                if (q->tail == NULL) {  /* It is the first flow */
 471                        slot->next = x;
 472                } else {
 473                        slot->next = q->tail->next;
 474                        q->tail->next = x;
 475                }
 476                /* We put this flow at the end of our flow list.
 477                 * This might sound unfair for a new flow to wait after old ones,
 478                 * but we could endup servicing new flows only, and freeze old ones.
 479                 */
 480                q->tail = slot;
 481                /* We could use a bigger initial quantum for new flows */
 482                slot->allot = q->scaled_quantum;
 483        }
 484        if (++sch->q.qlen <= q->limit)
 485                return NET_XMIT_SUCCESS;
 486
 487        qlen = slot->qlen;
 488        sfq_drop(sch);
 489        /* Return Congestion Notification only if we dropped a packet
 490         * from this flow.
 491         */
 492        if (qlen != slot->qlen)
 493                return NET_XMIT_CN;
 494
 495        /* As we dropped a packet, better let upper stack know this */
 496        qdisc_tree_decrease_qlen(sch, 1);
 497        return NET_XMIT_SUCCESS;
 498}
 499
 500static struct sk_buff *
 501sfq_dequeue(struct Qdisc *sch)
 502{
 503        struct sfq_sched_data *q = qdisc_priv(sch);
 504        struct sk_buff *skb;
 505        sfq_index a, next_a;
 506        struct sfq_slot *slot;
 507
 508        /* No active slots */
 509        if (q->tail == NULL)
 510                return NULL;
 511
 512next_slot:
 513        a = q->tail->next;
 514        slot = &q->slots[a];
 515        if (slot->allot <= 0) {
 516                q->tail = slot;
 517                slot->allot += q->scaled_quantum;
 518                goto next_slot;
 519        }
 520        skb = slot_dequeue_head(slot);
 521        sfq_dec(q, a);
 522        qdisc_bstats_update(sch, skb);
 523        sch->q.qlen--;
 524        sch->qstats.backlog -= qdisc_pkt_len(skb);
 525        slot->backlog -= qdisc_pkt_len(skb);
 526        /* Is the slot empty? */
 527        if (slot->qlen == 0) {
 528                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 529                next_a = slot->next;
 530                if (a == next_a) {
 531                        q->tail = NULL; /* no more active slots */
 532                        return skb;
 533                }
 534                q->tail->next = next_a;
 535        } else {
 536                slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 537        }
 538        return skb;
 539}
 540
 541static void
 542sfq_reset(struct Qdisc *sch)
 543{
 544        struct sk_buff *skb;
 545
 546        while ((skb = sfq_dequeue(sch)) != NULL)
 547                kfree_skb(skb);
 548}
 549
 550/*
 551 * When q->perturbation is changed, we rehash all queued skbs
 552 * to avoid OOO (Out Of Order) effects.
 553 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 554 * counters.
 555 */
 556static void sfq_rehash(struct Qdisc *sch)
 557{
 558        struct sfq_sched_data *q = qdisc_priv(sch);
 559        struct sk_buff *skb;
 560        int i;
 561        struct sfq_slot *slot;
 562        struct sk_buff_head list;
 563        int dropped = 0;
 564
 565        __skb_queue_head_init(&list);
 566
 567        for (i = 0; i < q->maxflows; i++) {
 568                slot = &q->slots[i];
 569                if (!slot->qlen)
 570                        continue;
 571                while (slot->qlen) {
 572                        skb = slot_dequeue_head(slot);
 573                        sfq_dec(q, i);
 574                        __skb_queue_tail(&list, skb);
 575                }
 576                slot->backlog = 0;
 577                red_set_vars(&slot->vars);
 578                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 579        }
 580        q->tail = NULL;
 581
 582        while ((skb = __skb_dequeue(&list)) != NULL) {
 583                unsigned int hash = sfq_hash(q, skb);
 584                sfq_index x = q->ht[hash];
 585
 586                slot = &q->slots[x];
 587                if (x == SFQ_EMPTY_SLOT) {
 588                        x = q->dep[0].next; /* get a free slot */
 589                        if (x >= SFQ_MAX_FLOWS) {
 590drop:                           sch->qstats.backlog -= qdisc_pkt_len(skb);
 591                                kfree_skb(skb);
 592                                dropped++;
 593                                continue;
 594                        }
 595                        q->ht[hash] = x;
 596                        slot = &q->slots[x];
 597                        slot->hash = hash;
 598                }
 599                if (slot->qlen >= q->maxdepth)
 600                        goto drop;
 601                slot_queue_add(slot, skb);
 602                if (q->red_parms)
 603                        slot->vars.qavg = red_calc_qavg(q->red_parms,
 604                                                        &slot->vars,
 605                                                        slot->backlog);
 606                slot->backlog += qdisc_pkt_len(skb);
 607                sfq_inc(q, x);
 608                if (slot->qlen == 1) {          /* The flow is new */
 609                        if (q->tail == NULL) {  /* It is the first flow */
 610                                slot->next = x;
 611                        } else {
 612                                slot->next = q->tail->next;
 613                                q->tail->next = x;
 614                        }
 615                        q->tail = slot;
 616                        slot->allot = q->scaled_quantum;
 617                }
 618        }
 619        sch->q.qlen -= dropped;
 620        qdisc_tree_decrease_qlen(sch, dropped);
 621}
 622
 623static void sfq_perturbation(unsigned long arg)
 624{
 625        struct Qdisc *sch = (struct Qdisc *)arg;
 626        struct sfq_sched_data *q = qdisc_priv(sch);
 627        spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 628
 629        spin_lock(root_lock);
 630        q->perturbation = net_random();
 631        if (!q->filter_list && q->tail)
 632                sfq_rehash(sch);
 633        spin_unlock(root_lock);
 634
 635        if (q->perturb_period)
 636                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 637}
 638
 639static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 640{
 641        struct sfq_sched_data *q = qdisc_priv(sch);
 642        struct tc_sfq_qopt *ctl = nla_data(opt);
 643        struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 644        unsigned int qlen;
 645        struct red_parms *p = NULL;
 646
 647        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 648                return -EINVAL;
 649        if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 650                ctl_v1 = nla_data(opt);
 651        if (ctl->divisor &&
 652            (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 653                return -EINVAL;
 654        if (ctl_v1 && ctl_v1->qth_min) {
 655                p = kmalloc(sizeof(*p), GFP_KERNEL);
 656                if (!p)
 657                        return -ENOMEM;
 658        }
 659        sch_tree_lock(sch);
 660        if (ctl->quantum) {
 661                q->quantum = ctl->quantum;
 662                q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 663        }
 664        q->perturb_period = ctl->perturb_period * HZ;
 665        if (ctl->flows)
 666                q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 667        if (ctl->divisor) {
 668                q->divisor = ctl->divisor;
 669                q->maxflows = min_t(u32, q->maxflows, q->divisor);
 670        }
 671        if (ctl_v1) {
 672                if (ctl_v1->depth)
 673                        q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 674                if (p) {
 675                        swap(q->red_parms, p);
 676                        red_set_parms(q->red_parms,
 677                                      ctl_v1->qth_min, ctl_v1->qth_max,
 678                                      ctl_v1->Wlog,
 679                                      ctl_v1->Plog, ctl_v1->Scell_log,
 680                                      NULL,
 681                                      ctl_v1->max_P);
 682                }
 683                q->flags = ctl_v1->flags;
 684                q->headdrop = ctl_v1->headdrop;
 685        }
 686        if (ctl->limit) {
 687                q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 688                q->maxflows = min_t(u32, q->maxflows, q->limit);
 689        }
 690
 691        qlen = sch->q.qlen;
 692        while (sch->q.qlen > q->limit)
 693                sfq_drop(sch);
 694        qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 695
 696        del_timer(&q->perturb_timer);
 697        if (q->perturb_period) {
 698                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 699                q->perturbation = net_random();
 700        }
 701        sch_tree_unlock(sch);
 702        kfree(p);
 703        return 0;
 704}
 705
 706static void *sfq_alloc(size_t sz)
 707{
 708        void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 709
 710        if (!ptr)
 711                ptr = vmalloc(sz);
 712        return ptr;
 713}
 714
 715static void sfq_free(void *addr)
 716{
 717        if (addr) {
 718                if (is_vmalloc_addr(addr))
 719                        vfree(addr);
 720                else
 721                        kfree(addr);
 722        }
 723}
 724
 725static void sfq_destroy(struct Qdisc *sch)
 726{
 727        struct sfq_sched_data *q = qdisc_priv(sch);
 728
 729        tcf_destroy_chain(&q->filter_list);
 730        q->perturb_period = 0;
 731        del_timer_sync(&q->perturb_timer);
 732        sfq_free(q->ht);
 733        sfq_free(q->slots);
 734        kfree(q->red_parms);
 735}
 736
 737static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 738{
 739        struct sfq_sched_data *q = qdisc_priv(sch);
 740        int i;
 741
 742        q->perturb_timer.function = sfq_perturbation;
 743        q->perturb_timer.data = (unsigned long)sch;
 744        init_timer_deferrable(&q->perturb_timer);
 745
 746        for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 747                q->dep[i].next = i + SFQ_MAX_FLOWS;
 748                q->dep[i].prev = i + SFQ_MAX_FLOWS;
 749        }
 750
 751        q->limit = SFQ_MAX_DEPTH;
 752        q->maxdepth = SFQ_MAX_DEPTH;
 753        q->cur_depth = 0;
 754        q->tail = NULL;
 755        q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 756        q->maxflows = SFQ_DEFAULT_FLOWS;
 757        q->quantum = psched_mtu(qdisc_dev(sch));
 758        q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 759        q->perturb_period = 0;
 760        q->perturbation = net_random();
 761
 762        if (opt) {
 763                int err = sfq_change(sch, opt);
 764                if (err)
 765                        return err;
 766        }
 767
 768        q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 769        q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 770        if (!q->ht || !q->slots) {
 771                sfq_destroy(sch);
 772                return -ENOMEM;
 773        }
 774        for (i = 0; i < q->divisor; i++)
 775                q->ht[i] = SFQ_EMPTY_SLOT;
 776
 777        for (i = 0; i < q->maxflows; i++) {
 778                slot_queue_init(&q->slots[i]);
 779                sfq_link(q, i);
 780        }
 781        if (q->limit >= 1)
 782                sch->flags |= TCQ_F_CAN_BYPASS;
 783        else
 784                sch->flags &= ~TCQ_F_CAN_BYPASS;
 785        return 0;
 786}
 787
 788static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 789{
 790        struct sfq_sched_data *q = qdisc_priv(sch);
 791        unsigned char *b = skb_tail_pointer(skb);
 792        struct tc_sfq_qopt_v1 opt;
 793        struct red_parms *p = q->red_parms;
 794
 795        memset(&opt, 0, sizeof(opt));
 796        opt.v0.quantum  = q->quantum;
 797        opt.v0.perturb_period = q->perturb_period / HZ;
 798        opt.v0.limit    = q->limit;
 799        opt.v0.divisor  = q->divisor;
 800        opt.v0.flows    = q->maxflows;
 801        opt.depth       = q->maxdepth;
 802        opt.headdrop    = q->headdrop;
 803
 804        if (p) {
 805                opt.qth_min     = p->qth_min >> p->Wlog;
 806                opt.qth_max     = p->qth_max >> p->Wlog;
 807                opt.Wlog        = p->Wlog;
 808                opt.Plog        = p->Plog;
 809                opt.Scell_log   = p->Scell_log;
 810                opt.max_P       = p->max_P;
 811        }
 812        memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 813        opt.flags       = q->flags;
 814
 815        if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 816                goto nla_put_failure;
 817
 818        return skb->len;
 819
 820nla_put_failure:
 821        nlmsg_trim(skb, b);
 822        return -1;
 823}
 824
 825static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 826{
 827        return NULL;
 828}
 829
 830static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 831{
 832        return 0;
 833}
 834
 835static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 836                              u32 classid)
 837{
 838        /* we cannot bypass queue discipline anymore */
 839        sch->flags &= ~TCQ_F_CAN_BYPASS;
 840        return 0;
 841}
 842
 843static void sfq_put(struct Qdisc *q, unsigned long cl)
 844{
 845}
 846
 847static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 848{
 849        struct sfq_sched_data *q = qdisc_priv(sch);
 850
 851        if (cl)
 852                return NULL;
 853        return &q->filter_list;
 854}
 855
 856static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 857                          struct sk_buff *skb, struct tcmsg *tcm)
 858{
 859        tcm->tcm_handle |= TC_H_MIN(cl);
 860        return 0;
 861}
 862
 863static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 864                                struct gnet_dump *d)
 865{
 866        struct sfq_sched_data *q = qdisc_priv(sch);
 867        sfq_index idx = q->ht[cl - 1];
 868        struct gnet_stats_queue qs = { 0 };
 869        struct tc_sfq_xstats xstats = { 0 };
 870
 871        if (idx != SFQ_EMPTY_SLOT) {
 872                const struct sfq_slot *slot = &q->slots[idx];
 873
 874                xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 875                qs.qlen = slot->qlen;
 876                qs.backlog = slot->backlog;
 877        }
 878        if (gnet_stats_copy_queue(d, &qs) < 0)
 879                return -1;
 880        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 881}
 882
 883static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 884{
 885        struct sfq_sched_data *q = qdisc_priv(sch);
 886        unsigned int i;
 887
 888        if (arg->stop)
 889                return;
 890
 891        for (i = 0; i < q->divisor; i++) {
 892                if (q->ht[i] == SFQ_EMPTY_SLOT ||
 893                    arg->count < arg->skip) {
 894                        arg->count++;
 895                        continue;
 896                }
 897                if (arg->fn(sch, i + 1, arg) < 0) {
 898                        arg->stop = 1;
 899                        break;
 900                }
 901                arg->count++;
 902        }
 903}
 904
 905static const struct Qdisc_class_ops sfq_class_ops = {
 906        .leaf           =       sfq_leaf,
 907        .get            =       sfq_get,
 908        .put            =       sfq_put,
 909        .tcf_chain      =       sfq_find_tcf,
 910        .bind_tcf       =       sfq_bind,
 911        .unbind_tcf     =       sfq_put,
 912        .dump           =       sfq_dump_class,
 913        .dump_stats     =       sfq_dump_class_stats,
 914        .walk           =       sfq_walk,
 915};
 916
 917static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 918        .cl_ops         =       &sfq_class_ops,
 919        .id             =       "sfq",
 920        .priv_size      =       sizeof(struct sfq_sched_data),
 921        .enqueue        =       sfq_enqueue,
 922        .dequeue        =       sfq_dequeue,
 923        .peek           =       qdisc_peek_dequeued,
 924        .drop           =       sfq_drop,
 925        .init           =       sfq_init,
 926        .reset          =       sfq_reset,
 927        .destroy        =       sfq_destroy,
 928        .change         =       NULL,
 929        .dump           =       sfq_dump,
 930        .owner          =       THIS_MODULE,
 931};
 932
 933static int __init sfq_module_init(void)
 934{
 935        return register_qdisc(&sfq_qdisc_ops);
 936}
 937static void __exit sfq_module_exit(void)
 938{
 939        unregister_qdisc(&sfq_qdisc_ops);
 940}
 941module_init(sfq_module_init)
 942module_exit(sfq_module_exit)
 943MODULE_LICENSE("GPL");
 944