linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/types.h>
  14#include <linux/kernel.h>
  15#include <linux/jiffies.h>
  16#include <linux/string.h>
  17#include <linux/in.h>
  18#include <linux/errno.h>
  19#include <linux/init.h>
  20#include <linux/skbuff.h>
  21#include <linux/jhash.h>
  22#include <linux/slab.h>
  23#include <linux/vmalloc.h>
  24#include <net/netlink.h>
  25#include <net/pkt_sched.h>
  26#include <net/pkt_cls.h>
  27#include <net/red.h>
  28
  29
  30/*      Stochastic Fairness Queuing algorithm.
  31        =======================================
  32
  33        Source:
  34        Paul E. McKenney "Stochastic Fairness Queuing",
  35        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37        Paul E. McKenney "Stochastic Fairness Queuing",
  38        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41        See also:
  42        M. Shreedhar and George Varghese "Efficient Fair
  43        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46        This is not the thing that is usually called (W)FQ nowadays.
  47        It does not use any timestamp mechanism, but instead
  48        processes queues in round-robin order.
  49
  50        ADVANTAGE:
  51
  52        - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54        DRAWBACKS:
  55
  56        - "Stochastic" -> It is not 100% fair.
  57        When hash collisions occur, several flows are considered as one.
  58
  59        - "Round-robin" -> It introduces larger delays than virtual clock
  60        based schemes, and should not be used for isolating interactive
  61        traffic from non-interactive. It means, that this scheduler
  62        should be used as leaf of CBQ or P3, which put interactive traffic
  63        to higher priority band.
  64
  65        We still need true WFQ for top level CSZ, but using WFQ
  66        for the best effort traffic is absolutely pointless:
  67        SFQ is superior for this purpose.
  68
  69        IMPLEMENTATION:
  70        This implementation limits :
  71        - maximal queue length per flow to 127 packets.
  72        - max mtu to 2^18-1;
  73        - max 65408 flows,
  74        - number of hash buckets to 65536.
  75
  76        It is easy to increase these values, but not in flight.  */
  77
  78#define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79#define SFQ_DEFAULT_FLOWS       128
  80#define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81#define SFQ_EMPTY_SLOT          0xffff
  82#define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84/* We use 16 bits to store allot, and want to handle packets up to 64K
  85 * Scale allot by 8 (1<<3) so that no overflow occurs.
  86 */
  87#define SFQ_ALLOT_SHIFT         3
  88#define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91typedef u16 sfq_index;
  92
  93/*
  94 * We dont use pointers to save space.
  95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97 * are 'pointers' to dep[] array
  98 */
  99struct sfq_head {
 100        sfq_index       next;
 101        sfq_index       prev;
 102};
 103
 104struct sfq_slot {
 105        struct sk_buff  *skblist_next;
 106        struct sk_buff  *skblist_prev;
 107        sfq_index       qlen; /* number of skbs in skblist */
 108        sfq_index       next; /* next slot in sfq RR chain */
 109        struct sfq_head dep; /* anchor in dep[] chains */
 110        unsigned short  hash; /* hash value (index in ht[]) */
 111        short           allot; /* credit for this slot */
 112
 113        unsigned int    backlog;
 114        struct red_vars vars;
 115};
 116
 117struct sfq_sched_data {
 118/* frequently used fields */
 119        int             limit;          /* limit of total number of packets in this qdisc */
 120        unsigned int    divisor;        /* number of slots in hash table */
 121        u8              headdrop;
 122        u8              maxdepth;       /* limit of packets per flow */
 123
 124        u32             perturbation;
 125        u8              cur_depth;      /* depth of longest slot */
 126        u8              flags;
 127        unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128        struct tcf_proto __rcu *filter_list;
 129        struct tcf_block *block;
 130        sfq_index       *ht;            /* Hash table ('divisor' slots) */
 131        struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 132
 133        struct red_parms *red_parms;
 134        struct tc_sfqred_stats stats;
 135        struct sfq_slot *tail;          /* current slot in round */
 136
 137        struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 138                                        /* Linked lists of slots, indexed by depth
 139                                         * dep[0] : list of unused flows
 140                                         * dep[1] : list of flows with 1 packet
 141                                         * dep[X] : list of flows with X packets
 142                                         */
 143
 144        unsigned int    maxflows;       /* number of flows in flows array */
 145        int             perturb_period;
 146        unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 147        struct timer_list perturb_timer;
 148};
 149
 150/*
 151 * sfq_head are either in a sfq_slot or in dep[] array
 152 */
 153static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 154{
 155        if (val < SFQ_MAX_FLOWS)
 156                return &q->slots[val].dep;
 157        return &q->dep[val - SFQ_MAX_FLOWS];
 158}
 159
 160static unsigned int sfq_hash(const struct sfq_sched_data *q,
 161                             const struct sk_buff *skb)
 162{
 163        return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 164}
 165
 166static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 167                                 int *qerr)
 168{
 169        struct sfq_sched_data *q = qdisc_priv(sch);
 170        struct tcf_result res;
 171        struct tcf_proto *fl;
 172        int result;
 173
 174        if (TC_H_MAJ(skb->priority) == sch->handle &&
 175            TC_H_MIN(skb->priority) > 0 &&
 176            TC_H_MIN(skb->priority) <= q->divisor)
 177                return TC_H_MIN(skb->priority);
 178
 179        fl = rcu_dereference_bh(q->filter_list);
 180        if (!fl)
 181                return sfq_hash(q, skb) + 1;
 182
 183        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 184        result = tcf_classify(skb, fl, &res, false);
 185        if (result >= 0) {
 186#ifdef CONFIG_NET_CLS_ACT
 187                switch (result) {
 188                case TC_ACT_STOLEN:
 189                case TC_ACT_QUEUED:
 190                case TC_ACT_TRAP:
 191                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 192                case TC_ACT_SHOT:
 193                        return 0;
 194                }
 195#endif
 196                if (TC_H_MIN(res.classid) <= q->divisor)
 197                        return TC_H_MIN(res.classid);
 198        }
 199        return 0;
 200}
 201
 202/*
 203 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 204 */
 205static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 206{
 207        sfq_index p, n;
 208        struct sfq_slot *slot = &q->slots[x];
 209        int qlen = slot->qlen;
 210
 211        p = qlen + SFQ_MAX_FLOWS;
 212        n = q->dep[qlen].next;
 213
 214        slot->dep.next = n;
 215        slot->dep.prev = p;
 216
 217        q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 218        sfq_dep_head(q, n)->prev = x;
 219}
 220
 221#define sfq_unlink(q, x, n, p)                  \
 222        do {                                    \
 223                n = q->slots[x].dep.next;       \
 224                p = q->slots[x].dep.prev;       \
 225                sfq_dep_head(q, p)->next = n;   \
 226                sfq_dep_head(q, n)->prev = p;   \
 227        } while (0)
 228
 229
 230static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 231{
 232        sfq_index p, n;
 233        int d;
 234
 235        sfq_unlink(q, x, n, p);
 236
 237        d = q->slots[x].qlen--;
 238        if (n == p && q->cur_depth == d)
 239                q->cur_depth--;
 240        sfq_link(q, x);
 241}
 242
 243static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 244{
 245        sfq_index p, n;
 246        int d;
 247
 248        sfq_unlink(q, x, n, p);
 249
 250        d = ++q->slots[x].qlen;
 251        if (q->cur_depth < d)
 252                q->cur_depth = d;
 253        sfq_link(q, x);
 254}
 255
 256/* helper functions : might be changed when/if skb use a standard list_head */
 257
 258/* remove one skb from tail of slot queue */
 259static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 260{
 261        struct sk_buff *skb = slot->skblist_prev;
 262
 263        slot->skblist_prev = skb->prev;
 264        skb->prev->next = (struct sk_buff *)slot;
 265        skb->next = skb->prev = NULL;
 266        return skb;
 267}
 268
 269/* remove one skb from head of slot queue */
 270static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 271{
 272        struct sk_buff *skb = slot->skblist_next;
 273
 274        slot->skblist_next = skb->next;
 275        skb->next->prev = (struct sk_buff *)slot;
 276        skb->next = skb->prev = NULL;
 277        return skb;
 278}
 279
 280static inline void slot_queue_init(struct sfq_slot *slot)
 281{
 282        memset(slot, 0, sizeof(*slot));
 283        slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 284}
 285
 286/* add skb to slot queue (tail add) */
 287static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 288{
 289        skb->prev = slot->skblist_prev;
 290        skb->next = (struct sk_buff *)slot;
 291        slot->skblist_prev->next = skb;
 292        slot->skblist_prev = skb;
 293}
 294
 295static unsigned int sfq_drop(struct Qdisc *sch)
 296{
 297        struct sfq_sched_data *q = qdisc_priv(sch);
 298        sfq_index x, d = q->cur_depth;
 299        struct sk_buff *skb;
 300        unsigned int len;
 301        struct sfq_slot *slot;
 302
 303        /* Queue is full! Find the longest slot and drop tail packet from it */
 304        if (d > 1) {
 305                x = q->dep[d].next;
 306                slot = &q->slots[x];
 307drop:
 308                skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 309                len = qdisc_pkt_len(skb);
 310                slot->backlog -= len;
 311                sfq_dec(q, x);
 312                sch->q.qlen--;
 313                qdisc_qstats_drop(sch);
 314                qdisc_qstats_backlog_dec(sch, skb);
 315                kfree_skb(skb);
 316                return len;
 317        }
 318
 319        if (d == 1) {
 320                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 321                x = q->tail->next;
 322                slot = &q->slots[x];
 323                q->tail->next = slot->next;
 324                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 325                goto drop;
 326        }
 327
 328        return 0;
 329}
 330
 331/* Is ECN parameter configured */
 332static int sfq_prob_mark(const struct sfq_sched_data *q)
 333{
 334        return q->flags & TC_RED_ECN;
 335}
 336
 337/* Should packets over max threshold just be marked */
 338static int sfq_hard_mark(const struct sfq_sched_data *q)
 339{
 340        return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 341}
 342
 343static int sfq_headdrop(const struct sfq_sched_data *q)
 344{
 345        return q->headdrop;
 346}
 347
 348static int
 349sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 350{
 351        struct sfq_sched_data *q = qdisc_priv(sch);
 352        unsigned int hash, dropped;
 353        sfq_index x, qlen;
 354        struct sfq_slot *slot;
 355        int uninitialized_var(ret);
 356        struct sk_buff *head;
 357        int delta;
 358
 359        hash = sfq_classify(skb, sch, &ret);
 360        if (hash == 0) {
 361                if (ret & __NET_XMIT_BYPASS)
 362                        qdisc_qstats_drop(sch);
 363                kfree_skb(skb);
 364                return ret;
 365        }
 366        hash--;
 367
 368        x = q->ht[hash];
 369        slot = &q->slots[x];
 370        if (x == SFQ_EMPTY_SLOT) {
 371                x = q->dep[0].next; /* get a free slot */
 372                if (x >= SFQ_MAX_FLOWS)
 373                        return qdisc_drop(skb, sch, to_free);
 374                q->ht[hash] = x;
 375                slot = &q->slots[x];
 376                slot->hash = hash;
 377                slot->backlog = 0; /* should already be 0 anyway... */
 378                red_set_vars(&slot->vars);
 379                goto enqueue;
 380        }
 381        if (q->red_parms) {
 382                slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 383                                                        &slot->vars,
 384                                                        slot->backlog);
 385                switch (red_action(q->red_parms,
 386                                   &slot->vars,
 387                                   slot->vars.qavg)) {
 388                case RED_DONT_MARK:
 389                        break;
 390
 391                case RED_PROB_MARK:
 392                        qdisc_qstats_overlimit(sch);
 393                        if (sfq_prob_mark(q)) {
 394                                /* We know we have at least one packet in queue */
 395                                if (sfq_headdrop(q) &&
 396                                    INET_ECN_set_ce(slot->skblist_next)) {
 397                                        q->stats.prob_mark_head++;
 398                                        break;
 399                                }
 400                                if (INET_ECN_set_ce(skb)) {
 401                                        q->stats.prob_mark++;
 402                                        break;
 403                                }
 404                        }
 405                        q->stats.prob_drop++;
 406                        goto congestion_drop;
 407
 408                case RED_HARD_MARK:
 409                        qdisc_qstats_overlimit(sch);
 410                        if (sfq_hard_mark(q)) {
 411                                /* We know we have at least one packet in queue */
 412                                if (sfq_headdrop(q) &&
 413                                    INET_ECN_set_ce(slot->skblist_next)) {
 414                                        q->stats.forced_mark_head++;
 415                                        break;
 416                                }
 417                                if (INET_ECN_set_ce(skb)) {
 418                                        q->stats.forced_mark++;
 419                                        break;
 420                                }
 421                        }
 422                        q->stats.forced_drop++;
 423                        goto congestion_drop;
 424                }
 425        }
 426
 427        if (slot->qlen >= q->maxdepth) {
 428congestion_drop:
 429                if (!sfq_headdrop(q))
 430                        return qdisc_drop(skb, sch, to_free);
 431
 432                /* We know we have at least one packet in queue */
 433                head = slot_dequeue_head(slot);
 434                delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 435                sch->qstats.backlog -= delta;
 436                slot->backlog -= delta;
 437                qdisc_drop(head, sch, to_free);
 438
 439                slot_queue_add(slot, skb);
 440                qdisc_tree_reduce_backlog(sch, 0, delta);
 441                return NET_XMIT_CN;
 442        }
 443
 444enqueue:
 445        qdisc_qstats_backlog_inc(sch, skb);
 446        slot->backlog += qdisc_pkt_len(skb);
 447        slot_queue_add(slot, skb);
 448        sfq_inc(q, x);
 449        if (slot->qlen == 1) {          /* The flow is new */
 450                if (q->tail == NULL) {  /* It is the first flow */
 451                        slot->next = x;
 452                } else {
 453                        slot->next = q->tail->next;
 454                        q->tail->next = x;
 455                }
 456                /* We put this flow at the end of our flow list.
 457                 * This might sound unfair for a new flow to wait after old ones,
 458                 * but we could endup servicing new flows only, and freeze old ones.
 459                 */
 460                q->tail = slot;
 461                /* We could use a bigger initial quantum for new flows */
 462                slot->allot = q->scaled_quantum;
 463        }
 464        if (++sch->q.qlen <= q->limit)
 465                return NET_XMIT_SUCCESS;
 466
 467        qlen = slot->qlen;
 468        dropped = sfq_drop(sch);
 469        /* Return Congestion Notification only if we dropped a packet
 470         * from this flow.
 471         */
 472        if (qlen != slot->qlen) {
 473                qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 474                return NET_XMIT_CN;
 475        }
 476
 477        /* As we dropped a packet, better let upper stack know this */
 478        qdisc_tree_reduce_backlog(sch, 1, dropped);
 479        return NET_XMIT_SUCCESS;
 480}
 481
 482static struct sk_buff *
 483sfq_dequeue(struct Qdisc *sch)
 484{
 485        struct sfq_sched_data *q = qdisc_priv(sch);
 486        struct sk_buff *skb;
 487        sfq_index a, next_a;
 488        struct sfq_slot *slot;
 489
 490        /* No active slots */
 491        if (q->tail == NULL)
 492                return NULL;
 493
 494next_slot:
 495        a = q->tail->next;
 496        slot = &q->slots[a];
 497        if (slot->allot <= 0) {
 498                q->tail = slot;
 499                slot->allot += q->scaled_quantum;
 500                goto next_slot;
 501        }
 502        skb = slot_dequeue_head(slot);
 503        sfq_dec(q, a);
 504        qdisc_bstats_update(sch, skb);
 505        sch->q.qlen--;
 506        qdisc_qstats_backlog_dec(sch, skb);
 507        slot->backlog -= qdisc_pkt_len(skb);
 508        /* Is the slot empty? */
 509        if (slot->qlen == 0) {
 510                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 511                next_a = slot->next;
 512                if (a == next_a) {
 513                        q->tail = NULL; /* no more active slots */
 514                        return skb;
 515                }
 516                q->tail->next = next_a;
 517        } else {
 518                slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 519        }
 520        return skb;
 521}
 522
 523static void
 524sfq_reset(struct Qdisc *sch)
 525{
 526        struct sk_buff *skb;
 527
 528        while ((skb = sfq_dequeue(sch)) != NULL)
 529                rtnl_kfree_skbs(skb, skb);
 530}
 531
 532/*
 533 * When q->perturbation is changed, we rehash all queued skbs
 534 * to avoid OOO (Out Of Order) effects.
 535 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 536 * counters.
 537 */
 538static void sfq_rehash(struct Qdisc *sch)
 539{
 540        struct sfq_sched_data *q = qdisc_priv(sch);
 541        struct sk_buff *skb;
 542        int i;
 543        struct sfq_slot *slot;
 544        struct sk_buff_head list;
 545        int dropped = 0;
 546        unsigned int drop_len = 0;
 547
 548        __skb_queue_head_init(&list);
 549
 550        for (i = 0; i < q->maxflows; i++) {
 551                slot = &q->slots[i];
 552                if (!slot->qlen)
 553                        continue;
 554                while (slot->qlen) {
 555                        skb = slot_dequeue_head(slot);
 556                        sfq_dec(q, i);
 557                        __skb_queue_tail(&list, skb);
 558                }
 559                slot->backlog = 0;
 560                red_set_vars(&slot->vars);
 561                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 562        }
 563        q->tail = NULL;
 564
 565        while ((skb = __skb_dequeue(&list)) != NULL) {
 566                unsigned int hash = sfq_hash(q, skb);
 567                sfq_index x = q->ht[hash];
 568
 569                slot = &q->slots[x];
 570                if (x == SFQ_EMPTY_SLOT) {
 571                        x = q->dep[0].next; /* get a free slot */
 572                        if (x >= SFQ_MAX_FLOWS) {
 573drop:
 574                                qdisc_qstats_backlog_dec(sch, skb);
 575                                drop_len += qdisc_pkt_len(skb);
 576                                kfree_skb(skb);
 577                                dropped++;
 578                                continue;
 579                        }
 580                        q->ht[hash] = x;
 581                        slot = &q->slots[x];
 582                        slot->hash = hash;
 583                }
 584                if (slot->qlen >= q->maxdepth)
 585                        goto drop;
 586                slot_queue_add(slot, skb);
 587                if (q->red_parms)
 588                        slot->vars.qavg = red_calc_qavg(q->red_parms,
 589                                                        &slot->vars,
 590                                                        slot->backlog);
 591                slot->backlog += qdisc_pkt_len(skb);
 592                sfq_inc(q, x);
 593                if (slot->qlen == 1) {          /* The flow is new */
 594                        if (q->tail == NULL) {  /* It is the first flow */
 595                                slot->next = x;
 596                        } else {
 597                                slot->next = q->tail->next;
 598                                q->tail->next = x;
 599                        }
 600                        q->tail = slot;
 601                        slot->allot = q->scaled_quantum;
 602                }
 603        }
 604        sch->q.qlen -= dropped;
 605        qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 606}
 607
 608static void sfq_perturbation(unsigned long arg)
 609{
 610        struct Qdisc *sch = (struct Qdisc *)arg;
 611        struct sfq_sched_data *q = qdisc_priv(sch);
 612        spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 613
 614        spin_lock(root_lock);
 615        q->perturbation = prandom_u32();
 616        if (!q->filter_list && q->tail)
 617                sfq_rehash(sch);
 618        spin_unlock(root_lock);
 619
 620        if (q->perturb_period)
 621                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 622}
 623
 624static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 625{
 626        struct sfq_sched_data *q = qdisc_priv(sch);
 627        struct tc_sfq_qopt *ctl = nla_data(opt);
 628        struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 629        unsigned int qlen, dropped = 0;
 630        struct red_parms *p = NULL;
 631
 632        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 633                return -EINVAL;
 634        if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 635                ctl_v1 = nla_data(opt);
 636        if (ctl->divisor &&
 637            (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 638                return -EINVAL;
 639        if (ctl_v1 && ctl_v1->qth_min) {
 640                p = kmalloc(sizeof(*p), GFP_KERNEL);
 641                if (!p)
 642                        return -ENOMEM;
 643        }
 644        sch_tree_lock(sch);
 645        if (ctl->quantum) {
 646                q->quantum = ctl->quantum;
 647                q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 648        }
 649        q->perturb_period = ctl->perturb_period * HZ;
 650        if (ctl->flows)
 651                q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 652        if (ctl->divisor) {
 653                q->divisor = ctl->divisor;
 654                q->maxflows = min_t(u32, q->maxflows, q->divisor);
 655        }
 656        if (ctl_v1) {
 657                if (ctl_v1->depth)
 658                        q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 659                if (p) {
 660                        swap(q->red_parms, p);
 661                        red_set_parms(q->red_parms,
 662                                      ctl_v1->qth_min, ctl_v1->qth_max,
 663                                      ctl_v1->Wlog,
 664                                      ctl_v1->Plog, ctl_v1->Scell_log,
 665                                      NULL,
 666                                      ctl_v1->max_P);
 667                }
 668                q->flags = ctl_v1->flags;
 669                q->headdrop = ctl_v1->headdrop;
 670        }
 671        if (ctl->limit) {
 672                q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 673                q->maxflows = min_t(u32, q->maxflows, q->limit);
 674        }
 675
 676        qlen = sch->q.qlen;
 677        while (sch->q.qlen > q->limit)
 678                dropped += sfq_drop(sch);
 679        qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 680
 681        del_timer(&q->perturb_timer);
 682        if (q->perturb_period) {
 683                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 684                q->perturbation = prandom_u32();
 685        }
 686        sch_tree_unlock(sch);
 687        kfree(p);
 688        return 0;
 689}
 690
 691static void *sfq_alloc(size_t sz)
 692{
 693        return  kvmalloc(sz, GFP_KERNEL);
 694}
 695
 696static void sfq_free(void *addr)
 697{
 698        kvfree(addr);
 699}
 700
 701static void sfq_destroy(struct Qdisc *sch)
 702{
 703        struct sfq_sched_data *q = qdisc_priv(sch);
 704
 705        tcf_block_put(q->block);
 706        q->perturb_period = 0;
 707        del_timer_sync(&q->perturb_timer);
 708        sfq_free(q->ht);
 709        sfq_free(q->slots);
 710        kfree(q->red_parms);
 711}
 712
 713static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 714{
 715        struct sfq_sched_data *q = qdisc_priv(sch);
 716        int i;
 717        int err;
 718
 719        setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
 720                               (unsigned long)sch);
 721
 722        err = tcf_block_get(&q->block, &q->filter_list);
 723        if (err)
 724                return err;
 725
 726        for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 727                q->dep[i].next = i + SFQ_MAX_FLOWS;
 728                q->dep[i].prev = i + SFQ_MAX_FLOWS;
 729        }
 730
 731        q->limit = SFQ_MAX_DEPTH;
 732        q->maxdepth = SFQ_MAX_DEPTH;
 733        q->cur_depth = 0;
 734        q->tail = NULL;
 735        q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 736        q->maxflows = SFQ_DEFAULT_FLOWS;
 737        q->quantum = psched_mtu(qdisc_dev(sch));
 738        q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 739        q->perturb_period = 0;
 740        q->perturbation = prandom_u32();
 741
 742        if (opt) {
 743                int err = sfq_change(sch, opt);
 744                if (err)
 745                        return err;
 746        }
 747
 748        q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 749        q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 750        if (!q->ht || !q->slots) {
 751                /* Note: sfq_destroy() will be called by our caller */
 752                return -ENOMEM;
 753        }
 754
 755        for (i = 0; i < q->divisor; i++)
 756                q->ht[i] = SFQ_EMPTY_SLOT;
 757
 758        for (i = 0; i < q->maxflows; i++) {
 759                slot_queue_init(&q->slots[i]);
 760                sfq_link(q, i);
 761        }
 762        if (q->limit >= 1)
 763                sch->flags |= TCQ_F_CAN_BYPASS;
 764        else
 765                sch->flags &= ~TCQ_F_CAN_BYPASS;
 766        return 0;
 767}
 768
 769static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 770{
 771        struct sfq_sched_data *q = qdisc_priv(sch);
 772        unsigned char *b = skb_tail_pointer(skb);
 773        struct tc_sfq_qopt_v1 opt;
 774        struct red_parms *p = q->red_parms;
 775
 776        memset(&opt, 0, sizeof(opt));
 777        opt.v0.quantum  = q->quantum;
 778        opt.v0.perturb_period = q->perturb_period / HZ;
 779        opt.v0.limit    = q->limit;
 780        opt.v0.divisor  = q->divisor;
 781        opt.v0.flows    = q->maxflows;
 782        opt.depth       = q->maxdepth;
 783        opt.headdrop    = q->headdrop;
 784
 785        if (p) {
 786                opt.qth_min     = p->qth_min >> p->Wlog;
 787                opt.qth_max     = p->qth_max >> p->Wlog;
 788                opt.Wlog        = p->Wlog;
 789                opt.Plog        = p->Plog;
 790                opt.Scell_log   = p->Scell_log;
 791                opt.max_P       = p->max_P;
 792        }
 793        memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 794        opt.flags       = q->flags;
 795
 796        if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 797                goto nla_put_failure;
 798
 799        return skb->len;
 800
 801nla_put_failure:
 802        nlmsg_trim(skb, b);
 803        return -1;
 804}
 805
 806static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 807{
 808        return NULL;
 809}
 810
 811static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 812{
 813        return 0;
 814}
 815
 816static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 817                              u32 classid)
 818{
 819        /* we cannot bypass queue discipline anymore */
 820        sch->flags &= ~TCQ_F_CAN_BYPASS;
 821        return 0;
 822}
 823
 824static void sfq_put(struct Qdisc *q, unsigned long cl)
 825{
 826}
 827
 828static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
 829{
 830        struct sfq_sched_data *q = qdisc_priv(sch);
 831
 832        if (cl)
 833                return NULL;
 834        return q->block;
 835}
 836
 837static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 838                          struct sk_buff *skb, struct tcmsg *tcm)
 839{
 840        tcm->tcm_handle |= TC_H_MIN(cl);
 841        return 0;
 842}
 843
 844static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 845                                struct gnet_dump *d)
 846{
 847        struct sfq_sched_data *q = qdisc_priv(sch);
 848        sfq_index idx = q->ht[cl - 1];
 849        struct gnet_stats_queue qs = { 0 };
 850        struct tc_sfq_xstats xstats = { 0 };
 851
 852        if (idx != SFQ_EMPTY_SLOT) {
 853                const struct sfq_slot *slot = &q->slots[idx];
 854
 855                xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 856                qs.qlen = slot->qlen;
 857                qs.backlog = slot->backlog;
 858        }
 859        if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 860                return -1;
 861        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 862}
 863
 864static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 865{
 866        struct sfq_sched_data *q = qdisc_priv(sch);
 867        unsigned int i;
 868
 869        if (arg->stop)
 870                return;
 871
 872        for (i = 0; i < q->divisor; i++) {
 873                if (q->ht[i] == SFQ_EMPTY_SLOT ||
 874                    arg->count < arg->skip) {
 875                        arg->count++;
 876                        continue;
 877                }
 878                if (arg->fn(sch, i + 1, arg) < 0) {
 879                        arg->stop = 1;
 880                        break;
 881                }
 882                arg->count++;
 883        }
 884}
 885
 886static const struct Qdisc_class_ops sfq_class_ops = {
 887        .leaf           =       sfq_leaf,
 888        .get            =       sfq_get,
 889        .put            =       sfq_put,
 890        .tcf_block      =       sfq_tcf_block,
 891        .bind_tcf       =       sfq_bind,
 892        .unbind_tcf     =       sfq_put,
 893        .dump           =       sfq_dump_class,
 894        .dump_stats     =       sfq_dump_class_stats,
 895        .walk           =       sfq_walk,
 896};
 897
 898static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 899        .cl_ops         =       &sfq_class_ops,
 900        .id             =       "sfq",
 901        .priv_size      =       sizeof(struct sfq_sched_data),
 902        .enqueue        =       sfq_enqueue,
 903        .dequeue        =       sfq_dequeue,
 904        .peek           =       qdisc_peek_dequeued,
 905        .init           =       sfq_init,
 906        .reset          =       sfq_reset,
 907        .destroy        =       sfq_destroy,
 908        .change         =       NULL,
 909        .dump           =       sfq_dump,
 910        .owner          =       THIS_MODULE,
 911};
 912
 913static int __init sfq_module_init(void)
 914{
 915        return register_qdisc(&sfq_qdisc_ops);
 916}
 917static void __exit sfq_module_exit(void)
 918{
 919        unregister_qdisc(&sfq_qdisc_ops);
 920}
 921module_init(sfq_module_init)
 922module_exit(sfq_module_exit)
 923MODULE_LICENSE("GPL");
 924