linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/types.h>
  14#include <linux/kernel.h>
  15#include <linux/jiffies.h>
  16#include <linux/string.h>
  17#include <linux/in.h>
  18#include <linux/errno.h>
  19#include <linux/init.h>
  20#include <linux/skbuff.h>
  21#include <linux/jhash.h>
  22#include <linux/slab.h>
  23#include <linux/vmalloc.h>
  24#include <net/netlink.h>
  25#include <net/pkt_sched.h>
  26#include <net/pkt_cls.h>
  27#include <net/red.h>
  28
  29
  30/*      Stochastic Fairness Queuing algorithm.
  31        =======================================
  32
  33        Source:
  34        Paul E. McKenney "Stochastic Fairness Queuing",
  35        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37        Paul E. McKenney "Stochastic Fairness Queuing",
  38        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41        See also:
  42        M. Shreedhar and George Varghese "Efficient Fair
  43        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46        This is not the thing that is usually called (W)FQ nowadays.
  47        It does not use any timestamp mechanism, but instead
  48        processes queues in round-robin order.
  49
  50        ADVANTAGE:
  51
  52        - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54        DRAWBACKS:
  55
  56        - "Stochastic" -> It is not 100% fair.
  57        When hash collisions occur, several flows are considered as one.
  58
  59        - "Round-robin" -> It introduces larger delays than virtual clock
  60        based schemes, and should not be used for isolating interactive
  61        traffic from non-interactive. It means, that this scheduler
  62        should be used as leaf of CBQ or P3, which put interactive traffic
  63        to higher priority band.
  64
  65        We still need true WFQ for top level CSZ, but using WFQ
  66        for the best effort traffic is absolutely pointless:
  67        SFQ is superior for this purpose.
  68
  69        IMPLEMENTATION:
  70        This implementation limits :
  71        - maximal queue length per flow to 127 packets.
  72        - max mtu to 2^18-1;
  73        - max 65408 flows,
  74        - number of hash buckets to 65536.
  75
  76        It is easy to increase these values, but not in flight.  */
  77
  78#define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79#define SFQ_DEFAULT_FLOWS       128
  80#define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81#define SFQ_EMPTY_SLOT          0xffff
  82#define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84/* We use 16 bits to store allot, and want to handle packets up to 64K
  85 * Scale allot by 8 (1<<3) so that no overflow occurs.
  86 */
  87#define SFQ_ALLOT_SHIFT         3
  88#define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91typedef u16 sfq_index;
  92
  93/*
  94 * We dont use pointers to save space.
  95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97 * are 'pointers' to dep[] array
  98 */
  99struct sfq_head {
 100        sfq_index       next;
 101        sfq_index       prev;
 102};
 103
 104struct sfq_slot {
 105        struct sk_buff  *skblist_next;
 106        struct sk_buff  *skblist_prev;
 107        sfq_index       qlen; /* number of skbs in skblist */
 108        sfq_index       next; /* next slot in sfq RR chain */
 109        struct sfq_head dep; /* anchor in dep[] chains */
 110        unsigned short  hash; /* hash value (index in ht[]) */
 111        short           allot; /* credit for this slot */
 112
 113        unsigned int    backlog;
 114        struct red_vars vars;
 115};
 116
 117struct sfq_sched_data {
 118/* frequently used fields */
 119        int             limit;          /* limit of total number of packets in this qdisc */
 120        unsigned int    divisor;        /* number of slots in hash table */
 121        u8              headdrop;
 122        u8              maxdepth;       /* limit of packets per flow */
 123
 124        u32             perturbation;
 125        u8              cur_depth;      /* depth of longest slot */
 126        u8              flags;
 127        unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128        struct tcf_proto __rcu *filter_list;
 129        struct tcf_block *block;
 130        sfq_index       *ht;            /* Hash table ('divisor' slots) */
 131        struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 132
 133        struct red_parms *red_parms;
 134        struct tc_sfqred_stats stats;
 135        struct sfq_slot *tail;          /* current slot in round */
 136
 137        struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 138                                        /* Linked lists of slots, indexed by depth
 139                                         * dep[0] : list of unused flows
 140                                         * dep[1] : list of flows with 1 packet
 141                                         * dep[X] : list of flows with X packets
 142                                         */
 143
 144        unsigned int    maxflows;       /* number of flows in flows array */
 145        int             perturb_period;
 146        unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 147        struct timer_list perturb_timer;
 148        struct Qdisc    *sch;
 149};
 150
 151/*
 152 * sfq_head are either in a sfq_slot or in dep[] array
 153 */
 154static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 155{
 156        if (val < SFQ_MAX_FLOWS)
 157                return &q->slots[val].dep;
 158        return &q->dep[val - SFQ_MAX_FLOWS];
 159}
 160
 161static unsigned int sfq_hash(const struct sfq_sched_data *q,
 162                             const struct sk_buff *skb)
 163{
 164        return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 165}
 166
 167static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 168                                 int *qerr)
 169{
 170        struct sfq_sched_data *q = qdisc_priv(sch);
 171        struct tcf_result res;
 172        struct tcf_proto *fl;
 173        int result;
 174
 175        if (TC_H_MAJ(skb->priority) == sch->handle &&
 176            TC_H_MIN(skb->priority) > 0 &&
 177            TC_H_MIN(skb->priority) <= q->divisor)
 178                return TC_H_MIN(skb->priority);
 179
 180        fl = rcu_dereference_bh(q->filter_list);
 181        if (!fl)
 182                return sfq_hash(q, skb) + 1;
 183
 184        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 185        result = tcf_classify(skb, fl, &res, false);
 186        if (result >= 0) {
 187#ifdef CONFIG_NET_CLS_ACT
 188                switch (result) {
 189                case TC_ACT_STOLEN:
 190                case TC_ACT_QUEUED:
 191                case TC_ACT_TRAP:
 192                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 193                        /* fall through */
 194                case TC_ACT_SHOT:
 195                        return 0;
 196                }
 197#endif
 198                if (TC_H_MIN(res.classid) <= q->divisor)
 199                        return TC_H_MIN(res.classid);
 200        }
 201        return 0;
 202}
 203
 204/*
 205 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 206 */
 207static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 208{
 209        sfq_index p, n;
 210        struct sfq_slot *slot = &q->slots[x];
 211        int qlen = slot->qlen;
 212
 213        p = qlen + SFQ_MAX_FLOWS;
 214        n = q->dep[qlen].next;
 215
 216        slot->dep.next = n;
 217        slot->dep.prev = p;
 218
 219        q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 220        sfq_dep_head(q, n)->prev = x;
 221}
 222
 223#define sfq_unlink(q, x, n, p)                  \
 224        do {                                    \
 225                n = q->slots[x].dep.next;       \
 226                p = q->slots[x].dep.prev;       \
 227                sfq_dep_head(q, p)->next = n;   \
 228                sfq_dep_head(q, n)->prev = p;   \
 229        } while (0)
 230
 231
 232static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 233{
 234        sfq_index p, n;
 235        int d;
 236
 237        sfq_unlink(q, x, n, p);
 238
 239        d = q->slots[x].qlen--;
 240        if (n == p && q->cur_depth == d)
 241                q->cur_depth--;
 242        sfq_link(q, x);
 243}
 244
 245static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 246{
 247        sfq_index p, n;
 248        int d;
 249
 250        sfq_unlink(q, x, n, p);
 251
 252        d = ++q->slots[x].qlen;
 253        if (q->cur_depth < d)
 254                q->cur_depth = d;
 255        sfq_link(q, x);
 256}
 257
 258/* helper functions : might be changed when/if skb use a standard list_head */
 259
 260/* remove one skb from tail of slot queue */
 261static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 262{
 263        struct sk_buff *skb = slot->skblist_prev;
 264
 265        slot->skblist_prev = skb->prev;
 266        skb->prev->next = (struct sk_buff *)slot;
 267        skb->next = skb->prev = NULL;
 268        return skb;
 269}
 270
 271/* remove one skb from head of slot queue */
 272static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 273{
 274        struct sk_buff *skb = slot->skblist_next;
 275
 276        slot->skblist_next = skb->next;
 277        skb->next->prev = (struct sk_buff *)slot;
 278        skb->next = skb->prev = NULL;
 279        return skb;
 280}
 281
 282static inline void slot_queue_init(struct sfq_slot *slot)
 283{
 284        memset(slot, 0, sizeof(*slot));
 285        slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 286}
 287
 288/* add skb to slot queue (tail add) */
 289static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 290{
 291        skb->prev = slot->skblist_prev;
 292        skb->next = (struct sk_buff *)slot;
 293        slot->skblist_prev->next = skb;
 294        slot->skblist_prev = skb;
 295}
 296
 297static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
 298{
 299        struct sfq_sched_data *q = qdisc_priv(sch);
 300        sfq_index x, d = q->cur_depth;
 301        struct sk_buff *skb;
 302        unsigned int len;
 303        struct sfq_slot *slot;
 304
 305        /* Queue is full! Find the longest slot and drop tail packet from it */
 306        if (d > 1) {
 307                x = q->dep[d].next;
 308                slot = &q->slots[x];
 309drop:
 310                skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 311                len = qdisc_pkt_len(skb);
 312                slot->backlog -= len;
 313                sfq_dec(q, x);
 314                sch->q.qlen--;
 315                qdisc_qstats_backlog_dec(sch, skb);
 316                qdisc_drop(skb, sch, to_free);
 317                return len;
 318        }
 319
 320        if (d == 1) {
 321                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 322                x = q->tail->next;
 323                slot = &q->slots[x];
 324                q->tail->next = slot->next;
 325                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 326                goto drop;
 327        }
 328
 329        return 0;
 330}
 331
 332/* Is ECN parameter configured */
 333static int sfq_prob_mark(const struct sfq_sched_data *q)
 334{
 335        return q->flags & TC_RED_ECN;
 336}
 337
 338/* Should packets over max threshold just be marked */
 339static int sfq_hard_mark(const struct sfq_sched_data *q)
 340{
 341        return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 342}
 343
 344static int sfq_headdrop(const struct sfq_sched_data *q)
 345{
 346        return q->headdrop;
 347}
 348
 349static int
 350sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 351{
 352        struct sfq_sched_data *q = qdisc_priv(sch);
 353        unsigned int hash, dropped;
 354        sfq_index x, qlen;
 355        struct sfq_slot *slot;
 356        int uninitialized_var(ret);
 357        struct sk_buff *head;
 358        int delta;
 359
 360        hash = sfq_classify(skb, sch, &ret);
 361        if (hash == 0) {
 362                if (ret & __NET_XMIT_BYPASS)
 363                        qdisc_qstats_drop(sch);
 364                __qdisc_drop(skb, to_free);
 365                return ret;
 366        }
 367        hash--;
 368
 369        x = q->ht[hash];
 370        slot = &q->slots[x];
 371        if (x == SFQ_EMPTY_SLOT) {
 372                x = q->dep[0].next; /* get a free slot */
 373                if (x >= SFQ_MAX_FLOWS)
 374                        return qdisc_drop(skb, sch, to_free);
 375                q->ht[hash] = x;
 376                slot = &q->slots[x];
 377                slot->hash = hash;
 378                slot->backlog = 0; /* should already be 0 anyway... */
 379                red_set_vars(&slot->vars);
 380                goto enqueue;
 381        }
 382        if (q->red_parms) {
 383                slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 384                                                        &slot->vars,
 385                                                        slot->backlog);
 386                switch (red_action(q->red_parms,
 387                                   &slot->vars,
 388                                   slot->vars.qavg)) {
 389                case RED_DONT_MARK:
 390                        break;
 391
 392                case RED_PROB_MARK:
 393                        qdisc_qstats_overlimit(sch);
 394                        if (sfq_prob_mark(q)) {
 395                                /* We know we have at least one packet in queue */
 396                                if (sfq_headdrop(q) &&
 397                                    INET_ECN_set_ce(slot->skblist_next)) {
 398                                        q->stats.prob_mark_head++;
 399                                        break;
 400                                }
 401                                if (INET_ECN_set_ce(skb)) {
 402                                        q->stats.prob_mark++;
 403                                        break;
 404                                }
 405                        }
 406                        q->stats.prob_drop++;
 407                        goto congestion_drop;
 408
 409                case RED_HARD_MARK:
 410                        qdisc_qstats_overlimit(sch);
 411                        if (sfq_hard_mark(q)) {
 412                                /* We know we have at least one packet in queue */
 413                                if (sfq_headdrop(q) &&
 414                                    INET_ECN_set_ce(slot->skblist_next)) {
 415                                        q->stats.forced_mark_head++;
 416                                        break;
 417                                }
 418                                if (INET_ECN_set_ce(skb)) {
 419                                        q->stats.forced_mark++;
 420                                        break;
 421                                }
 422                        }
 423                        q->stats.forced_drop++;
 424                        goto congestion_drop;
 425                }
 426        }
 427
 428        if (slot->qlen >= q->maxdepth) {
 429congestion_drop:
 430                if (!sfq_headdrop(q))
 431                        return qdisc_drop(skb, sch, to_free);
 432
 433                /* We know we have at least one packet in queue */
 434                head = slot_dequeue_head(slot);
 435                delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 436                sch->qstats.backlog -= delta;
 437                slot->backlog -= delta;
 438                qdisc_drop(head, sch, to_free);
 439
 440                slot_queue_add(slot, skb);
 441                qdisc_tree_reduce_backlog(sch, 0, delta);
 442                return NET_XMIT_CN;
 443        }
 444
 445enqueue:
 446        qdisc_qstats_backlog_inc(sch, skb);
 447        slot->backlog += qdisc_pkt_len(skb);
 448        slot_queue_add(slot, skb);
 449        sfq_inc(q, x);
 450        if (slot->qlen == 1) {          /* The flow is new */
 451                if (q->tail == NULL) {  /* It is the first flow */
 452                        slot->next = x;
 453                } else {
 454                        slot->next = q->tail->next;
 455                        q->tail->next = x;
 456                }
 457                /* We put this flow at the end of our flow list.
 458                 * This might sound unfair for a new flow to wait after old ones,
 459                 * but we could endup servicing new flows only, and freeze old ones.
 460                 */
 461                q->tail = slot;
 462                /* We could use a bigger initial quantum for new flows */
 463                slot->allot = q->scaled_quantum;
 464        }
 465        if (++sch->q.qlen <= q->limit)
 466                return NET_XMIT_SUCCESS;
 467
 468        qlen = slot->qlen;
 469        dropped = sfq_drop(sch, to_free);
 470        /* Return Congestion Notification only if we dropped a packet
 471         * from this flow.
 472         */
 473        if (qlen != slot->qlen) {
 474                qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 475                return NET_XMIT_CN;
 476        }
 477
 478        /* As we dropped a packet, better let upper stack know this */
 479        qdisc_tree_reduce_backlog(sch, 1, dropped);
 480        return NET_XMIT_SUCCESS;
 481}
 482
 483static struct sk_buff *
 484sfq_dequeue(struct Qdisc *sch)
 485{
 486        struct sfq_sched_data *q = qdisc_priv(sch);
 487        struct sk_buff *skb;
 488        sfq_index a, next_a;
 489        struct sfq_slot *slot;
 490
 491        /* No active slots */
 492        if (q->tail == NULL)
 493                return NULL;
 494
 495next_slot:
 496        a = q->tail->next;
 497        slot = &q->slots[a];
 498        if (slot->allot <= 0) {
 499                q->tail = slot;
 500                slot->allot += q->scaled_quantum;
 501                goto next_slot;
 502        }
 503        skb = slot_dequeue_head(slot);
 504        sfq_dec(q, a);
 505        qdisc_bstats_update(sch, skb);
 506        sch->q.qlen--;
 507        qdisc_qstats_backlog_dec(sch, skb);
 508        slot->backlog -= qdisc_pkt_len(skb);
 509        /* Is the slot empty? */
 510        if (slot->qlen == 0) {
 511                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 512                next_a = slot->next;
 513                if (a == next_a) {
 514                        q->tail = NULL; /* no more active slots */
 515                        return skb;
 516                }
 517                q->tail->next = next_a;
 518        } else {
 519                slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 520        }
 521        return skb;
 522}
 523
 524static void
 525sfq_reset(struct Qdisc *sch)
 526{
 527        struct sk_buff *skb;
 528
 529        while ((skb = sfq_dequeue(sch)) != NULL)
 530                rtnl_kfree_skbs(skb, skb);
 531}
 532
 533/*
 534 * When q->perturbation is changed, we rehash all queued skbs
 535 * to avoid OOO (Out Of Order) effects.
 536 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 537 * counters.
 538 */
 539static void sfq_rehash(struct Qdisc *sch)
 540{
 541        struct sfq_sched_data *q = qdisc_priv(sch);
 542        struct sk_buff *skb;
 543        int i;
 544        struct sfq_slot *slot;
 545        struct sk_buff_head list;
 546        int dropped = 0;
 547        unsigned int drop_len = 0;
 548
 549        __skb_queue_head_init(&list);
 550
 551        for (i = 0; i < q->maxflows; i++) {
 552                slot = &q->slots[i];
 553                if (!slot->qlen)
 554                        continue;
 555                while (slot->qlen) {
 556                        skb = slot_dequeue_head(slot);
 557                        sfq_dec(q, i);
 558                        __skb_queue_tail(&list, skb);
 559                }
 560                slot->backlog = 0;
 561                red_set_vars(&slot->vars);
 562                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 563        }
 564        q->tail = NULL;
 565
 566        while ((skb = __skb_dequeue(&list)) != NULL) {
 567                unsigned int hash = sfq_hash(q, skb);
 568                sfq_index x = q->ht[hash];
 569
 570                slot = &q->slots[x];
 571                if (x == SFQ_EMPTY_SLOT) {
 572                        x = q->dep[0].next; /* get a free slot */
 573                        if (x >= SFQ_MAX_FLOWS) {
 574drop:
 575                                qdisc_qstats_backlog_dec(sch, skb);
 576                                drop_len += qdisc_pkt_len(skb);
 577                                kfree_skb(skb);
 578                                dropped++;
 579                                continue;
 580                        }
 581                        q->ht[hash] = x;
 582                        slot = &q->slots[x];
 583                        slot->hash = hash;
 584                }
 585                if (slot->qlen >= q->maxdepth)
 586                        goto drop;
 587                slot_queue_add(slot, skb);
 588                if (q->red_parms)
 589                        slot->vars.qavg = red_calc_qavg(q->red_parms,
 590                                                        &slot->vars,
 591                                                        slot->backlog);
 592                slot->backlog += qdisc_pkt_len(skb);
 593                sfq_inc(q, x);
 594                if (slot->qlen == 1) {          /* The flow is new */
 595                        if (q->tail == NULL) {  /* It is the first flow */
 596                                slot->next = x;
 597                        } else {
 598                                slot->next = q->tail->next;
 599                                q->tail->next = x;
 600                        }
 601                        q->tail = slot;
 602                        slot->allot = q->scaled_quantum;
 603                }
 604        }
 605        sch->q.qlen -= dropped;
 606        qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 607}
 608
 609static void sfq_perturbation(struct timer_list *t)
 610{
 611        struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
 612        struct Qdisc *sch = q->sch;
 613        spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 614
 615        spin_lock(root_lock);
 616        q->perturbation = prandom_u32();
 617        if (!q->filter_list && q->tail)
 618                sfq_rehash(sch);
 619        spin_unlock(root_lock);
 620
 621        if (q->perturb_period)
 622                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 623}
 624
 625static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 626{
 627        struct sfq_sched_data *q = qdisc_priv(sch);
 628        struct tc_sfq_qopt *ctl = nla_data(opt);
 629        struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 630        unsigned int qlen, dropped = 0;
 631        struct red_parms *p = NULL;
 632        struct sk_buff *to_free = NULL;
 633        struct sk_buff *tail = NULL;
 634
 635        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 636                return -EINVAL;
 637        if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 638                ctl_v1 = nla_data(opt);
 639        if (ctl->divisor &&
 640            (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 641                return -EINVAL;
 642        if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
 643                                        ctl_v1->Wlog))
 644                return -EINVAL;
 645        if (ctl_v1 && ctl_v1->qth_min) {
 646                p = kmalloc(sizeof(*p), GFP_KERNEL);
 647                if (!p)
 648                        return -ENOMEM;
 649        }
 650        sch_tree_lock(sch);
 651        if (ctl->quantum) {
 652                q->quantum = ctl->quantum;
 653                q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 654        }
 655        q->perturb_period = ctl->perturb_period * HZ;
 656        if (ctl->flows)
 657                q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 658        if (ctl->divisor) {
 659                q->divisor = ctl->divisor;
 660                q->maxflows = min_t(u32, q->maxflows, q->divisor);
 661        }
 662        if (ctl_v1) {
 663                if (ctl_v1->depth)
 664                        q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 665                if (p) {
 666                        swap(q->red_parms, p);
 667                        red_set_parms(q->red_parms,
 668                                      ctl_v1->qth_min, ctl_v1->qth_max,
 669                                      ctl_v1->Wlog,
 670                                      ctl_v1->Plog, ctl_v1->Scell_log,
 671                                      NULL,
 672                                      ctl_v1->max_P);
 673                }
 674                q->flags = ctl_v1->flags;
 675                q->headdrop = ctl_v1->headdrop;
 676        }
 677        if (ctl->limit) {
 678                q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 679                q->maxflows = min_t(u32, q->maxflows, q->limit);
 680        }
 681
 682        qlen = sch->q.qlen;
 683        while (sch->q.qlen > q->limit) {
 684                dropped += sfq_drop(sch, &to_free);
 685                if (!tail)
 686                        tail = to_free;
 687        }
 688
 689        rtnl_kfree_skbs(to_free, tail);
 690        qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 691
 692        del_timer(&q->perturb_timer);
 693        if (q->perturb_period) {
 694                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 695                q->perturbation = prandom_u32();
 696        }
 697        sch_tree_unlock(sch);
 698        kfree(p);
 699        return 0;
 700}
 701
 702static void *sfq_alloc(size_t sz)
 703{
 704        return  kvmalloc(sz, GFP_KERNEL);
 705}
 706
 707static void sfq_free(void *addr)
 708{
 709        kvfree(addr);
 710}
 711
 712static void sfq_destroy(struct Qdisc *sch)
 713{
 714        struct sfq_sched_data *q = qdisc_priv(sch);
 715
 716        tcf_block_put(q->block);
 717        q->perturb_period = 0;
 718        del_timer_sync(&q->perturb_timer);
 719        sfq_free(q->ht);
 720        sfq_free(q->slots);
 721        kfree(q->red_parms);
 722}
 723
 724static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
 725                    struct netlink_ext_ack *extack)
 726{
 727        struct sfq_sched_data *q = qdisc_priv(sch);
 728        int i;
 729        int err;
 730
 731        q->sch = sch;
 732        timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
 733
 734        err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 735        if (err)
 736                return err;
 737
 738        for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 739                q->dep[i].next = i + SFQ_MAX_FLOWS;
 740                q->dep[i].prev = i + SFQ_MAX_FLOWS;
 741        }
 742
 743        q->limit = SFQ_MAX_DEPTH;
 744        q->maxdepth = SFQ_MAX_DEPTH;
 745        q->cur_depth = 0;
 746        q->tail = NULL;
 747        q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 748        q->maxflows = SFQ_DEFAULT_FLOWS;
 749        q->quantum = psched_mtu(qdisc_dev(sch));
 750        q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 751        q->perturb_period = 0;
 752        q->perturbation = prandom_u32();
 753
 754        if (opt) {
 755                int err = sfq_change(sch, opt);
 756                if (err)
 757                        return err;
 758        }
 759
 760        q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 761        q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 762        if (!q->ht || !q->slots) {
 763                /* Note: sfq_destroy() will be called by our caller */
 764                return -ENOMEM;
 765        }
 766
 767        for (i = 0; i < q->divisor; i++)
 768                q->ht[i] = SFQ_EMPTY_SLOT;
 769
 770        for (i = 0; i < q->maxflows; i++) {
 771                slot_queue_init(&q->slots[i]);
 772                sfq_link(q, i);
 773        }
 774        if (q->limit >= 1)
 775                sch->flags |= TCQ_F_CAN_BYPASS;
 776        else
 777                sch->flags &= ~TCQ_F_CAN_BYPASS;
 778        return 0;
 779}
 780
 781static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 782{
 783        struct sfq_sched_data *q = qdisc_priv(sch);
 784        unsigned char *b = skb_tail_pointer(skb);
 785        struct tc_sfq_qopt_v1 opt;
 786        struct red_parms *p = q->red_parms;
 787
 788        memset(&opt, 0, sizeof(opt));
 789        opt.v0.quantum  = q->quantum;
 790        opt.v0.perturb_period = q->perturb_period / HZ;
 791        opt.v0.limit    = q->limit;
 792        opt.v0.divisor  = q->divisor;
 793        opt.v0.flows    = q->maxflows;
 794        opt.depth       = q->maxdepth;
 795        opt.headdrop    = q->headdrop;
 796
 797        if (p) {
 798                opt.qth_min     = p->qth_min >> p->Wlog;
 799                opt.qth_max     = p->qth_max >> p->Wlog;
 800                opt.Wlog        = p->Wlog;
 801                opt.Plog        = p->Plog;
 802                opt.Scell_log   = p->Scell_log;
 803                opt.max_P       = p->max_P;
 804        }
 805        memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 806        opt.flags       = q->flags;
 807
 808        if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 809                goto nla_put_failure;
 810
 811        return skb->len;
 812
 813nla_put_failure:
 814        nlmsg_trim(skb, b);
 815        return -1;
 816}
 817
 818static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 819{
 820        return NULL;
 821}
 822
 823static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 824{
 825        return 0;
 826}
 827
 828static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 829                              u32 classid)
 830{
 831        /* we cannot bypass queue discipline anymore */
 832        sch->flags &= ~TCQ_F_CAN_BYPASS;
 833        return 0;
 834}
 835
 836static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 837{
 838}
 839
 840static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
 841                                       struct netlink_ext_ack *extack)
 842{
 843        struct sfq_sched_data *q = qdisc_priv(sch);
 844
 845        if (cl)
 846                return NULL;
 847        return q->block;
 848}
 849
 850static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 851                          struct sk_buff *skb, struct tcmsg *tcm)
 852{
 853        tcm->tcm_handle |= TC_H_MIN(cl);
 854        return 0;
 855}
 856
 857static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 858                                struct gnet_dump *d)
 859{
 860        struct sfq_sched_data *q = qdisc_priv(sch);
 861        sfq_index idx = q->ht[cl - 1];
 862        struct gnet_stats_queue qs = { 0 };
 863        struct tc_sfq_xstats xstats = { 0 };
 864
 865        if (idx != SFQ_EMPTY_SLOT) {
 866                const struct sfq_slot *slot = &q->slots[idx];
 867
 868                xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 869                qs.qlen = slot->qlen;
 870                qs.backlog = slot->backlog;
 871        }
 872        if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 873                return -1;
 874        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 875}
 876
 877static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 878{
 879        struct sfq_sched_data *q = qdisc_priv(sch);
 880        unsigned int i;
 881
 882        if (arg->stop)
 883                return;
 884
 885        for (i = 0; i < q->divisor; i++) {
 886                if (q->ht[i] == SFQ_EMPTY_SLOT ||
 887                    arg->count < arg->skip) {
 888                        arg->count++;
 889                        continue;
 890                }
 891                if (arg->fn(sch, i + 1, arg) < 0) {
 892                        arg->stop = 1;
 893                        break;
 894                }
 895                arg->count++;
 896        }
 897}
 898
 899static const struct Qdisc_class_ops sfq_class_ops = {
 900        .leaf           =       sfq_leaf,
 901        .find           =       sfq_find,
 902        .tcf_block      =       sfq_tcf_block,
 903        .bind_tcf       =       sfq_bind,
 904        .unbind_tcf     =       sfq_unbind,
 905        .dump           =       sfq_dump_class,
 906        .dump_stats     =       sfq_dump_class_stats,
 907        .walk           =       sfq_walk,
 908};
 909
 910static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 911        .cl_ops         =       &sfq_class_ops,
 912        .id             =       "sfq",
 913        .priv_size      =       sizeof(struct sfq_sched_data),
 914        .enqueue        =       sfq_enqueue,
 915        .dequeue        =       sfq_dequeue,
 916        .peek           =       qdisc_peek_dequeued,
 917        .init           =       sfq_init,
 918        .reset          =       sfq_reset,
 919        .destroy        =       sfq_destroy,
 920        .change         =       NULL,
 921        .dump           =       sfq_dump,
 922        .owner          =       THIS_MODULE,
 923};
 924
 925static int __init sfq_module_init(void)
 926{
 927        return register_qdisc(&sfq_qdisc_ops);
 928}
 929static void __exit sfq_module_exit(void)
 930{
 931        unregister_qdisc(&sfq_qdisc_ops);
 932}
 933module_init(sfq_module_init)
 934module_exit(sfq_module_exit)
 935MODULE_LICENSE("GPL");
 936