linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   4 *
   5 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/types.h>
  10#include <linux/kernel.h>
  11#include <linux/jiffies.h>
  12#include <linux/string.h>
  13#include <linux/in.h>
  14#include <linux/errno.h>
  15#include <linux/init.h>
  16#include <linux/skbuff.h>
  17#include <linux/siphash.h>
  18#include <linux/slab.h>
  19#include <linux/vmalloc.h>
  20#include <net/netlink.h>
  21#include <net/pkt_sched.h>
  22#include <net/pkt_cls.h>
  23#include <net/red.h>
  24
  25
  26/*      Stochastic Fairness Queuing algorithm.
  27        =======================================
  28
  29        Source:
  30        Paul E. McKenney "Stochastic Fairness Queuing",
  31        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  32
  33        Paul E. McKenney "Stochastic Fairness Queuing",
  34        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  35
  36
  37        See also:
  38        M. Shreedhar and George Varghese "Efficient Fair
  39        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  40
  41
  42        This is not the thing that is usually called (W)FQ nowadays.
  43        It does not use any timestamp mechanism, but instead
  44        processes queues in round-robin order.
  45
  46        ADVANTAGE:
  47
  48        - It is very cheap. Both CPU and memory requirements are minimal.
  49
  50        DRAWBACKS:
  51
  52        - "Stochastic" -> It is not 100% fair.
  53        When hash collisions occur, several flows are considered as one.
  54
  55        - "Round-robin" -> It introduces larger delays than virtual clock
  56        based schemes, and should not be used for isolating interactive
  57        traffic from non-interactive. It means, that this scheduler
  58        should be used as leaf of CBQ or P3, which put interactive traffic
  59        to higher priority band.
  60
  61        We still need true WFQ for top level CSZ, but using WFQ
  62        for the best effort traffic is absolutely pointless:
  63        SFQ is superior for this purpose.
  64
  65        IMPLEMENTATION:
  66        This implementation limits :
  67        - maximal queue length per flow to 127 packets.
  68        - max mtu to 2^18-1;
  69        - max 65408 flows,
  70        - number of hash buckets to 65536.
  71
  72        It is easy to increase these values, but not in flight.  */
  73
  74#define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  75#define SFQ_DEFAULT_FLOWS       128
  76#define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  77#define SFQ_EMPTY_SLOT          0xffff
  78#define SFQ_DEFAULT_HASH_DIVISOR 1024
  79
  80/* We use 16 bits to store allot, and want to handle packets up to 64K
  81 * Scale allot by 8 (1<<3) so that no overflow occurs.
  82 */
  83#define SFQ_ALLOT_SHIFT         3
  84#define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  85
  86/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  87typedef u16 sfq_index;
  88
  89/*
  90 * We dont use pointers to save space.
  91 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  92 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  93 * are 'pointers' to dep[] array
  94 */
  95struct sfq_head {
  96        sfq_index       next;
  97        sfq_index       prev;
  98};
  99
 100struct sfq_slot {
 101        struct sk_buff  *skblist_next;
 102        struct sk_buff  *skblist_prev;
 103        sfq_index       qlen; /* number of skbs in skblist */
 104        sfq_index       next; /* next slot in sfq RR chain */
 105        struct sfq_head dep; /* anchor in dep[] chains */
 106        unsigned short  hash; /* hash value (index in ht[]) */
 107        short           allot; /* credit for this slot */
 108
 109        unsigned int    backlog;
 110        struct red_vars vars;
 111};
 112
 113struct sfq_sched_data {
 114/* frequently used fields */
 115        int             limit;          /* limit of total number of packets in this qdisc */
 116        unsigned int    divisor;        /* number of slots in hash table */
 117        u8              headdrop;
 118        u8              maxdepth;       /* limit of packets per flow */
 119
 120        siphash_key_t   perturbation;
 121        u8              cur_depth;      /* depth of longest slot */
 122        u8              flags;
 123        unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 124        struct tcf_proto __rcu *filter_list;
 125        struct tcf_block *block;
 126        sfq_index       *ht;            /* Hash table ('divisor' slots) */
 127        struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 128
 129        struct red_parms *red_parms;
 130        struct tc_sfqred_stats stats;
 131        struct sfq_slot *tail;          /* current slot in round */
 132
 133        struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 134                                        /* Linked lists of slots, indexed by depth
 135                                         * dep[0] : list of unused flows
 136                                         * dep[1] : list of flows with 1 packet
 137                                         * dep[X] : list of flows with X packets
 138                                         */
 139
 140        unsigned int    maxflows;       /* number of flows in flows array */
 141        int             perturb_period;
 142        unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 143        struct timer_list perturb_timer;
 144        struct Qdisc    *sch;
 145};
 146
 147/*
 148 * sfq_head are either in a sfq_slot or in dep[] array
 149 */
 150static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 151{
 152        if (val < SFQ_MAX_FLOWS)
 153                return &q->slots[val].dep;
 154        return &q->dep[val - SFQ_MAX_FLOWS];
 155}
 156
 157static unsigned int sfq_hash(const struct sfq_sched_data *q,
 158                             const struct sk_buff *skb)
 159{
 160        return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
 161}
 162
 163static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 164                                 int *qerr)
 165{
 166        struct sfq_sched_data *q = qdisc_priv(sch);
 167        struct tcf_result res;
 168        struct tcf_proto *fl;
 169        int result;
 170
 171        if (TC_H_MAJ(skb->priority) == sch->handle &&
 172            TC_H_MIN(skb->priority) > 0 &&
 173            TC_H_MIN(skb->priority) <= q->divisor)
 174                return TC_H_MIN(skb->priority);
 175
 176        fl = rcu_dereference_bh(q->filter_list);
 177        if (!fl)
 178                return sfq_hash(q, skb) + 1;
 179
 180        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 181        result = tcf_classify(skb, fl, &res, false);
 182        if (result >= 0) {
 183#ifdef CONFIG_NET_CLS_ACT
 184                switch (result) {
 185                case TC_ACT_STOLEN:
 186                case TC_ACT_QUEUED:
 187                case TC_ACT_TRAP:
 188                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 189                        fallthrough;
 190                case TC_ACT_SHOT:
 191                        return 0;
 192                }
 193#endif
 194                if (TC_H_MIN(res.classid) <= q->divisor)
 195                        return TC_H_MIN(res.classid);
 196        }
 197        return 0;
 198}
 199
 200/*
 201 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 202 */
 203static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 204{
 205        sfq_index p, n;
 206        struct sfq_slot *slot = &q->slots[x];
 207        int qlen = slot->qlen;
 208
 209        p = qlen + SFQ_MAX_FLOWS;
 210        n = q->dep[qlen].next;
 211
 212        slot->dep.next = n;
 213        slot->dep.prev = p;
 214
 215        q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 216        sfq_dep_head(q, n)->prev = x;
 217}
 218
 219#define sfq_unlink(q, x, n, p)                  \
 220        do {                                    \
 221                n = q->slots[x].dep.next;       \
 222                p = q->slots[x].dep.prev;       \
 223                sfq_dep_head(q, p)->next = n;   \
 224                sfq_dep_head(q, n)->prev = p;   \
 225        } while (0)
 226
 227
 228static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 229{
 230        sfq_index p, n;
 231        int d;
 232
 233        sfq_unlink(q, x, n, p);
 234
 235        d = q->slots[x].qlen--;
 236        if (n == p && q->cur_depth == d)
 237                q->cur_depth--;
 238        sfq_link(q, x);
 239}
 240
 241static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 242{
 243        sfq_index p, n;
 244        int d;
 245
 246        sfq_unlink(q, x, n, p);
 247
 248        d = ++q->slots[x].qlen;
 249        if (q->cur_depth < d)
 250                q->cur_depth = d;
 251        sfq_link(q, x);
 252}
 253
 254/* helper functions : might be changed when/if skb use a standard list_head */
 255
 256/* remove one skb from tail of slot queue */
 257static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 258{
 259        struct sk_buff *skb = slot->skblist_prev;
 260
 261        slot->skblist_prev = skb->prev;
 262        skb->prev->next = (struct sk_buff *)slot;
 263        skb->next = skb->prev = NULL;
 264        return skb;
 265}
 266
 267/* remove one skb from head of slot queue */
 268static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 269{
 270        struct sk_buff *skb = slot->skblist_next;
 271
 272        slot->skblist_next = skb->next;
 273        skb->next->prev = (struct sk_buff *)slot;
 274        skb->next = skb->prev = NULL;
 275        return skb;
 276}
 277
 278static inline void slot_queue_init(struct sfq_slot *slot)
 279{
 280        memset(slot, 0, sizeof(*slot));
 281        slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 282}
 283
 284/* add skb to slot queue (tail add) */
 285static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 286{
 287        skb->prev = slot->skblist_prev;
 288        skb->next = (struct sk_buff *)slot;
 289        slot->skblist_prev->next = skb;
 290        slot->skblist_prev = skb;
 291}
 292
 293static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
 294{
 295        struct sfq_sched_data *q = qdisc_priv(sch);
 296        sfq_index x, d = q->cur_depth;
 297        struct sk_buff *skb;
 298        unsigned int len;
 299        struct sfq_slot *slot;
 300
 301        /* Queue is full! Find the longest slot and drop tail packet from it */
 302        if (d > 1) {
 303                x = q->dep[d].next;
 304                slot = &q->slots[x];
 305drop:
 306                skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 307                len = qdisc_pkt_len(skb);
 308                slot->backlog -= len;
 309                sfq_dec(q, x);
 310                sch->q.qlen--;
 311                qdisc_qstats_backlog_dec(sch, skb);
 312                qdisc_drop(skb, sch, to_free);
 313                return len;
 314        }
 315
 316        if (d == 1) {
 317                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 318                x = q->tail->next;
 319                slot = &q->slots[x];
 320                q->tail->next = slot->next;
 321                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 322                goto drop;
 323        }
 324
 325        return 0;
 326}
 327
 328/* Is ECN parameter configured */
 329static int sfq_prob_mark(const struct sfq_sched_data *q)
 330{
 331        return q->flags & TC_RED_ECN;
 332}
 333
 334/* Should packets over max threshold just be marked */
 335static int sfq_hard_mark(const struct sfq_sched_data *q)
 336{
 337        return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 338}
 339
 340static int sfq_headdrop(const struct sfq_sched_data *q)
 341{
 342        return q->headdrop;
 343}
 344
 345static int
 346sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 347{
 348        struct sfq_sched_data *q = qdisc_priv(sch);
 349        unsigned int hash, dropped;
 350        sfq_index x, qlen;
 351        struct sfq_slot *slot;
 352        int ret;
 353        struct sk_buff *head;
 354        int delta;
 355
 356        hash = sfq_classify(skb, sch, &ret);
 357        if (hash == 0) {
 358                if (ret & __NET_XMIT_BYPASS)
 359                        qdisc_qstats_drop(sch);
 360                __qdisc_drop(skb, to_free);
 361                return ret;
 362        }
 363        hash--;
 364
 365        x = q->ht[hash];
 366        slot = &q->slots[x];
 367        if (x == SFQ_EMPTY_SLOT) {
 368                x = q->dep[0].next; /* get a free slot */
 369                if (x >= SFQ_MAX_FLOWS)
 370                        return qdisc_drop(skb, sch, to_free);
 371                q->ht[hash] = x;
 372                slot = &q->slots[x];
 373                slot->hash = hash;
 374                slot->backlog = 0; /* should already be 0 anyway... */
 375                red_set_vars(&slot->vars);
 376                goto enqueue;
 377        }
 378        if (q->red_parms) {
 379                slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 380                                                        &slot->vars,
 381                                                        slot->backlog);
 382                switch (red_action(q->red_parms,
 383                                   &slot->vars,
 384                                   slot->vars.qavg)) {
 385                case RED_DONT_MARK:
 386                        break;
 387
 388                case RED_PROB_MARK:
 389                        qdisc_qstats_overlimit(sch);
 390                        if (sfq_prob_mark(q)) {
 391                                /* We know we have at least one packet in queue */
 392                                if (sfq_headdrop(q) &&
 393                                    INET_ECN_set_ce(slot->skblist_next)) {
 394                                        q->stats.prob_mark_head++;
 395                                        break;
 396                                }
 397                                if (INET_ECN_set_ce(skb)) {
 398                                        q->stats.prob_mark++;
 399                                        break;
 400                                }
 401                        }
 402                        q->stats.prob_drop++;
 403                        goto congestion_drop;
 404
 405                case RED_HARD_MARK:
 406                        qdisc_qstats_overlimit(sch);
 407                        if (sfq_hard_mark(q)) {
 408                                /* We know we have at least one packet in queue */
 409                                if (sfq_headdrop(q) &&
 410                                    INET_ECN_set_ce(slot->skblist_next)) {
 411                                        q->stats.forced_mark_head++;
 412                                        break;
 413                                }
 414                                if (INET_ECN_set_ce(skb)) {
 415                                        q->stats.forced_mark++;
 416                                        break;
 417                                }
 418                        }
 419                        q->stats.forced_drop++;
 420                        goto congestion_drop;
 421                }
 422        }
 423
 424        if (slot->qlen >= q->maxdepth) {
 425congestion_drop:
 426                if (!sfq_headdrop(q))
 427                        return qdisc_drop(skb, sch, to_free);
 428
 429                /* We know we have at least one packet in queue */
 430                head = slot_dequeue_head(slot);
 431                delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 432                sch->qstats.backlog -= delta;
 433                slot->backlog -= delta;
 434                qdisc_drop(head, sch, to_free);
 435
 436                slot_queue_add(slot, skb);
 437                qdisc_tree_reduce_backlog(sch, 0, delta);
 438                return NET_XMIT_CN;
 439        }
 440
 441enqueue:
 442        qdisc_qstats_backlog_inc(sch, skb);
 443        slot->backlog += qdisc_pkt_len(skb);
 444        slot_queue_add(slot, skb);
 445        sfq_inc(q, x);
 446        if (slot->qlen == 1) {          /* The flow is new */
 447                if (q->tail == NULL) {  /* It is the first flow */
 448                        slot->next = x;
 449                } else {
 450                        slot->next = q->tail->next;
 451                        q->tail->next = x;
 452                }
 453                /* We put this flow at the end of our flow list.
 454                 * This might sound unfair for a new flow to wait after old ones,
 455                 * but we could endup servicing new flows only, and freeze old ones.
 456                 */
 457                q->tail = slot;
 458                /* We could use a bigger initial quantum for new flows */
 459                slot->allot = q->scaled_quantum;
 460        }
 461        if (++sch->q.qlen <= q->limit)
 462                return NET_XMIT_SUCCESS;
 463
 464        qlen = slot->qlen;
 465        dropped = sfq_drop(sch, to_free);
 466        /* Return Congestion Notification only if we dropped a packet
 467         * from this flow.
 468         */
 469        if (qlen != slot->qlen) {
 470                qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 471                return NET_XMIT_CN;
 472        }
 473
 474        /* As we dropped a packet, better let upper stack know this */
 475        qdisc_tree_reduce_backlog(sch, 1, dropped);
 476        return NET_XMIT_SUCCESS;
 477}
 478
 479static struct sk_buff *
 480sfq_dequeue(struct Qdisc *sch)
 481{
 482        struct sfq_sched_data *q = qdisc_priv(sch);
 483        struct sk_buff *skb;
 484        sfq_index a, next_a;
 485        struct sfq_slot *slot;
 486
 487        /* No active slots */
 488        if (q->tail == NULL)
 489                return NULL;
 490
 491next_slot:
 492        a = q->tail->next;
 493        slot = &q->slots[a];
 494        if (slot->allot <= 0) {
 495                q->tail = slot;
 496                slot->allot += q->scaled_quantum;
 497                goto next_slot;
 498        }
 499        skb = slot_dequeue_head(slot);
 500        sfq_dec(q, a);
 501        qdisc_bstats_update(sch, skb);
 502        sch->q.qlen--;
 503        qdisc_qstats_backlog_dec(sch, skb);
 504        slot->backlog -= qdisc_pkt_len(skb);
 505        /* Is the slot empty? */
 506        if (slot->qlen == 0) {
 507                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 508                next_a = slot->next;
 509                if (a == next_a) {
 510                        q->tail = NULL; /* no more active slots */
 511                        return skb;
 512                }
 513                q->tail->next = next_a;
 514        } else {
 515                slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 516        }
 517        return skb;
 518}
 519
 520static void
 521sfq_reset(struct Qdisc *sch)
 522{
 523        struct sk_buff *skb;
 524
 525        while ((skb = sfq_dequeue(sch)) != NULL)
 526                rtnl_kfree_skbs(skb, skb);
 527}
 528
 529/*
 530 * When q->perturbation is changed, we rehash all queued skbs
 531 * to avoid OOO (Out Of Order) effects.
 532 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 533 * counters.
 534 */
 535static void sfq_rehash(struct Qdisc *sch)
 536{
 537        struct sfq_sched_data *q = qdisc_priv(sch);
 538        struct sk_buff *skb;
 539        int i;
 540        struct sfq_slot *slot;
 541        struct sk_buff_head list;
 542        int dropped = 0;
 543        unsigned int drop_len = 0;
 544
 545        __skb_queue_head_init(&list);
 546
 547        for (i = 0; i < q->maxflows; i++) {
 548                slot = &q->slots[i];
 549                if (!slot->qlen)
 550                        continue;
 551                while (slot->qlen) {
 552                        skb = slot_dequeue_head(slot);
 553                        sfq_dec(q, i);
 554                        __skb_queue_tail(&list, skb);
 555                }
 556                slot->backlog = 0;
 557                red_set_vars(&slot->vars);
 558                q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 559        }
 560        q->tail = NULL;
 561
 562        while ((skb = __skb_dequeue(&list)) != NULL) {
 563                unsigned int hash = sfq_hash(q, skb);
 564                sfq_index x = q->ht[hash];
 565
 566                slot = &q->slots[x];
 567                if (x == SFQ_EMPTY_SLOT) {
 568                        x = q->dep[0].next; /* get a free slot */
 569                        if (x >= SFQ_MAX_FLOWS) {
 570drop:
 571                                qdisc_qstats_backlog_dec(sch, skb);
 572                                drop_len += qdisc_pkt_len(skb);
 573                                kfree_skb(skb);
 574                                dropped++;
 575                                continue;
 576                        }
 577                        q->ht[hash] = x;
 578                        slot = &q->slots[x];
 579                        slot->hash = hash;
 580                }
 581                if (slot->qlen >= q->maxdepth)
 582                        goto drop;
 583                slot_queue_add(slot, skb);
 584                if (q->red_parms)
 585                        slot->vars.qavg = red_calc_qavg(q->red_parms,
 586                                                        &slot->vars,
 587                                                        slot->backlog);
 588                slot->backlog += qdisc_pkt_len(skb);
 589                sfq_inc(q, x);
 590                if (slot->qlen == 1) {          /* The flow is new */
 591                        if (q->tail == NULL) {  /* It is the first flow */
 592                                slot->next = x;
 593                        } else {
 594                                slot->next = q->tail->next;
 595                                q->tail->next = x;
 596                        }
 597                        q->tail = slot;
 598                        slot->allot = q->scaled_quantum;
 599                }
 600        }
 601        sch->q.qlen -= dropped;
 602        qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 603}
 604
 605static void sfq_perturbation(struct timer_list *t)
 606{
 607        struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
 608        struct Qdisc *sch = q->sch;
 609        spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 610        siphash_key_t nkey;
 611
 612        get_random_bytes(&nkey, sizeof(nkey));
 613        spin_lock(root_lock);
 614        q->perturbation = nkey;
 615        if (!q->filter_list && q->tail)
 616                sfq_rehash(sch);
 617        spin_unlock(root_lock);
 618
 619        if (q->perturb_period)
 620                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 621}
 622
 623static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 624{
 625        struct sfq_sched_data *q = qdisc_priv(sch);
 626        struct tc_sfq_qopt *ctl = nla_data(opt);
 627        struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 628        unsigned int qlen, dropped = 0;
 629        struct red_parms *p = NULL;
 630        struct sk_buff *to_free = NULL;
 631        struct sk_buff *tail = NULL;
 632
 633        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 634                return -EINVAL;
 635        if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 636                ctl_v1 = nla_data(opt);
 637        if (ctl->divisor &&
 638            (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 639                return -EINVAL;
 640
 641        /* slot->allot is a short, make sure quantum is not too big. */
 642        if (ctl->quantum) {
 643                unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum);
 644
 645                if (scaled <= 0 || scaled > SHRT_MAX)
 646                        return -EINVAL;
 647        }
 648
 649        if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
 650                                        ctl_v1->Wlog))
 651                return -EINVAL;
 652        if (ctl_v1 && ctl_v1->qth_min) {
 653                p = kmalloc(sizeof(*p), GFP_KERNEL);
 654                if (!p)
 655                        return -ENOMEM;
 656        }
 657        sch_tree_lock(sch);
 658        if (ctl->quantum) {
 659                q->quantum = ctl->quantum;
 660                q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 661        }
 662        q->perturb_period = ctl->perturb_period * HZ;
 663        if (ctl->flows)
 664                q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 665        if (ctl->divisor) {
 666                q->divisor = ctl->divisor;
 667                q->maxflows = min_t(u32, q->maxflows, q->divisor);
 668        }
 669        if (ctl_v1) {
 670                if (ctl_v1->depth)
 671                        q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 672                if (p) {
 673                        swap(q->red_parms, p);
 674                        red_set_parms(q->red_parms,
 675                                      ctl_v1->qth_min, ctl_v1->qth_max,
 676                                      ctl_v1->Wlog,
 677                                      ctl_v1->Plog, ctl_v1->Scell_log,
 678                                      NULL,
 679                                      ctl_v1->max_P);
 680                }
 681                q->flags = ctl_v1->flags;
 682                q->headdrop = ctl_v1->headdrop;
 683        }
 684        if (ctl->limit) {
 685                q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 686                q->maxflows = min_t(u32, q->maxflows, q->limit);
 687        }
 688
 689        qlen = sch->q.qlen;
 690        while (sch->q.qlen > q->limit) {
 691                dropped += sfq_drop(sch, &to_free);
 692                if (!tail)
 693                        tail = to_free;
 694        }
 695
 696        rtnl_kfree_skbs(to_free, tail);
 697        qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 698
 699        del_timer(&q->perturb_timer);
 700        if (q->perturb_period) {
 701                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 702                get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 703        }
 704        sch_tree_unlock(sch);
 705        kfree(p);
 706        return 0;
 707}
 708
 709static void *sfq_alloc(size_t sz)
 710{
 711        return  kvmalloc(sz, GFP_KERNEL);
 712}
 713
 714static void sfq_free(void *addr)
 715{
 716        kvfree(addr);
 717}
 718
 719static void sfq_destroy(struct Qdisc *sch)
 720{
 721        struct sfq_sched_data *q = qdisc_priv(sch);
 722
 723        tcf_block_put(q->block);
 724        q->perturb_period = 0;
 725        del_timer_sync(&q->perturb_timer);
 726        sfq_free(q->ht);
 727        sfq_free(q->slots);
 728        kfree(q->red_parms);
 729}
 730
 731static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
 732                    struct netlink_ext_ack *extack)
 733{
 734        struct sfq_sched_data *q = qdisc_priv(sch);
 735        int i;
 736        int err;
 737
 738        q->sch = sch;
 739        timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
 740
 741        err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 742        if (err)
 743                return err;
 744
 745        for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 746                q->dep[i].next = i + SFQ_MAX_FLOWS;
 747                q->dep[i].prev = i + SFQ_MAX_FLOWS;
 748        }
 749
 750        q->limit = SFQ_MAX_DEPTH;
 751        q->maxdepth = SFQ_MAX_DEPTH;
 752        q->cur_depth = 0;
 753        q->tail = NULL;
 754        q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 755        q->maxflows = SFQ_DEFAULT_FLOWS;
 756        q->quantum = psched_mtu(qdisc_dev(sch));
 757        q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 758        q->perturb_period = 0;
 759        get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 760
 761        if (opt) {
 762                int err = sfq_change(sch, opt);
 763                if (err)
 764                        return err;
 765        }
 766
 767        q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 768        q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 769        if (!q->ht || !q->slots) {
 770                /* Note: sfq_destroy() will be called by our caller */
 771                return -ENOMEM;
 772        }
 773
 774        for (i = 0; i < q->divisor; i++)
 775                q->ht[i] = SFQ_EMPTY_SLOT;
 776
 777        for (i = 0; i < q->maxflows; i++) {
 778                slot_queue_init(&q->slots[i]);
 779                sfq_link(q, i);
 780        }
 781        if (q->limit >= 1)
 782                sch->flags |= TCQ_F_CAN_BYPASS;
 783        else
 784                sch->flags &= ~TCQ_F_CAN_BYPASS;
 785        return 0;
 786}
 787
 788static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 789{
 790        struct sfq_sched_data *q = qdisc_priv(sch);
 791        unsigned char *b = skb_tail_pointer(skb);
 792        struct tc_sfq_qopt_v1 opt;
 793        struct red_parms *p = q->red_parms;
 794
 795        memset(&opt, 0, sizeof(opt));
 796        opt.v0.quantum  = q->quantum;
 797        opt.v0.perturb_period = q->perturb_period / HZ;
 798        opt.v0.limit    = q->limit;
 799        opt.v0.divisor  = q->divisor;
 800        opt.v0.flows    = q->maxflows;
 801        opt.depth       = q->maxdepth;
 802        opt.headdrop    = q->headdrop;
 803
 804        if (p) {
 805                opt.qth_min     = p->qth_min >> p->Wlog;
 806                opt.qth_max     = p->qth_max >> p->Wlog;
 807                opt.Wlog        = p->Wlog;
 808                opt.Plog        = p->Plog;
 809                opt.Scell_log   = p->Scell_log;
 810                opt.max_P       = p->max_P;
 811        }
 812        memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 813        opt.flags       = q->flags;
 814
 815        if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 816                goto nla_put_failure;
 817
 818        return skb->len;
 819
 820nla_put_failure:
 821        nlmsg_trim(skb, b);
 822        return -1;
 823}
 824
 825static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 826{
 827        return NULL;
 828}
 829
 830static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 831{
 832        return 0;
 833}
 834
 835static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 836                              u32 classid)
 837{
 838        return 0;
 839}
 840
 841static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 842{
 843}
 844
 845static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
 846                                       struct netlink_ext_ack *extack)
 847{
 848        struct sfq_sched_data *q = qdisc_priv(sch);
 849
 850        if (cl)
 851                return NULL;
 852        return q->block;
 853}
 854
 855static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 856                          struct sk_buff *skb, struct tcmsg *tcm)
 857{
 858        tcm->tcm_handle |= TC_H_MIN(cl);
 859        return 0;
 860}
 861
 862static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 863                                struct gnet_dump *d)
 864{
 865        struct sfq_sched_data *q = qdisc_priv(sch);
 866        sfq_index idx = q->ht[cl - 1];
 867        struct gnet_stats_queue qs = { 0 };
 868        struct tc_sfq_xstats xstats = { 0 };
 869
 870        if (idx != SFQ_EMPTY_SLOT) {
 871                const struct sfq_slot *slot = &q->slots[idx];
 872
 873                xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 874                qs.qlen = slot->qlen;
 875                qs.backlog = slot->backlog;
 876        }
 877        if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 878                return -1;
 879        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 880}
 881
 882static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 883{
 884        struct sfq_sched_data *q = qdisc_priv(sch);
 885        unsigned int i;
 886
 887        if (arg->stop)
 888                return;
 889
 890        for (i = 0; i < q->divisor; i++) {
 891                if (q->ht[i] == SFQ_EMPTY_SLOT ||
 892                    arg->count < arg->skip) {
 893                        arg->count++;
 894                        continue;
 895                }
 896                if (arg->fn(sch, i + 1, arg) < 0) {
 897                        arg->stop = 1;
 898                        break;
 899                }
 900                arg->count++;
 901        }
 902}
 903
 904static const struct Qdisc_class_ops sfq_class_ops = {
 905        .leaf           =       sfq_leaf,
 906        .find           =       sfq_find,
 907        .tcf_block      =       sfq_tcf_block,
 908        .bind_tcf       =       sfq_bind,
 909        .unbind_tcf     =       sfq_unbind,
 910        .dump           =       sfq_dump_class,
 911        .dump_stats     =       sfq_dump_class_stats,
 912        .walk           =       sfq_walk,
 913};
 914
 915static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 916        .cl_ops         =       &sfq_class_ops,
 917        .id             =       "sfq",
 918        .priv_size      =       sizeof(struct sfq_sched_data),
 919        .enqueue        =       sfq_enqueue,
 920        .dequeue        =       sfq_dequeue,
 921        .peek           =       qdisc_peek_dequeued,
 922        .init           =       sfq_init,
 923        .reset          =       sfq_reset,
 924        .destroy        =       sfq_destroy,
 925        .change         =       NULL,
 926        .dump           =       sfq_dump,
 927        .owner          =       THIS_MODULE,
 928};
 929
 930static int __init sfq_module_init(void)
 931{
 932        return register_qdisc(&sfq_qdisc_ops);
 933}
 934static void __exit sfq_module_exit(void)
 935{
 936        unregister_qdisc(&sfq_qdisc_ops);
 937}
 938module_init(sfq_module_init)
 939module_exit(sfq_module_exit)
 940MODULE_LICENSE("GPL");
 941