linux/net/sched/sch_netem.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_netem.c        Network emulator
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License.
   8 *
   9 *              Many of the algorithms and ideas for this came from
  10 *              NIST Net which is not copyrighted.
  11 *
  12 * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13 *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14 */
  15
  16#include <linux/mm.h>
  17#include <linux/module.h>
  18#include <linux/slab.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/errno.h>
  22#include <linux/skbuff.h>
  23#include <linux/vmalloc.h>
  24#include <linux/rtnetlink.h>
  25#include <linux/reciprocal_div.h>
  26#include <linux/rbtree.h>
  27
  28#include <net/netlink.h>
  29#include <net/pkt_sched.h>
  30#include <net/inet_ecn.h>
  31
  32#define VERSION "1.3"
  33
  34/*      Network Emulation Queuing algorithm.
  35        ====================================
  36
  37        Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  38                 Network Emulation Tool
  39                 [2] Luigi Rizzo, DummyNet for FreeBSD
  40
  41         ----------------------------------------------------------------
  42
  43         This started out as a simple way to delay outgoing packets to
  44         test TCP but has grown to include most of the functionality
  45         of a full blown network emulator like NISTnet. It can delay
  46         packets and add random jitter (and correlation). The random
  47         distribution can be loaded from a table as well to provide
  48         normal, Pareto, or experimental curves. Packet loss,
  49         duplication, and reordering can also be emulated.
  50
  51         This qdisc does not do classification that can be handled in
  52         layering other disciplines.  It does not need to do bandwidth
  53         control either since that can be handled by using token
  54         bucket or other rate control.
  55
  56     Correlated Loss Generator models
  57
  58        Added generation of correlated loss according to the
  59        "Gilbert-Elliot" model, a 4-state markov model.
  60
  61        References:
  62        [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  63        [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  64        and intuitive loss model for packet networks and its implementation
  65        in the Netem module in the Linux kernel", available in [1]
  66
  67        Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  68                 Fabio Ludovici <fabio.ludovici at yahoo.it>
  69*/
  70
  71struct disttable {
  72        u32  size;
  73        s16 table[0];
  74};
  75
  76struct netem_sched_data {
  77        /* internal t(ime)fifo qdisc uses t_root and sch->limit */
  78        struct rb_root t_root;
  79
  80        /* optional qdisc for classful handling (NULL at netem init) */
  81        struct Qdisc    *qdisc;
  82
  83        struct qdisc_watchdog watchdog;
  84
  85        s64 latency;
  86        s64 jitter;
  87
  88        u32 loss;
  89        u32 ecn;
  90        u32 limit;
  91        u32 counter;
  92        u32 gap;
  93        u32 duplicate;
  94        u32 reorder;
  95        u32 corrupt;
  96        u64 rate;
  97        s32 packet_overhead;
  98        u32 cell_size;
  99        struct reciprocal_value cell_size_reciprocal;
 100        s32 cell_overhead;
 101
 102        struct crndstate {
 103                u32 last;
 104                u32 rho;
 105        } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 106
 107        struct disttable *delay_dist;
 108
 109        enum  {
 110                CLG_RANDOM,
 111                CLG_4_STATES,
 112                CLG_GILB_ELL,
 113        } loss_model;
 114
 115        enum {
 116                TX_IN_GAP_PERIOD = 1,
 117                TX_IN_BURST_PERIOD,
 118                LOST_IN_GAP_PERIOD,
 119                LOST_IN_BURST_PERIOD,
 120        } _4_state_model;
 121
 122        enum {
 123                GOOD_STATE = 1,
 124                BAD_STATE,
 125        } GE_state_model;
 126
 127        /* Correlated Loss Generation models */
 128        struct clgstate {
 129                /* state of the Markov chain */
 130                u8 state;
 131
 132                /* 4-states and Gilbert-Elliot models */
 133                u32 a1; /* p13 for 4-states or p for GE */
 134                u32 a2; /* p31 for 4-states or r for GE */
 135                u32 a3; /* p32 for 4-states or h for GE */
 136                u32 a4; /* p14 for 4-states or 1-k for GE */
 137                u32 a5; /* p23 used only in 4-states */
 138        } clg;
 139
 140        struct tc_netem_slot slot_config;
 141        struct slotstate {
 142                u64 slot_next;
 143                s32 packets_left;
 144                s32 bytes_left;
 145        } slot;
 146
 147        struct disttable *slot_dist;
 148};
 149
 150/* Time stamp put into socket buffer control block
 151 * Only valid when skbs are in our internal t(ime)fifo queue.
 152 *
 153 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
 154 * and skb->next & skb->prev are scratch space for a qdisc,
 155 * we save skb->tstamp value in skb->cb[] before destroying it.
 156 */
 157struct netem_skb_cb {
 158        u64             time_to_send;
 159};
 160
 161static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 162{
 163        /* we assume we can use skb next/prev/tstamp as storage for rb_node */
 164        qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 165        return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 166}
 167
 168/* init_crandom - initialize correlated random number generator
 169 * Use entropy source for initial seed.
 170 */
 171static void init_crandom(struct crndstate *state, unsigned long rho)
 172{
 173        state->rho = rho;
 174        state->last = prandom_u32();
 175}
 176
 177/* get_crandom - correlated random number generator
 178 * Next number depends on last value.
 179 * rho is scaled to avoid floating point.
 180 */
 181static u32 get_crandom(struct crndstate *state)
 182{
 183        u64 value, rho;
 184        unsigned long answer;
 185
 186        if (!state || state->rho == 0)  /* no correlation */
 187                return prandom_u32();
 188
 189        value = prandom_u32();
 190        rho = (u64)state->rho + 1;
 191        answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 192        state->last = answer;
 193        return answer;
 194}
 195
 196/* loss_4state - 4-state model loss generator
 197 * Generates losses according to the 4-state Markov chain adopted in
 198 * the GI (General and Intuitive) loss model.
 199 */
 200static bool loss_4state(struct netem_sched_data *q)
 201{
 202        struct clgstate *clg = &q->clg;
 203        u32 rnd = prandom_u32();
 204
 205        /*
 206         * Makes a comparison between rnd and the transition
 207         * probabilities outgoing from the current state, then decides the
 208         * next state and if the next packet has to be transmitted or lost.
 209         * The four states correspond to:
 210         *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
 211         *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
 212         *   LOST_IN_GAP_PERIOD => lost packets within a burst period
 213         *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
 214         */
 215        switch (clg->state) {
 216        case TX_IN_GAP_PERIOD:
 217                if (rnd < clg->a4) {
 218                        clg->state = LOST_IN_BURST_PERIOD;
 219                        return true;
 220                } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
 221                        clg->state = LOST_IN_GAP_PERIOD;
 222                        return true;
 223                } else if (clg->a1 + clg->a4 < rnd) {
 224                        clg->state = TX_IN_GAP_PERIOD;
 225                }
 226
 227                break;
 228        case TX_IN_BURST_PERIOD:
 229                if (rnd < clg->a5) {
 230                        clg->state = LOST_IN_GAP_PERIOD;
 231                        return true;
 232                } else {
 233                        clg->state = TX_IN_BURST_PERIOD;
 234                }
 235
 236                break;
 237        case LOST_IN_GAP_PERIOD:
 238                if (rnd < clg->a3)
 239                        clg->state = TX_IN_BURST_PERIOD;
 240                else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 241                        clg->state = TX_IN_GAP_PERIOD;
 242                } else if (clg->a2 + clg->a3 < rnd) {
 243                        clg->state = LOST_IN_GAP_PERIOD;
 244                        return true;
 245                }
 246                break;
 247        case LOST_IN_BURST_PERIOD:
 248                clg->state = TX_IN_GAP_PERIOD;
 249                break;
 250        }
 251
 252        return false;
 253}
 254
 255/* loss_gilb_ell - Gilbert-Elliot model loss generator
 256 * Generates losses according to the Gilbert-Elliot loss model or
 257 * its special cases  (Gilbert or Simple Gilbert)
 258 *
 259 * Makes a comparison between random number and the transition
 260 * probabilities outgoing from the current state, then decides the
 261 * next state. A second random number is extracted and the comparison
 262 * with the loss probability of the current state decides if the next
 263 * packet will be transmitted or lost.
 264 */
 265static bool loss_gilb_ell(struct netem_sched_data *q)
 266{
 267        struct clgstate *clg = &q->clg;
 268
 269        switch (clg->state) {
 270        case GOOD_STATE:
 271                if (prandom_u32() < clg->a1)
 272                        clg->state = BAD_STATE;
 273                if (prandom_u32() < clg->a4)
 274                        return true;
 275                break;
 276        case BAD_STATE:
 277                if (prandom_u32() < clg->a2)
 278                        clg->state = GOOD_STATE;
 279                if (prandom_u32() > clg->a3)
 280                        return true;
 281        }
 282
 283        return false;
 284}
 285
 286static bool loss_event(struct netem_sched_data *q)
 287{
 288        switch (q->loss_model) {
 289        case CLG_RANDOM:
 290                /* Random packet drop 0 => none, ~0 => all */
 291                return q->loss && q->loss >= get_crandom(&q->loss_cor);
 292
 293        case CLG_4_STATES:
 294                /* 4state loss model algorithm (used also for GI model)
 295                * Extracts a value from the markov 4 state loss generator,
 296                * if it is 1 drops a packet and if needed writes the event in
 297                * the kernel logs
 298                */
 299                return loss_4state(q);
 300
 301        case CLG_GILB_ELL:
 302                /* Gilbert-Elliot loss model algorithm
 303                * Extracts a value from the Gilbert-Elliot loss generator,
 304                * if it is 1 drops a packet and if needed writes the event in
 305                * the kernel logs
 306                */
 307                return loss_gilb_ell(q);
 308        }
 309
 310        return false;   /* not reached */
 311}
 312
 313
 314/* tabledist - return a pseudo-randomly distributed value with mean mu and
 315 * std deviation sigma.  Uses table lookup to approximate the desired
 316 * distribution, and a uniformly-distributed pseudo-random source.
 317 */
 318static s64 tabledist(s64 mu, s32 sigma,
 319                     struct crndstate *state,
 320                     const struct disttable *dist)
 321{
 322        s64 x;
 323        long t;
 324        u32 rnd;
 325
 326        if (sigma == 0)
 327                return mu;
 328
 329        rnd = get_crandom(state);
 330
 331        /* default uniform distribution */
 332        if (dist == NULL)
 333                return ((rnd % (2 * sigma)) + mu) - sigma;
 334
 335        t = dist->table[rnd % dist->size];
 336        x = (sigma % NETEM_DIST_SCALE) * t;
 337        if (x >= 0)
 338                x += NETEM_DIST_SCALE/2;
 339        else
 340                x -= NETEM_DIST_SCALE/2;
 341
 342        return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 343}
 344
 345static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
 346{
 347        len += q->packet_overhead;
 348
 349        if (q->cell_size) {
 350                u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 351
 352                if (len > cells * q->cell_size) /* extra cell needed for remainder */
 353                        cells++;
 354                len = cells * (q->cell_size + q->cell_overhead);
 355        }
 356
 357        return div64_u64(len * NSEC_PER_SEC, q->rate);
 358}
 359
 360static void tfifo_reset(struct Qdisc *sch)
 361{
 362        struct netem_sched_data *q = qdisc_priv(sch);
 363        struct rb_node *p = rb_first(&q->t_root);
 364
 365        while (p) {
 366                struct sk_buff *skb = rb_to_skb(p);
 367
 368                p = rb_next(p);
 369                rb_erase(&skb->rbnode, &q->t_root);
 370                rtnl_kfree_skbs(skb, skb);
 371        }
 372}
 373
 374static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 375{
 376        struct netem_sched_data *q = qdisc_priv(sch);
 377        u64 tnext = netem_skb_cb(nskb)->time_to_send;
 378        struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 379
 380        while (*p) {
 381                struct sk_buff *skb;
 382
 383                parent = *p;
 384                skb = rb_to_skb(parent);
 385                if (tnext >= netem_skb_cb(skb)->time_to_send)
 386                        p = &parent->rb_right;
 387                else
 388                        p = &parent->rb_left;
 389        }
 390        rb_link_node(&nskb->rbnode, parent, p);
 391        rb_insert_color(&nskb->rbnode, &q->t_root);
 392        sch->q.qlen++;
 393}
 394
 395/* netem can't properly corrupt a megapacket (like we get from GSO), so instead
 396 * when we statistically choose to corrupt one, we instead segment it, returning
 397 * the first packet to be corrupted, and re-enqueue the remaining frames
 398 */
 399static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 400                                     struct sk_buff **to_free)
 401{
 402        struct sk_buff *segs;
 403        netdev_features_t features = netif_skb_features(skb);
 404
 405        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 406
 407        if (IS_ERR_OR_NULL(segs)) {
 408                qdisc_drop(skb, sch, to_free);
 409                return NULL;
 410        }
 411        consume_skb(skb);
 412        return segs;
 413}
 414
 415static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
 416{
 417        skb->next = qh->head;
 418
 419        if (!qh->head)
 420                qh->tail = skb;
 421        qh->head = skb;
 422        qh->qlen++;
 423}
 424
 425/*
 426 * Insert one skb into qdisc.
 427 * Note: parent depends on return value to account for queue length.
 428 *      NET_XMIT_DROP: queue length didn't change.
 429 *      NET_XMIT_SUCCESS: one skb was queued.
 430 */
 431static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 432                         struct sk_buff **to_free)
 433{
 434        struct netem_sched_data *q = qdisc_priv(sch);
 435        /* We don't fill cb now as skb_unshare() may invalidate it */
 436        struct netem_skb_cb *cb;
 437        struct sk_buff *skb2;
 438        struct sk_buff *segs = NULL;
 439        unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
 440        int nb = 0;
 441        int count = 1;
 442        int rc = NET_XMIT_SUCCESS;
 443
 444        /* Random duplication */
 445        if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 446                ++count;
 447
 448        /* Drop packet? */
 449        if (loss_event(q)) {
 450                if (q->ecn && INET_ECN_set_ce(skb))
 451                        qdisc_qstats_drop(sch); /* mark packet */
 452                else
 453                        --count;
 454        }
 455        if (count == 0) {
 456                qdisc_qstats_drop(sch);
 457                __qdisc_drop(skb, to_free);
 458                return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 459        }
 460
 461        /* If a delay is expected, orphan the skb. (orphaning usually takes
 462         * place at TX completion time, so _before_ the link transit delay)
 463         */
 464        if (q->latency || q->jitter || q->rate)
 465                skb_orphan_partial(skb);
 466
 467        /*
 468         * If we need to duplicate packet, then re-insert at top of the
 469         * qdisc tree, since parent queuer expects that only one
 470         * skb will be queued.
 471         */
 472        if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 473                struct Qdisc *rootq = qdisc_root(sch);
 474                u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 475
 476                q->duplicate = 0;
 477                rootq->enqueue(skb2, rootq, to_free);
 478                q->duplicate = dupsave;
 479        }
 480
 481        /*
 482         * Randomized packet corruption.
 483         * Make copy if needed since we are modifying
 484         * If packet is going to be hardware checksummed, then
 485         * do it now in software before we mangle it.
 486         */
 487        if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 488                if (skb_is_gso(skb)) {
 489                        segs = netem_segment(skb, sch, to_free);
 490                        if (!segs)
 491                                return NET_XMIT_DROP;
 492                } else {
 493                        segs = skb;
 494                }
 495
 496                skb = segs;
 497                segs = segs->next;
 498
 499                skb = skb_unshare(skb, GFP_ATOMIC);
 500                if (unlikely(!skb)) {
 501                        qdisc_qstats_drop(sch);
 502                        goto finish_segs;
 503                }
 504                if (skb->ip_summed == CHECKSUM_PARTIAL &&
 505                    skb_checksum_help(skb)) {
 506                        qdisc_drop(skb, sch, to_free);
 507                        goto finish_segs;
 508                }
 509
 510                skb->data[prandom_u32() % skb_headlen(skb)] ^=
 511                        1<<(prandom_u32() % 8);
 512        }
 513
 514        if (unlikely(sch->q.qlen >= sch->limit))
 515                return qdisc_drop_all(skb, sch, to_free);
 516
 517        qdisc_qstats_backlog_inc(sch, skb);
 518
 519        cb = netem_skb_cb(skb);
 520        if (q->gap == 0 ||              /* not doing reordering */
 521            q->counter < q->gap - 1 ||  /* inside last reordering gap */
 522            q->reorder < get_crandom(&q->reorder_cor)) {
 523                u64 now;
 524                s64 delay;
 525
 526                delay = tabledist(q->latency, q->jitter,
 527                                  &q->delay_cor, q->delay_dist);
 528
 529                now = ktime_get_ns();
 530
 531                if (q->rate) {
 532                        struct netem_skb_cb *last = NULL;
 533
 534                        if (sch->q.tail)
 535                                last = netem_skb_cb(sch->q.tail);
 536                        if (q->t_root.rb_node) {
 537                                struct sk_buff *t_skb;
 538                                struct netem_skb_cb *t_last;
 539
 540                                t_skb = skb_rb_last(&q->t_root);
 541                                t_last = netem_skb_cb(t_skb);
 542                                if (!last ||
 543                                    t_last->time_to_send > last->time_to_send) {
 544                                        last = t_last;
 545                                }
 546                        }
 547
 548                        if (last) {
 549                                /*
 550                                 * Last packet in queue is reference point (now),
 551                                 * calculate this time bonus and subtract
 552                                 * from delay.
 553                                 */
 554                                delay -= last->time_to_send - now;
 555                                delay = max_t(s64, 0, delay);
 556                                now = last->time_to_send;
 557                        }
 558
 559                        delay += packet_time_ns(qdisc_pkt_len(skb), q);
 560                }
 561
 562                cb->time_to_send = now + delay;
 563                ++q->counter;
 564                tfifo_enqueue(skb, sch);
 565        } else {
 566                /*
 567                 * Do re-ordering by putting one out of N packets at the front
 568                 * of the queue.
 569                 */
 570                cb->time_to_send = ktime_get_ns();
 571                q->counter = 0;
 572
 573                netem_enqueue_skb_head(&sch->q, skb);
 574                sch->qstats.requeues++;
 575        }
 576
 577finish_segs:
 578        if (segs) {
 579                while (segs) {
 580                        skb2 = segs->next;
 581                        segs->next = NULL;
 582                        qdisc_skb_cb(segs)->pkt_len = segs->len;
 583                        last_len = segs->len;
 584                        rc = qdisc_enqueue(segs, sch, to_free);
 585                        if (rc != NET_XMIT_SUCCESS) {
 586                                if (net_xmit_drop_count(rc))
 587                                        qdisc_qstats_drop(sch);
 588                        } else {
 589                                nb++;
 590                                len += last_len;
 591                        }
 592                        segs = skb2;
 593                }
 594                sch->q.qlen += nb;
 595                if (nb > 1)
 596                        qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 597        }
 598        return NET_XMIT_SUCCESS;
 599}
 600
 601/* Delay the next round with a new future slot with a
 602 * correct number of bytes and packets.
 603 */
 604
 605static void get_slot_next(struct netem_sched_data *q, u64 now)
 606{
 607        s64 next_delay;
 608
 609        if (!q->slot_dist)
 610                next_delay = q->slot_config.min_delay +
 611                                (prandom_u32() *
 612                                 (q->slot_config.max_delay -
 613                                  q->slot_config.min_delay) >> 32);
 614        else
 615                next_delay = tabledist(q->slot_config.dist_delay,
 616                                       (s32)(q->slot_config.dist_jitter),
 617                                       NULL, q->slot_dist);
 618
 619        q->slot.slot_next = now + next_delay;
 620        q->slot.packets_left = q->slot_config.max_packets;
 621        q->slot.bytes_left = q->slot_config.max_bytes;
 622}
 623
 624static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 625{
 626        struct netem_sched_data *q = qdisc_priv(sch);
 627        struct sk_buff *skb;
 628        struct rb_node *p;
 629
 630tfifo_dequeue:
 631        skb = __qdisc_dequeue_head(&sch->q);
 632        if (skb) {
 633                qdisc_qstats_backlog_dec(sch, skb);
 634deliver:
 635                qdisc_bstats_update(sch, skb);
 636                return skb;
 637        }
 638        p = rb_first(&q->t_root);
 639        if (p) {
 640                u64 time_to_send;
 641                u64 now = ktime_get_ns();
 642
 643                skb = rb_to_skb(p);
 644
 645                /* if more time remaining? */
 646                time_to_send = netem_skb_cb(skb)->time_to_send;
 647                if (q->slot.slot_next && q->slot.slot_next < time_to_send)
 648                        get_slot_next(q, now);
 649
 650                if (time_to_send <= now &&  q->slot.slot_next <= now) {
 651                        rb_erase(p, &q->t_root);
 652                        sch->q.qlen--;
 653                        qdisc_qstats_backlog_dec(sch, skb);
 654                        skb->next = NULL;
 655                        skb->prev = NULL;
 656                        /* skb->dev shares skb->rbnode area,
 657                         * we need to restore its value.
 658                         */
 659                        skb->dev = qdisc_dev(sch);
 660
 661#ifdef CONFIG_NET_CLS_ACT
 662                        /*
 663                         * If it's at ingress let's pretend the delay is
 664                         * from the network (tstamp will be updated).
 665                         */
 666                        if (skb->tc_redirected && skb->tc_from_ingress)
 667                                skb->tstamp = 0;
 668#endif
 669
 670                        if (q->slot.slot_next) {
 671                                q->slot.packets_left--;
 672                                q->slot.bytes_left -= qdisc_pkt_len(skb);
 673                                if (q->slot.packets_left <= 0 ||
 674                                    q->slot.bytes_left <= 0)
 675                                        get_slot_next(q, now);
 676                        }
 677
 678                        if (q->qdisc) {
 679                                unsigned int pkt_len = qdisc_pkt_len(skb);
 680                                struct sk_buff *to_free = NULL;
 681                                int err;
 682
 683                                err = qdisc_enqueue(skb, q->qdisc, &to_free);
 684                                kfree_skb_list(to_free);
 685                                if (err != NET_XMIT_SUCCESS &&
 686                                    net_xmit_drop_count(err)) {
 687                                        qdisc_qstats_drop(sch);
 688                                        qdisc_tree_reduce_backlog(sch, 1,
 689                                                                  pkt_len);
 690                                }
 691                                goto tfifo_dequeue;
 692                        }
 693                        goto deliver;
 694                }
 695
 696                if (q->qdisc) {
 697                        skb = q->qdisc->ops->dequeue(q->qdisc);
 698                        if (skb)
 699                                goto deliver;
 700                }
 701
 702                qdisc_watchdog_schedule_ns(&q->watchdog,
 703                                           max(time_to_send,
 704                                               q->slot.slot_next));
 705        }
 706
 707        if (q->qdisc) {
 708                skb = q->qdisc->ops->dequeue(q->qdisc);
 709                if (skb)
 710                        goto deliver;
 711        }
 712        return NULL;
 713}
 714
 715static void netem_reset(struct Qdisc *sch)
 716{
 717        struct netem_sched_data *q = qdisc_priv(sch);
 718
 719        qdisc_reset_queue(sch);
 720        tfifo_reset(sch);
 721        if (q->qdisc)
 722                qdisc_reset(q->qdisc);
 723        qdisc_watchdog_cancel(&q->watchdog);
 724}
 725
 726static void dist_free(struct disttable *d)
 727{
 728        kvfree(d);
 729}
 730
 731/*
 732 * Distribution data is a variable size payload containing
 733 * signed 16 bit values.
 734 */
 735
 736static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
 737                          const struct nlattr *attr)
 738{
 739        size_t n = nla_len(attr)/sizeof(__s16);
 740        const __s16 *data = nla_data(attr);
 741        spinlock_t *root_lock;
 742        struct disttable *d;
 743        int i;
 744
 745        if (n > NETEM_DIST_MAX)
 746                return -EINVAL;
 747
 748        d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
 749        if (!d)
 750                return -ENOMEM;
 751
 752        d->size = n;
 753        for (i = 0; i < n; i++)
 754                d->table[i] = data[i];
 755
 756        root_lock = qdisc_root_sleeping_lock(sch);
 757
 758        spin_lock_bh(root_lock);
 759        swap(*tbl, d);
 760        spin_unlock_bh(root_lock);
 761
 762        dist_free(d);
 763        return 0;
 764}
 765
 766static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
 767{
 768        const struct tc_netem_slot *c = nla_data(attr);
 769
 770        q->slot_config = *c;
 771        if (q->slot_config.max_packets == 0)
 772                q->slot_config.max_packets = INT_MAX;
 773        if (q->slot_config.max_bytes == 0)
 774                q->slot_config.max_bytes = INT_MAX;
 775        q->slot.packets_left = q->slot_config.max_packets;
 776        q->slot.bytes_left = q->slot_config.max_bytes;
 777        if (q->slot_config.min_delay | q->slot_config.max_delay |
 778            q->slot_config.dist_jitter)
 779                q->slot.slot_next = ktime_get_ns();
 780        else
 781                q->slot.slot_next = 0;
 782}
 783
 784static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
 785{
 786        const struct tc_netem_corr *c = nla_data(attr);
 787
 788        init_crandom(&q->delay_cor, c->delay_corr);
 789        init_crandom(&q->loss_cor, c->loss_corr);
 790        init_crandom(&q->dup_cor, c->dup_corr);
 791}
 792
 793static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
 794{
 795        const struct tc_netem_reorder *r = nla_data(attr);
 796
 797        q->reorder = r->probability;
 798        init_crandom(&q->reorder_cor, r->correlation);
 799}
 800
 801static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
 802{
 803        const struct tc_netem_corrupt *r = nla_data(attr);
 804
 805        q->corrupt = r->probability;
 806        init_crandom(&q->corrupt_cor, r->correlation);
 807}
 808
 809static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
 810{
 811        const struct tc_netem_rate *r = nla_data(attr);
 812
 813        q->rate = r->rate;
 814        q->packet_overhead = r->packet_overhead;
 815        q->cell_size = r->cell_size;
 816        q->cell_overhead = r->cell_overhead;
 817        if (q->cell_size)
 818                q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 819        else
 820                q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
 821}
 822
 823static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
 824{
 825        const struct nlattr *la;
 826        int rem;
 827
 828        nla_for_each_nested(la, attr, rem) {
 829                u16 type = nla_type(la);
 830
 831                switch (type) {
 832                case NETEM_LOSS_GI: {
 833                        const struct tc_netem_gimodel *gi = nla_data(la);
 834
 835                        if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 836                                pr_info("netem: incorrect gi model size\n");
 837                                return -EINVAL;
 838                        }
 839
 840                        q->loss_model = CLG_4_STATES;
 841
 842                        q->clg.state = TX_IN_GAP_PERIOD;
 843                        q->clg.a1 = gi->p13;
 844                        q->clg.a2 = gi->p31;
 845                        q->clg.a3 = gi->p32;
 846                        q->clg.a4 = gi->p14;
 847                        q->clg.a5 = gi->p23;
 848                        break;
 849                }
 850
 851                case NETEM_LOSS_GE: {
 852                        const struct tc_netem_gemodel *ge = nla_data(la);
 853
 854                        if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 855                                pr_info("netem: incorrect ge model size\n");
 856                                return -EINVAL;
 857                        }
 858
 859                        q->loss_model = CLG_GILB_ELL;
 860                        q->clg.state = GOOD_STATE;
 861                        q->clg.a1 = ge->p;
 862                        q->clg.a2 = ge->r;
 863                        q->clg.a3 = ge->h;
 864                        q->clg.a4 = ge->k1;
 865                        break;
 866                }
 867
 868                default:
 869                        pr_info("netem: unknown loss type %u\n", type);
 870                        return -EINVAL;
 871                }
 872        }
 873
 874        return 0;
 875}
 876
 877static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 878        [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 879        [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 880        [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 881        [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 882        [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 883        [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 884        [TCA_NETEM_RATE64]      = { .type = NLA_U64 },
 885        [TCA_NETEM_LATENCY64]   = { .type = NLA_S64 },
 886        [TCA_NETEM_JITTER64]    = { .type = NLA_S64 },
 887        [TCA_NETEM_SLOT]        = { .len = sizeof(struct tc_netem_slot) },
 888};
 889
 890static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 891                      const struct nla_policy *policy, int len)
 892{
 893        int nested_len = nla_len(nla) - NLA_ALIGN(len);
 894
 895        if (nested_len < 0) {
 896                pr_info("netem: invalid attributes len %d\n", nested_len);
 897                return -EINVAL;
 898        }
 899
 900        if (nested_len >= nla_attr_size(0))
 901                return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 902                                 nested_len, policy, NULL);
 903
 904        memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 905        return 0;
 906}
 907
 908/* Parse netlink message to set options */
 909static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 910                        struct netlink_ext_ack *extack)
 911{
 912        struct netem_sched_data *q = qdisc_priv(sch);
 913        struct nlattr *tb[TCA_NETEM_MAX + 1];
 914        struct tc_netem_qopt *qopt;
 915        struct clgstate old_clg;
 916        int old_loss_model = CLG_RANDOM;
 917        int ret;
 918
 919        if (opt == NULL)
 920                return -EINVAL;
 921
 922        qopt = nla_data(opt);
 923        ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 924        if (ret < 0)
 925                return ret;
 926
 927        /* backup q->clg and q->loss_model */
 928        old_clg = q->clg;
 929        old_loss_model = q->loss_model;
 930
 931        if (tb[TCA_NETEM_LOSS]) {
 932                ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
 933                if (ret) {
 934                        q->loss_model = old_loss_model;
 935                        return ret;
 936                }
 937        } else {
 938                q->loss_model = CLG_RANDOM;
 939        }
 940
 941        if (tb[TCA_NETEM_DELAY_DIST]) {
 942                ret = get_dist_table(sch, &q->delay_dist,
 943                                     tb[TCA_NETEM_DELAY_DIST]);
 944                if (ret)
 945                        goto get_table_failure;
 946        }
 947
 948        if (tb[TCA_NETEM_SLOT_DIST]) {
 949                ret = get_dist_table(sch, &q->slot_dist,
 950                                     tb[TCA_NETEM_SLOT_DIST]);
 951                if (ret)
 952                        goto get_table_failure;
 953        }
 954
 955        sch->limit = qopt->limit;
 956
 957        q->latency = PSCHED_TICKS2NS(qopt->latency);
 958        q->jitter = PSCHED_TICKS2NS(qopt->jitter);
 959        q->limit = qopt->limit;
 960        q->gap = qopt->gap;
 961        q->counter = 0;
 962        q->loss = qopt->loss;
 963        q->duplicate = qopt->duplicate;
 964
 965        /* for compatibility with earlier versions.
 966         * if gap is set, need to assume 100% probability
 967         */
 968        if (q->gap)
 969                q->reorder = ~0;
 970
 971        if (tb[TCA_NETEM_CORR])
 972                get_correlation(q, tb[TCA_NETEM_CORR]);
 973
 974        if (tb[TCA_NETEM_REORDER])
 975                get_reorder(q, tb[TCA_NETEM_REORDER]);
 976
 977        if (tb[TCA_NETEM_CORRUPT])
 978                get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
 979
 980        if (tb[TCA_NETEM_RATE])
 981                get_rate(q, tb[TCA_NETEM_RATE]);
 982
 983        if (tb[TCA_NETEM_RATE64])
 984                q->rate = max_t(u64, q->rate,
 985                                nla_get_u64(tb[TCA_NETEM_RATE64]));
 986
 987        if (tb[TCA_NETEM_LATENCY64])
 988                q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
 989
 990        if (tb[TCA_NETEM_JITTER64])
 991                q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
 992
 993        if (tb[TCA_NETEM_ECN])
 994                q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 995
 996        if (tb[TCA_NETEM_SLOT])
 997                get_slot(q, tb[TCA_NETEM_SLOT]);
 998
 999        return ret;
1000
1001get_table_failure:
1002        /* recover clg and loss_model, in case of
1003         * q->clg and q->loss_model were modified
1004         * in get_loss_clg()
1005         */
1006        q->clg = old_clg;
1007        q->loss_model = old_loss_model;
1008        return ret;
1009}
1010
1011static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1012                      struct netlink_ext_ack *extack)
1013{
1014        struct netem_sched_data *q = qdisc_priv(sch);
1015        int ret;
1016
1017        qdisc_watchdog_init(&q->watchdog, sch);
1018
1019        if (!opt)
1020                return -EINVAL;
1021
1022        q->loss_model = CLG_RANDOM;
1023        ret = netem_change(sch, opt, extack);
1024        if (ret)
1025                pr_info("netem: change failed\n");
1026        return ret;
1027}
1028
1029static void netem_destroy(struct Qdisc *sch)
1030{
1031        struct netem_sched_data *q = qdisc_priv(sch);
1032
1033        qdisc_watchdog_cancel(&q->watchdog);
1034        if (q->qdisc)
1035                qdisc_destroy(q->qdisc);
1036        dist_free(q->delay_dist);
1037        dist_free(q->slot_dist);
1038}
1039
1040static int dump_loss_model(const struct netem_sched_data *q,
1041                           struct sk_buff *skb)
1042{
1043        struct nlattr *nest;
1044
1045        nest = nla_nest_start(skb, TCA_NETEM_LOSS);
1046        if (nest == NULL)
1047                goto nla_put_failure;
1048
1049        switch (q->loss_model) {
1050        case CLG_RANDOM:
1051                /* legacy loss model */
1052                nla_nest_cancel(skb, nest);
1053                return 0;       /* no data */
1054
1055        case CLG_4_STATES: {
1056                struct tc_netem_gimodel gi = {
1057                        .p13 = q->clg.a1,
1058                        .p31 = q->clg.a2,
1059                        .p32 = q->clg.a3,
1060                        .p14 = q->clg.a4,
1061                        .p23 = q->clg.a5,
1062                };
1063
1064                if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1065                        goto nla_put_failure;
1066                break;
1067        }
1068        case CLG_GILB_ELL: {
1069                struct tc_netem_gemodel ge = {
1070                        .p = q->clg.a1,
1071                        .r = q->clg.a2,
1072                        .h = q->clg.a3,
1073                        .k1 = q->clg.a4,
1074                };
1075
1076                if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1077                        goto nla_put_failure;
1078                break;
1079        }
1080        }
1081
1082        nla_nest_end(skb, nest);
1083        return 0;
1084
1085nla_put_failure:
1086        nla_nest_cancel(skb, nest);
1087        return -1;
1088}
1089
1090static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1091{
1092        const struct netem_sched_data *q = qdisc_priv(sch);
1093        struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1094        struct tc_netem_qopt qopt;
1095        struct tc_netem_corr cor;
1096        struct tc_netem_reorder reorder;
1097        struct tc_netem_corrupt corrupt;
1098        struct tc_netem_rate rate;
1099        struct tc_netem_slot slot;
1100
1101        qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
1102                             UINT_MAX);
1103        qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
1104                            UINT_MAX);
1105        qopt.limit = q->limit;
1106        qopt.loss = q->loss;
1107        qopt.gap = q->gap;
1108        qopt.duplicate = q->duplicate;
1109        if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1110                goto nla_put_failure;
1111
1112        if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1113                goto nla_put_failure;
1114
1115        if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1116                goto nla_put_failure;
1117
1118        cor.delay_corr = q->delay_cor.rho;
1119        cor.loss_corr = q->loss_cor.rho;
1120        cor.dup_corr = q->dup_cor.rho;
1121        if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1122                goto nla_put_failure;
1123
1124        reorder.probability = q->reorder;
1125        reorder.correlation = q->reorder_cor.rho;
1126        if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1127                goto nla_put_failure;
1128
1129        corrupt.probability = q->corrupt;
1130        corrupt.correlation = q->corrupt_cor.rho;
1131        if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1132                goto nla_put_failure;
1133
1134        if (q->rate >= (1ULL << 32)) {
1135                if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1136                                      TCA_NETEM_PAD))
1137                        goto nla_put_failure;
1138                rate.rate = ~0U;
1139        } else {
1140                rate.rate = q->rate;
1141        }
1142        rate.packet_overhead = q->packet_overhead;
1143        rate.cell_size = q->cell_size;
1144        rate.cell_overhead = q->cell_overhead;
1145        if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1146                goto nla_put_failure;
1147
1148        if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1149                goto nla_put_failure;
1150
1151        if (dump_loss_model(q, skb) != 0)
1152                goto nla_put_failure;
1153
1154        if (q->slot_config.min_delay | q->slot_config.max_delay |
1155            q->slot_config.dist_jitter) {
1156                slot = q->slot_config;
1157                if (slot.max_packets == INT_MAX)
1158                        slot.max_packets = 0;
1159                if (slot.max_bytes == INT_MAX)
1160                        slot.max_bytes = 0;
1161                if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1162                        goto nla_put_failure;
1163        }
1164
1165        return nla_nest_end(skb, nla);
1166
1167nla_put_failure:
1168        nlmsg_trim(skb, nla);
1169        return -1;
1170}
1171
1172static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1173                          struct sk_buff *skb, struct tcmsg *tcm)
1174{
1175        struct netem_sched_data *q = qdisc_priv(sch);
1176
1177        if (cl != 1 || !q->qdisc)       /* only one class */
1178                return -ENOENT;
1179
1180        tcm->tcm_handle |= TC_H_MIN(1);
1181        tcm->tcm_info = q->qdisc->handle;
1182
1183        return 0;
1184}
1185
1186static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1187                     struct Qdisc **old, struct netlink_ext_ack *extack)
1188{
1189        struct netem_sched_data *q = qdisc_priv(sch);
1190
1191        *old = qdisc_replace(sch, new, &q->qdisc);
1192        return 0;
1193}
1194
1195static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1196{
1197        struct netem_sched_data *q = qdisc_priv(sch);
1198        return q->qdisc;
1199}
1200
1201static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1202{
1203        return 1;
1204}
1205
1206static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1207{
1208        if (!walker->stop) {
1209                if (walker->count >= walker->skip)
1210                        if (walker->fn(sch, 1, walker) < 0) {
1211                                walker->stop = 1;
1212                                return;
1213                        }
1214                walker->count++;
1215        }
1216}
1217
1218static const struct Qdisc_class_ops netem_class_ops = {
1219        .graft          =       netem_graft,
1220        .leaf           =       netem_leaf,
1221        .find           =       netem_find,
1222        .walk           =       netem_walk,
1223        .dump           =       netem_dump_class,
1224};
1225
1226static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1227        .id             =       "netem",
1228        .cl_ops         =       &netem_class_ops,
1229        .priv_size      =       sizeof(struct netem_sched_data),
1230        .enqueue        =       netem_enqueue,
1231        .dequeue        =       netem_dequeue,
1232        .peek           =       qdisc_peek_dequeued,
1233        .init           =       netem_init,
1234        .reset          =       netem_reset,
1235        .destroy        =       netem_destroy,
1236        .change         =       netem_change,
1237        .dump           =       netem_dump,
1238        .owner          =       THIS_MODULE,
1239};
1240
1241
1242static int __init netem_module_init(void)
1243{
1244        pr_info("netem: version " VERSION "\n");
1245        return register_qdisc(&netem_qdisc_ops);
1246}
1247static void __exit netem_module_exit(void)
1248{
1249        unregister_qdisc(&netem_qdisc_ops);
1250}
1251module_init(netem_module_init)
1252module_exit(netem_module_exit)
1253MODULE_LICENSE("GPL");
1254