LXR linux/net/sched/sch

   1/*
   2 * net/sched/sch_netem.c        Network emulator
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License.
   8 *
   9 *              Many of the algorithms and ideas for this came from
  10 *              NIST Net which is not copyrighted.
  11 *
  12 * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13 *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14 */
  15
  16#include <linux/mm.h>
  17#include <linux/module.h>
  18#include <linux/slab.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/errno.h>
  22#include <linux/skbuff.h>
  23#include <linux/vmalloc.h>
  24#include <linux/rtnetlink.h>
  25#include <linux/reciprocal_div.h>
  26
  27#include <net/netlink.h>
  28#include <net/pkt_sched.h>
  29#include <net/inet_ecn.h>
  30
  31#define VERSION "1.3"
  32
  33/*      Network Emulation Queuing algorithm.
  34        ====================================
  35
  36        Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  37                 Network Emulation Tool
  38                 [2] Luigi Rizzo, DummyNet for FreeBSD
  39
  40         ----------------------------------------------------------------
  41
  42         This started out as a simple way to delay outgoing packets to
  43         test TCP but has grown to include most of the functionality
  44         of a full blown network emulator like NISTnet. It can delay
  45         packets and add random jitter (and correlation). The random
  46         distribution can be loaded from a table as well to provide
  47         normal, Pareto, or experimental curves. Packet loss,
  48         duplication, and reordering can also be emulated.
  49
  50         This qdisc does not do classification that can be handled in
  51         layering other disciplines.  It does not need to do bandwidth
  52         control either since that can be handled by using token
  53         bucket or other rate control.
  54
  55     Correlated Loss Generator models
  56
  57        Added generation of correlated loss according to the
  58        "Gilbert-Elliot" model, a 4-state markov model.
  59
  60        References:
  61        [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  62        [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  63        and intuitive loss model for packet networks and its implementation
  64        in the Netem module in the Linux kernel", available in [1]
  65
  66        Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  67                 Fabio Ludovici <fabio.ludovici at yahoo.it>
  68*/
  69
  70struct netem_sched_data {
  71        /* internal t(ime)fifo qdisc uses sch->q and sch->limit */
  72
  73        /* optional qdisc for classful handling (NULL at netem init) */
  74        struct Qdisc    *qdisc;
  75
  76        struct qdisc_watchdog watchdog;
  77
  78        psched_tdiff_t latency;
  79        psched_tdiff_t jitter;
  80
  81        u32 loss;
  82        u32 ecn;
  83        u32 limit;
  84        u32 counter;
  85        u32 gap;
  86        u32 duplicate;
  87        u32 reorder;
  88        u32 corrupt;
  89        u32 rate;
  90        s32 packet_overhead;
  91        u32 cell_size;
  92        u32 cell_size_reciprocal;
  93        s32 cell_overhead;
  94
  95        struct crndstate {
  96                u32 last;
  97                u32 rho;
  98        } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  99
 100        struct disttable {
 101                u32  size;
 102                s16 table[0];
 103        } *delay_dist;
 104
 105        enum  {
 106                CLG_RANDOM,
 107                CLG_4_STATES,
 108                CLG_GILB_ELL,
 109        } loss_model;
 110
 111        /* Correlated Loss Generation models */
 112        struct clgstate {
 113                /* state of the Markov chain */
 114                u8 state;
 115
 116                /* 4-states and Gilbert-Elliot models */
 117                u32 a1; /* p13 for 4-states or p for GE */
 118                u32 a2; /* p31 for 4-states or r for GE */
 119                u32 a3; /* p32 for 4-states or h for GE */
 120                u32 a4; /* p14 for 4-states or 1-k for GE */
 121                u32 a5; /* p23 used only in 4-states */
 122        } clg;
 123
 124};
 125
 126/* Time stamp put into socket buffer control block
 127 * Only valid when skbs are in our internal t(ime)fifo queue.
 128 */
 129struct netem_skb_cb {
 130        psched_time_t   time_to_send;
 131};
 132
 133static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 134{
 135        qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 136        return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 137}
 138
 139/* init_crandom - initialize correlated random number generator
 140 * Use entropy source for initial seed.
 141 */
 142static void init_crandom(struct crndstate *state, unsigned long rho)
 143{
 144        state->rho = rho;
 145        state->last = net_random();
 146}
 147
 148/* get_crandom - correlated random number generator
 149 * Next number depends on last value.
 150 * rho is scaled to avoid floating point.
 151 */
 152static u32 get_crandom(struct crndstate *state)
 153{
 154        u64 value, rho;
 155        unsigned long answer;
 156
 157        if (state->rho == 0)    /* no correlation */
 158                return net_random();
 159
 160        value = net_random();
 161        rho = (u64)state->rho + 1;
 162        answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 163        state->last = answer;
 164        return answer;
 165}
 166
 167/* loss_4state - 4-state model loss generator
 168 * Generates losses according to the 4-state Markov chain adopted in
 169 * the GI (General and Intuitive) loss model.
 170 */
 171static bool loss_4state(struct netem_sched_data *q)
 172{
 173        struct clgstate *clg = &q->clg;
 174        u32 rnd = net_random();
 175
 176        /*
 177         * Makes a comparison between rnd and the transition
 178         * probabilities outgoing from the current state, then decides the
 179         * next state and if the next packet has to be transmitted or lost.
 180         * The four states correspond to:
 181         *   1 => successfully transmitted packets within a gap period
 182         *   4 => isolated losses within a gap period
 183         *   3 => lost packets within a burst period
 184         *   2 => successfully transmitted packets within a burst period
 185         */
 186        switch (clg->state) {
 187        case 1:
 188                if (rnd < clg->a4) {
 189                        clg->state = 4;
 190                        return true;
 191                } else if (clg->a4 < rnd && rnd < clg->a1) {
 192                        clg->state = 3;
 193                        return true;
 194                } else if (clg->a1 < rnd)
 195                        clg->state = 1;
 196
 197                break;
 198        case 2:
 199                if (rnd < clg->a5) {
 200                        clg->state = 3;
 201                        return true;
 202                } else
 203                        clg->state = 2;
 204
 205                break;
 206        case 3:
 207                if (rnd < clg->a3)
 208                        clg->state = 2;
 209                else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 210                        clg->state = 1;
 211                        return true;
 212                } else if (clg->a2 + clg->a3 < rnd) {
 213                        clg->state = 3;
 214                        return true;
 215                }
 216                break;
 217        case 4:
 218                clg->state = 1;
 219                break;
 220        }
 221
 222        return false;
 223}
 224
 225/* loss_gilb_ell - Gilbert-Elliot model loss generator
 226 * Generates losses according to the Gilbert-Elliot loss model or
 227 * its special cases  (Gilbert or Simple Gilbert)
 228 *
 229 * Makes a comparison between random number and the transition
 230 * probabilities outgoing from the current state, then decides the
 231 * next state. A second random number is extracted and the comparison
 232 * with the loss probability of the current state decides if the next
 233 * packet will be transmitted or lost.
 234 */
 235static bool loss_gilb_ell(struct netem_sched_data *q)
 236{
 237        struct clgstate *clg = &q->clg;
 238
 239        switch (clg->state) {
 240        case 1:
 241                if (net_random() < clg->a1)
 242                        clg->state = 2;
 243                if (net_random() < clg->a4)
 244                        return true;
 245        case 2:
 246                if (net_random() < clg->a2)
 247                        clg->state = 1;
 248                if (clg->a3 > net_random())
 249                        return true;
 250        }
 251
 252        return false;
 253}
 254
 255static bool loss_event(struct netem_sched_data *q)
 256{
 257        switch (q->loss_model) {
 258        case CLG_RANDOM:
 259                /* Random packet drop 0 => none, ~0 => all */
 260                return q->loss && q->loss >= get_crandom(&q->loss_cor);
 261
 262        case CLG_4_STATES:
 263                /* 4state loss model algorithm (used also for GI model)
 264                * Extracts a value from the markov 4 state loss generator,
 265                * if it is 1 drops a packet and if needed writes the event in
 266                * the kernel logs
 267                */
 268                return loss_4state(q);
 269
 270        case CLG_GILB_ELL:
 271                /* Gilbert-Elliot loss model algorithm
 272                * Extracts a value from the Gilbert-Elliot loss generator,
 273                * if it is 1 drops a packet and if needed writes the event in
 274                * the kernel logs
 275                */
 276                return loss_gilb_ell(q);
 277        }
 278
 279        return false;   /* not reached */
 280}
 281
 282
 283/* tabledist - return a pseudo-randomly distributed value with mean mu and
 284 * std deviation sigma.  Uses table lookup to approximate the desired
 285 * distribution, and a uniformly-distributed pseudo-random source.
 286 */
 287static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 288                                struct crndstate *state,
 289                                const struct disttable *dist)
 290{
 291        psched_tdiff_t x;
 292        long t;
 293        u32 rnd;
 294
 295        if (sigma == 0)
 296                return mu;
 297
 298        rnd = get_crandom(state);
 299
 300        /* default uniform distribution */
 301        if (dist == NULL)
 302                return (rnd % (2*sigma)) - sigma + mu;
 303
 304        t = dist->table[rnd % dist->size];
 305        x = (sigma % NETEM_DIST_SCALE) * t;
 306        if (x >= 0)
 307                x += NETEM_DIST_SCALE/2;
 308        else
 309                x -= NETEM_DIST_SCALE/2;
 310
 311        return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 312}
 313
 314static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 315{
 316        u64 ticks;
 317
 318        len += q->packet_overhead;
 319
 320        if (q->cell_size) {
 321                u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 322
 323                if (len > cells * q->cell_size) /* extra cell needed for remainder */
 324                        cells++;
 325                len = cells * (q->cell_size + q->cell_overhead);
 326        }
 327
 328        ticks = (u64)len * NSEC_PER_SEC;
 329
 330        do_div(ticks, q->rate);
 331        return PSCHED_NS2TICKS(ticks);
 332}
 333
 334static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 335{
 336        struct sk_buff_head *list = &sch->q;
 337        psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 338        struct sk_buff *skb = skb_peek_tail(list);
 339
 340        /* Optimize for add at tail */
 341        if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
 342                return __skb_queue_tail(list, nskb);
 343
 344        skb_queue_reverse_walk(list, skb) {
 345                if (tnext >= netem_skb_cb(skb)->time_to_send)
 346                        break;
 347        }
 348
 349        __skb_queue_after(list, skb, nskb);
 350}
 351
 352/*
 353 * Insert one skb into qdisc.
 354 * Note: parent depends on return value to account for queue length.
 355 *      NET_XMIT_DROP: queue length didn't change.
 356 *      NET_XMIT_SUCCESS: one skb was queued.
 357 */
 358static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 359{
 360        struct netem_sched_data *q = qdisc_priv(sch);
 361        /* We don't fill cb now as skb_unshare() may invalidate it */
 362        struct netem_skb_cb *cb;
 363        struct sk_buff *skb2;
 364        int count = 1;
 365
 366        /* Random duplication */
 367        if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 368                ++count;
 369
 370        /* Drop packet? */
 371        if (loss_event(q)) {
 372                if (q->ecn && INET_ECN_set_ce(skb))
 373                        sch->qstats.drops++; /* mark packet */
 374                else
 375                        --count;
 376        }
 377        if (count == 0) {
 378                sch->qstats.drops++;
 379                kfree_skb(skb);
 380                return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 381        }
 382
 383        /* If a delay is expected, orphan the skb. (orphaning usually takes
 384         * place at TX completion time, so _before_ the link transit delay)
 385         * Ideally, this orphaning should be done after the rate limiting
 386         * module, because this breaks TCP Small Queue, and other mechanisms
 387         * based on socket sk_wmem_alloc.
 388         */
 389        if (q->latency || q->jitter)
 390                skb_orphan(skb);
 391
 392        /*
 393         * If we need to duplicate packet, then re-insert at top of the
 394         * qdisc tree, since parent queuer expects that only one
 395         * skb will be queued.
 396         */
 397        if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 398                struct Qdisc *rootq = qdisc_root(sch);
 399                u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 400                q->duplicate = 0;
 401
 402                qdisc_enqueue_root(skb2, rootq);
 403                q->duplicate = dupsave;
 404        }
 405
 406        /*
 407         * Randomized packet corruption.
 408         * Make copy if needed since we are modifying
 409         * If packet is going to be hardware checksummed, then
 410         * do it now in software before we mangle it.
 411         */
 412        if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 413                if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 414                    (skb->ip_summed == CHECKSUM_PARTIAL &&
 415                     skb_checksum_help(skb)))
 416                        return qdisc_drop(skb, sch);
 417
 418                skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 419        }
 420
 421        if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
 422                return qdisc_reshape_fail(skb, sch);
 423
 424        sch->qstats.backlog += qdisc_pkt_len(skb);
 425
 426        cb = netem_skb_cb(skb);
 427        if (q->gap == 0 ||              /* not doing reordering */
 428            q->counter < q->gap - 1 ||  /* inside last reordering gap */
 429            q->reorder < get_crandom(&q->reorder_cor)) {
 430                psched_time_t now;
 431                psched_tdiff_t delay;
 432
 433                delay = tabledist(q->latency, q->jitter,
 434                                  &q->delay_cor, q->delay_dist);
 435
 436                now = psched_get_time();
 437
 438                if (q->rate) {
 439                        struct sk_buff_head *list = &sch->q;
 440
 441                        if (!skb_queue_empty(list)) {
 442                                /*
 443                                 * Last packet in queue is reference point (now),
 444                                 * calculate this time bonus and subtract
 445                                 * from delay.
 446                                 */
 447                                delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now;
 448                                delay = max_t(psched_tdiff_t, 0, delay);
 449                                now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
 450                        }
 451
 452                        delay += packet_len_2_sched_time(skb->len, q);
 453                }
 454
 455                cb->time_to_send = now + delay;
 456                ++q->counter;
 457                tfifo_enqueue(skb, sch);
 458        } else {
 459                /*
 460                 * Do re-ordering by putting one out of N packets at the front
 461                 * of the queue.
 462                 */
 463                cb->time_to_send = psched_get_time();
 464                q->counter = 0;
 465
 466                __skb_queue_head(&sch->q, skb);
 467                sch->qstats.requeues++;
 468        }
 469
 470        return NET_XMIT_SUCCESS;
 471}
 472
 473static unsigned int netem_drop(struct Qdisc *sch)
 474{
 475        struct netem_sched_data *q = qdisc_priv(sch);
 476        unsigned int len;
 477
 478        len = qdisc_queue_drop(sch);
 479        if (!len && q->qdisc && q->qdisc->ops->drop)
 480            len = q->qdisc->ops->drop(q->qdisc);
 481        if (len)
 482                sch->qstats.drops++;
 483
 484        return len;
 485}
 486
 487static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 488{
 489        struct netem_sched_data *q = qdisc_priv(sch);
 490        struct sk_buff *skb;
 491
 492        if (qdisc_is_throttled(sch))
 493                return NULL;
 494
 495tfifo_dequeue:
 496        skb = qdisc_peek_head(sch);
 497        if (skb) {
 498                const struct netem_skb_cb *cb = netem_skb_cb(skb);
 499
 500                /* if more time remaining? */
 501                if (cb->time_to_send <= psched_get_time()) {
 502                        __skb_unlink(skb, &sch->q);
 503                        sch->qstats.backlog -= qdisc_pkt_len(skb);
 504
 505#ifdef CONFIG_NET_CLS_ACT
 506                        /*
 507                         * If it's at ingress let's pretend the delay is
 508                         * from the network (tstamp will be updated).
 509                         */
 510                        if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 511                                skb->tstamp.tv64 = 0;
 512#endif
 513
 514                        if (q->qdisc) {
 515                                int err = qdisc_enqueue(skb, q->qdisc);
 516
 517                                if (unlikely(err != NET_XMIT_SUCCESS)) {
 518                                        if (net_xmit_drop_count(err)) {
 519                                                sch->qstats.drops++;
 520                                                qdisc_tree_decrease_qlen(sch, 1);
 521                                        }
 522                                }
 523                                goto tfifo_dequeue;
 524                        }
 525deliver:
 526                        qdisc_unthrottled(sch);
 527                        qdisc_bstats_update(sch, skb);
 528                        return skb;
 529                }
 530
 531                if (q->qdisc) {
 532                        skb = q->qdisc->ops->dequeue(q->qdisc);
 533                        if (skb)
 534                                goto deliver;
 535                }
 536                qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 537        }
 538
 539        if (q->qdisc) {
 540                skb = q->qdisc->ops->dequeue(q->qdisc);
 541                if (skb)
 542                        goto deliver;
 543        }
 544        return NULL;
 545}
 546
 547static void netem_reset(struct Qdisc *sch)
 548{
 549        struct netem_sched_data *q = qdisc_priv(sch);
 550
 551        qdisc_reset_queue(sch);
 552        if (q->qdisc)
 553                qdisc_reset(q->qdisc);
 554        qdisc_watchdog_cancel(&q->watchdog);
 555}
 556
 557static void dist_free(struct disttable *d)
 558{
 559        if (d) {
 560                if (is_vmalloc_addr(d))
 561                        vfree(d);
 562                else
 563                        kfree(d);
 564        }
 565}
 566
 567/*
 568 * Distribution data is a variable size payload containing
 569 * signed 16 bit values.
 570 */
 571static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 572{
 573        struct netem_sched_data *q = qdisc_priv(sch);
 574        size_t n = nla_len(attr)/sizeof(__s16);
 575        const __s16 *data = nla_data(attr);
 576        spinlock_t *root_lock;
 577        struct disttable *d;
 578        int i;
 579        size_t s;
 580
 581        if (n > NETEM_DIST_MAX)
 582                return -EINVAL;
 583
 584        s = sizeof(struct disttable) + n * sizeof(s16);
 585        d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 586        if (!d)
 587                d = vmalloc(s);
 588        if (!d)
 589                return -ENOMEM;
 590
 591        d->size = n;
 592        for (i = 0; i < n; i++)
 593                d->table[i] = data[i];
 594
 595        root_lock = qdisc_root_sleeping_lock(sch);
 596
 597        spin_lock_bh(root_lock);
 598        swap(q->delay_dist, d);
 599        spin_unlock_bh(root_lock);
 600
 601        dist_free(d);
 602        return 0;
 603}
 604
 605static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 606{
 607        struct netem_sched_data *q = qdisc_priv(sch);
 608        const struct tc_netem_corr *c = nla_data(attr);
 609
 610        init_crandom(&q->delay_cor, c->delay_corr);
 611        init_crandom(&q->loss_cor, c->loss_corr);
 612        init_crandom(&q->dup_cor, c->dup_corr);
 613}
 614
 615static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 616{
 617        struct netem_sched_data *q = qdisc_priv(sch);
 618        const struct tc_netem_reorder *r = nla_data(attr);
 619
 620        q->reorder = r->probability;
 621        init_crandom(&q->reorder_cor, r->correlation);
 622}
 623
 624static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 625{
 626        struct netem_sched_data *q = qdisc_priv(sch);
 627        const struct tc_netem_corrupt *r = nla_data(attr);
 628
 629        q->corrupt = r->probability;
 630        init_crandom(&q->corrupt_cor, r->correlation);
 631}
 632
 633static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 634{
 635        struct netem_sched_data *q = qdisc_priv(sch);
 636        const struct tc_netem_rate *r = nla_data(attr);
 637
 638        q->rate = r->rate;
 639        q->packet_overhead = r->packet_overhead;
 640        q->cell_size = r->cell_size;
 641        if (q->cell_size)
 642                q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 643        q->cell_overhead = r->cell_overhead;
 644}
 645
 646static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 647{
 648        struct netem_sched_data *q = qdisc_priv(sch);
 649        const struct nlattr *la;
 650        int rem;
 651
 652        nla_for_each_nested(la, attr, rem) {
 653                u16 type = nla_type(la);
 654
 655                switch(type) {
 656                case NETEM_LOSS_GI: {
 657                        const struct tc_netem_gimodel *gi = nla_data(la);
 658
 659                        if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 660                                pr_info("netem: incorrect gi model size\n");
 661                                return -EINVAL;
 662                        }
 663
 664                        q->loss_model = CLG_4_STATES;
 665
 666                        q->clg.state = 1;
 667                        q->clg.a1 = gi->p13;
 668                        q->clg.a2 = gi->p31;
 669                        q->clg.a3 = gi->p32;
 670                        q->clg.a4 = gi->p14;
 671                        q->clg.a5 = gi->p23;
 672                        break;
 673                }
 674
 675                case NETEM_LOSS_GE: {
 676                        const struct tc_netem_gemodel *ge = nla_data(la);
 677
 678                        if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 679                                pr_info("netem: incorrect ge model size\n");
 680                                return -EINVAL;
 681                        }
 682
 683                        q->loss_model = CLG_GILB_ELL;
 684                        q->clg.state = 1;
 685                        q->clg.a1 = ge->p;
 686                        q->clg.a2 = ge->r;
 687                        q->clg.a3 = ge->h;
 688                        q->clg.a4 = ge->k1;
 689                        break;
 690                }
 691
 692                default:
 693                        pr_info("netem: unknown loss type %u\n", type);
 694                        return -EINVAL;
 695                }
 696        }
 697
 698        return 0;
 699}
 700
 701static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 702        [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 703        [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 704        [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 705        [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 706        [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 707        [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 708};
 709
 710static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 711                      const struct nla_policy *policy, int len)
 712{
 713        int nested_len = nla_len(nla) - NLA_ALIGN(len);
 714
 715        if (nested_len < 0) {
 716                pr_info("netem: invalid attributes len %d\n", nested_len);
 717                return -EINVAL;
 718        }
 719
 720        if (nested_len >= nla_attr_size(0))
 721                return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 722                                 nested_len, policy);
 723
 724        memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 725        return 0;
 726}
 727
 728/* Parse netlink message to set options */
 729static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 730{
 731        struct netem_sched_data *q = qdisc_priv(sch);
 732        struct nlattr *tb[TCA_NETEM_MAX + 1];
 733        struct tc_netem_qopt *qopt;
 734        int ret;
 735
 736        if (opt == NULL)
 737                return -EINVAL;
 738
 739        qopt = nla_data(opt);
 740        ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 741        if (ret < 0)
 742                return ret;
 743
 744        sch->limit = qopt->limit;
 745
 746        q->latency = qopt->latency;
 747        q->jitter = qopt->jitter;
 748        q->limit = qopt->limit;
 749        q->gap = qopt->gap;
 750        q->counter = 0;
 751        q->loss = qopt->loss;
 752        q->duplicate = qopt->duplicate;
 753
 754        /* for compatibility with earlier versions.
 755         * if gap is set, need to assume 100% probability
 756         */
 757        if (q->gap)
 758                q->reorder = ~0;
 759
 760        if (tb[TCA_NETEM_CORR])
 761                get_correlation(sch, tb[TCA_NETEM_CORR]);
 762
 763        if (tb[TCA_NETEM_DELAY_DIST]) {
 764                ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 765                if (ret)
 766                        return ret;
 767        }
 768
 769        if (tb[TCA_NETEM_REORDER])
 770                get_reorder(sch, tb[TCA_NETEM_REORDER]);
 771
 772        if (tb[TCA_NETEM_CORRUPT])
 773                get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 774
 775        if (tb[TCA_NETEM_RATE])
 776                get_rate(sch, tb[TCA_NETEM_RATE]);
 777
 778        if (tb[TCA_NETEM_ECN])
 779                q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 780
 781        q->loss_model = CLG_RANDOM;
 782        if (tb[TCA_NETEM_LOSS])
 783                ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 784
 785        return ret;
 786}
 787
 788static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 789{
 790        struct netem_sched_data *q = qdisc_priv(sch);
 791        int ret;
 792
 793        if (!opt)
 794                return -EINVAL;
 795
 796        qdisc_watchdog_init(&q->watchdog, sch);
 797
 798        q->loss_model = CLG_RANDOM;
 799        ret = netem_change(sch, opt);
 800        if (ret)
 801                pr_info("netem: change failed\n");
 802        return ret;
 803}
 804
 805static void netem_destroy(struct Qdisc *sch)
 806{
 807        struct netem_sched_data *q = qdisc_priv(sch);
 808
 809        qdisc_watchdog_cancel(&q->watchdog);
 810        if (q->qdisc)
 811                qdisc_destroy(q->qdisc);
 812        dist_free(q->delay_dist);
 813}
 814
 815static int dump_loss_model(const struct netem_sched_data *q,
 816                           struct sk_buff *skb)
 817{
 818        struct nlattr *nest;
 819
 820        nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 821        if (nest == NULL)
 822                goto nla_put_failure;
 823
 824        switch (q->loss_model) {
 825        case CLG_RANDOM:
 826                /* legacy loss model */
 827                nla_nest_cancel(skb, nest);
 828                return 0;       /* no data */
 829
 830        case CLG_4_STATES: {
 831                struct tc_netem_gimodel gi = {
 832                        .p13 = q->clg.a1,
 833                        .p31 = q->clg.a2,
 834                        .p32 = q->clg.a3,
 835                        .p14 = q->clg.a4,
 836                        .p23 = q->clg.a5,
 837                };
 838
 839                if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
 840                        goto nla_put_failure;
 841                break;
 842        }
 843        case CLG_GILB_ELL: {
 844                struct tc_netem_gemodel ge = {
 845                        .p = q->clg.a1,
 846                        .r = q->clg.a2,
 847                        .h = q->clg.a3,
 848                        .k1 = q->clg.a4,
 849                };
 850
 851                if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
 852                        goto nla_put_failure;
 853                break;
 854        }
 855        }
 856
 857        nla_nest_end(skb, nest);
 858        return 0;
 859
 860nla_put_failure:
 861        nla_nest_cancel(skb, nest);
 862        return -1;
 863}
 864
 865static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 866{
 867        const struct netem_sched_data *q = qdisc_priv(sch);
 868        struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 869        struct tc_netem_qopt qopt;
 870        struct tc_netem_corr cor;
 871        struct tc_netem_reorder reorder;
 872        struct tc_netem_corrupt corrupt;
 873        struct tc_netem_rate rate;
 874
 875        qopt.latency = q->latency;
 876        qopt.jitter = q->jitter;
 877        qopt.limit = q->limit;
 878        qopt.loss = q->loss;
 879        qopt.gap = q->gap;
 880        qopt.duplicate = q->duplicate;
 881        if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 882                goto nla_put_failure;
 883
 884        cor.delay_corr = q->delay_cor.rho;
 885        cor.loss_corr = q->loss_cor.rho;
 886        cor.dup_corr = q->dup_cor.rho;
 887        if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
 888                goto nla_put_failure;
 889
 890        reorder.probability = q->reorder;
 891        reorder.correlation = q->reorder_cor.rho;
 892        if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
 893                goto nla_put_failure;
 894
 895        corrupt.probability = q->corrupt;
 896        corrupt.correlation = q->corrupt_cor.rho;
 897        if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
 898                goto nla_put_failure;
 899
 900        rate.rate = q->rate;
 901        rate.packet_overhead = q->packet_overhead;
 902        rate.cell_size = q->cell_size;
 903        rate.cell_overhead = q->cell_overhead;
 904        if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
 905                goto nla_put_failure;
 906
 907        if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
 908                goto nla_put_failure;
 909
 910        if (dump_loss_model(q, skb) != 0)
 911                goto nla_put_failure;
 912
 913        return nla_nest_end(skb, nla);
 914
 915nla_put_failure:
 916        nlmsg_trim(skb, nla);
 917        return -1;
 918}
 919
 920static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 921                          struct sk_buff *skb, struct tcmsg *tcm)
 922{
 923        struct netem_sched_data *q = qdisc_priv(sch);
 924
 925        if (cl != 1 || !q->qdisc)       /* only one class */
 926                return -ENOENT;
 927
 928        tcm->tcm_handle |= TC_H_MIN(1);
 929        tcm->tcm_info = q->qdisc->handle;
 930
 931        return 0;
 932}
 933
 934static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 935                     struct Qdisc **old)
 936{
 937        struct netem_sched_data *q = qdisc_priv(sch);
 938
 939        sch_tree_lock(sch);
 940        *old = q->qdisc;
 941        q->qdisc = new;
 942        if (*old) {
 943                qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 944                qdisc_reset(*old);
 945        }
 946        sch_tree_unlock(sch);
 947
 948        return 0;
 949}
 950
 951static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 952{
 953        struct netem_sched_data *q = qdisc_priv(sch);
 954        return q->qdisc;
 955}
 956
 957static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 958{
 959        return 1;
 960}
 961
 962static void netem_put(struct Qdisc *sch, unsigned long arg)
 963{
 964}
 965
 966static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 967{
 968        if (!walker->stop) {
 969                if (walker->count >= walker->skip)
 970                        if (walker->fn(sch, 1, walker) < 0) {
 971                                walker->stop = 1;
 972                                return;
 973                        }
 974                walker->count++;
 975        }
 976}
 977
 978static const struct Qdisc_class_ops netem_class_ops = {
 979        .graft          =       netem_graft,
 980        .leaf           =       netem_leaf,
 981        .get            =       netem_get,
 982        .put            =       netem_put,
 983        .walk           =       netem_walk,
 984        .dump           =       netem_dump_class,
 985};
 986
 987static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 988        .id             =       "netem",
 989        .cl_ops         =       &netem_class_ops,
 990        .priv_size      =       sizeof(struct netem_sched_data),
 991        .enqueue        =       netem_enqueue,
 992        .dequeue        =       netem_dequeue,
 993        .peek           =       qdisc_peek_dequeued,
 994        .drop           =       netem_drop,
 995        .init           =       netem_init,
 996        .reset          =       netem_reset,
 997        .destroy        =       netem_destroy,
 998        .change         =       netem_change,
 999        .dump           =       netem_dump,
1000        .owner          =       THIS_MODULE,

1001};
1002
1003
1004static int __init netem_module_init(void)
1005{
1006        pr_info("netem: version " VERSION "\n");
1007        return register_qdisc(&netem_qdisc_ops);
1008}
1009static void __exit netem_module_exit(void)
1010{
1011        unregister_qdisc(&netem_qdisc_ops);
1012}
1013module_init(netem_module_init)
1014module_exit(netem_module_exit)
1015MODULE_LICENSE("GPL");
1016