linux/net/core/gen_estimator.c
<<
>>
Prefs
   1/*
   2 * net/sched/gen_estimator.c    Simple rate estimator.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Changes:
  12 *              Jamal Hadi Salim - moved it to net/core and reshulfed
  13 *              names to make it usable in general net subsystem.
  14 */
  15
  16#include <asm/uaccess.h>
  17#include <linux/bitops.h>
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/jiffies.h>
  22#include <linux/string.h>
  23#include <linux/mm.h>
  24#include <linux/socket.h>
  25#include <linux/sockios.h>
  26#include <linux/in.h>
  27#include <linux/errno.h>
  28#include <linux/interrupt.h>
  29#include <linux/netdevice.h>
  30#include <linux/skbuff.h>
  31#include <linux/rtnetlink.h>
  32#include <linux/init.h>
  33#include <linux/rbtree.h>
  34#include <linux/slab.h>
  35#include <net/sock.h>
  36#include <net/gen_stats.h>
  37
  38/*
  39   This code is NOT intended to be used for statistics collection,
  40   its purpose is to provide a base for statistical multiplexing
  41   for controlled load service.
  42   If you need only statistics, run a user level daemon which
  43   periodically reads byte counters.
  44
  45   Unfortunately, rate estimation is not a very easy task.
  46   F.e. I did not find a simple way to estimate the current peak rate
  47   and even failed to formulate the problem 8)8)
  48
  49   So I preferred not to built an estimator into the scheduler,
  50   but run this task separately.
  51   Ideally, it should be kernel thread(s), but for now it runs
  52   from timers, which puts apparent top bounds on the number of rated
  53   flows, has minimal overhead on small, but is enough
  54   to handle controlled load service, sets of aggregates.
  55
  56   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
  57
  58   avrate = avrate*(1-W) + rate*W
  59
  60   where W is chosen as negative power of 2: W = 2^(-ewma_log)
  61
  62   The resulting time constant is:
  63
  64   T = A/(-ln(1-W))
  65
  66
  67   NOTES.
  68
  69   * avbps is scaled by 2^5, avpps is scaled by 2^10.
  70   * both values are reported as 32 bit unsigned values. bps can
  71     overflow for fast links : max speed being 34360Mbit/sec
  72   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
  73     for HZ=100 and HZ=1024 8)), maximal interval
  74     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
  75     are too expensive, longer ones can be implemented
  76     at user level painlessly.
  77 */
  78
  79#define EST_MAX_INTERVAL        5
  80
  81struct gen_estimator
  82{
  83        struct list_head        list;
  84        struct gnet_stats_basic_packed  *bstats;
  85        struct gnet_stats_rate_est64    *rate_est;
  86        spinlock_t              *stats_lock;
  87        int                     ewma_log;
  88        u64                     last_bytes;
  89        u64                     avbps;
  90        u32                     last_packets;
  91        u32                     avpps;
  92        struct rcu_head         e_rcu;
  93        struct rb_node          node;
  94};
  95
  96struct gen_estimator_head
  97{
  98        struct timer_list       timer;
  99        struct list_head        list;
 100};
 101
 102static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
 103
 104/* Protects against NULL dereference */
 105static DEFINE_RWLOCK(est_lock);
 106
 107/* Protects against soft lockup during large deletion */
 108static struct rb_root est_root = RB_ROOT;
 109static DEFINE_SPINLOCK(est_tree_lock);
 110
 111static void est_timer(unsigned long arg)
 112{
 113        int idx = (int)arg;
 114        struct gen_estimator *e;
 115
 116        rcu_read_lock();
 117        list_for_each_entry_rcu(e, &elist[idx].list, list) {
 118                u64 nbytes;
 119                u64 brate;
 120                u32 npackets;
 121                u32 rate;
 122
 123                spin_lock(e->stats_lock);
 124                read_lock(&est_lock);
 125                if (e->bstats == NULL)
 126                        goto skip;
 127
 128                nbytes = e->bstats->bytes;
 129                npackets = e->bstats->packets;
 130                brate = (nbytes - e->last_bytes)<<(7 - idx);
 131                e->last_bytes = nbytes;
 132                e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
 133                e->rate_est->bps = (e->avbps+0xF)>>5;
 134
 135                rate = (npackets - e->last_packets)<<(12 - idx);
 136                e->last_packets = npackets;
 137                e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
 138                e->rate_est->pps = (e->avpps+0x1FF)>>10;
 139skip:
 140                read_unlock(&est_lock);
 141                spin_unlock(e->stats_lock);
 142        }
 143
 144        if (!list_empty(&elist[idx].list))
 145                mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
 146        rcu_read_unlock();
 147}
 148
 149static void gen_add_node(struct gen_estimator *est)
 150{
 151        struct rb_node **p = &est_root.rb_node, *parent = NULL;
 152
 153        while (*p) {
 154                struct gen_estimator *e;
 155
 156                parent = *p;
 157                e = rb_entry(parent, struct gen_estimator, node);
 158
 159                if (est->bstats > e->bstats)
 160                        p = &parent->rb_right;
 161                else
 162                        p = &parent->rb_left;
 163        }
 164        rb_link_node(&est->node, parent, p);
 165        rb_insert_color(&est->node, &est_root);
 166}
 167
 168static
 169struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
 170                                    const struct gnet_stats_rate_est64 *rate_est)
 171{
 172        struct rb_node *p = est_root.rb_node;
 173
 174        while (p) {
 175                struct gen_estimator *e;
 176
 177                e = rb_entry(p, struct gen_estimator, node);
 178
 179                if (bstats > e->bstats)
 180                        p = p->rb_right;
 181                else if (bstats < e->bstats || rate_est != e->rate_est)
 182                        p = p->rb_left;
 183                else
 184                        return e;
 185        }
 186        return NULL;
 187}
 188
 189/**
 190 * gen_new_estimator - create a new rate estimator
 191 * @bstats: basic statistics
 192 * @rate_est: rate estimator statistics
 193 * @stats_lock: statistics lock
 194 * @opt: rate estimator configuration TLV
 195 *
 196 * Creates a new rate estimator with &bstats as source and &rate_est
 197 * as destination. A new timer with the interval specified in the
 198 * configuration TLV is created. Upon each interval, the latest statistics
 199 * will be read from &bstats and the estimated rate will be stored in
 200 * &rate_est with the statistics lock grabed during this period.
 201 *
 202 * Returns 0 on success or a negative error code.
 203 *
 204 */
 205int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 206                      struct gnet_stats_rate_est64 *rate_est,
 207                      spinlock_t *stats_lock,
 208                      struct nlattr *opt)
 209{
 210        struct gen_estimator *est;
 211        struct gnet_estimator *parm = nla_data(opt);
 212        int idx;
 213
 214        if (nla_len(opt) < sizeof(*parm))
 215                return -EINVAL;
 216
 217        if (parm->interval < -2 || parm->interval > 3)
 218                return -EINVAL;
 219
 220        est = kzalloc(sizeof(*est), GFP_KERNEL);
 221        if (est == NULL)
 222                return -ENOBUFS;
 223
 224        idx = parm->interval + 2;
 225        est->bstats = bstats;
 226        est->rate_est = rate_est;
 227        est->stats_lock = stats_lock;
 228        est->ewma_log = parm->ewma_log;
 229        est->last_bytes = bstats->bytes;
 230        est->avbps = rate_est->bps<<5;
 231        est->last_packets = bstats->packets;
 232        est->avpps = rate_est->pps<<10;
 233
 234        spin_lock_bh(&est_tree_lock);
 235        if (!elist[idx].timer.function) {
 236                INIT_LIST_HEAD(&elist[idx].list);
 237                setup_timer(&elist[idx].timer, est_timer, idx);
 238        }
 239
 240        if (list_empty(&elist[idx].list))
 241                mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
 242
 243        list_add_rcu(&est->list, &elist[idx].list);
 244        gen_add_node(est);
 245        spin_unlock_bh(&est_tree_lock);
 246
 247        return 0;
 248}
 249EXPORT_SYMBOL(gen_new_estimator);
 250
 251/**
 252 * gen_kill_estimator - remove a rate estimator
 253 * @bstats: basic statistics
 254 * @rate_est: rate estimator statistics
 255 *
 256 * Removes the rate estimator specified by &bstats and &rate_est.
 257 *
 258 * Note : Caller should respect an RCU grace period before freeing stats_lock
 259 */
 260void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 261                        struct gnet_stats_rate_est64 *rate_est)
 262{
 263        struct gen_estimator *e;
 264
 265        spin_lock_bh(&est_tree_lock);
 266        while ((e = gen_find_node(bstats, rate_est))) {
 267                rb_erase(&e->node, &est_root);
 268
 269                write_lock(&est_lock);
 270                e->bstats = NULL;
 271                write_unlock(&est_lock);
 272
 273                list_del_rcu(&e->list);
 274                kfree_rcu(e, e_rcu);
 275        }
 276        spin_unlock_bh(&est_tree_lock);
 277}
 278EXPORT_SYMBOL(gen_kill_estimator);
 279
 280/**
 281 * gen_replace_estimator - replace rate estimator configuration
 282 * @bstats: basic statistics
 283 * @rate_est: rate estimator statistics
 284 * @stats_lock: statistics lock
 285 * @opt: rate estimator configuration TLV
 286 *
 287 * Replaces the configuration of a rate estimator by calling
 288 * gen_kill_estimator() and gen_new_estimator().
 289 *
 290 * Returns 0 on success or a negative error code.
 291 */
 292int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 293                          struct gnet_stats_rate_est64 *rate_est,
 294                          spinlock_t *stats_lock, struct nlattr *opt)
 295{
 296        gen_kill_estimator(bstats, rate_est);
 297        return gen_new_estimator(bstats, rate_est, stats_lock, opt);
 298}
 299EXPORT_SYMBOL(gen_replace_estimator);
 300
 301/**
 302 * gen_estimator_active - test if estimator is currently in use
 303 * @bstats: basic statistics
 304 * @rate_est: rate estimator statistics
 305 *
 306 * Returns true if estimator is active, and false if not.
 307 */
 308bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
 309                          const struct gnet_stats_rate_est64 *rate_est)
 310{
 311        bool res;
 312
 313        ASSERT_RTNL();
 314
 315        spin_lock_bh(&est_tree_lock);
 316        res = gen_find_node(bstats, rate_est) != NULL;
 317        spin_unlock_bh(&est_tree_lock);
 318
 319        return res;
 320}
 321EXPORT_SYMBOL(gen_estimator_active);
 322