linux/net/core/gen_estimator.c
<<
>>
Prefs
   1/*
   2 * net/sched/gen_estimator.c    Simple rate estimator.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Changes:
  12 *              Jamal Hadi Salim - moved it to net/core and reshulfed
  13 *              names to make it usable in general net subsystem.
  14 */
  15
  16#include <asm/uaccess.h>
  17#include <linux/bitops.h>
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/jiffies.h>
  22#include <linux/string.h>
  23#include <linux/mm.h>
  24#include <linux/socket.h>
  25#include <linux/sockios.h>
  26#include <linux/in.h>
  27#include <linux/errno.h>
  28#include <linux/interrupt.h>
  29#include <linux/netdevice.h>
  30#include <linux/skbuff.h>
  31#include <linux/rtnetlink.h>
  32#include <linux/init.h>
  33#include <linux/rbtree.h>
  34#include <linux/slab.h>
  35#include <net/sock.h>
  36#include <net/gen_stats.h>
  37
  38/*
  39   This code is NOT intended to be used for statistics collection,
  40   its purpose is to provide a base for statistical multiplexing
  41   for controlled load service.
  42   If you need only statistics, run a user level daemon which
  43   periodically reads byte counters.
  44
  45   Unfortunately, rate estimation is not a very easy task.
  46   F.e. I did not find a simple way to estimate the current peak rate
  47   and even failed to formulate the problem 8)8)
  48
  49   So I preferred not to built an estimator into the scheduler,
  50   but run this task separately.
  51   Ideally, it should be kernel thread(s), but for now it runs
  52   from timers, which puts apparent top bounds on the number of rated
  53   flows, has minimal overhead on small, but is enough
  54   to handle controlled load service, sets of aggregates.
  55
  56   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
  57
  58   avrate = avrate*(1-W) + rate*W
  59
  60   where W is chosen as negative power of 2: W = 2^(-ewma_log)
  61
  62   The resulting time constant is:
  63
  64   T = A/(-ln(1-W))
  65
  66
  67   NOTES.
  68
  69   * avbps and avpps are scaled by 2^5.
  70   * both values are reported as 32 bit unsigned values. bps can
  71     overflow for fast links : max speed being 34360Mbit/sec
  72   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
  73     for HZ=100 and HZ=1024 8)), maximal interval
  74     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
  75     are too expensive, longer ones can be implemented
  76     at user level painlessly.
  77 */
  78
  79#define EST_MAX_INTERVAL        5
  80
  81struct gen_estimator
  82{
  83        struct list_head        list;
  84        struct gnet_stats_basic_packed  *bstats;
  85        struct gnet_stats_rate_est64    *rate_est;
  86        spinlock_t              *stats_lock;
  87        seqcount_t              *running;
  88        int                     ewma_log;
  89        u32                     last_packets;
  90        unsigned long           avpps;
  91        u64                     last_bytes;
  92        u64                     avbps;
  93        struct rcu_head         e_rcu;
  94        struct rb_node          node;
  95        struct gnet_stats_basic_cpu __percpu *cpu_bstats;
  96        struct rcu_head         head;
  97};
  98
  99struct gen_estimator_head
 100{
 101        struct timer_list       timer;
 102        struct list_head        list;
 103};
 104
 105static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
 106
 107/* Protects against NULL dereference */
 108static DEFINE_RWLOCK(est_lock);
 109
 110/* Protects against soft lockup during large deletion */
 111static struct rb_root est_root = RB_ROOT;
 112static DEFINE_SPINLOCK(est_tree_lock);
 113
 114static void est_timer(unsigned long arg)
 115{
 116        int idx = (int)arg;
 117        struct gen_estimator *e;
 118
 119        rcu_read_lock();
 120        list_for_each_entry_rcu(e, &elist[idx].list, list) {
 121                struct gnet_stats_basic_packed b = {0};
 122                unsigned long rate;
 123                u64 brate;
 124
 125                if (e->stats_lock)
 126                        spin_lock(e->stats_lock);
 127                read_lock(&est_lock);
 128                if (e->bstats == NULL)
 129                        goto skip;
 130
 131                __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
 132
 133                brate = (b.bytes - e->last_bytes)<<(7 - idx);
 134                e->last_bytes = b.bytes;
 135                e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
 136                WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
 137
 138                rate = b.packets - e->last_packets;
 139                rate <<= (7 - idx);
 140                e->last_packets = b.packets;
 141                e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
 142                WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
 143skip:
 144                read_unlock(&est_lock);
 145                if (e->stats_lock)
 146                        spin_unlock(e->stats_lock);
 147        }
 148
 149        if (!list_empty(&elist[idx].list))
 150                mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
 151        rcu_read_unlock();
 152}
 153
 154static void gen_add_node(struct gen_estimator *est)
 155{
 156        struct rb_node **p = &est_root.rb_node, *parent = NULL;
 157
 158        while (*p) {
 159                struct gen_estimator *e;
 160
 161                parent = *p;
 162                e = rb_entry(parent, struct gen_estimator, node);
 163
 164                if (est->bstats > e->bstats)
 165                        p = &parent->rb_right;
 166                else
 167                        p = &parent->rb_left;
 168        }
 169        rb_link_node(&est->node, parent, p);
 170        rb_insert_color(&est->node, &est_root);
 171}
 172
 173static
 174struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
 175                                    const struct gnet_stats_rate_est64 *rate_est)
 176{
 177        struct rb_node *p = est_root.rb_node;
 178
 179        while (p) {
 180                struct gen_estimator *e;
 181
 182                e = rb_entry(p, struct gen_estimator, node);
 183
 184                if (bstats > e->bstats)
 185                        p = p->rb_right;
 186                else if (bstats < e->bstats || rate_est != e->rate_est)
 187                        p = p->rb_left;
 188                else
 189                        return e;
 190        }
 191        return NULL;
 192}
 193
 194/**
 195 * gen_new_estimator - create a new rate estimator
 196 * @bstats: basic statistics
 197 * @cpu_bstats: bstats per cpu
 198 * @rate_est: rate estimator statistics
 199 * @stats_lock: statistics lock
 200 * @running: qdisc running seqcount
 201 * @opt: rate estimator configuration TLV
 202 *
 203 * Creates a new rate estimator with &bstats as source and &rate_est
 204 * as destination. A new timer with the interval specified in the
 205 * configuration TLV is created. Upon each interval, the latest statistics
 206 * will be read from &bstats and the estimated rate will be stored in
 207 * &rate_est with the statistics lock grabbed during this period.
 208 *
 209 * Returns 0 on success or a negative error code.
 210 *
 211 */
 212int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 213                      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 214                      struct gnet_stats_rate_est64 *rate_est,
 215                      spinlock_t *stats_lock,
 216                      seqcount_t *running,
 217                      struct nlattr *opt)
 218{
 219        struct gen_estimator *est;
 220        struct gnet_estimator *parm = nla_data(opt);
 221        struct gnet_stats_basic_packed b = {0};
 222        int idx;
 223
 224        if (nla_len(opt) < sizeof(*parm))
 225                return -EINVAL;
 226
 227        if (parm->interval < -2 || parm->interval > 3)
 228                return -EINVAL;
 229
 230        est = kzalloc(sizeof(*est), GFP_KERNEL);
 231        if (est == NULL)
 232                return -ENOBUFS;
 233
 234        __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
 235
 236        idx = parm->interval + 2;
 237        est->bstats = bstats;
 238        est->rate_est = rate_est;
 239        est->stats_lock = stats_lock;
 240        est->running  = running;
 241        est->ewma_log = parm->ewma_log;
 242        est->last_bytes = b.bytes;
 243        est->avbps = rate_est->bps<<5;
 244        est->last_packets = b.packets;
 245        est->avpps = rate_est->pps<<10;
 246        est->cpu_bstats = cpu_bstats;
 247
 248        spin_lock_bh(&est_tree_lock);
 249        if (!elist[idx].timer.function) {
 250                INIT_LIST_HEAD(&elist[idx].list);
 251                setup_timer(&elist[idx].timer, est_timer, idx);
 252        }
 253
 254        if (list_empty(&elist[idx].list))
 255                mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
 256
 257        list_add_rcu(&est->list, &elist[idx].list);
 258        gen_add_node(est);
 259        spin_unlock_bh(&est_tree_lock);
 260
 261        return 0;
 262}
 263EXPORT_SYMBOL(gen_new_estimator);
 264
 265/**
 266 * gen_kill_estimator - remove a rate estimator
 267 * @bstats: basic statistics
 268 * @rate_est: rate estimator statistics
 269 *
 270 * Removes the rate estimator specified by &bstats and &rate_est.
 271 *
 272 * Note : Caller should respect an RCU grace period before freeing stats_lock
 273 */
 274void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 275                        struct gnet_stats_rate_est64 *rate_est)
 276{
 277        struct gen_estimator *e;
 278
 279        spin_lock_bh(&est_tree_lock);
 280        while ((e = gen_find_node(bstats, rate_est))) {
 281                rb_erase(&e->node, &est_root);
 282
 283                write_lock(&est_lock);
 284                e->bstats = NULL;
 285                write_unlock(&est_lock);
 286
 287                list_del_rcu(&e->list);
 288                kfree_rcu(e, e_rcu);
 289        }
 290        spin_unlock_bh(&est_tree_lock);
 291}
 292EXPORT_SYMBOL(gen_kill_estimator);
 293
 294/**
 295 * gen_replace_estimator - replace rate estimator configuration
 296 * @bstats: basic statistics
 297 * @cpu_bstats: bstats per cpu
 298 * @rate_est: rate estimator statistics
 299 * @stats_lock: statistics lock
 300 * @running: qdisc running seqcount (might be NULL)
 301 * @opt: rate estimator configuration TLV
 302 *
 303 * Replaces the configuration of a rate estimator by calling
 304 * gen_kill_estimator() and gen_new_estimator().
 305 *
 306 * Returns 0 on success or a negative error code.
 307 */
 308int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 309                          struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 310                          struct gnet_stats_rate_est64 *rate_est,
 311                          spinlock_t *stats_lock,
 312                          seqcount_t *running, struct nlattr *opt)
 313{
 314        gen_kill_estimator(bstats, rate_est);
 315        return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
 316}
 317EXPORT_SYMBOL(gen_replace_estimator);
 318
 319/**
 320 * gen_estimator_active - test if estimator is currently in use
 321 * @bstats: basic statistics
 322 * @rate_est: rate estimator statistics
 323 *
 324 * Returns true if estimator is active, and false if not.
 325 */
 326bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
 327                          const struct gnet_stats_rate_est64 *rate_est)
 328{
 329        bool res;
 330
 331        ASSERT_RTNL();
 332
 333        spin_lock_bh(&est_tree_lock);
 334        res = gen_find_node(bstats, rate_est) != NULL;
 335        spin_unlock_bh(&est_tree_lock);
 336
 337        return res;
 338}
 339EXPORT_SYMBOL(gen_estimator_active);
 340