LXR linux/net/core/flow.c

   1/* flow.c: Generic flow cache.
   2 *
   3 * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
   4 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
   5 */
   6
   7#include <linux/kernel.h>
   8#include <linux/module.h>
   9#include <linux/list.h>
  10#include <linux/jhash.h>
  11#include <linux/interrupt.h>
  12#include <linux/mm.h>
  13#include <linux/random.h>
  14#include <linux/init.h>
  15#include <linux/slab.h>
  16#include <linux/smp.h>
  17#include <linux/completion.h>
  18#include <linux/percpu.h>
  19#include <linux/bitops.h>
  20#include <linux/notifier.h>
  21#include <linux/cpu.h>
  22#include <linux/cpumask.h>
  23#include <linux/mutex.h>
  24#include <net/flow.h>
  25#include <linux/atomic.h>
  26#include <linux/security.h>
  27#include <net/net_namespace.h>
  28
  29struct flow_cache_entry {
  30        union {
  31                struct hlist_node       hlist;
  32                struct list_head        gc_list;
  33        } u;
  34        struct net                      *net;
  35        u16                             family;
  36        u8                              dir;
  37        u32                             genid;
  38        struct flowi                    key;
  39        struct flow_cache_object        *object;
  40};
  41
  42struct flow_flush_info {
  43        struct flow_cache               *cache;
  44        atomic_t                        cpuleft;
  45        struct completion               completion;
  46};
  47
  48static struct kmem_cache *flow_cachep __read_mostly;
  49
  50#define flow_cache_hash_size(cache)     (1 << (cache)->hash_shift)
  51#define FLOW_HASH_RND_PERIOD            (10 * 60 * HZ)
  52
  53static void flow_cache_new_hashrnd(unsigned long arg)
  54{
  55        struct flow_cache *fc = (void *) arg;
  56        int i;
  57
  58        for_each_possible_cpu(i)
  59                per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
  60
  61        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
  62        add_timer(&fc->rnd_timer);
  63}
  64
  65static int flow_entry_valid(struct flow_cache_entry *fle,
  66                                struct net *net)
  67{
  68        if (atomic_read(&net->flow_cache_genid) != fle->genid)
  69                return 0;
  70        if (fle->object && !fle->object->ops->check(fle->object))
  71                return 0;
  72        return 1;
  73}
  74
  75static void flow_entry_kill(struct flow_cache_entry *fle,
  76                                struct net *net)
  77{
  78        if (fle->object)
  79                fle->object->ops->delete(fle->object);
  80        kmem_cache_free(flow_cachep, fle);
  81}
  82
  83static void flow_cache_gc_task(struct work_struct *work)
  84{
  85        struct list_head gc_list;
  86        struct flow_cache_entry *fce, *n;
  87        struct net *net = container_of(work, struct net, flow_cache_gc_work);
  88
  89        INIT_LIST_HEAD(&gc_list);
  90        spin_lock_bh(&net->flow_cache_gc_lock);
  91        list_splice_tail_init(&net->flow_cache_gc_list, &gc_list);
  92        spin_unlock_bh(&net->flow_cache_gc_lock);
  93
  94        list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
  95                flow_entry_kill(fce, net);
  96}
  97
  98static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
  99                                     int deleted, struct list_head *gc_list,
 100                                     struct net *net)
 101{
 102        if (deleted) {
 103                fcp->hash_count -= deleted;
 104                spin_lock_bh(&net->flow_cache_gc_lock);
 105                list_splice_tail(gc_list, &net->flow_cache_gc_list);
 106                spin_unlock_bh(&net->flow_cache_gc_lock);
 107                schedule_work(&net->flow_cache_gc_work);
 108        }
 109}
 110
 111static void __flow_cache_shrink(struct flow_cache *fc,
 112                                struct flow_cache_percpu *fcp,
 113                                int shrink_to)
 114{
 115        struct flow_cache_entry *fle;
 116        struct hlist_node *tmp;
 117        LIST_HEAD(gc_list);
 118        int i, deleted = 0;
 119        struct net *net = container_of(fc, struct net, flow_cache_global);
 120
 121        for (i = 0; i < flow_cache_hash_size(fc); i++) {
 122                int saved = 0;
 123
 124                hlist_for_each_entry_safe(fle, tmp,
 125                                          &fcp->hash_table[i], u.hlist) {
 126                        if (saved < shrink_to &&
 127                            flow_entry_valid(fle, net)) {
 128                                saved++;
 129                        } else {
 130                                deleted++;
 131                                hlist_del(&fle->u.hlist);
 132                                list_add_tail(&fle->u.gc_list, &gc_list);
 133                        }
 134                }
 135        }
 136
 137        flow_cache_queue_garbage(fcp, deleted, &gc_list, net);
 138}
 139
 140static void flow_cache_shrink(struct flow_cache *fc,
 141                              struct flow_cache_percpu *fcp)
 142{
 143        int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
 144
 145        __flow_cache_shrink(fc, fcp, shrink_to);
 146}
 147
 148static void flow_new_hash_rnd(struct flow_cache *fc,
 149                              struct flow_cache_percpu *fcp)
 150{
 151        get_random_bytes(&fcp->hash_rnd, sizeof(u32));
 152        fcp->hash_rnd_recalc = 0;
 153        __flow_cache_shrink(fc, fcp, 0);
 154}
 155
 156static u32 flow_hash_code(struct flow_cache *fc,
 157                          struct flow_cache_percpu *fcp,
 158                          const struct flowi *key,
 159                          size_t keysize)
 160{
 161        const u32 *k = (const u32 *) key;
 162        const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
 163
 164        return jhash2(k, length, fcp->hash_rnd)
 165                & (flow_cache_hash_size(fc) - 1);
 166}
 167
 168/* I hear what you're saying, use memcmp.  But memcmp cannot make
 169 * important assumptions that we can here, such as alignment.
 170 */
 171static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
 172                            size_t keysize)
 173{
 174        const flow_compare_t *k1, *k1_lim, *k2;
 175
 176        k1 = (const flow_compare_t *) key1;
 177        k1_lim = k1 + keysize;
 178
 179        k2 = (const flow_compare_t *) key2;
 180
 181        do {
 182                if (*k1++ != *k2++)
 183                        return 1;
 184        } while (k1 < k1_lim);
 185
 186        return 0;
 187}
 188
 189struct flow_cache_object *
 190flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 191                  flow_resolve_t resolver, void *ctx)
 192{
 193        struct flow_cache *fc = &net->flow_cache_global;
 194        struct flow_cache_percpu *fcp;
 195        struct flow_cache_entry *fle, *tfle;
 196        struct flow_cache_object *flo;
 197        size_t keysize;
 198        unsigned int hash;
 199
 200        local_bh_disable();
 201        fcp = this_cpu_ptr(fc->percpu);
 202
 203        fle = NULL;
 204        flo = NULL;
 205
 206        keysize = flow_key_size(family);
 207        if (!keysize)
 208                goto nocache;
 209
 210        /* Packet really early in init?  Making flow_cache_init a
 211         * pre-smp initcall would solve this.  --RR */
 212        if (!fcp->hash_table)
 213                goto nocache;
 214
 215        if (fcp->hash_rnd_recalc)
 216                flow_new_hash_rnd(fc, fcp);
 217
 218        hash = flow_hash_code(fc, fcp, key, keysize);
 219        hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
 220                if (tfle->net == net &&
 221                    tfle->family == family &&
 222                    tfle->dir == dir &&
 223                    flow_key_compare(key, &tfle->key, keysize) == 0) {
 224                        fle = tfle;
 225                        break;
 226                }
 227        }
 228
 229        if (unlikely(!fle)) {
 230                if (fcp->hash_count > fc->high_watermark)
 231                        flow_cache_shrink(fc, fcp);
 232
 233                fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 234                if (fle) {
 235                        fle->net = net;
 236                        fle->family = family;
 237                        fle->dir = dir;
 238                        memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
 239                        fle->object = NULL;
 240                        hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 241                        fcp->hash_count++;
 242                }
 243        } else if (likely(fle->genid == atomic_read(&net->flow_cache_genid))) {
 244                flo = fle->object;
 245                if (!flo)
 246                        goto ret_object;
 247                flo = flo->ops->get(flo);
 248                if (flo)
 249                        goto ret_object;
 250        } else if (fle->object) {
 251                flo = fle->object;
 252                flo->ops->delete(flo);
 253                fle->object = NULL;
 254        }
 255
 256nocache:
 257        flo = NULL;
 258        if (fle) {
 259                flo = fle->object;
 260                fle->object = NULL;
 261        }
 262        flo = resolver(net, key, family, dir, flo, ctx);
 263        if (fle) {
 264                fle->genid = atomic_read(&net->flow_cache_genid);
 265                if (!IS_ERR(flo))
 266                        fle->object = flo;
 267                else
 268                        fle->genid--;
 269        } else {
 270                if (!IS_ERR_OR_NULL(flo))
 271                        flo->ops->delete(flo);
 272        }
 273ret_object:
 274        local_bh_enable();
 275        return flo;
 276}
 277EXPORT_SYMBOL(flow_cache_lookup);
 278
 279static void flow_cache_flush_tasklet(unsigned long data)
 280{
 281        struct flow_flush_info *info = (void *)data;
 282        struct flow_cache *fc = info->cache;
 283        struct flow_cache_percpu *fcp;
 284        struct flow_cache_entry *fle;
 285        struct hlist_node *tmp;
 286        LIST_HEAD(gc_list);
 287        int i, deleted = 0;
 288        struct net *net = container_of(fc, struct net, flow_cache_global);
 289
 290        fcp = this_cpu_ptr(fc->percpu);
 291        for (i = 0; i < flow_cache_hash_size(fc); i++) {
 292                hlist_for_each_entry_safe(fle, tmp,
 293                                          &fcp->hash_table[i], u.hlist) {
 294                        if (flow_entry_valid(fle, net))
 295                                continue;
 296
 297                        deleted++;
 298                        hlist_del(&fle->u.hlist);
 299                        list_add_tail(&fle->u.gc_list, &gc_list);
 300                }
 301        }
 302
 303        flow_cache_queue_garbage(fcp, deleted, &gc_list, net);
 304
 305        if (atomic_dec_and_test(&info->cpuleft))
 306                complete(&info->completion);
 307}
 308
 309/*
 310 * Return whether a cpu needs flushing.  Conservatively, we assume
 311 * the presence of any entries means the core may require flushing,
 312 * since the flow_cache_ops.check() function may assume it's running
 313 * on the same core as the per-cpu cache component.
 314 */
 315static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
 316{
 317        struct flow_cache_percpu *fcp;
 318        int i;
 319
 320        fcp = per_cpu_ptr(fc->percpu, cpu);
 321        for (i = 0; i < flow_cache_hash_size(fc); i++)
 322                if (!hlist_empty(&fcp->hash_table[i]))
 323                        return 0;
 324        return 1;
 325}
 326
 327static void flow_cache_flush_per_cpu(void *data)
 328{
 329        struct flow_flush_info *info = data;
 330        struct tasklet_struct *tasklet;
 331
 332        tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
 333        tasklet->data = (unsigned long)info;
 334        tasklet_schedule(tasklet);
 335}
 336
 337void flow_cache_flush(struct net *net)
 338{
 339        struct flow_flush_info info;
 340        cpumask_var_t mask;
 341        int i, self;
 342
 343        /* Track which cpus need flushing to avoid disturbing all cores. */
 344        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 345                return;
 346        cpumask_clear(mask);
 347
 348        /* Don't want cpus going down or up during this. */
 349        get_online_cpus();
 350        mutex_lock(&net->flow_flush_sem);
 351        info.cache = &net->flow_cache_global;
 352        for_each_online_cpu(i)
 353                if (!flow_cache_percpu_empty(info.cache, i))
 354                        cpumask_set_cpu(i, mask);
 355        atomic_set(&info.cpuleft, cpumask_weight(mask));
 356        if (atomic_read(&info.cpuleft) == 0)
 357                goto done;
 358
 359        init_completion(&info.completion);
 360
 361        local_bh_disable();
 362        self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
 363        on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
 364        if (self)
 365                flow_cache_flush_tasklet((unsigned long)&info);
 366        local_bh_enable();
 367
 368        wait_for_completion(&info.completion);
 369
 370done:
 371        mutex_unlock(&net->flow_flush_sem);
 372        put_online_cpus();
 373        free_cpumask_var(mask);
 374}
 375
 376static void flow_cache_flush_task(struct work_struct *work)
 377{
 378        struct net *net = container_of(work, struct net, flow_cache_flush_work);
 379
 380        flow_cache_flush(net);
 381}
 382
 383void flow_cache_flush_deferred(struct net *net)
 384{
 385        schedule_work(&net->flow_cache_flush_work);
 386}
 387
 388static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
 389{
 390        struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 391        size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
 392
 393        if (!fcp->hash_table) {
 394                fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
 395                if (!fcp->hash_table) {
 396                        pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
 397                        return -ENOMEM;
 398                }
 399                fcp->hash_rnd_recalc = 1;
 400                fcp->hash_count = 0;
 401                tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
 402        }
 403        return 0;
 404}
 405
 406static int flow_cache_cpu(struct notifier_block *nfb,
 407                          unsigned long action,
 408                          void *hcpu)
 409{
 410        struct flow_cache *fc = container_of(nfb, struct flow_cache,
 411                                                hotcpu_notifier);
 412        int res, cpu = (unsigned long) hcpu;
 413        struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 414
 415        switch (action) {
 416        case CPU_UP_PREPARE:
 417        case CPU_UP_PREPARE_FROZEN:
 418                res = flow_cache_cpu_prepare(fc, cpu);
 419                if (res)
 420                        return notifier_from_errno(res);
 421                break;
 422        case CPU_DEAD:
 423        case CPU_DEAD_FROZEN:
 424                __flow_cache_shrink(fc, fcp, 0);
 425                break;
 426        }
 427        return NOTIFY_OK;
 428}
 429
 430int flow_cache_init(struct net *net)
 431{
 432        int i;
 433        struct flow_cache *fc = &net->flow_cache_global;
 434
 435        if (!flow_cachep)
 436                flow_cachep = kmem_cache_create("flow_cache",
 437                                                sizeof(struct flow_cache_entry),
 438                                                0, SLAB_PANIC, NULL);
 439        spin_lock_init(&net->flow_cache_gc_lock);
 440        INIT_LIST_HEAD(&net->flow_cache_gc_list);
 441        INIT_WORK(&net->flow_cache_gc_work, flow_cache_gc_task);
 442        INIT_WORK(&net->flow_cache_flush_work, flow_cache_flush_task);
 443        mutex_init(&net->flow_flush_sem);
 444
 445        fc->hash_shift = 10;
 446        fc->low_watermark = 2 * flow_cache_hash_size(fc);
 447        fc->high_watermark = 4 * flow_cache_hash_size(fc);
 448
 449        fc->percpu = alloc_percpu(struct flow_cache_percpu);
 450        if (!fc->percpu)
 451                return -ENOMEM;
 452
 453        cpu_notifier_register_begin();
 454
 455        for_each_online_cpu(i) {
 456                if (flow_cache_cpu_prepare(fc, i))
 457                        goto err;
 458        }
 459        fc->hotcpu_notifier = (struct notifier_block){
 460                .notifier_call = flow_cache_cpu,
 461        };
 462        __register_hotcpu_notifier(&fc->hotcpu_notifier);
 463
 464        cpu_notifier_register_done();
 465
 466        setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
 467                    (unsigned long) fc);
 468        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
 469        add_timer(&fc->rnd_timer);
 470
 471        return 0;
 472
 473err:
 474        for_each_possible_cpu(i) {
 475                struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
 476                kfree(fcp->hash_table);
 477                fcp->hash_table = NULL;
 478        }
 479
 480        cpu_notifier_register_done();
 481
 482        free_percpu(fc->percpu);
 483        fc->percpu = NULL;
 484
 485        return -ENOMEM;
 486}
 487EXPORT_SYMBOL(flow_cache_init);
 488
 489void flow_cache_fini(struct net *net)
 490{
 491        int i;
 492        struct flow_cache *fc = &net->flow_cache_global;
 493
 494        del_timer_sync(&fc->rnd_timer);
 495        unregister_hotcpu_notifier(&fc->hotcpu_notifier);
 496
 497        for_each_possible_cpu(i) {
 498                struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
 499                kfree(fcp->hash_table);
 500                fcp->hash_table = NULL;
 501        }
 502
 503        free_percpu(fc->percpu);
 504        fc->percpu = NULL;
 505}
 506EXPORT_SYMBOL(flow_cache_fini);
 507