LXR linux/net/core/flow.c

   1/* flow.c: Generic flow cache.
   2 *
   3 * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
   4 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
   5 */
   6
   7#include <linux/kernel.h>
   8#include <linux/module.h>
   9#include <linux/list.h>
  10#include <linux/jhash.h>
  11#include <linux/interrupt.h>
  12#include <linux/mm.h>
  13#include <linux/random.h>
  14#include <linux/init.h>
  15#include <linux/slab.h>
  16#include <linux/smp.h>
  17#include <linux/completion.h>
  18#include <linux/percpu.h>
  19#include <linux/bitops.h>
  20#include <linux/notifier.h>
  21#include <linux/cpu.h>
  22#include <linux/cpumask.h>
  23#include <linux/mutex.h>
  24#include <net/flow.h>
  25#include <linux/atomic.h>
  26#include <linux/security.h>
  27#include <net/net_namespace.h>
  28
  29struct flow_cache_entry {
  30        union {
  31                struct hlist_node       hlist;
  32                struct list_head        gc_list;
  33        } u;
  34        struct net                      *net;
  35        u16                             family;
  36        u8                              dir;
  37        u32                             genid;
  38        struct flowi                    key;
  39        struct flow_cache_object        *object;
  40};
  41
  42struct flow_flush_info {
  43        struct flow_cache               *cache;
  44        atomic_t                        cpuleft;
  45        struct completion               completion;
  46};
  47
  48static struct kmem_cache *flow_cachep __read_mostly;
  49
  50#define flow_cache_hash_size(cache)     (1 << (cache)->hash_shift)
  51#define FLOW_HASH_RND_PERIOD            (10 * 60 * HZ)
  52
  53static void flow_cache_new_hashrnd(unsigned long arg)
  54{
  55        struct flow_cache *fc = (void *) arg;
  56        int i;
  57
  58        for_each_possible_cpu(i)
  59                per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
  60
  61        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
  62        add_timer(&fc->rnd_timer);
  63}
  64
  65static int flow_entry_valid(struct flow_cache_entry *fle,
  66                                struct netns_xfrm *xfrm)
  67{
  68        if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
  69                return 0;
  70        if (fle->object && !fle->object->ops->check(fle->object))
  71                return 0;
  72        return 1;
  73}
  74
  75static void flow_entry_kill(struct flow_cache_entry *fle,
  76                                struct netns_xfrm *xfrm)
  77{
  78        if (fle->object)
  79                fle->object->ops->delete(fle->object);
  80        kmem_cache_free(flow_cachep, fle);
  81}
  82
  83static void flow_cache_gc_task(struct work_struct *work)
  84{
  85        struct list_head gc_list;
  86        struct flow_cache_entry *fce, *n;
  87        struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
  88                                                flow_cache_gc_work);
  89
  90        INIT_LIST_HEAD(&gc_list);
  91        spin_lock_bh(&xfrm->flow_cache_gc_lock);
  92        list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
  93        spin_unlock_bh(&xfrm->flow_cache_gc_lock);
  94
  95        list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
  96                flow_entry_kill(fce, xfrm);
  97                atomic_dec(&xfrm->flow_cache_gc_count);
  98                WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0);
  99        }
 100}
 101
 102static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
 103                                     int deleted, struct list_head *gc_list,
 104                                     struct netns_xfrm *xfrm)
 105{
 106        if (deleted) {
 107                atomic_add(deleted, &xfrm->flow_cache_gc_count);
 108                fcp->hash_count -= deleted;
 109                spin_lock_bh(&xfrm->flow_cache_gc_lock);
 110                list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
 111                spin_unlock_bh(&xfrm->flow_cache_gc_lock);
 112                schedule_work(&xfrm->flow_cache_gc_work);
 113        }
 114}
 115
 116static void __flow_cache_shrink(struct flow_cache *fc,
 117                                struct flow_cache_percpu *fcp,
 118                                int shrink_to)
 119{
 120        struct flow_cache_entry *fle;
 121        struct hlist_node *tmp;
 122        LIST_HEAD(gc_list);
 123        int i, deleted = 0;
 124        struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
 125                                                flow_cache_global);
 126
 127        for (i = 0; i < flow_cache_hash_size(fc); i++) {
 128                int saved = 0;
 129
 130                hlist_for_each_entry_safe(fle, tmp,
 131                                          &fcp->hash_table[i], u.hlist) {
 132                        if (saved < shrink_to &&
 133                            flow_entry_valid(fle, xfrm)) {
 134                                saved++;
 135                        } else {
 136                                deleted++;
 137                                hlist_del(&fle->u.hlist);
 138                                list_add_tail(&fle->u.gc_list, &gc_list);
 139                        }
 140                }
 141        }
 142
 143        flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
 144}
 145
 146static void flow_cache_shrink(struct flow_cache *fc,
 147                              struct flow_cache_percpu *fcp)
 148{
 149        int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
 150
 151        __flow_cache_shrink(fc, fcp, shrink_to);
 152}
 153
 154static void flow_new_hash_rnd(struct flow_cache *fc,
 155                              struct flow_cache_percpu *fcp)
 156{
 157        get_random_bytes(&fcp->hash_rnd, sizeof(u32));
 158        fcp->hash_rnd_recalc = 0;
 159        __flow_cache_shrink(fc, fcp, 0);
 160}
 161
 162static u32 flow_hash_code(struct flow_cache *fc,
 163                          struct flow_cache_percpu *fcp,
 164                          const struct flowi *key,
 165                          size_t keysize)
 166{
 167        const u32 *k = (const u32 *) key;
 168        const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
 169
 170        return jhash2(k, length, fcp->hash_rnd)
 171                & (flow_cache_hash_size(fc) - 1);
 172}
 173
 174/* I hear what you're saying, use memcmp.  But memcmp cannot make
 175 * important assumptions that we can here, such as alignment.
 176 */
 177static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
 178                            size_t keysize)
 179{
 180        const flow_compare_t *k1, *k1_lim, *k2;
 181
 182        k1 = (const flow_compare_t *) key1;
 183        k1_lim = k1 + keysize;
 184
 185        k2 = (const flow_compare_t *) key2;
 186
 187        do {
 188                if (*k1++ != *k2++)
 189                        return 1;
 190        } while (k1 < k1_lim);
 191
 192        return 0;
 193}
 194
 195struct flow_cache_object *
 196flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 197                  flow_resolve_t resolver, void *ctx)
 198{
 199        struct flow_cache *fc = &net->xfrm.flow_cache_global;
 200        struct flow_cache_percpu *fcp;
 201        struct flow_cache_entry *fle, *tfle;
 202        struct flow_cache_object *flo;
 203        size_t keysize;
 204        unsigned int hash;
 205
 206        local_bh_disable();
 207        fcp = this_cpu_ptr(fc->percpu);
 208
 209        fle = NULL;
 210        flo = NULL;
 211
 212        keysize = flow_key_size(family);
 213        if (!keysize)
 214                goto nocache;
 215
 216        /* Packet really early in init?  Making flow_cache_init a
 217         * pre-smp initcall would solve this.  --RR */
 218        if (!fcp->hash_table)
 219                goto nocache;
 220
 221        if (fcp->hash_rnd_recalc)
 222                flow_new_hash_rnd(fc, fcp);
 223
 224        hash = flow_hash_code(fc, fcp, key, keysize);
 225        hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
 226                if (tfle->net == net &&
 227                    tfle->family == family &&
 228                    tfle->dir == dir &&
 229                    flow_key_compare(key, &tfle->key, keysize) == 0) {
 230                        fle = tfle;
 231                        break;
 232                }
 233        }
 234
 235        if (unlikely(!fle)) {
 236                if (fcp->hash_count > fc->high_watermark)
 237                        flow_cache_shrink(fc, fcp);
 238
 239                if (fcp->hash_count > 2 * fc->high_watermark ||
 240                    atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) {
 241                        atomic_inc(&net->xfrm.flow_cache_genid);
 242                        flo = ERR_PTR(-ENOBUFS);
 243                        goto ret_object;
 244                }
 245
 246                fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 247                if (fle) {
 248                        fle->net = net;
 249                        fle->family = family;
 250                        fle->dir = dir;
 251                        memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
 252                        fle->object = NULL;
 253                        hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 254                        fcp->hash_count++;
 255                }
 256        } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
 257                flo = fle->object;
 258                if (!flo)
 259                        goto ret_object;
 260                flo = flo->ops->get(flo);
 261                if (flo)
 262                        goto ret_object;
 263        } else if (fle->object) {
 264                flo = fle->object;
 265                flo->ops->delete(flo);
 266                fle->object = NULL;
 267        }
 268
 269nocache:
 270        flo = NULL;
 271        if (fle) {
 272                flo = fle->object;
 273                fle->object = NULL;
 274        }
 275        flo = resolver(net, key, family, dir, flo, ctx);
 276        if (fle) {
 277                fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
 278                if (!IS_ERR(flo))
 279                        fle->object = flo;
 280                else
 281                        fle->genid--;
 282        } else {
 283                if (!IS_ERR_OR_NULL(flo))
 284                        flo->ops->delete(flo);
 285        }
 286ret_object:
 287        local_bh_enable();
 288        return flo;
 289}
 290EXPORT_SYMBOL(flow_cache_lookup);
 291
 292static void flow_cache_flush_tasklet(unsigned long data)
 293{
 294        struct flow_flush_info *info = (void *)data;
 295        struct flow_cache *fc = info->cache;
 296        struct flow_cache_percpu *fcp;
 297        struct flow_cache_entry *fle;
 298        struct hlist_node *tmp;
 299        LIST_HEAD(gc_list);
 300        int i, deleted = 0;
 301        struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
 302                                                flow_cache_global);
 303
 304        fcp = this_cpu_ptr(fc->percpu);
 305        for (i = 0; i < flow_cache_hash_size(fc); i++) {
 306                hlist_for_each_entry_safe(fle, tmp,
 307                                          &fcp->hash_table[i], u.hlist) {
 308                        if (flow_entry_valid(fle, xfrm))
 309                                continue;
 310
 311                        deleted++;
 312                        hlist_del(&fle->u.hlist);
 313                        list_add_tail(&fle->u.gc_list, &gc_list);
 314                }
 315        }
 316
 317        flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
 318
 319        if (atomic_dec_and_test(&info->cpuleft))
 320                complete(&info->completion);
 321}
 322
 323/*
 324 * Return whether a cpu needs flushing.  Conservatively, we assume
 325 * the presence of any entries means the core may require flushing,
 326 * since the flow_cache_ops.check() function may assume it's running
 327 * on the same core as the per-cpu cache component.
 328 */
 329static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
 330{
 331        struct flow_cache_percpu *fcp;
 332        int i;
 333
 334        fcp = per_cpu_ptr(fc->percpu, cpu);
 335        for (i = 0; i < flow_cache_hash_size(fc); i++)
 336                if (!hlist_empty(&fcp->hash_table[i]))
 337                        return 0;
 338        return 1;
 339}
 340
 341static void flow_cache_flush_per_cpu(void *data)
 342{
 343        struct flow_flush_info *info = data;
 344        struct tasklet_struct *tasklet;
 345
 346        tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
 347        tasklet->data = (unsigned long)info;
 348        tasklet_schedule(tasklet);
 349}
 350
 351void flow_cache_flush(struct net *net)
 352{
 353        struct flow_flush_info info;
 354        cpumask_var_t mask;
 355        int i, self;
 356
 357        /* Track which cpus need flushing to avoid disturbing all cores. */
 358        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 359                return;
 360        cpumask_clear(mask);
 361
 362        /* Don't want cpus going down or up during this. */
 363        get_online_cpus();
 364        mutex_lock(&net->xfrm.flow_flush_sem);
 365        info.cache = &net->xfrm.flow_cache_global;
 366        for_each_online_cpu(i)
 367                if (!flow_cache_percpu_empty(info.cache, i))
 368                        cpumask_set_cpu(i, mask);
 369        atomic_set(&info.cpuleft, cpumask_weight(mask));
 370        if (atomic_read(&info.cpuleft) == 0)
 371                goto done;
 372
 373        init_completion(&info.completion);
 374
 375        local_bh_disable();
 376        self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
 377        on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
 378        if (self)
 379                flow_cache_flush_tasklet((unsigned long)&info);
 380        local_bh_enable();
 381
 382        wait_for_completion(&info.completion);
 383
 384done:
 385        mutex_unlock(&net->xfrm.flow_flush_sem);
 386        put_online_cpus();
 387        free_cpumask_var(mask);
 388}
 389
 390static void flow_cache_flush_task(struct work_struct *work)
 391{
 392        struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
 393                                                flow_cache_flush_work);
 394        struct net *net = container_of(xfrm, struct net, xfrm);
 395
 396        flow_cache_flush(net);
 397}
 398
 399void flow_cache_flush_deferred(struct net *net)
 400{
 401        schedule_work(&net->xfrm.flow_cache_flush_work);
 402}
 403
 404static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
 405{
 406        struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 407        size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
 408
 409        if (!fcp->hash_table) {
 410                fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
 411                if (!fcp->hash_table) {
 412                        pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
 413                        return -ENOMEM;
 414                }
 415                fcp->hash_rnd_recalc = 1;
 416                fcp->hash_count = 0;
 417                tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
 418        }
 419        return 0;
 420}
 421
 422static int flow_cache_cpu(struct notifier_block *nfb,
 423                          unsigned long action,
 424                          void *hcpu)
 425{
 426        struct flow_cache *fc = container_of(nfb, struct flow_cache,
 427                                                hotcpu_notifier);
 428        int res, cpu = (unsigned long) hcpu;
 429        struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 430
 431        switch (action) {
 432        case CPU_UP_PREPARE:
 433        case CPU_UP_PREPARE_FROZEN:
 434                res = flow_cache_cpu_prepare(fc, cpu);
 435                if (res)
 436                        return notifier_from_errno(res);
 437                break;
 438        case CPU_DEAD:
 439        case CPU_DEAD_FROZEN:
 440                __flow_cache_shrink(fc, fcp, 0);
 441                break;
 442        }
 443        return NOTIFY_OK;
 444}
 445
 446int flow_cache_init(struct net *net)
 447{
 448        int i;
 449        struct flow_cache *fc = &net->xfrm.flow_cache_global;
 450
 451        if (!flow_cachep)
 452                flow_cachep = kmem_cache_create("flow_cache",
 453                                                sizeof(struct flow_cache_entry),
 454                                                0, SLAB_PANIC, NULL);
 455        spin_lock_init(&net->xfrm.flow_cache_gc_lock);
 456        INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
 457        INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
 458        INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
 459        mutex_init(&net->xfrm.flow_flush_sem);
 460        atomic_set(&net->xfrm.flow_cache_gc_count, 0);
 461
 462        fc->hash_shift = 10;
 463        fc->low_watermark = 2 * flow_cache_hash_size(fc);
 464        fc->high_watermark = 4 * flow_cache_hash_size(fc);
 465
 466        fc->percpu = alloc_percpu(struct flow_cache_percpu);
 467        if (!fc->percpu)
 468                return -ENOMEM;
 469
 470        cpu_notifier_register_begin();
 471
 472        for_each_online_cpu(i) {
 473                if (flow_cache_cpu_prepare(fc, i))
 474                        goto err;
 475        }
 476        fc->hotcpu_notifier = (struct notifier_block){
 477                .notifier_call = flow_cache_cpu,
 478        };
 479        __register_hotcpu_notifier(&fc->hotcpu_notifier);
 480
 481        cpu_notifier_register_done();
 482
 483        setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
 484                    (unsigned long) fc);
 485        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
 486        add_timer(&fc->rnd_timer);
 487
 488        return 0;
 489
 490err:
 491        for_each_possible_cpu(i) {
 492                struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
 493                kfree(fcp->hash_table);
 494                fcp->hash_table = NULL;
 495        }
 496
 497        cpu_notifier_register_done();
 498
 499        free_percpu(fc->percpu);
 500        fc->percpu = NULL;
 501
 502        return -ENOMEM;
 503}
 504EXPORT_SYMBOL(flow_cache_init);
 505
 506void flow_cache_fini(struct net *net)
 507{
 508        int i;
 509        struct flow_cache *fc = &net->xfrm.flow_cache_global;
 510
 511        del_timer_sync(&fc->rnd_timer);
 512        unregister_hotcpu_notifier(&fc->hotcpu_notifier);
 513
 514        for_each_possible_cpu(i) {
 515                struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
 516                kfree(fcp->hash_table);
 517                fcp->hash_table = NULL;
 518        }
 519
 520        free_percpu(fc->percpu);
 521        fc->percpu = NULL;
 522}
 523EXPORT_SYMBOL(flow_cache_fini);
 524