linux/net/core/flow.c
<<
>>
Prefs
   1/* flow.c: Generic flow cache.
   2 *
   3 * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
   4 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
   5 */
   6
   7#include <linux/kernel.h>
   8#include <linux/module.h>
   9#include <linux/list.h>
  10#include <linux/jhash.h>
  11#include <linux/interrupt.h>
  12#include <linux/mm.h>
  13#include <linux/random.h>
  14#include <linux/init.h>
  15#include <linux/slab.h>
  16#include <linux/smp.h>
  17#include <linux/completion.h>
  18#include <linux/percpu.h>
  19#include <linux/bitops.h>
  20#include <linux/notifier.h>
  21#include <linux/cpu.h>
  22#include <linux/cpumask.h>
  23#include <linux/mutex.h>
  24#include <net/flow.h>
  25#include <linux/atomic.h>
  26#include <linux/security.h>
  27#include <net/net_namespace.h>
  28
  29struct flow_cache_entry {
  30        union {
  31                struct hlist_node       hlist;
  32                struct list_head        gc_list;
  33        } u;
  34        struct net                      *net;
  35        u16                             family;
  36        u8                              dir;
  37        u32                             genid;
  38        struct flowi                    key;
  39        struct flow_cache_object        *object;
  40};
  41
  42struct flow_flush_info {
  43        struct flow_cache               *cache;
  44        atomic_t                        cpuleft;
  45        struct completion               completion;
  46};
  47
  48static struct kmem_cache *flow_cachep __read_mostly;
  49
  50#define flow_cache_hash_size(cache)     (1 << (cache)->hash_shift)
  51#define FLOW_HASH_RND_PERIOD            (10 * 60 * HZ)
  52
  53static void flow_cache_new_hashrnd(unsigned long arg)
  54{
  55        struct flow_cache *fc = (void *) arg;
  56        int i;
  57
  58        for_each_possible_cpu(i)
  59                per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
  60
  61        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
  62        add_timer(&fc->rnd_timer);
  63}
  64
  65static int flow_entry_valid(struct flow_cache_entry *fle,
  66                                struct netns_xfrm *xfrm)
  67{
  68        if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
  69                return 0;
  70        if (fle->object && !fle->object->ops->check(fle->object))
  71                return 0;
  72        return 1;
  73}
  74
  75static void flow_entry_kill(struct flow_cache_entry *fle,
  76                                struct netns_xfrm *xfrm)
  77{
  78        if (fle->object)
  79                fle->object->ops->delete(fle->object);
  80        kmem_cache_free(flow_cachep, fle);
  81}
  82
  83static void flow_cache_gc_task(struct work_struct *work)
  84{
  85        struct list_head gc_list;
  86        struct flow_cache_entry *fce, *n;
  87        struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
  88                                                flow_cache_gc_work);
  89
  90        INIT_LIST_HEAD(&gc_list);
  91        spin_lock_bh(&xfrm->flow_cache_gc_lock);
  92        list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
  93        spin_unlock_bh(&xfrm->flow_cache_gc_lock);
  94
  95        list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
  96                flow_entry_kill(fce, xfrm);
  97}
  98
  99static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
 100                                     int deleted, struct list_head *gc_list,
 101                                     struct netns_xfrm *xfrm)
 102{
 103        if (deleted) {
 104                fcp->hash_count -= deleted;
 105                spin_lock_bh(&xfrm->flow_cache_gc_lock);
 106                list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
 107                spin_unlock_bh(&xfrm->flow_cache_gc_lock);
 108                schedule_work(&xfrm->flow_cache_gc_work);
 109        }
 110}
 111
 112static void __flow_cache_shrink(struct flow_cache *fc,
 113                                struct flow_cache_percpu *fcp,
 114                                int shrink_to)
 115{
 116        struct flow_cache_entry *fle;
 117        struct hlist_node *tmp;
 118        LIST_HEAD(gc_list);
 119        int i, deleted = 0;
 120        struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
 121                                                flow_cache_global);
 122
 123        for (i = 0; i < flow_cache_hash_size(fc); i++) {
 124                int saved = 0;
 125
 126                hlist_for_each_entry_safe(fle, tmp,
 127                                          &fcp->hash_table[i], u.hlist) {
 128                        if (saved < shrink_to &&
 129                            flow_entry_valid(fle, xfrm)) {
 130                                saved++;
 131                        } else {
 132                                deleted++;
 133                                hlist_del(&fle->u.hlist);
 134                                list_add_tail(&fle->u.gc_list, &gc_list);
 135                        }
 136                }
 137        }
 138
 139        flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
 140}
 141
 142static void flow_cache_shrink(struct flow_cache *fc,
 143                              struct flow_cache_percpu *fcp)
 144{
 145        int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
 146
 147        __flow_cache_shrink(fc, fcp, shrink_to);
 148}
 149
 150static void flow_new_hash_rnd(struct flow_cache *fc,
 151                              struct flow_cache_percpu *fcp)
 152{
 153        get_random_bytes(&fcp->hash_rnd, sizeof(u32));
 154        fcp->hash_rnd_recalc = 0;
 155        __flow_cache_shrink(fc, fcp, 0);
 156}
 157
 158static u32 flow_hash_code(struct flow_cache *fc,
 159                          struct flow_cache_percpu *fcp,
 160                          const struct flowi *key,
 161                          size_t keysize)
 162{
 163        const u32 *k = (const u32 *) key;
 164        const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
 165
 166        return jhash2(k, length, fcp->hash_rnd)
 167                & (flow_cache_hash_size(fc) - 1);
 168}
 169
 170/* I hear what you're saying, use memcmp.  But memcmp cannot make
 171 * important assumptions that we can here, such as alignment.
 172 */
 173static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
 174                            size_t keysize)
 175{
 176        const flow_compare_t *k1, *k1_lim, *k2;
 177
 178        k1 = (const flow_compare_t *) key1;
 179        k1_lim = k1 + keysize;
 180
 181        k2 = (const flow_compare_t *) key2;
 182
 183        do {
 184                if (*k1++ != *k2++)
 185                        return 1;
 186        } while (k1 < k1_lim);
 187
 188        return 0;
 189}
 190
 191struct flow_cache_object *
 192flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 193                  flow_resolve_t resolver, void *ctx)
 194{
 195        struct flow_cache *fc = &net->xfrm.flow_cache_global;
 196        struct flow_cache_percpu *fcp;
 197        struct flow_cache_entry *fle, *tfle;
 198        struct flow_cache_object *flo;
 199        size_t keysize;
 200        unsigned int hash;
 201
 202        local_bh_disable();
 203        fcp = this_cpu_ptr(fc->percpu);
 204
 205        fle = NULL;
 206        flo = NULL;
 207
 208        keysize = flow_key_size(family);
 209        if (!keysize)
 210                goto nocache;
 211
 212        /* Packet really early in init?  Making flow_cache_init a
 213         * pre-smp initcall would solve this.  --RR */
 214        if (!fcp->hash_table)
 215                goto nocache;
 216
 217        if (fcp->hash_rnd_recalc)
 218                flow_new_hash_rnd(fc, fcp);
 219
 220        hash = flow_hash_code(fc, fcp, key, keysize);
 221        hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
 222                if (tfle->net == net &&
 223                    tfle->family == family &&
 224                    tfle->dir == dir &&
 225                    flow_key_compare(key, &tfle->key, keysize) == 0) {
 226                        fle = tfle;
 227                        break;
 228                }
 229        }
 230
 231        if (unlikely(!fle)) {
 232                if (fcp->hash_count > fc->high_watermark)
 233                        flow_cache_shrink(fc, fcp);
 234
 235                fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 236                if (fle) {
 237                        fle->net = net;
 238                        fle->family = family;
 239                        fle->dir = dir;
 240                        memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
 241                        fle->object = NULL;
 242                        hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 243                        fcp->hash_count++;
 244                }
 245        } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
 246                flo = fle->object;
 247                if (!flo)
 248                        goto ret_object;
 249                flo = flo->ops->get(flo);
 250                if (flo)
 251                        goto ret_object;
 252        } else if (fle->object) {
 253                flo = fle->object;
 254                flo->ops->delete(flo);
 255                fle->object = NULL;
 256        }
 257
 258nocache:
 259        flo = NULL;
 260        if (fle) {
 261                flo = fle->object;
 262                fle->object = NULL;
 263        }
 264        flo = resolver(net, key, family, dir, flo, ctx);
 265        if (fle) {
 266                fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
 267                if (!IS_ERR(flo))
 268                        fle->object = flo;
 269                else
 270                        fle->genid--;
 271        } else {
 272                if (!IS_ERR_OR_NULL(flo))
 273                        flo->ops->delete(flo);
 274        }
 275ret_object:
 276        local_bh_enable();
 277        return flo;
 278}
 279EXPORT_SYMBOL(flow_cache_lookup);
 280
 281static void flow_cache_flush_tasklet(unsigned long data)
 282{
 283        struct flow_flush_info *info = (void *)data;
 284        struct flow_cache *fc = info->cache;
 285        struct flow_cache_percpu *fcp;
 286        struct flow_cache_entry *fle;
 287        struct hlist_node *tmp;
 288        LIST_HEAD(gc_list);
 289        int i, deleted = 0;
 290        struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
 291                                                flow_cache_global);
 292
 293        fcp = this_cpu_ptr(fc->percpu);
 294        for (i = 0; i < flow_cache_hash_size(fc); i++) {
 295                hlist_for_each_entry_safe(fle, tmp,
 296                                          &fcp->hash_table[i], u.hlist) {
 297                        if (flow_entry_valid(fle, xfrm))
 298                                continue;
 299
 300                        deleted++;
 301                        hlist_del(&fle->u.hlist);
 302                        list_add_tail(&fle->u.gc_list, &gc_list);
 303                }
 304        }
 305
 306        flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
 307
 308        if (atomic_dec_and_test(&info->cpuleft))
 309                complete(&info->completion);
 310}
 311
 312/*
 313 * Return whether a cpu needs flushing.  Conservatively, we assume
 314 * the presence of any entries means the core may require flushing,
 315 * since the flow_cache_ops.check() function may assume it's running
 316 * on the same core as the per-cpu cache component.
 317 */
 318static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
 319{
 320        struct flow_cache_percpu *fcp;
 321        int i;
 322
 323        fcp = per_cpu_ptr(fc->percpu, cpu);
 324        for (i = 0; i < flow_cache_hash_size(fc); i++)
 325                if (!hlist_empty(&fcp->hash_table[i]))
 326                        return 0;
 327        return 1;
 328}
 329
 330static void flow_cache_flush_per_cpu(void *data)
 331{
 332        struct flow_flush_info *info = data;
 333        struct tasklet_struct *tasklet;
 334
 335        tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
 336        tasklet->data = (unsigned long)info;
 337        tasklet_schedule(tasklet);
 338}
 339
 340void flow_cache_flush(struct net *net)
 341{
 342        struct flow_flush_info info;
 343        cpumask_var_t mask;
 344        int i, self;
 345
 346        /* Track which cpus need flushing to avoid disturbing all cores. */
 347        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 348                return;
 349        cpumask_clear(mask);
 350
 351        /* Don't want cpus going down or up during this. */
 352        get_online_cpus();
 353        mutex_lock(&net->xfrm.flow_flush_sem);
 354        info.cache = &net->xfrm.flow_cache_global;
 355        for_each_online_cpu(i)
 356                if (!flow_cache_percpu_empty(info.cache, i))
 357                        cpumask_set_cpu(i, mask);
 358        atomic_set(&info.cpuleft, cpumask_weight(mask));
 359        if (atomic_read(&info.cpuleft) == 0)
 360                goto done;
 361
 362        init_completion(&info.completion);
 363
 364        local_bh_disable();
 365        self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
 366        on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
 367        if (self)
 368                flow_cache_flush_tasklet((unsigned long)&info);
 369        local_bh_enable();
 370
 371        wait_for_completion(&info.completion);
 372
 373done:
 374        mutex_unlock(&net->xfrm.flow_flush_sem);
 375        put_online_cpus();
 376        free_cpumask_var(mask);
 377}
 378
 379static void flow_cache_flush_task(struct work_struct *work)
 380{
 381        struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
 382                                                flow_cache_flush_work);
 383        struct net *net = container_of(xfrm, struct net, xfrm);
 384
 385        flow_cache_flush(net);
 386}
 387
 388void flow_cache_flush_deferred(struct net *net)
 389{
 390        schedule_work(&net->xfrm.flow_cache_flush_work);
 391}
 392
 393static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
 394{
 395        struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 396        size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
 397
 398        if (!fcp->hash_table) {
 399                fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
 400                if (!fcp->hash_table) {
 401                        pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
 402                        return -ENOMEM;
 403                }
 404                fcp->hash_rnd_recalc = 1;
 405                fcp->hash_count = 0;
 406                tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
 407        }
 408        return 0;
 409}
 410
 411static int flow_cache_cpu(struct notifier_block *nfb,
 412                          unsigned long action,
 413                          void *hcpu)
 414{
 415        struct flow_cache *fc = container_of(nfb, struct flow_cache,
 416                                                hotcpu_notifier);
 417        int res, cpu = (unsigned long) hcpu;
 418        struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 419
 420        switch (action) {
 421        case CPU_UP_PREPARE:
 422        case CPU_UP_PREPARE_FROZEN:
 423                res = flow_cache_cpu_prepare(fc, cpu);
 424                if (res)
 425                        return notifier_from_errno(res);
 426                break;
 427        case CPU_DEAD:
 428        case CPU_DEAD_FROZEN:
 429                __flow_cache_shrink(fc, fcp, 0);
 430                break;
 431        }
 432        return NOTIFY_OK;
 433}
 434
 435int flow_cache_init(struct net *net)
 436{
 437        int i;
 438        struct flow_cache *fc = &net->xfrm.flow_cache_global;
 439
 440        if (!flow_cachep)
 441                flow_cachep = kmem_cache_create("flow_cache",
 442                                                sizeof(struct flow_cache_entry),
 443                                                0, SLAB_PANIC, NULL);
 444        spin_lock_init(&net->xfrm.flow_cache_gc_lock);
 445        INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
 446        INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
 447        INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
 448        mutex_init(&net->xfrm.flow_flush_sem);
 449
 450        fc->hash_shift = 10;
 451        fc->low_watermark = 2 * flow_cache_hash_size(fc);
 452        fc->high_watermark = 4 * flow_cache_hash_size(fc);
 453
 454        fc->percpu = alloc_percpu(struct flow_cache_percpu);
 455        if (!fc->percpu)
 456                return -ENOMEM;
 457
 458        cpu_notifier_register_begin();
 459
 460        for_each_online_cpu(i) {
 461                if (flow_cache_cpu_prepare(fc, i))
 462                        goto err;
 463        }
 464        fc->hotcpu_notifier = (struct notifier_block){
 465                .notifier_call = flow_cache_cpu,
 466        };
 467        __register_hotcpu_notifier(&fc->hotcpu_notifier);
 468
 469        cpu_notifier_register_done();
 470
 471        setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
 472                    (unsigned long) fc);
 473        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
 474        add_timer(&fc->rnd_timer);
 475
 476        return 0;
 477
 478err:
 479        for_each_possible_cpu(i) {
 480                struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
 481                kfree(fcp->hash_table);
 482                fcp->hash_table = NULL;
 483        }
 484
 485        cpu_notifier_register_done();
 486
 487        free_percpu(fc->percpu);
 488        fc->percpu = NULL;
 489
 490        return -ENOMEM;
 491}
 492EXPORT_SYMBOL(flow_cache_init);
 493
 494void flow_cache_fini(struct net *net)
 495{
 496        int i;
 497        struct flow_cache *fc = &net->xfrm.flow_cache_global;
 498
 499        del_timer_sync(&fc->rnd_timer);
 500        unregister_hotcpu_notifier(&fc->hotcpu_notifier);
 501
 502        for_each_possible_cpu(i) {
 503                struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
 504                kfree(fcp->hash_table);
 505                fcp->hash_table = NULL;
 506        }
 507
 508        free_percpu(fc->percpu);
 509        fc->percpu = NULL;
 510}
 511EXPORT_SYMBOL(flow_cache_fini);
 512