linux/net/core/dst.c
<<
>>
Prefs
   1/*
   2 * net/core/dst.c       Protocol independent destination cache.
   3 *
   4 * Authors:             Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   5 *
   6 */
   7
   8#include <linux/bitops.h>
   9#include <linux/errno.h>
  10#include <linux/init.h>
  11#include <linux/kernel.h>
  12#include <linux/workqueue.h>
  13#include <linux/mm.h>
  14#include <linux/module.h>
  15#include <linux/slab.h>
  16#include <linux/netdevice.h>
  17#include <linux/skbuff.h>
  18#include <linux/string.h>
  19#include <linux/types.h>
  20#include <net/net_namespace.h>
  21#include <linux/sched.h>
  22#include <linux/prefetch.h>
  23#include <net/lwtunnel.h>
  24#include <net/xfrm.h>
  25
  26#include <net/dst.h>
  27#include <net/dst_metadata.h>
  28
  29/*
  30 * Theory of operations:
  31 * 1) We use a list, protected by a spinlock, to add
  32 *    new entries from both BH and non-BH context.
  33 * 2) In order to keep spinlock held for a small delay,
  34 *    we use a second list where are stored long lived
  35 *    entries, that are handled by the garbage collect thread
  36 *    fired by a workqueue.
  37 * 3) This list is guarded by a mutex,
  38 *    so that the gc_task and dst_dev_event() can be synchronized.
  39 */
  40
  41/*
  42 * We want to keep lock & list close together
  43 * to dirty as few cache lines as possible in __dst_free().
  44 * As this is not a very strong hint, we dont force an alignment on SMP.
  45 */
  46int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
  47{
  48        kfree_skb(skb);
  49        return 0;
  50}
  51EXPORT_SYMBOL(dst_discard_out);
  52
  53const struct dst_metrics dst_default_metrics = {
  54        /* This initializer is needed to force linker to place this variable
  55         * into const section. Otherwise it might end into bss section.
  56         * We really want to avoid false sharing on this variable, and catch
  57         * any writes on it.
  58         */
  59        .refcnt = REFCOUNT_INIT(1),
  60};
  61
  62void dst_init(struct dst_entry *dst, struct dst_ops *ops,
  63              struct net_device *dev, int initial_ref, int initial_obsolete,
  64              unsigned short flags)
  65{
  66        dst->dev = dev;
  67        if (dev)
  68                dev_hold(dev);
  69        dst->ops = ops;
  70        dst_init_metrics(dst, dst_default_metrics.metrics, true);
  71        dst->expires = 0UL;
  72#ifdef CONFIG_XFRM
  73        dst->xfrm = NULL;
  74#endif
  75        dst->input = dst_discard;
  76        dst->output = dst_discard_out;
  77        dst->error = 0;
  78        dst->obsolete = initial_obsolete;
  79        dst->header_len = 0;
  80        dst->trailer_len = 0;
  81#ifdef CONFIG_IP_ROUTE_CLASSID
  82        dst->tclassid = 0;
  83#endif
  84        dst->lwtstate = NULL;
  85        atomic_set(&dst->__refcnt, initial_ref);
  86        dst->__use = 0;
  87        dst->lastuse = jiffies;
  88        dst->flags = flags;
  89        if (!(flags & DST_NOCOUNT))
  90                dst_entries_add(ops, 1);
  91}
  92EXPORT_SYMBOL(dst_init);
  93
  94void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
  95                int initial_ref, int initial_obsolete, unsigned short flags)
  96{
  97        struct dst_entry *dst;
  98
  99        if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
 100                if (ops->gc(ops))
 101                        return NULL;
 102        }
 103
 104        dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
 105        if (!dst)
 106                return NULL;
 107
 108        dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
 109
 110        return dst;
 111}
 112EXPORT_SYMBOL(dst_alloc);
 113
 114struct dst_entry *dst_destroy(struct dst_entry * dst)
 115{
 116        struct dst_entry *child = NULL;
 117
 118        smp_rmb();
 119
 120#ifdef CONFIG_XFRM
 121        if (dst->xfrm) {
 122                struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
 123
 124                child = xdst->child;
 125        }
 126#endif
 127        if (!(dst->flags & DST_NOCOUNT))
 128                dst_entries_add(dst->ops, -1);
 129
 130        if (dst->ops->destroy)
 131                dst->ops->destroy(dst);
 132        if (dst->dev)
 133                dev_put(dst->dev);
 134
 135        lwtstate_put(dst->lwtstate);
 136
 137        if (dst->flags & DST_METADATA)
 138                metadata_dst_free((struct metadata_dst *)dst);
 139        else
 140                kmem_cache_free(dst->ops->kmem_cachep, dst);
 141
 142        dst = child;
 143        if (dst)
 144                dst_release_immediate(dst);
 145        return NULL;
 146}
 147EXPORT_SYMBOL(dst_destroy);
 148
 149static void dst_destroy_rcu(struct rcu_head *head)
 150{
 151        struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
 152
 153        dst = dst_destroy(dst);
 154}
 155
 156/* Operations to mark dst as DEAD and clean up the net device referenced
 157 * by dst:
 158 * 1. put the dst under loopback interface and discard all tx/rx packets
 159 *    on this route.
 160 * 2. release the net_device
 161 * This function should be called when removing routes from the fib tree
 162 * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
 163 * make the next dst_ops->check() fail.
 164 */
 165void dst_dev_put(struct dst_entry *dst)
 166{
 167        struct net_device *dev = dst->dev;
 168
 169        dst->obsolete = DST_OBSOLETE_DEAD;
 170        if (dst->ops->ifdown)
 171                dst->ops->ifdown(dst, dev, true);
 172        dst->input = dst_discard;
 173        dst->output = dst_discard_out;
 174        dst->dev = dev_net(dst->dev)->loopback_dev;
 175        dev_hold(dst->dev);
 176        dev_put(dev);
 177}
 178EXPORT_SYMBOL(dst_dev_put);
 179
 180void dst_release(struct dst_entry *dst)
 181{
 182        if (dst) {
 183                int newrefcnt;
 184
 185                newrefcnt = atomic_dec_return(&dst->__refcnt);
 186                if (unlikely(newrefcnt < 0))
 187                        net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 188                                             __func__, dst, newrefcnt);
 189                if (!newrefcnt)
 190                        call_rcu(&dst->rcu_head, dst_destroy_rcu);
 191        }
 192}
 193EXPORT_SYMBOL(dst_release);
 194
 195void dst_release_immediate(struct dst_entry *dst)
 196{
 197        if (dst) {
 198                int newrefcnt;
 199
 200                newrefcnt = atomic_dec_return(&dst->__refcnt);
 201                if (unlikely(newrefcnt < 0))
 202                        net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 203                                             __func__, dst, newrefcnt);
 204                if (!newrefcnt)
 205                        dst_destroy(dst);
 206        }
 207}
 208EXPORT_SYMBOL(dst_release_immediate);
 209
 210u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
 211{
 212        struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
 213
 214        if (p) {
 215                struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
 216                unsigned long prev, new;
 217
 218                refcount_set(&p->refcnt, 1);
 219                memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
 220
 221                new = (unsigned long) p;
 222                prev = cmpxchg(&dst->_metrics, old, new);
 223
 224                if (prev != old) {
 225                        kfree(p);
 226                        p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
 227                        if (prev & DST_METRICS_READ_ONLY)
 228                                p = NULL;
 229                } else if (prev & DST_METRICS_REFCOUNTED) {
 230                        if (refcount_dec_and_test(&old_p->refcnt))
 231                                kfree(old_p);
 232                }
 233        }
 234        BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
 235        return (u32 *)p;
 236}
 237EXPORT_SYMBOL(dst_cow_metrics_generic);
 238
 239/* Caller asserts that dst_metrics_read_only(dst) is false.  */
 240void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
 241{
 242        unsigned long prev, new;
 243
 244        new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
 245        prev = cmpxchg(&dst->_metrics, old, new);
 246        if (prev == old)
 247                kfree(__DST_METRICS_PTR(old));
 248}
 249EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 250
 251static struct dst_ops md_dst_ops = {
 252        .family =               AF_UNSPEC,
 253};
 254
 255static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 256{
 257        WARN_ONCE(1, "Attempting to call output on metadata dst\n");
 258        kfree_skb(skb);
 259        return 0;
 260}
 261
 262static int dst_md_discard(struct sk_buff *skb)
 263{
 264        WARN_ONCE(1, "Attempting to call input on metadata dst\n");
 265        kfree_skb(skb);
 266        return 0;
 267}
 268
 269static void __metadata_dst_init(struct metadata_dst *md_dst,
 270                                enum metadata_type type, u8 optslen)
 271
 272{
 273        struct dst_entry *dst;
 274
 275        dst = &md_dst->dst;
 276        dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
 277                 DST_METADATA | DST_NOCOUNT);
 278
 279        dst->input = dst_md_discard;
 280        dst->output = dst_md_discard_out;
 281
 282        memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
 283        md_dst->type = type;
 284}
 285
 286struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
 287                                        gfp_t flags)
 288{
 289        struct metadata_dst *md_dst;
 290
 291        md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
 292        if (!md_dst)
 293                return NULL;
 294
 295        __metadata_dst_init(md_dst, type, optslen);
 296
 297        return md_dst;
 298}
 299EXPORT_SYMBOL_GPL(metadata_dst_alloc);
 300
 301void metadata_dst_free(struct metadata_dst *md_dst)
 302{
 303#ifdef CONFIG_DST_CACHE
 304        if (md_dst->type == METADATA_IP_TUNNEL)
 305                dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
 306#endif
 307        kfree(md_dst);
 308}
 309
 310struct metadata_dst __percpu *
 311metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
 312{
 313        int cpu;
 314        struct metadata_dst __percpu *md_dst;
 315
 316        md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
 317                                    __alignof__(struct metadata_dst), flags);
 318        if (!md_dst)
 319                return NULL;
 320
 321        for_each_possible_cpu(cpu)
 322                __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
 323
 324        return md_dst;
 325}
 326EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
 327
 328void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
 329{
 330#ifdef CONFIG_DST_CACHE
 331        int cpu;
 332
 333        for_each_possible_cpu(cpu) {
 334                struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
 335
 336                if (one_md_dst->type == METADATA_IP_TUNNEL)
 337                        dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
 338        }
 339#endif
 340        free_percpu(md_dst);
 341}
 342EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
 343