linux/net/core/dst.c
<<
>>
Prefs
   1/*
   2 * net/core/dst.c       Protocol independent destination cache.
   3 *
   4 * Authors:             Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   5 *
   6 */
   7
   8#include <linux/bitops.h>
   9#include <linux/errno.h>
  10#include <linux/init.h>
  11#include <linux/kernel.h>
  12#include <linux/workqueue.h>
  13#include <linux/mm.h>
  14#include <linux/module.h>
  15#include <linux/slab.h>
  16#include <linux/netdevice.h>
  17#include <linux/skbuff.h>
  18#include <linux/string.h>
  19#include <linux/types.h>
  20#include <net/net_namespace.h>
  21#include <linux/sched.h>
  22#include <linux/prefetch.h>
  23#include <net/lwtunnel.h>
  24
  25#include <net/dst.h>
  26#include <net/dst_metadata.h>
  27
  28/*
  29 * Theory of operations:
  30 * 1) We use a list, protected by a spinlock, to add
  31 *    new entries from both BH and non-BH context.
  32 * 2) In order to keep spinlock held for a small delay,
  33 *    we use a second list where are stored long lived
  34 *    entries, that are handled by the garbage collect thread
  35 *    fired by a workqueue.
  36 * 3) This list is guarded by a mutex,
  37 *    so that the gc_task and dst_dev_event() can be synchronized.
  38 */
  39
  40/*
  41 * We want to keep lock & list close together
  42 * to dirty as few cache lines as possible in __dst_free().
  43 * As this is not a very strong hint, we dont force an alignment on SMP.
  44 */
  45int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
  46{
  47        kfree_skb(skb);
  48        return 0;
  49}
  50EXPORT_SYMBOL(dst_discard_out);
  51
  52const struct dst_metrics dst_default_metrics = {
  53        /* This initializer is needed to force linker to place this variable
  54         * into const section. Otherwise it might end into bss section.
  55         * We really want to avoid false sharing on this variable, and catch
  56         * any writes on it.
  57         */
  58        .refcnt = REFCOUNT_INIT(1),
  59};
  60
  61void dst_init(struct dst_entry *dst, struct dst_ops *ops,
  62              struct net_device *dev, int initial_ref, int initial_obsolete,
  63              unsigned short flags)
  64{
  65        dst->child = NULL;
  66        dst->dev = dev;
  67        if (dev)
  68                dev_hold(dev);
  69        dst->ops = ops;
  70        dst_init_metrics(dst, dst_default_metrics.metrics, true);
  71        dst->expires = 0UL;
  72        dst->path = dst;
  73        dst->from = NULL;
  74#ifdef CONFIG_XFRM
  75        dst->xfrm = NULL;
  76#endif
  77        dst->input = dst_discard;
  78        dst->output = dst_discard_out;
  79        dst->error = 0;
  80        dst->obsolete = initial_obsolete;
  81        dst->header_len = 0;
  82        dst->trailer_len = 0;
  83#ifdef CONFIG_IP_ROUTE_CLASSID
  84        dst->tclassid = 0;
  85#endif
  86        dst->lwtstate = NULL;
  87        atomic_set(&dst->__refcnt, initial_ref);
  88        dst->__use = 0;
  89        dst->lastuse = jiffies;
  90        dst->flags = flags;
  91        dst->next = NULL;
  92        if (!(flags & DST_NOCOUNT))
  93                dst_entries_add(ops, 1);
  94}
  95EXPORT_SYMBOL(dst_init);
  96
  97void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
  98                int initial_ref, int initial_obsolete, unsigned short flags)
  99{
 100        struct dst_entry *dst;
 101
 102        if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
 103                if (ops->gc(ops))
 104                        return NULL;
 105        }
 106
 107        dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
 108        if (!dst)
 109                return NULL;
 110
 111        dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
 112
 113        return dst;
 114}
 115EXPORT_SYMBOL(dst_alloc);
 116
 117struct dst_entry *dst_destroy(struct dst_entry * dst)
 118{
 119        struct dst_entry *child;
 120
 121        smp_rmb();
 122
 123        child = dst->child;
 124
 125        if (!(dst->flags & DST_NOCOUNT))
 126                dst_entries_add(dst->ops, -1);
 127
 128        if (dst->ops->destroy)
 129                dst->ops->destroy(dst);
 130        if (dst->dev)
 131                dev_put(dst->dev);
 132
 133        lwtstate_put(dst->lwtstate);
 134
 135        if (dst->flags & DST_METADATA)
 136                metadata_dst_free((struct metadata_dst *)dst);
 137        else
 138                kmem_cache_free(dst->ops->kmem_cachep, dst);
 139
 140        dst = child;
 141        if (dst)
 142                dst_release_immediate(dst);
 143        return NULL;
 144}
 145EXPORT_SYMBOL(dst_destroy);
 146
 147static void dst_destroy_rcu(struct rcu_head *head)
 148{
 149        struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
 150
 151        dst = dst_destroy(dst);
 152}
 153
 154/* Operations to mark dst as DEAD and clean up the net device referenced
 155 * by dst:
 156 * 1. put the dst under loopback interface and discard all tx/rx packets
 157 *    on this route.
 158 * 2. release the net_device
 159 * This function should be called when removing routes from the fib tree
 160 * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
 161 * make the next dst_ops->check() fail.
 162 */
 163void dst_dev_put(struct dst_entry *dst)
 164{
 165        struct net_device *dev = dst->dev;
 166
 167        dst->obsolete = DST_OBSOLETE_DEAD;
 168        if (dst->ops->ifdown)
 169                dst->ops->ifdown(dst, dev, true);
 170        dst->input = dst_discard;
 171        dst->output = dst_discard_out;
 172        dst->dev = dev_net(dst->dev)->loopback_dev;
 173        dev_hold(dst->dev);
 174        dev_put(dev);
 175}
 176EXPORT_SYMBOL(dst_dev_put);
 177
 178void dst_release(struct dst_entry *dst)
 179{
 180        if (dst) {
 181                int newrefcnt;
 182
 183                newrefcnt = atomic_dec_return(&dst->__refcnt);
 184                if (unlikely(newrefcnt < 0))
 185                        net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 186                                             __func__, dst, newrefcnt);
 187                if (!newrefcnt)
 188                        call_rcu(&dst->rcu_head, dst_destroy_rcu);
 189        }
 190}
 191EXPORT_SYMBOL(dst_release);
 192
 193void dst_release_immediate(struct dst_entry *dst)
 194{
 195        if (dst) {
 196                int newrefcnt;
 197
 198                newrefcnt = atomic_dec_return(&dst->__refcnt);
 199                if (unlikely(newrefcnt < 0))
 200                        net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 201                                             __func__, dst, newrefcnt);
 202                if (!newrefcnt)
 203                        dst_destroy(dst);
 204        }
 205}
 206EXPORT_SYMBOL(dst_release_immediate);
 207
 208u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
 209{
 210        struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
 211
 212        if (p) {
 213                struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
 214                unsigned long prev, new;
 215
 216                refcount_set(&p->refcnt, 1);
 217                memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
 218
 219                new = (unsigned long) p;
 220                prev = cmpxchg(&dst->_metrics, old, new);
 221
 222                if (prev != old) {
 223                        kfree(p);
 224                        p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
 225                        if (prev & DST_METRICS_READ_ONLY)
 226                                p = NULL;
 227                } else if (prev & DST_METRICS_REFCOUNTED) {
 228                        if (refcount_dec_and_test(&old_p->refcnt))
 229                                kfree(old_p);
 230                }
 231        }
 232        BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
 233        return (u32 *)p;
 234}
 235EXPORT_SYMBOL(dst_cow_metrics_generic);
 236
 237/* Caller asserts that dst_metrics_read_only(dst) is false.  */
 238void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
 239{
 240        unsigned long prev, new;
 241
 242        new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
 243        prev = cmpxchg(&dst->_metrics, old, new);
 244        if (prev == old)
 245                kfree(__DST_METRICS_PTR(old));
 246}
 247EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 248
 249static struct dst_ops md_dst_ops = {
 250        .family =               AF_UNSPEC,
 251};
 252
 253static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 254{
 255        WARN_ONCE(1, "Attempting to call output on metadata dst\n");
 256        kfree_skb(skb);
 257        return 0;
 258}
 259
 260static int dst_md_discard(struct sk_buff *skb)
 261{
 262        WARN_ONCE(1, "Attempting to call input on metadata dst\n");
 263        kfree_skb(skb);
 264        return 0;
 265}
 266
 267static void __metadata_dst_init(struct metadata_dst *md_dst,
 268                                enum metadata_type type, u8 optslen)
 269
 270{
 271        struct dst_entry *dst;
 272
 273        dst = &md_dst->dst;
 274        dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
 275                 DST_METADATA | DST_NOCOUNT);
 276
 277        dst->input = dst_md_discard;
 278        dst->output = dst_md_discard_out;
 279
 280        memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
 281        md_dst->type = type;
 282}
 283
 284struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
 285                                        gfp_t flags)
 286{
 287        struct metadata_dst *md_dst;
 288
 289        md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
 290        if (!md_dst)
 291                return NULL;
 292
 293        __metadata_dst_init(md_dst, type, optslen);
 294
 295        return md_dst;
 296}
 297EXPORT_SYMBOL_GPL(metadata_dst_alloc);
 298
 299void metadata_dst_free(struct metadata_dst *md_dst)
 300{
 301#ifdef CONFIG_DST_CACHE
 302        if (md_dst->type == METADATA_IP_TUNNEL)
 303                dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
 304#endif
 305        kfree(md_dst);
 306}
 307
 308struct metadata_dst __percpu *
 309metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
 310{
 311        int cpu;
 312        struct metadata_dst __percpu *md_dst;
 313
 314        md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
 315                                    __alignof__(struct metadata_dst), flags);
 316        if (!md_dst)
 317                return NULL;
 318
 319        for_each_possible_cpu(cpu)
 320                __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
 321
 322        return md_dst;
 323}
 324EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
 325