linux/net/core/dst.c
<<
>>
Prefs
   1/*
   2 * net/core/dst.c       Protocol independent destination cache.
   3 *
   4 * Authors:             Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   5 *
   6 */
   7
   8#include <linux/bitops.h>
   9#include <linux/errno.h>
  10#include <linux/init.h>
  11#include <linux/kernel.h>
  12#include <linux/workqueue.h>
  13#include <linux/mm.h>
  14#include <linux/module.h>
  15#include <linux/slab.h>
  16#include <linux/netdevice.h>
  17#include <linux/skbuff.h>
  18#include <linux/string.h>
  19#include <linux/types.h>
  20#include <net/net_namespace.h>
  21#include <linux/sched.h>
  22#include <linux/prefetch.h>
  23#include <net/lwtunnel.h>
  24#include <net/xfrm.h>
  25
  26#include <net/dst.h>
  27#include <net/dst_metadata.h>
  28
  29/*
  30 * Theory of operations:
  31 * 1) We use a list, protected by a spinlock, to add
  32 *    new entries from both BH and non-BH context.
  33 * 2) In order to keep spinlock held for a small delay,
  34 *    we use a second list where are stored long lived
  35 *    entries, that are handled by the garbage collect thread
  36 *    fired by a workqueue.
  37 * 3) This list is guarded by a mutex,
  38 *    so that the gc_task and dst_dev_event() can be synchronized.
  39 */
  40
  41/*
  42 * We want to keep lock & list close together
  43 * to dirty as few cache lines as possible in __dst_free().
  44 * As this is not a very strong hint, we dont force an alignment on SMP.
  45 */
  46int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
  47{
  48        kfree_skb(skb);
  49        return 0;
  50}
  51EXPORT_SYMBOL(dst_discard_out);
  52
  53const struct dst_metrics dst_default_metrics = {
  54        /* This initializer is needed to force linker to place this variable
  55         * into const section. Otherwise it might end into bss section.
  56         * We really want to avoid false sharing on this variable, and catch
  57         * any writes on it.
  58         */
  59        .refcnt = REFCOUNT_INIT(1),
  60};
  61EXPORT_SYMBOL(dst_default_metrics);
  62
  63void dst_init(struct dst_entry *dst, struct dst_ops *ops,
  64              struct net_device *dev, int initial_ref, int initial_obsolete,
  65              unsigned short flags)
  66{
  67        dst->dev = dev;
  68        if (dev)
  69                dev_hold(dev);
  70        dst->ops = ops;
  71        dst_init_metrics(dst, dst_default_metrics.metrics, true);
  72        dst->expires = 0UL;
  73#ifdef CONFIG_XFRM
  74        dst->xfrm = NULL;
  75#endif
  76        dst->input = dst_discard;
  77        dst->output = dst_discard_out;
  78        dst->error = 0;
  79        dst->obsolete = initial_obsolete;
  80        dst->header_len = 0;
  81        dst->trailer_len = 0;
  82#ifdef CONFIG_IP_ROUTE_CLASSID
  83        dst->tclassid = 0;
  84#endif
  85        dst->lwtstate = NULL;
  86        atomic_set(&dst->__refcnt, initial_ref);
  87        dst->__use = 0;
  88        dst->lastuse = jiffies;
  89        dst->flags = flags;
  90        if (!(flags & DST_NOCOUNT))
  91                dst_entries_add(ops, 1);
  92}
  93EXPORT_SYMBOL(dst_init);
  94
  95void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
  96                int initial_ref, int initial_obsolete, unsigned short flags)
  97{
  98        struct dst_entry *dst;
  99
 100        if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
 101                if (ops->gc(ops)) {
 102                        printk_ratelimited(KERN_NOTICE "Route cache is full: "
 103                                           "consider increasing sysctl "
 104                                           "net.ipv[4|6].route.max_size.\n");
 105                        return NULL;
 106                }
 107        }
 108
 109        dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
 110        if (!dst)
 111                return NULL;
 112
 113        dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
 114
 115        return dst;
 116}
 117EXPORT_SYMBOL(dst_alloc);
 118
 119struct dst_entry *dst_destroy(struct dst_entry * dst)
 120{
 121        struct dst_entry *child = NULL;
 122
 123        smp_rmb();
 124
 125#ifdef CONFIG_XFRM
 126        if (dst->xfrm) {
 127                struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
 128
 129                child = xdst->child;
 130        }
 131#endif
 132        if (!(dst->flags & DST_NOCOUNT))
 133                dst_entries_add(dst->ops, -1);
 134
 135        if (dst->ops->destroy)
 136                dst->ops->destroy(dst);
 137        if (dst->dev)
 138                dev_put(dst->dev);
 139
 140        lwtstate_put(dst->lwtstate);
 141
 142        if (dst->flags & DST_METADATA)
 143                metadata_dst_free((struct metadata_dst *)dst);
 144        else
 145                kmem_cache_free(dst->ops->kmem_cachep, dst);
 146
 147        dst = child;
 148        if (dst)
 149                dst_release_immediate(dst);
 150        return NULL;
 151}
 152EXPORT_SYMBOL(dst_destroy);
 153
 154static void dst_destroy_rcu(struct rcu_head *head)
 155{
 156        struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
 157
 158        dst = dst_destroy(dst);
 159}
 160
 161/* Operations to mark dst as DEAD and clean up the net device referenced
 162 * by dst:
 163 * 1. put the dst under loopback interface and discard all tx/rx packets
 164 *    on this route.
 165 * 2. release the net_device
 166 * This function should be called when removing routes from the fib tree
 167 * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
 168 * make the next dst_ops->check() fail.
 169 */
 170void dst_dev_put(struct dst_entry *dst)
 171{
 172        struct net_device *dev = dst->dev;
 173
 174        dst->obsolete = DST_OBSOLETE_DEAD;
 175        if (dst->ops->ifdown)
 176                dst->ops->ifdown(dst, dev, true);
 177        dst->input = dst_discard;
 178        dst->output = dst_discard_out;
 179        dst->dev = dev_net(dst->dev)->loopback_dev;
 180        dev_hold(dst->dev);
 181        dev_put(dev);
 182}
 183EXPORT_SYMBOL(dst_dev_put);
 184
 185void dst_release(struct dst_entry *dst)
 186{
 187        if (dst) {
 188                int newrefcnt;
 189
 190                newrefcnt = atomic_dec_return(&dst->__refcnt);
 191                if (unlikely(newrefcnt < 0))
 192                        net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 193                                             __func__, dst, newrefcnt);
 194                if (!newrefcnt)
 195                        call_rcu(&dst->rcu_head, dst_destroy_rcu);
 196        }
 197}
 198EXPORT_SYMBOL(dst_release);
 199
 200void dst_release_immediate(struct dst_entry *dst)
 201{
 202        if (dst) {
 203                int newrefcnt;
 204
 205                newrefcnt = atomic_dec_return(&dst->__refcnt);
 206                if (unlikely(newrefcnt < 0))
 207                        net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 208                                             __func__, dst, newrefcnt);
 209                if (!newrefcnt)
 210                        dst_destroy(dst);
 211        }
 212}
 213EXPORT_SYMBOL(dst_release_immediate);
 214
 215u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
 216{
 217        struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
 218
 219        if (p) {
 220                struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
 221                unsigned long prev, new;
 222
 223                refcount_set(&p->refcnt, 1);
 224                memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
 225
 226                new = (unsigned long) p;
 227                prev = cmpxchg(&dst->_metrics, old, new);
 228
 229                if (prev != old) {
 230                        kfree(p);
 231                        p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
 232                        if (prev & DST_METRICS_READ_ONLY)
 233                                p = NULL;
 234                } else if (prev & DST_METRICS_REFCOUNTED) {
 235                        if (refcount_dec_and_test(&old_p->refcnt))
 236                                kfree(old_p);
 237                }
 238        }
 239        BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
 240        return (u32 *)p;
 241}
 242EXPORT_SYMBOL(dst_cow_metrics_generic);
 243
 244/* Caller asserts that dst_metrics_read_only(dst) is false.  */
 245void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
 246{
 247        unsigned long prev, new;
 248
 249        new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
 250        prev = cmpxchg(&dst->_metrics, old, new);
 251        if (prev == old)
 252                kfree(__DST_METRICS_PTR(old));
 253}
 254EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 255
 256static struct dst_ops md_dst_ops = {
 257        .family =               AF_UNSPEC,
 258};
 259
 260static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 261{
 262        WARN_ONCE(1, "Attempting to call output on metadata dst\n");
 263        kfree_skb(skb);
 264        return 0;
 265}
 266
 267static int dst_md_discard(struct sk_buff *skb)
 268{
 269        WARN_ONCE(1, "Attempting to call input on metadata dst\n");
 270        kfree_skb(skb);
 271        return 0;
 272}
 273
 274static void __metadata_dst_init(struct metadata_dst *md_dst,
 275                                enum metadata_type type, u8 optslen)
 276
 277{
 278        struct dst_entry *dst;
 279
 280        dst = &md_dst->dst;
 281        dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
 282                 DST_METADATA | DST_NOCOUNT);
 283
 284        dst->input = dst_md_discard;
 285        dst->output = dst_md_discard_out;
 286
 287        memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
 288        md_dst->type = type;
 289}
 290
 291struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
 292                                        gfp_t flags)
 293{
 294        struct metadata_dst *md_dst;
 295
 296        md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
 297        if (!md_dst)
 298                return NULL;
 299
 300        __metadata_dst_init(md_dst, type, optslen);
 301
 302        return md_dst;
 303}
 304EXPORT_SYMBOL_GPL(metadata_dst_alloc);
 305
 306void metadata_dst_free(struct metadata_dst *md_dst)
 307{
 308#ifdef CONFIG_DST_CACHE
 309        if (md_dst->type == METADATA_IP_TUNNEL)
 310                dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
 311#endif
 312        kfree(md_dst);
 313}
 314EXPORT_SYMBOL_GPL(metadata_dst_free);
 315
 316struct metadata_dst __percpu *
 317metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
 318{
 319        int cpu;
 320        struct metadata_dst __percpu *md_dst;
 321
 322        md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
 323                                    __alignof__(struct metadata_dst), flags);
 324        if (!md_dst)
 325                return NULL;
 326
 327        for_each_possible_cpu(cpu)
 328                __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
 329
 330        return md_dst;
 331}
 332EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
 333
 334void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
 335{
 336#ifdef CONFIG_DST_CACHE
 337        int cpu;
 338
 339        for_each_possible_cpu(cpu) {
 340                struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
 341
 342                if (one_md_dst->type == METADATA_IP_TUNNEL)
 343                        dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
 344        }
 345#endif
 346        free_percpu(md_dst);
 347}
 348EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
 349