linux/drivers/net/vxlan.c
<<
>>
Prefs
   1/*
   2 * VXLAN: Virtual eXtensible Local Area Network
   3 *
   4 * Copyright (c) 2012-2013 Vyatta Inc.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 as
   8 * published by the Free Software Foundation.
   9 *
  10 * TODO
  11 *  - IPv6 (not in RFC)
  12 */
  13
  14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15
  16#include <linux/kernel.h>
  17#include <linux/types.h>
  18#include <linux/module.h>
  19#include <linux/errno.h>
  20#include <linux/slab.h>
  21#include <linux/skbuff.h>
  22#include <linux/rculist.h>
  23#include <linux/netdevice.h>
  24#include <linux/in.h>
  25#include <linux/ip.h>
  26#include <linux/udp.h>
  27#include <linux/igmp.h>
  28#include <linux/etherdevice.h>
  29#include <linux/if_ether.h>
  30#include <linux/hash.h>
  31#include <linux/ethtool.h>
  32#include <net/arp.h>
  33#include <net/ndisc.h>
  34#include <net/ip.h>
  35#include <net/ip_tunnels.h>
  36#include <net/icmp.h>
  37#include <net/udp.h>
  38#include <net/rtnetlink.h>
  39#include <net/route.h>
  40#include <net/dsfield.h>
  41#include <net/inet_ecn.h>
  42#include <net/net_namespace.h>
  43#include <net/netns/generic.h>
  44
  45#define VXLAN_VERSION   "0.1"
  46
  47#define VNI_HASH_BITS   10
  48#define VNI_HASH_SIZE   (1<<VNI_HASH_BITS)
  49#define FDB_HASH_BITS   8
  50#define FDB_HASH_SIZE   (1<<FDB_HASH_BITS)
  51#define FDB_AGE_DEFAULT 300 /* 5 min */
  52#define FDB_AGE_INTERVAL (10 * HZ)      /* rescan interval */
  53
  54#define VXLAN_N_VID     (1u << 24)
  55#define VXLAN_VID_MASK  (VXLAN_N_VID - 1)
  56/* IP header + UDP + VXLAN + Ethernet header */
  57#define VXLAN_HEADROOM (20 + 8 + 8 + 14)
  58
  59#define VXLAN_FLAGS 0x08000000  /* struct vxlanhdr.vx_flags required value. */
  60
  61/* VXLAN protocol header */
  62struct vxlanhdr {
  63        __be32 vx_flags;
  64        __be32 vx_vni;
  65};
  66
  67/* UDP port for VXLAN traffic.
  68 * The IANA assigned port is 4789, but the Linux default is 8472
  69 * for compatability with early adopters.
  70 */
  71static unsigned int vxlan_port __read_mostly = 8472;
  72module_param_named(udp_port, vxlan_port, uint, 0444);
  73MODULE_PARM_DESC(udp_port, "Destination UDP port");
  74
  75static bool log_ecn_error = true;
  76module_param(log_ecn_error, bool, 0644);
  77MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
  78
  79/* per-net private data for this module */
  80static unsigned int vxlan_net_id;
  81struct vxlan_net {
  82        struct socket     *sock;        /* UDP encap socket */
  83        struct hlist_head vni_list[VNI_HASH_SIZE];
  84};
  85
  86struct vxlan_rdst {
  87        struct rcu_head          rcu;
  88        __be32                   remote_ip;
  89        __be16                   remote_port;
  90        u32                      remote_vni;
  91        u32                      remote_ifindex;
  92        struct vxlan_rdst       *remote_next;
  93};
  94
  95/* Forwarding table entry */
  96struct vxlan_fdb {
  97        struct hlist_node hlist;        /* linked list of entries */
  98        struct rcu_head   rcu;
  99        unsigned long     updated;      /* jiffies */
 100        unsigned long     used;
 101        struct vxlan_rdst remote;
 102        u16               state;        /* see ndm_state */
 103        u8                flags;        /* see ndm_flags */
 104        u8                eth_addr[ETH_ALEN];
 105};
 106
 107/* Pseudo network device */
 108struct vxlan_dev {
 109        struct hlist_node hlist;
 110        struct net_device *dev;
 111        struct vxlan_rdst default_dst;  /* default destination */
 112        __be32            saddr;        /* source address */
 113        __be16            dst_port;
 114        __u16             port_min;     /* source port range */
 115        __u16             port_max;
 116        __u8              tos;          /* TOS override */
 117        __u8              ttl;
 118        u32               flags;        /* VXLAN_F_* below */
 119
 120        unsigned long     age_interval;
 121        struct timer_list age_timer;
 122        spinlock_t        hash_lock;
 123        unsigned int      addrcnt;
 124        unsigned int      addrmax;
 125
 126        struct hlist_head fdb_head[FDB_HASH_SIZE];
 127};
 128
 129#define VXLAN_F_LEARN   0x01
 130#define VXLAN_F_PROXY   0x02
 131#define VXLAN_F_RSC     0x04
 132#define VXLAN_F_L2MISS  0x08
 133#define VXLAN_F_L3MISS  0x10
 134
 135/* salt for hash table */
 136static u32 vxlan_salt __read_mostly;
 137
 138static inline struct hlist_head *vni_head(struct net *net, u32 id)
 139{
 140        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 141
 142        return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
 143}
 144
 145/* Look up VNI in a per net namespace table */
 146static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
 147{
 148        struct vxlan_dev *vxlan;
 149
 150        hlist_for_each_entry_rcu(vxlan, vni_head(net, id), hlist) {
 151                if (vxlan->default_dst.remote_vni == id)
 152                        return vxlan;
 153        }
 154
 155        return NULL;
 156}
 157
 158/* Fill in neighbour message in skbuff. */
 159static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 160                           const struct vxlan_fdb *fdb,
 161                           u32 portid, u32 seq, int type, unsigned int flags,
 162                           const struct vxlan_rdst *rdst)
 163{
 164        unsigned long now = jiffies;
 165        struct nda_cacheinfo ci;
 166        struct nlmsghdr *nlh;
 167        struct ndmsg *ndm;
 168        bool send_ip, send_eth;
 169
 170        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
 171        if (nlh == NULL)
 172                return -EMSGSIZE;
 173
 174        ndm = nlmsg_data(nlh);
 175        memset(ndm, 0, sizeof(*ndm));
 176
 177        send_eth = send_ip = true;
 178
 179        if (type == RTM_GETNEIGH) {
 180                ndm->ndm_family = AF_INET;
 181                send_ip = rdst->remote_ip != htonl(INADDR_ANY);
 182                send_eth = !is_zero_ether_addr(fdb->eth_addr);
 183        } else
 184                ndm->ndm_family = AF_BRIDGE;
 185        ndm->ndm_state = fdb->state;
 186        ndm->ndm_ifindex = vxlan->dev->ifindex;
 187        ndm->ndm_flags = fdb->flags;
 188        ndm->ndm_type = NDA_DST;
 189
 190        if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
 191                goto nla_put_failure;
 192
 193        if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip))
 194                goto nla_put_failure;
 195
 196        if (rdst->remote_port && rdst->remote_port != vxlan->dst_port &&
 197            nla_put_be16(skb, NDA_PORT, rdst->remote_port))
 198                goto nla_put_failure;
 199        if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
 200            nla_put_be32(skb, NDA_VNI, rdst->remote_vni))
 201                goto nla_put_failure;
 202        if (rdst->remote_ifindex &&
 203            nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
 204                goto nla_put_failure;
 205
 206        ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
 207        ci.ndm_confirmed = 0;
 208        ci.ndm_updated   = jiffies_to_clock_t(now - fdb->updated);
 209        ci.ndm_refcnt    = 0;
 210
 211        if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
 212                goto nla_put_failure;
 213
 214        return nlmsg_end(skb, nlh);
 215
 216nla_put_failure:
 217        nlmsg_cancel(skb, nlh);
 218        return -EMSGSIZE;
 219}
 220
 221static inline size_t vxlan_nlmsg_size(void)
 222{
 223        return NLMSG_ALIGN(sizeof(struct ndmsg))
 224                + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
 225                + nla_total_size(sizeof(__be32)) /* NDA_DST */
 226                + nla_total_size(sizeof(__be16)) /* NDA_PORT */
 227                + nla_total_size(sizeof(__be32)) /* NDA_VNI */
 228                + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
 229                + nla_total_size(sizeof(struct nda_cacheinfo));
 230}
 231
 232static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
 233                             const struct vxlan_fdb *fdb, int type)
 234{
 235        struct net *net = dev_net(vxlan->dev);
 236        struct sk_buff *skb;
 237        int err = -ENOBUFS;
 238
 239        skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
 240        if (skb == NULL)
 241                goto errout;
 242
 243        err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, &fdb->remote);
 244        if (err < 0) {
 245                /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
 246                WARN_ON(err == -EMSGSIZE);
 247                kfree_skb(skb);
 248                goto errout;
 249        }
 250
 251        rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
 252        return;
 253errout:
 254        if (err < 0)
 255                rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 256}
 257
 258static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
 259{
 260        struct vxlan_dev *vxlan = netdev_priv(dev);
 261        struct vxlan_fdb f;
 262
 263        memset(&f, 0, sizeof f);
 264        f.state = NUD_STALE;
 265        f.remote.remote_ip = ipa; /* goes to NDA_DST */
 266        f.remote.remote_vni = VXLAN_N_VID;
 267
 268        vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
 269}
 270
 271static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
 272{
 273        struct vxlan_fdb        f;
 274
 275        memset(&f, 0, sizeof f);
 276        f.state = NUD_STALE;
 277        memcpy(f.eth_addr, eth_addr, ETH_ALEN);
 278
 279        vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
 280}
 281
 282/* Hash Ethernet address */
 283static u32 eth_hash(const unsigned char *addr)
 284{
 285        u64 value = get_unaligned((u64 *)addr);
 286
 287        /* only want 6 bytes */
 288#ifdef __BIG_ENDIAN
 289        value >>= 16;
 290#else
 291        value <<= 16;
 292#endif
 293        return hash_64(value, FDB_HASH_BITS);
 294}
 295
 296/* Hash chain to use given mac address */
 297static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
 298                                                const u8 *mac)
 299{
 300        return &vxlan->fdb_head[eth_hash(mac)];
 301}
 302
 303/* Look up Ethernet address in forwarding table */
 304static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
 305                                        const u8 *mac)
 306
 307{
 308        struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
 309        struct vxlan_fdb *f;
 310
 311        hlist_for_each_entry_rcu(f, head, hlist) {
 312                if (compare_ether_addr(mac, f->eth_addr) == 0)
 313                        return f;
 314        }
 315
 316        return NULL;
 317}
 318
 319static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
 320                                        const u8 *mac)
 321{
 322        struct vxlan_fdb *f;
 323
 324        f = __vxlan_find_mac(vxlan, mac);
 325        if (f)
 326                f->used = jiffies;
 327
 328        return f;
 329}
 330
 331/* Add/update destinations for multicast */
 332static int vxlan_fdb_append(struct vxlan_fdb *f,
 333                            __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
 334{
 335        struct vxlan_rdst *rd_prev, *rd;
 336
 337        rd_prev = NULL;
 338        for (rd = &f->remote; rd; rd = rd->remote_next) {
 339                if (rd->remote_ip == ip &&
 340                    rd->remote_port == port &&
 341                    rd->remote_vni == vni &&
 342                    rd->remote_ifindex == ifindex)
 343                        return 0;
 344                rd_prev = rd;
 345        }
 346        rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
 347        if (rd == NULL)
 348                return -ENOBUFS;
 349        rd->remote_ip = ip;
 350        rd->remote_port = port;
 351        rd->remote_vni = vni;
 352        rd->remote_ifindex = ifindex;
 353        rd->remote_next = NULL;
 354        rd_prev->remote_next = rd;
 355        return 1;
 356}
 357
 358/* Add new entry to forwarding table -- assumes lock held */
 359static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 360                            const u8 *mac, __be32 ip,
 361                            __u16 state, __u16 flags,
 362                            __be16 port, __u32 vni, __u32 ifindex,
 363                            __u8 ndm_flags)
 364{
 365        struct vxlan_fdb *f;
 366        int notify = 0;
 367
 368        f = __vxlan_find_mac(vxlan, mac);
 369        if (f) {
 370                if (flags & NLM_F_EXCL) {
 371                        netdev_dbg(vxlan->dev,
 372                                   "lost race to create %pM\n", mac);
 373                        return -EEXIST;
 374                }
 375                if (f->state != state) {
 376                        f->state = state;
 377                        f->updated = jiffies;
 378                        notify = 1;
 379                }
 380                if (f->flags != ndm_flags) {
 381                        f->flags = ndm_flags;
 382                        f->updated = jiffies;
 383                        notify = 1;
 384                }
 385                if ((flags & NLM_F_APPEND) &&
 386                    is_multicast_ether_addr(f->eth_addr)) {
 387                        int rc = vxlan_fdb_append(f, ip, port, vni, ifindex);
 388
 389                        if (rc < 0)
 390                                return rc;
 391                        notify |= rc;
 392                }
 393        } else {
 394                if (!(flags & NLM_F_CREATE))
 395                        return -ENOENT;
 396
 397                if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
 398                        return -ENOSPC;
 399
 400                netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
 401                f = kmalloc(sizeof(*f), GFP_ATOMIC);
 402                if (!f)
 403                        return -ENOMEM;
 404
 405                notify = 1;
 406                f->remote.remote_ip = ip;
 407                f->remote.remote_port = port;
 408                f->remote.remote_vni = vni;
 409                f->remote.remote_ifindex = ifindex;
 410                f->remote.remote_next = NULL;
 411                f->state = state;
 412                f->flags = ndm_flags;
 413                f->updated = f->used = jiffies;
 414                memcpy(f->eth_addr, mac, ETH_ALEN);
 415
 416                ++vxlan->addrcnt;
 417                hlist_add_head_rcu(&f->hlist,
 418                                   vxlan_fdb_head(vxlan, mac));
 419        }
 420
 421        if (notify)
 422                vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
 423
 424        return 0;
 425}
 426
 427static void vxlan_fdb_free(struct rcu_head *head)
 428{
 429        struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
 430
 431        while (f->remote.remote_next) {
 432                struct vxlan_rdst *rd = f->remote.remote_next;
 433
 434                f->remote.remote_next = rd->remote_next;
 435                kfree(rd);
 436        }
 437        kfree(f);
 438}
 439
 440static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
 441{
 442        netdev_dbg(vxlan->dev,
 443                    "delete %pM\n", f->eth_addr);
 444
 445        --vxlan->addrcnt;
 446        vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
 447
 448        hlist_del_rcu(&f->hlist);
 449        call_rcu(&f->rcu, vxlan_fdb_free);
 450}
 451
 452/* Add static entry (via netlink) */
 453static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 454                         struct net_device *dev,
 455                         const unsigned char *addr, u16 flags)
 456{
 457        struct vxlan_dev *vxlan = netdev_priv(dev);
 458        struct net *net = dev_net(vxlan->dev);
 459        __be32 ip;
 460        __be16 port;
 461        u32 vni, ifindex;
 462        int err;
 463
 464        if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
 465                pr_info("RTM_NEWNEIGH with invalid state %#x\n",
 466                        ndm->ndm_state);
 467                return -EINVAL;
 468        }
 469
 470        if (tb[NDA_DST] == NULL)
 471                return -EINVAL;
 472
 473        if (nla_len(tb[NDA_DST]) != sizeof(__be32))
 474                return -EAFNOSUPPORT;
 475
 476        ip = nla_get_be32(tb[NDA_DST]);
 477
 478        if (tb[NDA_PORT]) {
 479                if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
 480                        return -EINVAL;
 481                port = nla_get_be16(tb[NDA_PORT]);
 482        } else
 483                port = vxlan->dst_port;
 484
 485        if (tb[NDA_VNI]) {
 486                if (nla_len(tb[NDA_VNI]) != sizeof(u32))
 487                        return -EINVAL;
 488                vni = nla_get_u32(tb[NDA_VNI]);
 489        } else
 490                vni = vxlan->default_dst.remote_vni;
 491
 492        if (tb[NDA_IFINDEX]) {
 493                struct net_device *tdev;
 494
 495                if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
 496                        return -EINVAL;
 497                ifindex = nla_get_u32(tb[NDA_IFINDEX]);
 498                tdev = dev_get_by_index(net, ifindex);
 499                if (!tdev)
 500                        return -EADDRNOTAVAIL;
 501                dev_put(tdev);
 502        } else
 503                ifindex = 0;
 504
 505        spin_lock_bh(&vxlan->hash_lock);
 506        err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags,
 507                               port, vni, ifindex, ndm->ndm_flags);
 508        spin_unlock_bh(&vxlan->hash_lock);
 509
 510        return err;
 511}
 512
 513/* Delete entry (via netlink) */
 514static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 515                            struct net_device *dev,
 516                            const unsigned char *addr)
 517{
 518        struct vxlan_dev *vxlan = netdev_priv(dev);
 519        struct vxlan_fdb *f;
 520        int err = -ENOENT;
 521
 522        spin_lock_bh(&vxlan->hash_lock);
 523        f = vxlan_find_mac(vxlan, addr);
 524        if (f) {
 525                vxlan_fdb_destroy(vxlan, f);
 526                err = 0;
 527        }
 528        spin_unlock_bh(&vxlan->hash_lock);
 529
 530        return err;
 531}
 532
 533/* Dump forwarding table */
 534static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 535                          struct net_device *dev, int idx)
 536{
 537        struct vxlan_dev *vxlan = netdev_priv(dev);
 538        unsigned int h;
 539
 540        for (h = 0; h < FDB_HASH_SIZE; ++h) {
 541                struct vxlan_fdb *f;
 542                int err;
 543
 544                hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
 545                        struct vxlan_rdst *rd;
 546                        for (rd = &f->remote; rd; rd = rd->remote_next) {
 547                                if (idx < cb->args[0])
 548                                        goto skip;
 549
 550                                err = vxlan_fdb_info(skb, vxlan, f,
 551                                                     NETLINK_CB(cb->skb).portid,
 552                                                     cb->nlh->nlmsg_seq,
 553                                                     RTM_NEWNEIGH,
 554                                                     NLM_F_MULTI, rd);
 555                                if (err < 0)
 556                                        break;
 557skip:
 558                                ++idx;
 559                        }
 560                }
 561        }
 562
 563        return idx;
 564}
 565
 566/* Watch incoming packets to learn mapping between Ethernet address
 567 * and Tunnel endpoint.
 568 * Return true if packet is bogus and should be droppped.
 569 */
 570static bool vxlan_snoop(struct net_device *dev,
 571                        __be32 src_ip, const u8 *src_mac)
 572{
 573        struct vxlan_dev *vxlan = netdev_priv(dev);
 574        struct vxlan_fdb *f;
 575
 576        f = vxlan_find_mac(vxlan, src_mac);
 577        if (likely(f)) {
 578                if (likely(f->remote.remote_ip == src_ip))
 579                        return false;
 580
 581                /* Don't migrate static entries, drop packets */
 582                if (f->state & NUD_NOARP)
 583                        return true;
 584
 585                if (net_ratelimit())
 586                        netdev_info(dev,
 587                                    "%pM migrated from %pI4 to %pI4\n",
 588                                    src_mac, &f->remote.remote_ip, &src_ip);
 589
 590                f->remote.remote_ip = src_ip;
 591                f->updated = jiffies;
 592        } else {
 593                /* learned new entry */
 594                spin_lock(&vxlan->hash_lock);
 595
 596                /* close off race between vxlan_flush and incoming packets */
 597                if (netif_running(dev))
 598                        vxlan_fdb_create(vxlan, src_mac, src_ip,
 599                                         NUD_REACHABLE,
 600                                         NLM_F_EXCL|NLM_F_CREATE,
 601                                         vxlan->dst_port,
 602                                         vxlan->default_dst.remote_vni,
 603                                         0, NTF_SELF);
 604                spin_unlock(&vxlan->hash_lock);
 605        }
 606
 607        return false;
 608}
 609
 610
 611/* See if multicast group is already in use by other ID */
 612static bool vxlan_group_used(struct vxlan_net *vn,
 613                             const struct vxlan_dev *this)
 614{
 615        const struct vxlan_dev *vxlan;
 616        unsigned h;
 617
 618        for (h = 0; h < VNI_HASH_SIZE; ++h)
 619                hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist) {
 620                        if (vxlan == this)
 621                                continue;
 622
 623                        if (!netif_running(vxlan->dev))
 624                                continue;
 625
 626                        if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip)
 627                                return true;
 628                }
 629
 630        return false;
 631}
 632
 633/* kernel equivalent to IP_ADD_MEMBERSHIP */
 634static int vxlan_join_group(struct net_device *dev)
 635{
 636        struct vxlan_dev *vxlan = netdev_priv(dev);
 637        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 638        struct sock *sk = vn->sock->sk;
 639        struct ip_mreqn mreq = {
 640                .imr_multiaddr.s_addr   = vxlan->default_dst.remote_ip,
 641                .imr_ifindex            = vxlan->default_dst.remote_ifindex,
 642        };
 643        int err;
 644
 645        /* Already a member of group */
 646        if (vxlan_group_used(vn, vxlan))
 647                return 0;
 648
 649        /* Need to drop RTNL to call multicast join */
 650        rtnl_unlock();
 651        lock_sock(sk);
 652        err = ip_mc_join_group(sk, &mreq);
 653        release_sock(sk);
 654        rtnl_lock();
 655
 656        return err;
 657}
 658
 659
 660/* kernel equivalent to IP_DROP_MEMBERSHIP */
 661static int vxlan_leave_group(struct net_device *dev)
 662{
 663        struct vxlan_dev *vxlan = netdev_priv(dev);
 664        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 665        int err = 0;
 666        struct sock *sk = vn->sock->sk;
 667        struct ip_mreqn mreq = {
 668                .imr_multiaddr.s_addr   = vxlan->default_dst.remote_ip,
 669                .imr_ifindex            = vxlan->default_dst.remote_ifindex,
 670        };
 671
 672        /* Only leave group when last vxlan is done. */
 673        if (vxlan_group_used(vn, vxlan))
 674                return 0;
 675
 676        /* Need to drop RTNL to call multicast leave */
 677        rtnl_unlock();
 678        lock_sock(sk);
 679        err = ip_mc_leave_group(sk, &mreq);
 680        release_sock(sk);
 681        rtnl_lock();
 682
 683        return err;
 684}
 685
 686/* Callback from net/ipv4/udp.c to receive packets */
 687static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 688{
 689        struct iphdr *oip;
 690        struct vxlanhdr *vxh;
 691        struct vxlan_dev *vxlan;
 692        struct pcpu_tstats *stats;
 693        __u32 vni;
 694        int err;
 695
 696        /* pop off outer UDP header */
 697        __skb_pull(skb, sizeof(struct udphdr));
 698
 699        /* Need Vxlan and inner Ethernet header to be present */
 700        if (!pskb_may_pull(skb, sizeof(struct vxlanhdr)))
 701                goto error;
 702
 703        /* Drop packets with reserved bits set */
 704        vxh = (struct vxlanhdr *) skb->data;
 705        if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
 706            (vxh->vx_vni & htonl(0xff))) {
 707                netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
 708                           ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
 709                goto error;
 710        }
 711
 712        __skb_pull(skb, sizeof(struct vxlanhdr));
 713
 714        /* Is this VNI defined? */
 715        vni = ntohl(vxh->vx_vni) >> 8;
 716        vxlan = vxlan_find_vni(sock_net(sk), vni);
 717        if (!vxlan) {
 718                netdev_dbg(skb->dev, "unknown vni %d\n", vni);
 719                goto drop;
 720        }
 721
 722        if (!pskb_may_pull(skb, ETH_HLEN)) {
 723                vxlan->dev->stats.rx_length_errors++;
 724                vxlan->dev->stats.rx_errors++;
 725                goto drop;
 726        }
 727
 728        skb_reset_mac_header(skb);
 729
 730        /* Re-examine inner Ethernet packet */
 731        oip = ip_hdr(skb);
 732        skb->protocol = eth_type_trans(skb, vxlan->dev);
 733
 734        /* Ignore packet loops (and multicast echo) */
 735        if (compare_ether_addr(eth_hdr(skb)->h_source,
 736                               vxlan->dev->dev_addr) == 0)
 737                goto drop;
 738
 739        if ((vxlan->flags & VXLAN_F_LEARN) &&
 740            vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source))
 741                goto drop;
 742
 743        __skb_tunnel_rx(skb, vxlan->dev);
 744        skb_reset_network_header(skb);
 745
 746        /* If the NIC driver gave us an encapsulated packet with
 747         * CHECKSUM_UNNECESSARY and Rx checksum feature is enabled,
 748         * leave the CHECKSUM_UNNECESSARY, the device checksummed it
 749         * for us. Otherwise force the upper layers to verify it.
 750         */
 751        if (skb->ip_summed != CHECKSUM_UNNECESSARY || !skb->encapsulation ||
 752            !(vxlan->dev->features & NETIF_F_RXCSUM))
 753                skb->ip_summed = CHECKSUM_NONE;
 754
 755        skb->encapsulation = 0;
 756
 757        err = IP_ECN_decapsulate(oip, skb);
 758        if (unlikely(err)) {
 759                if (log_ecn_error)
 760                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 761                                             &oip->saddr, oip->tos);
 762                if (err > 1) {
 763                        ++vxlan->dev->stats.rx_frame_errors;
 764                        ++vxlan->dev->stats.rx_errors;
 765                        goto drop;
 766                }
 767        }
 768
 769        stats = this_cpu_ptr(vxlan->dev->tstats);
 770        u64_stats_update_begin(&stats->syncp);
 771        stats->rx_packets++;
 772        stats->rx_bytes += skb->len;
 773        u64_stats_update_end(&stats->syncp);
 774
 775        netif_rx(skb);
 776
 777        return 0;
 778error:
 779        /* Put UDP header back */
 780        __skb_push(skb, sizeof(struct udphdr));
 781
 782        return 1;
 783drop:
 784        /* Consume bad packet */
 785        kfree_skb(skb);
 786        return 0;
 787}
 788
 789static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 790{
 791        struct vxlan_dev *vxlan = netdev_priv(dev);
 792        struct arphdr *parp;
 793        u8 *arpptr, *sha;
 794        __be32 sip, tip;
 795        struct neighbour *n;
 796
 797        if (dev->flags & IFF_NOARP)
 798                goto out;
 799
 800        if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
 801                dev->stats.tx_dropped++;
 802                goto out;
 803        }
 804        parp = arp_hdr(skb);
 805
 806        if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
 807             parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
 808            parp->ar_pro != htons(ETH_P_IP) ||
 809            parp->ar_op != htons(ARPOP_REQUEST) ||
 810            parp->ar_hln != dev->addr_len ||
 811            parp->ar_pln != 4)
 812                goto out;
 813        arpptr = (u8 *)parp + sizeof(struct arphdr);
 814        sha = arpptr;
 815        arpptr += dev->addr_len;        /* sha */
 816        memcpy(&sip, arpptr, sizeof(sip));
 817        arpptr += sizeof(sip);
 818        arpptr += dev->addr_len;        /* tha */
 819        memcpy(&tip, arpptr, sizeof(tip));
 820
 821        if (ipv4_is_loopback(tip) ||
 822            ipv4_is_multicast(tip))
 823                goto out;
 824
 825        n = neigh_lookup(&arp_tbl, &tip, dev);
 826
 827        if (n) {
 828                struct vxlan_fdb *f;
 829                struct sk_buff  *reply;
 830
 831                if (!(n->nud_state & NUD_CONNECTED)) {
 832                        neigh_release(n);
 833                        goto out;
 834                }
 835
 836                f = vxlan_find_mac(vxlan, n->ha);
 837                if (f && f->remote.remote_ip == htonl(INADDR_ANY)) {
 838                        /* bridge-local neighbor */
 839                        neigh_release(n);
 840                        goto out;
 841                }
 842
 843                reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
 844                                n->ha, sha);
 845
 846                neigh_release(n);
 847
 848                skb_reset_mac_header(reply);
 849                __skb_pull(reply, skb_network_offset(reply));
 850                reply->ip_summed = CHECKSUM_UNNECESSARY;
 851                reply->pkt_type = PACKET_HOST;
 852
 853                if (netif_rx_ni(reply) == NET_RX_DROP)
 854                        dev->stats.rx_dropped++;
 855        } else if (vxlan->flags & VXLAN_F_L3MISS)
 856                vxlan_ip_miss(dev, tip);
 857out:
 858        consume_skb(skb);
 859        return NETDEV_TX_OK;
 860}
 861
 862static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 863{
 864        struct vxlan_dev *vxlan = netdev_priv(dev);
 865        struct neighbour *n;
 866        struct iphdr *pip;
 867
 868        if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
 869                return false;
 870
 871        n = NULL;
 872        switch (ntohs(eth_hdr(skb)->h_proto)) {
 873        case ETH_P_IP:
 874                if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 875                        return false;
 876                pip = ip_hdr(skb);
 877                n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
 878                break;
 879        default:
 880                return false;
 881        }
 882
 883        if (n) {
 884                bool diff;
 885
 886                diff = compare_ether_addr(eth_hdr(skb)->h_dest, n->ha) != 0;
 887                if (diff) {
 888                        memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
 889                                dev->addr_len);
 890                        memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
 891                }
 892                neigh_release(n);
 893                return diff;
 894        } else if (vxlan->flags & VXLAN_F_L3MISS)
 895                vxlan_ip_miss(dev, pip->daddr);
 896        return false;
 897}
 898
 899static void vxlan_sock_free(struct sk_buff *skb)
 900{
 901        sock_put(skb->sk);
 902}
 903
 904/* On transmit, associate with the tunnel socket */
 905static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
 906{
 907        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 908        struct sock *sk = vn->sock->sk;
 909
 910        skb_orphan(skb);
 911        sock_hold(sk);
 912        skb->sk = sk;
 913        skb->destructor = vxlan_sock_free;
 914}
 915
 916/* Compute source port for outgoing packet
 917 *   first choice to use L4 flow hash since it will spread
 918 *     better and maybe available from hardware
 919 *   secondary choice is to use jhash on the Ethernet header
 920 */
 921static __be16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb)
 922{
 923        unsigned int range = (vxlan->port_max - vxlan->port_min) + 1;
 924        u32 hash;
 925
 926        hash = skb_get_rxhash(skb);
 927        if (!hash)
 928                hash = jhash(skb->data, 2 * ETH_ALEN,
 929                             (__force u32) skb->protocol);
 930
 931        return htons((((u64) hash * range) >> 32) + vxlan->port_min);
 932}
 933
 934static int handle_offloads(struct sk_buff *skb)
 935{
 936        if (skb_is_gso(skb)) {
 937                int err = skb_unclone(skb, GFP_ATOMIC);
 938                if (unlikely(err))
 939                        return err;
 940
 941                skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
 942        } else if (skb->ip_summed != CHECKSUM_PARTIAL)
 943                skb->ip_summed = CHECKSUM_NONE;
 944
 945        return 0;
 946}
 947
 948/* Bypass encapsulation if the destination is local */
 949static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 950                               struct vxlan_dev *dst_vxlan)
 951{
 952        struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
 953        struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
 954
 955        skb->pkt_type = PACKET_HOST;
 956        skb->encapsulation = 0;
 957        skb->dev = dst_vxlan->dev;
 958        __skb_pull(skb, skb_network_offset(skb));
 959
 960        if (dst_vxlan->flags & VXLAN_F_LEARN)
 961                vxlan_snoop(skb->dev, htonl(INADDR_LOOPBACK),
 962                            eth_hdr(skb)->h_source);
 963
 964        u64_stats_update_begin(&tx_stats->syncp);
 965        tx_stats->tx_packets++;
 966        tx_stats->tx_bytes += skb->len;
 967        u64_stats_update_end(&tx_stats->syncp);
 968
 969        if (netif_rx(skb) == NET_RX_SUCCESS) {
 970                u64_stats_update_begin(&rx_stats->syncp);
 971                rx_stats->rx_packets++;
 972                rx_stats->rx_bytes += skb->len;
 973                u64_stats_update_end(&rx_stats->syncp);
 974        } else {
 975                skb->dev->stats.rx_dropped++;
 976        }
 977}
 978
 979static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 980                                  struct vxlan_rdst *rdst, bool did_rsc)
 981{
 982        struct vxlan_dev *vxlan = netdev_priv(dev);
 983        struct rtable *rt;
 984        const struct iphdr *old_iph;
 985        struct iphdr *iph;
 986        struct vxlanhdr *vxh;
 987        struct udphdr *uh;
 988        struct flowi4 fl4;
 989        __be32 dst;
 990        __be16 src_port, dst_port;
 991        u32 vni;
 992        __be16 df = 0;
 993        __u8 tos, ttl;
 994
 995        dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
 996        vni = rdst->remote_vni;
 997        dst = rdst->remote_ip;
 998
 999        if (!dst) {
1000                if (did_rsc) {
1001                        /* short-circuited back to local bridge */
1002                        vxlan_encap_bypass(skb, vxlan, vxlan);
1003                        return NETDEV_TX_OK;
1004                }
1005                goto drop;
1006        }
1007
1008        if (!skb->encapsulation) {
1009                skb_reset_inner_headers(skb);
1010                skb->encapsulation = 1;
1011        }
1012
1013        /* Need space for new headers (invalidates iph ptr) */
1014        if (skb_cow_head(skb, VXLAN_HEADROOM))
1015                goto drop;
1016
1017        old_iph = ip_hdr(skb);
1018
1019        ttl = vxlan->ttl;
1020        if (!ttl && IN_MULTICAST(ntohl(dst)))
1021                ttl = 1;
1022
1023        tos = vxlan->tos;
1024        if (tos == 1)
1025                tos = ip_tunnel_get_dsfield(old_iph, skb);
1026
1027        src_port = vxlan_src_port(vxlan, skb);
1028
1029        memset(&fl4, 0, sizeof(fl4));
1030        fl4.flowi4_oif = rdst->remote_ifindex;
1031        fl4.flowi4_tos = RT_TOS(tos);
1032        fl4.daddr = dst;
1033        fl4.saddr = vxlan->saddr;
1034
1035        rt = ip_route_output_key(dev_net(dev), &fl4);
1036        if (IS_ERR(rt)) {
1037                netdev_dbg(dev, "no route to %pI4\n", &dst);
1038                dev->stats.tx_carrier_errors++;
1039                goto tx_error;
1040        }
1041
1042        if (rt->dst.dev == dev) {
1043                netdev_dbg(dev, "circular route to %pI4\n", &dst);
1044                ip_rt_put(rt);
1045                dev->stats.collisions++;
1046                goto tx_error;
1047        }
1048
1049        /* Bypass encapsulation if the destination is local */
1050        if (rt->rt_flags & RTCF_LOCAL &&
1051            !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
1052                struct vxlan_dev *dst_vxlan;
1053
1054                ip_rt_put(rt);
1055                dst_vxlan = vxlan_find_vni(dev_net(dev), vni);
1056                if (!dst_vxlan)
1057                        goto tx_error;
1058                vxlan_encap_bypass(skb, vxlan, dst_vxlan);
1059                return NETDEV_TX_OK;
1060        }
1061
1062        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1063        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
1064                              IPSKB_REROUTED);
1065        skb_dst_drop(skb);
1066        skb_dst_set(skb, &rt->dst);
1067
1068        vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
1069        vxh->vx_flags = htonl(VXLAN_FLAGS);
1070        vxh->vx_vni = htonl(vni << 8);
1071
1072        __skb_push(skb, sizeof(*uh));
1073        skb_reset_transport_header(skb);
1074        uh = udp_hdr(skb);
1075
1076        uh->dest = dst_port;
1077        uh->source = src_port;
1078
1079        uh->len = htons(skb->len);
1080        uh->check = 0;
1081
1082        __skb_push(skb, sizeof(*iph));
1083        skb_reset_network_header(skb);
1084        iph             = ip_hdr(skb);
1085        iph->version    = 4;
1086        iph->ihl        = sizeof(struct iphdr) >> 2;
1087        iph->frag_off   = df;
1088        iph->protocol   = IPPROTO_UDP;
1089        iph->tos        = ip_tunnel_ecn_encap(tos, old_iph, skb);
1090        iph->daddr      = dst;
1091        iph->saddr      = fl4.saddr;
1092        iph->ttl        = ttl ? : ip4_dst_hoplimit(&rt->dst);
1093        tunnel_ip_select_ident(skb, old_iph, &rt->dst);
1094
1095        nf_reset(skb);
1096
1097        vxlan_set_owner(dev, skb);
1098
1099        if (handle_offloads(skb))
1100                goto drop;
1101
1102        iptunnel_xmit(skb, dev);
1103        return NETDEV_TX_OK;
1104
1105drop:
1106        dev->stats.tx_dropped++;
1107        goto tx_free;
1108
1109tx_error:
1110        dev->stats.tx_errors++;
1111tx_free:
1112        dev_kfree_skb(skb);
1113        return NETDEV_TX_OK;
1114}
1115
1116/* Transmit local packets over Vxlan
1117 *
1118 * Outer IP header inherits ECN and DF from inner header.
1119 * Outer UDP destination is the VXLAN assigned port.
1120 *           source port is based on hash of flow
1121 */
1122static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
1123{
1124        struct vxlan_dev *vxlan = netdev_priv(dev);
1125        struct ethhdr *eth;
1126        bool did_rsc = false;
1127        struct vxlan_rdst *rdst0, *rdst;
1128        struct vxlan_fdb *f;
1129        int rc1, rc;
1130
1131        skb_reset_mac_header(skb);
1132        eth = eth_hdr(skb);
1133
1134        if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
1135                return arp_reduce(dev, skb);
1136
1137        f = vxlan_find_mac(vxlan, eth->h_dest);
1138        did_rsc = false;
1139
1140        if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) &&
1141            ntohs(eth->h_proto) == ETH_P_IP) {
1142                did_rsc = route_shortcircuit(dev, skb);
1143                if (did_rsc)
1144                        f = vxlan_find_mac(vxlan, eth->h_dest);
1145        }
1146
1147        if (f == NULL) {
1148                rdst0 = &vxlan->default_dst;
1149
1150                if (rdst0->remote_ip == htonl(INADDR_ANY) &&
1151                    (vxlan->flags & VXLAN_F_L2MISS) &&
1152                    !is_multicast_ether_addr(eth->h_dest))
1153                        vxlan_fdb_miss(vxlan, eth->h_dest);
1154        } else
1155                rdst0 = &f->remote;
1156
1157        rc = NETDEV_TX_OK;
1158
1159        /* if there are multiple destinations, send copies */
1160        for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) {
1161                struct sk_buff *skb1;
1162
1163                skb1 = skb_clone(skb, GFP_ATOMIC);
1164                if (skb1) {
1165                        rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc);
1166                        if (rc == NETDEV_TX_OK)
1167                                rc = rc1;
1168                }
1169        }
1170
1171        rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc);
1172        if (rc == NETDEV_TX_OK)
1173                rc = rc1;
1174        return rc;
1175}
1176
1177/* Walk the forwarding table and purge stale entries */
1178static void vxlan_cleanup(unsigned long arg)
1179{
1180        struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
1181        unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
1182        unsigned int h;
1183
1184        if (!netif_running(vxlan->dev))
1185                return;
1186
1187        spin_lock_bh(&vxlan->hash_lock);
1188        for (h = 0; h < FDB_HASH_SIZE; ++h) {
1189                struct hlist_node *p, *n;
1190                hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1191                        struct vxlan_fdb *f
1192                                = container_of(p, struct vxlan_fdb, hlist);
1193                        unsigned long timeout;
1194
1195                        if (f->state & NUD_PERMANENT)
1196                                continue;
1197
1198                        timeout = f->used + vxlan->age_interval * HZ;
1199                        if (time_before_eq(timeout, jiffies)) {
1200                                netdev_dbg(vxlan->dev,
1201                                           "garbage collect %pM\n",
1202                                           f->eth_addr);
1203                                f->state = NUD_STALE;
1204                                vxlan_fdb_destroy(vxlan, f);
1205                        } else if (time_before(timeout, next_timer))
1206                                next_timer = timeout;
1207                }
1208        }
1209        spin_unlock_bh(&vxlan->hash_lock);
1210
1211        mod_timer(&vxlan->age_timer, next_timer);
1212}
1213
1214/* Setup stats when device is created */
1215static int vxlan_init(struct net_device *dev)
1216{
1217        dev->tstats = alloc_percpu(struct pcpu_tstats);
1218        if (!dev->tstats)
1219                return -ENOMEM;
1220
1221        return 0;
1222}
1223
1224/* Start ageing timer and join group when device is brought up */
1225static int vxlan_open(struct net_device *dev)
1226{
1227        struct vxlan_dev *vxlan = netdev_priv(dev);
1228        int err;
1229
1230        if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
1231                err = vxlan_join_group(dev);
1232                if (err)
1233                        return err;
1234        }
1235
1236        if (vxlan->age_interval)
1237                mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
1238
1239        return 0;
1240}
1241
1242/* Purge the forwarding table */
1243static void vxlan_flush(struct vxlan_dev *vxlan)
1244{
1245        unsigned h;
1246
1247        spin_lock_bh(&vxlan->hash_lock);
1248        for (h = 0; h < FDB_HASH_SIZE; ++h) {
1249                struct hlist_node *p, *n;
1250                hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1251                        struct vxlan_fdb *f
1252                                = container_of(p, struct vxlan_fdb, hlist);
1253                        vxlan_fdb_destroy(vxlan, f);
1254                }
1255        }
1256        spin_unlock_bh(&vxlan->hash_lock);
1257}
1258
1259/* Cleanup timer and forwarding table on shutdown */
1260static int vxlan_stop(struct net_device *dev)
1261{
1262        struct vxlan_dev *vxlan = netdev_priv(dev);
1263
1264        if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)))
1265                vxlan_leave_group(dev);
1266
1267        del_timer_sync(&vxlan->age_timer);
1268
1269        vxlan_flush(vxlan);
1270
1271        return 0;
1272}
1273
1274/* Stub, nothing needs to be done. */
1275static void vxlan_set_multicast_list(struct net_device *dev)
1276{
1277}
1278
1279static const struct net_device_ops vxlan_netdev_ops = {
1280        .ndo_init               = vxlan_init,
1281        .ndo_open               = vxlan_open,
1282        .ndo_stop               = vxlan_stop,
1283        .ndo_start_xmit         = vxlan_xmit,
1284        .ndo_get_stats64        = ip_tunnel_get_stats64,
1285        .ndo_set_rx_mode        = vxlan_set_multicast_list,
1286        .ndo_change_mtu         = eth_change_mtu,
1287        .ndo_validate_addr      = eth_validate_addr,
1288        .ndo_set_mac_address    = eth_mac_addr,
1289        .ndo_fdb_add            = vxlan_fdb_add,
1290        .ndo_fdb_del            = vxlan_fdb_delete,
1291        .ndo_fdb_dump           = vxlan_fdb_dump,
1292};
1293
1294/* Info for udev, that this is a virtual tunnel endpoint */
1295static struct device_type vxlan_type = {
1296        .name = "vxlan",
1297};
1298
1299static void vxlan_free(struct net_device *dev)
1300{
1301        free_percpu(dev->tstats);
1302        free_netdev(dev);
1303}
1304
1305/* Initialize the device structure. */
1306static void vxlan_setup(struct net_device *dev)
1307{
1308        struct vxlan_dev *vxlan = netdev_priv(dev);
1309        unsigned h;
1310        int low, high;
1311
1312        eth_hw_addr_random(dev);
1313        ether_setup(dev);
1314        dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
1315
1316        dev->netdev_ops = &vxlan_netdev_ops;
1317        dev->destructor = vxlan_free;
1318        SET_NETDEV_DEVTYPE(dev, &vxlan_type);
1319
1320        dev->tx_queue_len = 0;
1321        dev->features   |= NETIF_F_LLTX;
1322        dev->features   |= NETIF_F_NETNS_LOCAL;
1323        dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
1324        dev->features   |= NETIF_F_RXCSUM;
1325        dev->features   |= NETIF_F_GSO_SOFTWARE;
1326
1327        dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
1328        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1329        dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1330        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1331
1332        spin_lock_init(&vxlan->hash_lock);
1333
1334        init_timer_deferrable(&vxlan->age_timer);
1335        vxlan->age_timer.function = vxlan_cleanup;
1336        vxlan->age_timer.data = (unsigned long) vxlan;
1337
1338        inet_get_local_port_range(&low, &high);
1339        vxlan->port_min = low;
1340        vxlan->port_max = high;
1341        vxlan->dst_port = htons(vxlan_port);
1342
1343        vxlan->dev = dev;
1344
1345        for (h = 0; h < FDB_HASH_SIZE; ++h)
1346                INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
1347}
1348
1349static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
1350        [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
1351        [IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1352        [IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
1353        [IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1354        [IFLA_VXLAN_TOS]        = { .type = NLA_U8 },
1355        [IFLA_VXLAN_TTL]        = { .type = NLA_U8 },
1356        [IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
1357        [IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
1358        [IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
1359        [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
1360        [IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
1361        [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
1362        [IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
1363        [IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
1364        [IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
1365};
1366
1367static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
1368{
1369        if (tb[IFLA_ADDRESS]) {
1370                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
1371                        pr_debug("invalid link address (not ethernet)\n");
1372                        return -EINVAL;
1373                }
1374
1375                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
1376                        pr_debug("invalid all zero ethernet address\n");
1377                        return -EADDRNOTAVAIL;
1378                }
1379        }
1380
1381        if (!data)
1382                return -EINVAL;
1383
1384        if (data[IFLA_VXLAN_ID]) {
1385                __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
1386                if (id >= VXLAN_VID_MASK)
1387                        return -ERANGE;
1388        }
1389
1390        if (data[IFLA_VXLAN_PORT_RANGE]) {
1391                const struct ifla_vxlan_port_range *p
1392                        = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1393
1394                if (ntohs(p->high) < ntohs(p->low)) {
1395                        pr_debug("port range %u .. %u not valid\n",
1396                                 ntohs(p->low), ntohs(p->high));
1397                        return -EINVAL;
1398                }
1399        }
1400
1401        return 0;
1402}
1403
1404static void vxlan_get_drvinfo(struct net_device *netdev,
1405                              struct ethtool_drvinfo *drvinfo)
1406{
1407        strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
1408        strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
1409}
1410
1411static const struct ethtool_ops vxlan_ethtool_ops = {
1412        .get_drvinfo    = vxlan_get_drvinfo,
1413        .get_link       = ethtool_op_get_link,
1414};
1415
1416static int vxlan_newlink(struct net *net, struct net_device *dev,
1417                         struct nlattr *tb[], struct nlattr *data[])
1418{
1419        struct vxlan_dev *vxlan = netdev_priv(dev);
1420        struct vxlan_rdst *dst = &vxlan->default_dst;
1421        __u32 vni;
1422        int err;
1423
1424        if (!data[IFLA_VXLAN_ID])
1425                return -EINVAL;
1426
1427        vni = nla_get_u32(data[IFLA_VXLAN_ID]);
1428        if (vxlan_find_vni(net, vni)) {
1429                pr_info("duplicate VNI %u\n", vni);
1430                return -EEXIST;
1431        }
1432        dst->remote_vni = vni;
1433
1434        if (data[IFLA_VXLAN_GROUP])
1435                dst->remote_ip = nla_get_be32(data[IFLA_VXLAN_GROUP]);
1436
1437        if (data[IFLA_VXLAN_LOCAL])
1438                vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
1439
1440        if (data[IFLA_VXLAN_LINK] &&
1441            (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
1442                struct net_device *lowerdev
1443                         = __dev_get_by_index(net, dst->remote_ifindex);
1444
1445                if (!lowerdev) {
1446                        pr_info("ifindex %d does not exist\n", dst->remote_ifindex);
1447                        return -ENODEV;
1448                }
1449
1450                if (!tb[IFLA_MTU])
1451                        dev->mtu = lowerdev->mtu - VXLAN_HEADROOM;
1452
1453                /* update header length based on lower device */
1454                dev->hard_header_len = lowerdev->hard_header_len +
1455                                       VXLAN_HEADROOM;
1456        }
1457
1458        if (data[IFLA_VXLAN_TOS])
1459                vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
1460
1461        if (data[IFLA_VXLAN_TTL])
1462                vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
1463
1464        if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
1465                vxlan->flags |= VXLAN_F_LEARN;
1466
1467        if (data[IFLA_VXLAN_AGEING])
1468                vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
1469        else
1470                vxlan->age_interval = FDB_AGE_DEFAULT;
1471
1472        if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
1473                vxlan->flags |= VXLAN_F_PROXY;
1474
1475        if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
1476                vxlan->flags |= VXLAN_F_RSC;
1477
1478        if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
1479                vxlan->flags |= VXLAN_F_L2MISS;
1480
1481        if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
1482                vxlan->flags |= VXLAN_F_L3MISS;
1483
1484        if (data[IFLA_VXLAN_LIMIT])
1485                vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
1486
1487        if (data[IFLA_VXLAN_PORT_RANGE]) {
1488                const struct ifla_vxlan_port_range *p
1489                        = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1490                vxlan->port_min = ntohs(p->low);
1491                vxlan->port_max = ntohs(p->high);
1492        }
1493
1494        if (data[IFLA_VXLAN_PORT])
1495                vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
1496
1497        SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops);
1498
1499        err = register_netdevice(dev);
1500        if (!err)
1501                hlist_add_head_rcu(&vxlan->hlist, vni_head(net, dst->remote_vni));
1502
1503        return err;
1504}
1505
1506static void vxlan_dellink(struct net_device *dev, struct list_head *head)
1507{
1508        struct vxlan_dev *vxlan = netdev_priv(dev);
1509
1510        hlist_del_rcu(&vxlan->hlist);
1511
1512        unregister_netdevice_queue(dev, head);
1513}
1514
1515static size_t vxlan_get_size(const struct net_device *dev)
1516{
1517
1518        return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
1519                nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
1520                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
1521                nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */
1522                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
1523                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
1524                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
1525                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
1526                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
1527                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
1528                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
1529                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
1530                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
1531                nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
1532                nla_total_size(sizeof(__be16))+ /* IFLA_VXLAN_PORT */
1533                0;
1534}
1535
1536static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
1537{
1538        const struct vxlan_dev *vxlan = netdev_priv(dev);
1539        const struct vxlan_rdst *dst = &vxlan->default_dst;
1540        struct ifla_vxlan_port_range ports = {
1541                .low =  htons(vxlan->port_min),
1542                .high = htons(vxlan->port_max),
1543        };
1544
1545        if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni))
1546                goto nla_put_failure;
1547
1548        if (dst->remote_ip && nla_put_be32(skb, IFLA_VXLAN_GROUP, dst->remote_ip))
1549                goto nla_put_failure;
1550
1551        if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
1552                goto nla_put_failure;
1553
1554        if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
1555                goto nla_put_failure;
1556
1557        if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
1558            nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
1559            nla_put_u8(skb, IFLA_VXLAN_LEARNING,
1560                        !!(vxlan->flags & VXLAN_F_LEARN)) ||
1561            nla_put_u8(skb, IFLA_VXLAN_PROXY,
1562                        !!(vxlan->flags & VXLAN_F_PROXY)) ||
1563            nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
1564            nla_put_u8(skb, IFLA_VXLAN_L2MISS,
1565                        !!(vxlan->flags & VXLAN_F_L2MISS)) ||
1566            nla_put_u8(skb, IFLA_VXLAN_L3MISS,
1567                        !!(vxlan->flags & VXLAN_F_L3MISS)) ||
1568            nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
1569            nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
1570            nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port))
1571                goto nla_put_failure;
1572
1573        if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
1574                goto nla_put_failure;
1575
1576        return 0;
1577
1578nla_put_failure:
1579        return -EMSGSIZE;
1580}
1581
1582static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
1583        .kind           = "vxlan",
1584        .maxtype        = IFLA_VXLAN_MAX,
1585        .policy         = vxlan_policy,
1586        .priv_size      = sizeof(struct vxlan_dev),
1587        .setup          = vxlan_setup,
1588        .validate       = vxlan_validate,
1589        .newlink        = vxlan_newlink,
1590        .dellink        = vxlan_dellink,
1591        .get_size       = vxlan_get_size,
1592        .fill_info      = vxlan_fill_info,
1593};
1594
1595static __net_init int vxlan_init_net(struct net *net)
1596{
1597        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1598        struct sock *sk;
1599        struct sockaddr_in vxlan_addr = {
1600                .sin_family = AF_INET,
1601                .sin_addr.s_addr = htonl(INADDR_ANY),
1602        };
1603        int rc;
1604        unsigned h;
1605
1606        /* Create UDP socket for encapsulation receive. */
1607        rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
1608        if (rc < 0) {
1609                pr_debug("UDP socket create failed\n");
1610                return rc;
1611        }
1612        /* Put in proper namespace */
1613        sk = vn->sock->sk;
1614        sk_change_net(sk, net);
1615
1616        vxlan_addr.sin_port = htons(vxlan_port);
1617
1618        rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr,
1619                         sizeof(vxlan_addr));
1620        if (rc < 0) {
1621                pr_debug("bind for UDP socket %pI4:%u (%d)\n",
1622                         &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
1623                sk_release_kernel(sk);
1624                vn->sock = NULL;
1625                return rc;
1626        }
1627
1628        /* Disable multicast loopback */
1629        inet_sk(sk)->mc_loop = 0;
1630
1631        /* Mark socket as an encapsulation socket. */
1632        udp_sk(sk)->encap_type = 1;
1633        udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
1634        udp_encap_enable();
1635
1636        for (h = 0; h < VNI_HASH_SIZE; ++h)
1637                INIT_HLIST_HEAD(&vn->vni_list[h]);
1638
1639        return 0;
1640}
1641
1642static __net_exit void vxlan_exit_net(struct net *net)
1643{
1644        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1645        struct vxlan_dev *vxlan;
1646        unsigned h;
1647
1648        rtnl_lock();
1649        for (h = 0; h < VNI_HASH_SIZE; ++h)
1650                hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist)
1651                        dev_close(vxlan->dev);
1652        rtnl_unlock();
1653
1654        if (vn->sock) {
1655                sk_release_kernel(vn->sock->sk);
1656                vn->sock = NULL;
1657        }
1658}
1659
1660static struct pernet_operations vxlan_net_ops = {
1661        .init = vxlan_init_net,
1662        .exit = vxlan_exit_net,
1663        .id   = &vxlan_net_id,
1664        .size = sizeof(struct vxlan_net),
1665};
1666
1667static int __init vxlan_init_module(void)
1668{
1669        int rc;
1670
1671        get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
1672
1673        rc = register_pernet_device(&vxlan_net_ops);
1674        if (rc)
1675                goto out1;
1676
1677        rc = rtnl_link_register(&vxlan_link_ops);
1678        if (rc)
1679                goto out2;
1680
1681        return 0;
1682
1683out2:
1684        unregister_pernet_device(&vxlan_net_ops);
1685out1:
1686        return rc;
1687}
1688module_init(vxlan_init_module);
1689
1690static void __exit vxlan_cleanup_module(void)
1691{
1692        rtnl_link_unregister(&vxlan_link_ops);
1693        unregister_pernet_device(&vxlan_net_ops);
1694        rcu_barrier();
1695}
1696module_exit(vxlan_cleanup_module);
1697
1698MODULE_LICENSE("GPL");
1699MODULE_VERSION(VXLAN_VERSION);
1700MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
1701MODULE_ALIAS_RTNL_LINK("vxlan");
1702