linux/drivers/net/macvtap.c
<<
>>
Prefs
   1#include <linux/etherdevice.h>
   2#include <linux/if_macvlan.h>
   3#include <linux/if_vlan.h>
   4#include <linux/interrupt.h>
   5#include <linux/nsproxy.h>
   6#include <linux/compat.h>
   7#include <linux/if_tun.h>
   8#include <linux/module.h>
   9#include <linux/skbuff.h>
  10#include <linux/cache.h>
  11#include <linux/sched.h>
  12#include <linux/types.h>
  13#include <linux/slab.h>
  14#include <linux/init.h>
  15#include <linux/wait.h>
  16#include <linux/cdev.h>
  17#include <linux/idr.h>
  18#include <linux/fs.h>
  19
  20#include <net/net_namespace.h>
  21#include <net/rtnetlink.h>
  22#include <net/sock.h>
  23#include <linux/virtio_net.h>
  24
  25/*
  26 * A macvtap queue is the central object of this driver, it connects
  27 * an open character device to a macvlan interface. There can be
  28 * multiple queues on one interface, which map back to queues
  29 * implemented in hardware on the underlying device.
  30 *
  31 * macvtap_proto is used to allocate queues through the sock allocation
  32 * mechanism.
  33 *
  34 * TODO: multiqueue support is currently not implemented, even though
  35 * macvtap is basically prepared for that. We will need to add this
  36 * here as well as in virtio-net and qemu to get line rate on 10gbit
  37 * adapters from a guest.
  38 */
  39struct macvtap_queue {
  40        struct sock sk;
  41        struct socket sock;
  42        struct socket_wq wq;
  43        int vnet_hdr_sz;
  44        struct macvlan_dev __rcu *vlan;
  45        struct file *file;
  46        unsigned int flags;
  47};
  48
  49static struct proto macvtap_proto = {
  50        .name = "macvtap",
  51        .owner = THIS_MODULE,
  52        .obj_size = sizeof (struct macvtap_queue),
  53};
  54
  55/*
  56 * Variables for dealing with macvtaps device numbers.
  57 */
  58static dev_t macvtap_major;
  59#define MACVTAP_NUM_DEVS (1U << MINORBITS)
  60static DEFINE_MUTEX(minor_lock);
  61static DEFINE_IDR(minor_idr);
  62
  63#define GOODCOPY_LEN 128
  64static struct class *macvtap_class;
  65static struct cdev macvtap_cdev;
  66
  67static const struct proto_ops macvtap_socket_ops;
  68
  69/*
  70 * RCU usage:
  71 * The macvtap_queue and the macvlan_dev are loosely coupled, the
  72 * pointers from one to the other can only be read while rcu_read_lock
  73 * or macvtap_lock is held.
  74 *
  75 * Both the file and the macvlan_dev hold a reference on the macvtap_queue
  76 * through sock_hold(&q->sk). When the macvlan_dev goes away first,
  77 * q->vlan becomes inaccessible. When the files gets closed,
  78 * macvtap_get_queue() fails.
  79 *
  80 * There may still be references to the struct sock inside of the
  81 * queue from outbound SKBs, but these never reference back to the
  82 * file or the dev. The data structure is freed through __sk_free
  83 * when both our references and any pending SKBs are gone.
  84 */
  85static DEFINE_SPINLOCK(macvtap_lock);
  86
  87/*
  88 * get_slot: return a [unused/occupied] slot in vlan->taps[]:
  89 *      - if 'q' is NULL, return the first empty slot;
  90 *      - otherwise, return the slot this pointer occupies.
  91 */
  92static int get_slot(struct macvlan_dev *vlan, struct macvtap_queue *q)
  93{
  94        int i;
  95
  96        for (i = 0; i < MAX_MACVTAP_QUEUES; i++) {
  97                if (rcu_dereference_protected(vlan->taps[i],
  98                                              lockdep_is_held(&macvtap_lock)) == q)
  99                        return i;
 100        }
 101
 102        /* Should never happen */
 103        BUG_ON(1);
 104}
 105
 106static int macvtap_set_queue(struct net_device *dev, struct file *file,
 107                                struct macvtap_queue *q)
 108{
 109        struct macvlan_dev *vlan = netdev_priv(dev);
 110        int index;
 111        int err = -EBUSY;
 112
 113        spin_lock(&macvtap_lock);
 114        if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
 115                goto out;
 116
 117        err = 0;
 118        index = get_slot(vlan, NULL);
 119        rcu_assign_pointer(q->vlan, vlan);
 120        rcu_assign_pointer(vlan->taps[index], q);
 121        sock_hold(&q->sk);
 122
 123        q->file = file;
 124        file->private_data = q;
 125
 126        vlan->numvtaps++;
 127
 128out:
 129        spin_unlock(&macvtap_lock);
 130        return err;
 131}
 132
 133/*
 134 * The file owning the queue got closed, give up both
 135 * the reference that the files holds as well as the
 136 * one from the macvlan_dev if that still exists.
 137 *
 138 * Using the spinlock makes sure that we don't get
 139 * to the queue again after destroying it.
 140 */
 141static void macvtap_put_queue(struct macvtap_queue *q)
 142{
 143        struct macvlan_dev *vlan;
 144
 145        spin_lock(&macvtap_lock);
 146        vlan = rcu_dereference_protected(q->vlan,
 147                                         lockdep_is_held(&macvtap_lock));
 148        if (vlan) {
 149                int index = get_slot(vlan, q);
 150
 151                RCU_INIT_POINTER(vlan->taps[index], NULL);
 152                RCU_INIT_POINTER(q->vlan, NULL);
 153                sock_put(&q->sk);
 154                --vlan->numvtaps;
 155        }
 156
 157        spin_unlock(&macvtap_lock);
 158
 159        synchronize_rcu();
 160        sock_put(&q->sk);
 161}
 162
 163/*
 164 * Select a queue based on the rxq of the device on which this packet
 165 * arrived. If the incoming device is not mq, calculate a flow hash
 166 * to select a queue. If all fails, find the first available queue.
 167 * Cache vlan->numvtaps since it can become zero during the execution
 168 * of this function.
 169 */
 170static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
 171                                               struct sk_buff *skb)
 172{
 173        struct macvlan_dev *vlan = netdev_priv(dev);
 174        struct macvtap_queue *tap = NULL;
 175        int numvtaps = vlan->numvtaps;
 176        __u32 rxq;
 177
 178        if (!numvtaps)
 179                goto out;
 180
 181        /* Check if we can use flow to select a queue */
 182        rxq = skb_get_rxhash(skb);
 183        if (rxq) {
 184                tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
 185                if (tap)
 186                        goto out;
 187        }
 188
 189        if (likely(skb_rx_queue_recorded(skb))) {
 190                rxq = skb_get_rx_queue(skb);
 191
 192                while (unlikely(rxq >= numvtaps))
 193                        rxq -= numvtaps;
 194
 195                tap = rcu_dereference(vlan->taps[rxq]);
 196                if (tap)
 197                        goto out;
 198        }
 199
 200        /* Everything failed - find first available queue */
 201        for (rxq = 0; rxq < MAX_MACVTAP_QUEUES; rxq++) {
 202                tap = rcu_dereference(vlan->taps[rxq]);
 203                if (tap)
 204                        break;
 205        }
 206
 207out:
 208        return tap;
 209}
 210
 211/*
 212 * The net_device is going away, give up the reference
 213 * that it holds on all queues and safely set the pointer
 214 * from the queues to NULL.
 215 */
 216static void macvtap_del_queues(struct net_device *dev)
 217{
 218        struct macvlan_dev *vlan = netdev_priv(dev);
 219        struct macvtap_queue *q, *qlist[MAX_MACVTAP_QUEUES];
 220        int i, j = 0;
 221
 222        /* macvtap_put_queue can free some slots, so go through all slots */
 223        spin_lock(&macvtap_lock);
 224        for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) {
 225                q = rcu_dereference_protected(vlan->taps[i],
 226                                              lockdep_is_held(&macvtap_lock));
 227                if (q) {
 228                        qlist[j++] = q;
 229                        RCU_INIT_POINTER(vlan->taps[i], NULL);
 230                        RCU_INIT_POINTER(q->vlan, NULL);
 231                        vlan->numvtaps--;
 232                }
 233        }
 234        BUG_ON(vlan->numvtaps != 0);
 235        /* guarantee that any future macvtap_set_queue will fail */
 236        vlan->numvtaps = MAX_MACVTAP_QUEUES;
 237        spin_unlock(&macvtap_lock);
 238
 239        synchronize_rcu();
 240
 241        for (--j; j >= 0; j--)
 242                sock_put(&qlist[j]->sk);
 243}
 244
 245/*
 246 * Forward happens for data that gets sent from one macvlan
 247 * endpoint to another one in bridge mode. We just take
 248 * the skb and put it into the receive queue.
 249 */
 250static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
 251{
 252        struct macvtap_queue *q = macvtap_get_queue(dev, skb);
 253        if (!q)
 254                goto drop;
 255
 256        if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len)
 257                goto drop;
 258
 259        skb_queue_tail(&q->sk.sk_receive_queue, skb);
 260        wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND);
 261        return NET_RX_SUCCESS;
 262
 263drop:
 264        kfree_skb(skb);
 265        return NET_RX_DROP;
 266}
 267
 268/*
 269 * Receive is for data from the external interface (lowerdev),
 270 * in case of macvtap, we can treat that the same way as
 271 * forward, which macvlan cannot.
 272 */
 273static int macvtap_receive(struct sk_buff *skb)
 274{
 275        skb_push(skb, ETH_HLEN);
 276        return macvtap_forward(skb->dev, skb);
 277}
 278
 279static int macvtap_get_minor(struct macvlan_dev *vlan)
 280{
 281        int retval = -ENOMEM;
 282
 283        mutex_lock(&minor_lock);
 284        retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL);
 285        if (retval >= 0) {
 286                vlan->minor = retval;
 287        } else if (retval == -ENOSPC) {
 288                printk(KERN_ERR "too many macvtap devices\n");
 289                retval = -EINVAL;
 290        }
 291        mutex_unlock(&minor_lock);
 292        return retval < 0 ? retval : 0;
 293}
 294
 295static void macvtap_free_minor(struct macvlan_dev *vlan)
 296{
 297        mutex_lock(&minor_lock);
 298        if (vlan->minor) {
 299                idr_remove(&minor_idr, vlan->minor);
 300                vlan->minor = 0;
 301        }
 302        mutex_unlock(&minor_lock);
 303}
 304
 305static struct net_device *dev_get_by_macvtap_minor(int minor)
 306{
 307        struct net_device *dev = NULL;
 308        struct macvlan_dev *vlan;
 309
 310        mutex_lock(&minor_lock);
 311        vlan = idr_find(&minor_idr, minor);
 312        if (vlan) {
 313                dev = vlan->dev;
 314                dev_hold(dev);
 315        }
 316        mutex_unlock(&minor_lock);
 317        return dev;
 318}
 319
 320static int macvtap_newlink(struct net *src_net,
 321                           struct net_device *dev,
 322                           struct nlattr *tb[],
 323                           struct nlattr *data[])
 324{
 325        /* Don't put anything that may fail after macvlan_common_newlink
 326         * because we can't undo what it does.
 327         */
 328        return macvlan_common_newlink(src_net, dev, tb, data,
 329                                      macvtap_receive, macvtap_forward);
 330}
 331
 332static void macvtap_dellink(struct net_device *dev,
 333                            struct list_head *head)
 334{
 335        macvtap_del_queues(dev);
 336        macvlan_dellink(dev, head);
 337}
 338
 339static void macvtap_setup(struct net_device *dev)
 340{
 341        macvlan_common_setup(dev);
 342        dev->tx_queue_len = TUN_READQ_SIZE;
 343}
 344
 345static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
 346        .kind           = "macvtap",
 347        .setup          = macvtap_setup,
 348        .newlink        = macvtap_newlink,
 349        .dellink        = macvtap_dellink,
 350};
 351
 352
 353static void macvtap_sock_write_space(struct sock *sk)
 354{
 355        wait_queue_head_t *wqueue;
 356
 357        if (!sock_writeable(sk) ||
 358            !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
 359                return;
 360
 361        wqueue = sk_sleep(sk);
 362        if (wqueue && waitqueue_active(wqueue))
 363                wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
 364}
 365
 366static void macvtap_sock_destruct(struct sock *sk)
 367{
 368        skb_queue_purge(&sk->sk_receive_queue);
 369}
 370
 371static int macvtap_open(struct inode *inode, struct file *file)
 372{
 373        struct net *net = current->nsproxy->net_ns;
 374        struct net_device *dev = dev_get_by_macvtap_minor(iminor(inode));
 375        struct macvtap_queue *q;
 376        int err;
 377
 378        err = -ENODEV;
 379        if (!dev)
 380                goto out;
 381
 382        err = -ENOMEM;
 383        q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
 384                                             &macvtap_proto);
 385        if (!q)
 386                goto out;
 387
 388        q->sock.wq = &q->wq;
 389        init_waitqueue_head(&q->wq.wait);
 390        q->sock.type = SOCK_RAW;
 391        q->sock.state = SS_CONNECTED;
 392        q->sock.file = file;
 393        q->sock.ops = &macvtap_socket_ops;
 394        sock_init_data(&q->sock, &q->sk);
 395        q->sk.sk_write_space = macvtap_sock_write_space;
 396        q->sk.sk_destruct = macvtap_sock_destruct;
 397        q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
 398        q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 399
 400        /*
 401         * so far only KVM virtio_net uses macvtap, enable zero copy between
 402         * guest kernel and host kernel when lower device supports zerocopy
 403         *
 404         * The macvlan supports zerocopy iff the lower device supports zero
 405         * copy so we don't have to look at the lower device directly.
 406         */
 407        if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
 408                sock_set_flag(&q->sk, SOCK_ZEROCOPY);
 409
 410        err = macvtap_set_queue(dev, file, q);
 411        if (err)
 412                sock_put(&q->sk);
 413
 414out:
 415        if (dev)
 416                dev_put(dev);
 417
 418        return err;
 419}
 420
 421static int macvtap_release(struct inode *inode, struct file *file)
 422{
 423        struct macvtap_queue *q = file->private_data;
 424        macvtap_put_queue(q);
 425        return 0;
 426}
 427
 428static unsigned int macvtap_poll(struct file *file, poll_table * wait)
 429{
 430        struct macvtap_queue *q = file->private_data;
 431        unsigned int mask = POLLERR;
 432
 433        if (!q)
 434                goto out;
 435
 436        mask = 0;
 437        poll_wait(file, &q->wq.wait, wait);
 438
 439        if (!skb_queue_empty(&q->sk.sk_receive_queue))
 440                mask |= POLLIN | POLLRDNORM;
 441
 442        if (sock_writeable(&q->sk) ||
 443            (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
 444             sock_writeable(&q->sk)))
 445                mask |= POLLOUT | POLLWRNORM;
 446
 447out:
 448        return mask;
 449}
 450
 451static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
 452                                                size_t len, size_t linear,
 453                                                int noblock, int *err)
 454{
 455        struct sk_buff *skb;
 456
 457        /* Under a page?  Don't bother with paged skb. */
 458        if (prepad + len < PAGE_SIZE || !linear)
 459                linear = len;
 460
 461        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
 462                                   err);
 463        if (!skb)
 464                return NULL;
 465
 466        skb_reserve(skb, prepad);
 467        skb_put(skb, linear);
 468        skb->data_len = len - linear;
 469        skb->len += len - linear;
 470
 471        return skb;
 472}
 473
 474/* set skb frags from iovec, this can move to core network code for reuse */
 475static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 476                                  int offset, size_t count)
 477{
 478        int len = iov_length(from, count) - offset;
 479        int copy = skb_headlen(skb);
 480        int size, offset1 = 0;
 481        int i = 0;
 482
 483        /* Skip over from offset */
 484        while (count && (offset >= from->iov_len)) {
 485                offset -= from->iov_len;
 486                ++from;
 487                --count;
 488        }
 489
 490        /* copy up to skb headlen */
 491        while (count && (copy > 0)) {
 492                size = min_t(unsigned int, copy, from->iov_len - offset);
 493                if (copy_from_user(skb->data + offset1, from->iov_base + offset,
 494                                   size))
 495                        return -EFAULT;
 496                if (copy > size) {
 497                        ++from;
 498                        --count;
 499                        offset = 0;
 500                } else
 501                        offset += size;
 502                copy -= size;
 503                offset1 += size;
 504        }
 505
 506        if (len == offset1)
 507                return 0;
 508
 509        while (count--) {
 510                struct page *page[MAX_SKB_FRAGS];
 511                int num_pages;
 512                unsigned long base;
 513                unsigned long truesize;
 514
 515                len = from->iov_len - offset;
 516                if (!len) {
 517                        offset = 0;
 518                        ++from;
 519                        continue;
 520                }
 521                base = (unsigned long)from->iov_base + offset;
 522                size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
 523                if (i + size > MAX_SKB_FRAGS)
 524                        return -EMSGSIZE;
 525                num_pages = get_user_pages_fast(base, size, 0, &page[i]);
 526                if (num_pages != size) {
 527                        for (i = 0; i < num_pages; i++)
 528                                put_page(page[i]);
 529                        return -EFAULT;
 530                }
 531                truesize = size * PAGE_SIZE;
 532                skb->data_len += len;
 533                skb->len += len;
 534                skb->truesize += truesize;
 535                atomic_add(truesize, &skb->sk->sk_wmem_alloc);
 536                while (len) {
 537                        int off = base & ~PAGE_MASK;
 538                        int size = min_t(int, len, PAGE_SIZE - off);
 539                        __skb_fill_page_desc(skb, i, page[i], off, size);
 540                        skb_shinfo(skb)->nr_frags++;
 541                        /* increase sk_wmem_alloc */
 542                        base += size;
 543                        len -= size;
 544                        i++;
 545                }
 546                offset = 0;
 547                ++from;
 548        }
 549        return 0;
 550}
 551
 552/*
 553 * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
 554 * be shared with the tun/tap driver.
 555 */
 556static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb,
 557                                     struct virtio_net_hdr *vnet_hdr)
 558{
 559        unsigned short gso_type = 0;
 560        if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 561                switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
 562                case VIRTIO_NET_HDR_GSO_TCPV4:
 563                        gso_type = SKB_GSO_TCPV4;
 564                        break;
 565                case VIRTIO_NET_HDR_GSO_TCPV6:
 566                        gso_type = SKB_GSO_TCPV6;
 567                        break;
 568                case VIRTIO_NET_HDR_GSO_UDP:
 569                        gso_type = SKB_GSO_UDP;
 570                        break;
 571                default:
 572                        return -EINVAL;
 573                }
 574
 575                if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
 576                        gso_type |= SKB_GSO_TCP_ECN;
 577
 578                if (vnet_hdr->gso_size == 0)
 579                        return -EINVAL;
 580        }
 581
 582        if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
 583                if (!skb_partial_csum_set(skb, vnet_hdr->csum_start,
 584                                          vnet_hdr->csum_offset))
 585                        return -EINVAL;
 586        }
 587
 588        if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 589                skb_shinfo(skb)->gso_size = vnet_hdr->gso_size;
 590                skb_shinfo(skb)->gso_type = gso_type;
 591
 592                /* Header must be checked, and gso_segs computed. */
 593                skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
 594                skb_shinfo(skb)->gso_segs = 0;
 595        }
 596        return 0;
 597}
 598
 599static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
 600                                   struct virtio_net_hdr *vnet_hdr)
 601{
 602        memset(vnet_hdr, 0, sizeof(*vnet_hdr));
 603
 604        if (skb_is_gso(skb)) {
 605                struct skb_shared_info *sinfo = skb_shinfo(skb);
 606
 607                /* This is a hint as to how much should be linear. */
 608                vnet_hdr->hdr_len = skb_headlen(skb);
 609                vnet_hdr->gso_size = sinfo->gso_size;
 610                if (sinfo->gso_type & SKB_GSO_TCPV4)
 611                        vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 612                else if (sinfo->gso_type & SKB_GSO_TCPV6)
 613                        vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
 614                else if (sinfo->gso_type & SKB_GSO_UDP)
 615                        vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
 616                else
 617                        BUG();
 618                if (sinfo->gso_type & SKB_GSO_TCP_ECN)
 619                        vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
 620        } else
 621                vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
 622
 623        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 624                vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 625                vnet_hdr->csum_start = skb_checksum_start_offset(skb);
 626                vnet_hdr->csum_offset = skb->csum_offset;
 627        } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
 628                vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
 629        } /* else everything is zero */
 630
 631        return 0;
 632}
 633
 634
 635/* Get packet from user space buffer */
 636static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 637                                const struct iovec *iv, unsigned long total_len,
 638                                size_t count, int noblock)
 639{
 640        struct sk_buff *skb;
 641        struct macvlan_dev *vlan;
 642        unsigned long len = total_len;
 643        int err;
 644        struct virtio_net_hdr vnet_hdr = { 0 };
 645        int vnet_hdr_len = 0;
 646        int copylen = 0;
 647        bool zerocopy = false;
 648
 649        if (q->flags & IFF_VNET_HDR) {
 650                vnet_hdr_len = q->vnet_hdr_sz;
 651
 652                err = -EINVAL;
 653                if (len < vnet_hdr_len)
 654                        goto err;
 655                len -= vnet_hdr_len;
 656
 657                err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0,
 658                                           sizeof(vnet_hdr));
 659                if (err < 0)
 660                        goto err;
 661                if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
 662                     vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
 663                                                        vnet_hdr.hdr_len)
 664                        vnet_hdr.hdr_len = vnet_hdr.csum_start +
 665                                                vnet_hdr.csum_offset + 2;
 666                err = -EINVAL;
 667                if (vnet_hdr.hdr_len > len)
 668                        goto err;
 669        }
 670
 671        err = -EINVAL;
 672        if (unlikely(len < ETH_HLEN))
 673                goto err;
 674
 675        err = -EMSGSIZE;
 676        if (unlikely(count > UIO_MAXIOV))
 677                goto err;
 678
 679        if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY))
 680                zerocopy = true;
 681
 682        if (zerocopy) {
 683                /* Userspace may produce vectors with count greater than
 684                 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
 685                 * to let the rest of data to be fit in the frags.
 686                 */
 687                if (count > MAX_SKB_FRAGS) {
 688                        copylen = iov_length(iv, count - MAX_SKB_FRAGS);
 689                        if (copylen < vnet_hdr_len)
 690                                copylen = 0;
 691                        else
 692                                copylen -= vnet_hdr_len;
 693                }
 694                /* There are 256 bytes to be copied in skb, so there is enough
 695                 * room for skb expand head in case it is used.
 696                 * The rest buffer is mapped from userspace.
 697                 */
 698                if (copylen < vnet_hdr.hdr_len)
 699                        copylen = vnet_hdr.hdr_len;
 700                if (!copylen)
 701                        copylen = GOODCOPY_LEN;
 702        } else
 703                copylen = len;
 704
 705        skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
 706                                vnet_hdr.hdr_len, noblock, &err);
 707        if (!skb)
 708                goto err;
 709
 710        if (zerocopy)
 711                err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
 712        else
 713                err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
 714                                                   len);
 715        if (err)
 716                goto err_kfree;
 717
 718        skb_set_network_header(skb, ETH_HLEN);
 719        skb_reset_mac_header(skb);
 720        skb->protocol = eth_hdr(skb)->h_proto;
 721
 722        if (vnet_hdr_len) {
 723                err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr);
 724                if (err)
 725                        goto err_kfree;
 726        }
 727
 728        rcu_read_lock_bh();
 729        vlan = rcu_dereference_bh(q->vlan);
 730        /* copy skb_ubuf_info for callback when skb has no error */
 731        if (zerocopy) {
 732                skb_shinfo(skb)->destructor_arg = m->msg_control;
 733                skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 734                skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
 735        }
 736        if (vlan)
 737                macvlan_start_xmit(skb, vlan->dev);
 738        else
 739                kfree_skb(skb);
 740        rcu_read_unlock_bh();
 741
 742        return total_len;
 743
 744err_kfree:
 745        kfree_skb(skb);
 746
 747err:
 748        rcu_read_lock_bh();
 749        vlan = rcu_dereference_bh(q->vlan);
 750        if (vlan)
 751                vlan->dev->stats.tx_dropped++;
 752        rcu_read_unlock_bh();
 753
 754        return err;
 755}
 756
 757static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 758                                 unsigned long count, loff_t pos)
 759{
 760        struct file *file = iocb->ki_filp;
 761        ssize_t result = -ENOLINK;
 762        struct macvtap_queue *q = file->private_data;
 763
 764        result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
 765                                  file->f_flags & O_NONBLOCK);
 766        return result;
 767}
 768
 769/* Put packet to the user space buffer */
 770static ssize_t macvtap_put_user(struct macvtap_queue *q,
 771                                const struct sk_buff *skb,
 772                                const struct iovec *iv, int len)
 773{
 774        struct macvlan_dev *vlan;
 775        int ret;
 776        int vnet_hdr_len = 0;
 777        int vlan_offset = 0;
 778        int copied;
 779
 780        if (q->flags & IFF_VNET_HDR) {
 781                struct virtio_net_hdr vnet_hdr;
 782                vnet_hdr_len = q->vnet_hdr_sz;
 783                if ((len -= vnet_hdr_len) < 0)
 784                        return -EINVAL;
 785
 786                ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr);
 787                if (ret)
 788                        return ret;
 789
 790                if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
 791                        return -EFAULT;
 792        }
 793        copied = vnet_hdr_len;
 794
 795        if (!vlan_tx_tag_present(skb))
 796                len = min_t(int, skb->len, len);
 797        else {
 798                int copy;
 799                struct {
 800                        __be16 h_vlan_proto;
 801                        __be16 h_vlan_TCI;
 802                } veth;
 803                veth.h_vlan_proto = htons(ETH_P_8021Q);
 804                veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
 805
 806                vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 807                len = min_t(int, skb->len + VLAN_HLEN, len);
 808
 809                copy = min_t(int, vlan_offset, len);
 810                ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
 811                len -= copy;
 812                copied += copy;
 813                if (ret || !len)
 814                        goto done;
 815
 816                copy = min_t(int, sizeof(veth), len);
 817                ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
 818                len -= copy;
 819                copied += copy;
 820                if (ret || !len)
 821                        goto done;
 822        }
 823
 824        ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
 825        copied += len;
 826
 827done:
 828        rcu_read_lock_bh();
 829        vlan = rcu_dereference_bh(q->vlan);
 830        if (vlan)
 831                macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0);
 832        rcu_read_unlock_bh();
 833
 834        return ret ? ret : copied;
 835}
 836
 837static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
 838                               const struct iovec *iv, unsigned long len,
 839                               int noblock)
 840{
 841        DEFINE_WAIT(wait);
 842        struct sk_buff *skb;
 843        ssize_t ret = 0;
 844
 845        while (len) {
 846                prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE);
 847
 848                /* Read frames from the queue */
 849                skb = skb_dequeue(&q->sk.sk_receive_queue);
 850                if (!skb) {
 851                        if (noblock) {
 852                                ret = -EAGAIN;
 853                                break;
 854                        }
 855                        if (signal_pending(current)) {
 856                                ret = -ERESTARTSYS;
 857                                break;
 858                        }
 859                        /* Nothing to read, let's sleep */
 860                        schedule();
 861                        continue;
 862                }
 863                ret = macvtap_put_user(q, skb, iv, len);
 864                kfree_skb(skb);
 865                break;
 866        }
 867
 868        finish_wait(sk_sleep(&q->sk), &wait);
 869        return ret;
 870}
 871
 872static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 873                                unsigned long count, loff_t pos)
 874{
 875        struct file *file = iocb->ki_filp;
 876        struct macvtap_queue *q = file->private_data;
 877        ssize_t len, ret = 0;
 878
 879        len = iov_length(iv, count);
 880        if (len < 0) {
 881                ret = -EINVAL;
 882                goto out;
 883        }
 884
 885        ret = macvtap_do_read(q, iocb, iv, len, file->f_flags & O_NONBLOCK);
 886        ret = min_t(ssize_t, ret, len); /* XXX copied from tun.c. Why? */
 887out:
 888        return ret;
 889}
 890
 891/*
 892 * provide compatibility with generic tun/tap interface
 893 */
 894static long macvtap_ioctl(struct file *file, unsigned int cmd,
 895                          unsigned long arg)
 896{
 897        struct macvtap_queue *q = file->private_data;
 898        struct macvlan_dev *vlan;
 899        void __user *argp = (void __user *)arg;
 900        struct ifreq __user *ifr = argp;
 901        unsigned int __user *up = argp;
 902        unsigned int u;
 903        int __user *sp = argp;
 904        int s;
 905        int ret;
 906
 907        switch (cmd) {
 908        case TUNSETIFF:
 909                /* ignore the name, just look at flags */
 910                if (get_user(u, &ifr->ifr_flags))
 911                        return -EFAULT;
 912
 913                ret = 0;
 914                if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
 915                        ret = -EINVAL;
 916                else
 917                        q->flags = u;
 918
 919                return ret;
 920
 921        case TUNGETIFF:
 922                rcu_read_lock_bh();
 923                vlan = rcu_dereference_bh(q->vlan);
 924                if (vlan)
 925                        dev_hold(vlan->dev);
 926                rcu_read_unlock_bh();
 927
 928                if (!vlan)
 929                        return -ENOLINK;
 930
 931                ret = 0;
 932                if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
 933                    put_user(q->flags, &ifr->ifr_flags))
 934                        ret = -EFAULT;
 935                dev_put(vlan->dev);
 936                return ret;
 937
 938        case TUNGETFEATURES:
 939                if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
 940                        return -EFAULT;
 941                return 0;
 942
 943        case TUNSETSNDBUF:
 944                if (get_user(u, up))
 945                        return -EFAULT;
 946
 947                q->sk.sk_sndbuf = u;
 948                return 0;
 949
 950        case TUNGETVNETHDRSZ:
 951                s = q->vnet_hdr_sz;
 952                if (put_user(s, sp))
 953                        return -EFAULT;
 954                return 0;
 955
 956        case TUNSETVNETHDRSZ:
 957                if (get_user(s, sp))
 958                        return -EFAULT;
 959                if (s < (int)sizeof(struct virtio_net_hdr))
 960                        return -EINVAL;
 961
 962                q->vnet_hdr_sz = s;
 963                return 0;
 964
 965        case TUNSETOFFLOAD:
 966                /* let the user check for future flags */
 967                if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
 968                            TUN_F_TSO_ECN | TUN_F_UFO))
 969                        return -EINVAL;
 970
 971                /* TODO: only accept frames with the features that
 972                         got enabled for forwarded frames */
 973                if (!(q->flags & IFF_VNET_HDR))
 974                        return  -EINVAL;
 975                return 0;
 976
 977        default:
 978                return -EINVAL;
 979        }
 980}
 981
 982#ifdef CONFIG_COMPAT
 983static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
 984                                 unsigned long arg)
 985{
 986        return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
 987}
 988#endif
 989
 990static const struct file_operations macvtap_fops = {
 991        .owner          = THIS_MODULE,
 992        .open           = macvtap_open,
 993        .release        = macvtap_release,
 994        .aio_read       = macvtap_aio_read,
 995        .aio_write      = macvtap_aio_write,
 996        .poll           = macvtap_poll,
 997        .llseek         = no_llseek,
 998        .unlocked_ioctl = macvtap_ioctl,
 999#ifdef CONFIG_COMPAT
1000        .compat_ioctl   = macvtap_compat_ioctl,
1001#endif
1002};
1003
1004static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
1005                           struct msghdr *m, size_t total_len)
1006{
1007        struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
1008        return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
1009                            m->msg_flags & MSG_DONTWAIT);
1010}
1011
1012static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
1013                           struct msghdr *m, size_t total_len,
1014                           int flags)
1015{
1016        struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
1017        int ret;
1018        if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
1019                return -EINVAL;
1020        ret = macvtap_do_read(q, iocb, m->msg_iov, total_len,
1021                          flags & MSG_DONTWAIT);
1022        if (ret > total_len) {
1023                m->msg_flags |= MSG_TRUNC;
1024                ret = flags & MSG_TRUNC ? ret : total_len;
1025        }
1026        return ret;
1027}
1028
1029/* Ops structure to mimic raw sockets with tun */
1030static const struct proto_ops macvtap_socket_ops = {
1031        .sendmsg = macvtap_sendmsg,
1032        .recvmsg = macvtap_recvmsg,
1033};
1034
1035/* Get an underlying socket object from tun file.  Returns error unless file is
1036 * attached to a device.  The returned object works like a packet socket, it
1037 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
1038 * holding a reference to the file for as long as the socket is in use. */
1039struct socket *macvtap_get_socket(struct file *file)
1040{
1041        struct macvtap_queue *q;
1042        if (file->f_op != &macvtap_fops)
1043                return ERR_PTR(-EINVAL);
1044        q = file->private_data;
1045        if (!q)
1046                return ERR_PTR(-EBADFD);
1047        return &q->sock;
1048}
1049EXPORT_SYMBOL_GPL(macvtap_get_socket);
1050
1051static int macvtap_device_event(struct notifier_block *unused,
1052                                unsigned long event, void *ptr)
1053{
1054        struct net_device *dev = ptr;
1055        struct macvlan_dev *vlan;
1056        struct device *classdev;
1057        dev_t devt;
1058        int err;
1059
1060        if (dev->rtnl_link_ops != &macvtap_link_ops)
1061                return NOTIFY_DONE;
1062
1063        vlan = netdev_priv(dev);
1064
1065        switch (event) {
1066        case NETDEV_REGISTER:
1067                /* Create the device node here after the network device has
1068                 * been registered but before register_netdevice has
1069                 * finished running.
1070                 */
1071                err = macvtap_get_minor(vlan);
1072                if (err)
1073                        return notifier_from_errno(err);
1074
1075                devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
1076                classdev = device_create(macvtap_class, &dev->dev, devt,
1077                                         dev, "tap%d", dev->ifindex);
1078                if (IS_ERR(classdev)) {
1079                        macvtap_free_minor(vlan);
1080                        return notifier_from_errno(PTR_ERR(classdev));
1081                }
1082                break;
1083        case NETDEV_UNREGISTER:
1084                devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
1085                device_destroy(macvtap_class, devt);
1086                macvtap_free_minor(vlan);
1087                break;
1088        }
1089
1090        return NOTIFY_DONE;
1091}
1092
1093static struct notifier_block macvtap_notifier_block __read_mostly = {
1094        .notifier_call  = macvtap_device_event,
1095};
1096
1097static int macvtap_init(void)
1098{
1099        int err;
1100
1101        err = alloc_chrdev_region(&macvtap_major, 0,
1102                                MACVTAP_NUM_DEVS, "macvtap");
1103        if (err)
1104                goto out1;
1105
1106        cdev_init(&macvtap_cdev, &macvtap_fops);
1107        err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
1108        if (err)
1109                goto out2;
1110
1111        macvtap_class = class_create(THIS_MODULE, "macvtap");
1112        if (IS_ERR(macvtap_class)) {
1113                err = PTR_ERR(macvtap_class);
1114                goto out3;
1115        }
1116
1117        err = register_netdevice_notifier(&macvtap_notifier_block);
1118        if (err)
1119                goto out4;
1120
1121        err = macvlan_link_register(&macvtap_link_ops);
1122        if (err)
1123                goto out5;
1124
1125        return 0;
1126
1127out5:
1128        unregister_netdevice_notifier(&macvtap_notifier_block);
1129out4:
1130        class_unregister(macvtap_class);
1131out3:
1132        cdev_del(&macvtap_cdev);
1133out2:
1134        unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
1135out1:
1136        return err;
1137}
1138module_init(macvtap_init);
1139
1140static void macvtap_exit(void)
1141{
1142        rtnl_link_unregister(&macvtap_link_ops);
1143        unregister_netdevice_notifier(&macvtap_notifier_block);
1144        class_unregister(macvtap_class);
1145        cdev_del(&macvtap_cdev);
1146        unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
1147}
1148module_exit(macvtap_exit);
1149
1150MODULE_ALIAS_RTNL_LINK("macvtap");
1151MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
1152MODULE_LICENSE("GPL");
1153