linux/drivers/net/tun.c
<<
>>
Prefs
   1/*
   2 *  TUN - Universal TUN/TAP device driver.
   3 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
   4 *
   5 *  This program is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU General Public License as published by
   7 *  the Free Software Foundation; either version 2 of the License, or
   8 *  (at your option) any later version.
   9 *
  10 *  This program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
  16 */
  17
  18/*
  19 *  Changes:
  20 *
  21 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
  22 *    Add TUNSETLINK ioctl to set the link encapsulation
  23 *
  24 *  Mark Smith <markzzzsmith@yahoo.com.au>
  25 *    Use eth_random_addr() for tap MAC address.
  26 *
  27 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
  28 *    Fixes in packet dropping, queue length setting and queue wakeup.
  29 *    Increased default tx queue length.
  30 *    Added ethtool API.
  31 *    Minor cleanups
  32 *
  33 *  Daniel Podlejski <underley@underley.eu.org>
  34 *    Modifications for 2.3.99-pre5 kernel.
  35 */
  36
  37#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  38
  39#define DRV_NAME        "tun"
  40#define DRV_VERSION     "1.6"
  41#define DRV_DESCRIPTION "Universal TUN/TAP device driver"
  42#define DRV_COPYRIGHT   "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
  43
  44#include <linux/module.h>
  45#include <linux/errno.h>
  46#include <linux/kernel.h>
  47#include <linux/major.h>
  48#include <linux/slab.h>
  49#include <linux/poll.h>
  50#include <linux/fcntl.h>
  51#include <linux/init.h>
  52#include <linux/skbuff.h>
  53#include <linux/netdevice.h>
  54#include <linux/etherdevice.h>
  55#include <linux/miscdevice.h>
  56#include <linux/ethtool.h>
  57#include <linux/rtnetlink.h>
  58#include <linux/compat.h>
  59#include <linux/if.h>
  60#include <linux/if_arp.h>
  61#include <linux/if_ether.h>
  62#include <linux/if_tun.h>
  63#include <linux/if_vlan.h>
  64#include <linux/crc32.h>
  65#include <linux/nsproxy.h>
  66#include <linux/virtio_net.h>
  67#include <linux/rcupdate.h>
  68#include <net/ipv6.h>
  69#include <net/net_namespace.h>
  70#include <net/netns/generic.h>
  71#include <net/rtnetlink.h>
  72#include <net/sock.h>
  73#include <linux/seq_file.h>
  74
  75#include <asm/uaccess.h>
  76
  77/* Uncomment to enable debugging */
  78/* #define TUN_DEBUG 1 */
  79
  80#ifdef TUN_DEBUG
  81static int debug;
  82
  83#define tun_debug(level, tun, fmt, args...)                     \
  84do {                                                            \
  85        if (tun->debug)                                         \
  86                netdev_printk(level, tun->dev, fmt, ##args);    \
  87} while (0)
  88#define DBG1(level, fmt, args...)                               \
  89do {                                                            \
  90        if (debug == 2)                                         \
  91                printk(level fmt, ##args);                      \
  92} while (0)
  93#else
  94#define tun_debug(level, tun, fmt, args...)                     \
  95do {                                                            \
  96        if (0)                                                  \
  97                netdev_printk(level, tun->dev, fmt, ##args);    \
  98} while (0)
  99#define DBG1(level, fmt, args...)                               \
 100do {                                                            \
 101        if (0)                                                  \
 102                printk(level fmt, ##args);                      \
 103} while (0)
 104#endif
 105
 106#define GOODCOPY_LEN 128
 107
 108#define FLT_EXACT_COUNT 8
 109struct tap_filter {
 110        unsigned int    count;    /* Number of addrs. Zero means disabled */
 111        u32             mask[2];  /* Mask of the hashed addrs */
 112        unsigned char   addr[FLT_EXACT_COUNT][ETH_ALEN];
 113};
 114
 115/* DEFAULT_MAX_NUM_RSS_QUEUES were chosen to let the rx/tx queues allocated for
 116 * the netdevice to be fit in one page. So we can make sure the success of
 117 * memory allocation. TODO: increase the limit. */
 118#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES
 119#define MAX_TAP_FLOWS  4096
 120
 121#define TUN_FLOW_EXPIRE (3 * HZ)
 122
 123/* A tun_file connects an open character device to a tuntap netdevice. It
 124 * also contains all socket related structures (except sock_fprog and tap_filter)
 125 * to serve as one transmit queue for tuntap device. The sock_fprog and
 126 * tap_filter were kept in tun_struct since they were used for filtering for the
 127 * netdevice not for a specific queue (at least I didn't see the requirement for
 128 * this).
 129 *
 130 * RCU usage:
 131 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
 132 * other can only be read while rcu_read_lock or rtnl_lock is held.
 133 */
 134struct tun_file {
 135        struct sock sk;
 136        struct socket socket;
 137        struct socket_wq wq;
 138        struct tun_struct __rcu *tun;
 139        struct net *net;
 140        struct fasync_struct *fasync;
 141        /* only used for fasnyc */
 142        unsigned int flags;
 143        union {
 144                u16 queue_index;
 145                unsigned int ifindex;
 146        };
 147        struct list_head next;
 148        struct tun_struct *detached;
 149};
 150
 151struct tun_flow_entry {
 152        struct hlist_node hash_link;
 153        struct rcu_head rcu;
 154        struct tun_struct *tun;
 155
 156        u32 rxhash;
 157        u32 rps_rxhash;
 158        int queue_index;
 159        unsigned long updated;
 160};
 161
 162#define TUN_NUM_FLOW_ENTRIES 1024
 163
 164/* Since the socket were moved to tun_file, to preserve the behavior of persist
 165 * device, socket filter, sndbuf and vnet header size were restore when the
 166 * file were attached to a persist device.
 167 */
 168struct tun_struct {
 169        struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES];
 170        unsigned int            numqueues;
 171        unsigned int            flags;
 172        kuid_t                  owner;
 173        kgid_t                  group;
 174
 175        struct net_device       *dev;
 176        netdev_features_t       set_features;
 177#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
 178                          NETIF_F_TSO6)
 179
 180        int                     vnet_hdr_sz;
 181        int                     sndbuf;
 182        struct tap_filter       txflt;
 183        struct sock_fprog       fprog;
 184        /* protected by rtnl lock */
 185        bool                    filter_attached;
 186#ifdef TUN_DEBUG
 187        int debug;
 188#endif
 189        spinlock_t lock;
 190        struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
 191        struct timer_list flow_gc_timer;
 192        unsigned long ageing_time;
 193        unsigned int numdisabled;
 194        struct list_head disabled;
 195        void *security;
 196        u32 flow_count;
 197};
 198
 199static inline u32 tun_hashfn(u32 rxhash)
 200{
 201        return rxhash & 0x3ff;
 202}
 203
 204static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
 205{
 206        struct tun_flow_entry *e;
 207
 208        hlist_for_each_entry_rcu(e, head, hash_link) {
 209                if (e->rxhash == rxhash)
 210                        return e;
 211        }
 212        return NULL;
 213}
 214
 215static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
 216                                              struct hlist_head *head,
 217                                              u32 rxhash, u16 queue_index)
 218{
 219        struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
 220
 221        if (e) {
 222                tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
 223                          rxhash, queue_index);
 224                e->updated = jiffies;
 225                e->rxhash = rxhash;
 226                e->rps_rxhash = 0;
 227                e->queue_index = queue_index;
 228                e->tun = tun;
 229                hlist_add_head_rcu(&e->hash_link, head);
 230                ++tun->flow_count;
 231        }
 232        return e;
 233}
 234
 235static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
 236{
 237        tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
 238                  e->rxhash, e->queue_index);
 239        sock_rps_reset_flow_hash(e->rps_rxhash);
 240        hlist_del_rcu(&e->hash_link);
 241        kfree_rcu(e, rcu);
 242        --tun->flow_count;
 243}
 244
 245static void tun_flow_flush(struct tun_struct *tun)
 246{
 247        int i;
 248
 249        spin_lock_bh(&tun->lock);
 250        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 251                struct tun_flow_entry *e;
 252                struct hlist_node *n;
 253
 254                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
 255                        tun_flow_delete(tun, e);
 256        }
 257        spin_unlock_bh(&tun->lock);
 258}
 259
 260static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
 261{
 262        int i;
 263
 264        spin_lock_bh(&tun->lock);
 265        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 266                struct tun_flow_entry *e;
 267                struct hlist_node *n;
 268
 269                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
 270                        if (e->queue_index == queue_index)
 271                                tun_flow_delete(tun, e);
 272                }
 273        }
 274        spin_unlock_bh(&tun->lock);
 275}
 276
 277static void tun_flow_cleanup(unsigned long data)
 278{
 279        struct tun_struct *tun = (struct tun_struct *)data;
 280        unsigned long delay = tun->ageing_time;
 281        unsigned long next_timer = jiffies + delay;
 282        unsigned long count = 0;
 283        int i;
 284
 285        tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");
 286
 287        spin_lock_bh(&tun->lock);
 288        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 289                struct tun_flow_entry *e;
 290                struct hlist_node *n;
 291
 292                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
 293                        unsigned long this_timer;
 294                        count++;
 295                        this_timer = e->updated + delay;
 296                        if (time_before_eq(this_timer, jiffies))
 297                                tun_flow_delete(tun, e);
 298                        else if (time_before(this_timer, next_timer))
 299                                next_timer = this_timer;
 300                }
 301        }
 302
 303        if (count)
 304                mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
 305        spin_unlock_bh(&tun->lock);
 306}
 307
 308static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
 309                            struct tun_file *tfile)
 310{
 311        struct hlist_head *head;
 312        struct tun_flow_entry *e;
 313        unsigned long delay = tun->ageing_time;
 314        u16 queue_index = tfile->queue_index;
 315
 316        if (!rxhash)
 317                return;
 318        else
 319                head = &tun->flows[tun_hashfn(rxhash)];
 320
 321        rcu_read_lock();
 322
 323        /* We may get a very small possibility of OOO during switching, not
 324         * worth to optimize.*/
 325        if (tun->numqueues == 1 || tfile->detached)
 326                goto unlock;
 327
 328        e = tun_flow_find(head, rxhash);
 329        if (likely(e)) {
 330                /* TODO: keep queueing to old queue until it's empty? */
 331                e->queue_index = queue_index;
 332                e->updated = jiffies;
 333                sock_rps_record_flow_hash(e->rps_rxhash);
 334        } else {
 335                spin_lock_bh(&tun->lock);
 336                if (!tun_flow_find(head, rxhash) &&
 337                    tun->flow_count < MAX_TAP_FLOWS)
 338                        tun_flow_create(tun, head, rxhash, queue_index);
 339
 340                if (!timer_pending(&tun->flow_gc_timer))
 341                        mod_timer(&tun->flow_gc_timer,
 342                                  round_jiffies_up(jiffies + delay));
 343                spin_unlock_bh(&tun->lock);
 344        }
 345
 346unlock:
 347        rcu_read_unlock();
 348}
 349
 350/**
 351 * Save the hash received in the stack receive path and update the
 352 * flow_hash table accordingly.
 353 */
 354static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 355{
 356        if (unlikely(e->rps_rxhash != hash)) {
 357                sock_rps_reset_flow_hash(e->rps_rxhash);
 358                e->rps_rxhash = hash;
 359        }
 360}
 361
 362/* We try to identify a flow through its rxhash first. The reason that
 363 * we do not check rxq no. is because some cards(e.g 82599), chooses
 364 * the rxq based on the txq where the last packet of the flow comes. As
 365 * the userspace application move between processors, we may get a
 366 * different rxq no. here. If we could not get rxhash, then we would
 367 * hope the rxq no. may help here.
 368 */
 369static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
 370                            void *accel_priv, select_queue_fallback_t fallback)
 371{
 372        struct tun_struct *tun = netdev_priv(dev);
 373        struct tun_flow_entry *e;
 374        u32 txq = 0;
 375        u32 numqueues = 0;
 376
 377        rcu_read_lock();
 378        numqueues = ACCESS_ONCE(tun->numqueues);
 379
 380        txq = skb_get_hash(skb);
 381        if (txq) {
 382                e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
 383                if (e) {
 384                        tun_flow_save_rps_rxhash(e, txq);
 385                        txq = e->queue_index;
 386                } else
 387                        /* use multiply and shift instead of expensive divide */
 388                        txq = ((u64)txq * numqueues) >> 32;
 389        } else if (likely(skb_rx_queue_recorded(skb))) {
 390                txq = skb_get_rx_queue(skb);
 391                while (unlikely(txq >= numqueues))
 392                        txq -= numqueues;
 393        }
 394
 395        rcu_read_unlock();
 396        return txq;
 397}
 398
 399static inline bool tun_not_capable(struct tun_struct *tun)
 400{
 401        const struct cred *cred = current_cred();
 402        struct net *net = dev_net(tun->dev);
 403
 404        return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
 405                  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
 406                !ns_capable(net->user_ns, CAP_NET_ADMIN);
 407}
 408
 409static void tun_set_real_num_queues(struct tun_struct *tun)
 410{
 411        netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
 412        netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
 413}
 414
 415static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
 416{
 417        tfile->detached = tun;
 418        list_add_tail(&tfile->next, &tun->disabled);
 419        ++tun->numdisabled;
 420}
 421
 422static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 423{
 424        struct tun_struct *tun = tfile->detached;
 425
 426        tfile->detached = NULL;
 427        list_del_init(&tfile->next);
 428        --tun->numdisabled;
 429        return tun;
 430}
 431
 432static void tun_queue_purge(struct tun_file *tfile)
 433{
 434        skb_queue_purge(&tfile->sk.sk_receive_queue);
 435        skb_queue_purge(&tfile->sk.sk_error_queue);
 436}
 437
 438static void __tun_detach(struct tun_file *tfile, bool clean)
 439{
 440        struct tun_file *ntfile;
 441        struct tun_struct *tun;
 442
 443        tun = rtnl_dereference(tfile->tun);
 444
 445        if (tun && !tfile->detached) {
 446                u16 index = tfile->queue_index;
 447                BUG_ON(index >= tun->numqueues);
 448
 449                rcu_assign_pointer(tun->tfiles[index],
 450                                   tun->tfiles[tun->numqueues - 1]);
 451                ntfile = rtnl_dereference(tun->tfiles[index]);
 452                ntfile->queue_index = index;
 453
 454                --tun->numqueues;
 455                if (clean) {
 456                        RCU_INIT_POINTER(tfile->tun, NULL);
 457                        sock_put(&tfile->sk);
 458                } else
 459                        tun_disable_queue(tun, tfile);
 460
 461                synchronize_net();
 462                tun_flow_delete_by_queue(tun, tun->numqueues + 1);
 463                /* Drop read queue */
 464                tun_queue_purge(tfile);
 465                tun_set_real_num_queues(tun);
 466        } else if (tfile->detached && clean) {
 467                tun = tun_enable_queue(tfile);
 468                sock_put(&tfile->sk);
 469        }
 470
 471        if (clean) {
 472                if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
 473                        netif_carrier_off(tun->dev);
 474
 475                        if (!(tun->flags & TUN_PERSIST) &&
 476                            tun->dev->reg_state == NETREG_REGISTERED)
 477                                unregister_netdevice(tun->dev);
 478                }
 479
 480                BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
 481                                 &tfile->socket.flags));
 482                sk_release_kernel(&tfile->sk);
 483        }
 484}
 485
 486static void tun_detach(struct tun_file *tfile, bool clean)
 487{
 488        rtnl_lock();
 489        __tun_detach(tfile, clean);
 490        rtnl_unlock();
 491}
 492
 493static void tun_detach_all(struct net_device *dev)
 494{
 495        struct tun_struct *tun = netdev_priv(dev);
 496        struct tun_file *tfile, *tmp;
 497        int i, n = tun->numqueues;
 498
 499        for (i = 0; i < n; i++) {
 500                tfile = rtnl_dereference(tun->tfiles[i]);
 501                BUG_ON(!tfile);
 502                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 503                RCU_INIT_POINTER(tfile->tun, NULL);
 504                --tun->numqueues;
 505        }
 506        list_for_each_entry(tfile, &tun->disabled, next) {
 507                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 508                RCU_INIT_POINTER(tfile->tun, NULL);
 509        }
 510        BUG_ON(tun->numqueues != 0);
 511
 512        synchronize_net();
 513        for (i = 0; i < n; i++) {
 514                tfile = rtnl_dereference(tun->tfiles[i]);
 515                /* Drop read queue */
 516                tun_queue_purge(tfile);
 517                sock_put(&tfile->sk);
 518        }
 519        list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
 520                tun_enable_queue(tfile);
 521                tun_queue_purge(tfile);
 522                sock_put(&tfile->sk);
 523        }
 524        BUG_ON(tun->numdisabled != 0);
 525
 526        if (tun->flags & TUN_PERSIST)
 527                module_put(THIS_MODULE);
 528}
 529
 530static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
 531{
 532        struct tun_file *tfile = file->private_data;
 533        int err;
 534
 535        err = security_tun_dev_attach(tfile->socket.sk, tun->security);
 536        if (err < 0)
 537                goto out;
 538
 539        err = -EINVAL;
 540        if (rtnl_dereference(tfile->tun) && !tfile->detached)
 541                goto out;
 542
 543        err = -EBUSY;
 544        if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
 545                goto out;
 546
 547        err = -E2BIG;
 548        if (!tfile->detached &&
 549            tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
 550                goto out;
 551
 552        err = 0;
 553
 554        /* Re-attach the filter to persist device */
 555        if (!skip_filter && (tun->filter_attached == true)) {
 556                err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
 557                if (!err)
 558                        goto out;
 559        }
 560        tfile->queue_index = tun->numqueues;
 561        rcu_assign_pointer(tfile->tun, tun);
 562        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 563        tun->numqueues++;
 564
 565        if (tfile->detached)
 566                tun_enable_queue(tfile);
 567        else
 568                sock_hold(&tfile->sk);
 569
 570        tun_set_real_num_queues(tun);
 571
 572        /* device is allowed to go away first, so no need to hold extra
 573         * refcnt.
 574         */
 575
 576out:
 577        return err;
 578}
 579
 580static struct tun_struct *__tun_get(struct tun_file *tfile)
 581{
 582        struct tun_struct *tun;
 583
 584        rcu_read_lock();
 585        tun = rcu_dereference(tfile->tun);
 586        if (tun)
 587                dev_hold(tun->dev);
 588        rcu_read_unlock();
 589
 590        return tun;
 591}
 592
 593static struct tun_struct *tun_get(struct file *file)
 594{
 595        return __tun_get(file->private_data);
 596}
 597
 598static void tun_put(struct tun_struct *tun)
 599{
 600        dev_put(tun->dev);
 601}
 602
 603/* TAP filtering */
 604static void addr_hash_set(u32 *mask, const u8 *addr)
 605{
 606        int n = ether_crc(ETH_ALEN, addr) >> 26;
 607        mask[n >> 5] |= (1 << (n & 31));
 608}
 609
 610static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
 611{
 612        int n = ether_crc(ETH_ALEN, addr) >> 26;
 613        return mask[n >> 5] & (1 << (n & 31));
 614}
 615
 616static int update_filter(struct tap_filter *filter, void __user *arg)
 617{
 618        struct { u8 u[ETH_ALEN]; } *addr;
 619        struct tun_filter uf;
 620        int err, alen, n, nexact;
 621
 622        if (copy_from_user(&uf, arg, sizeof(uf)))
 623                return -EFAULT;
 624
 625        if (!uf.count) {
 626                /* Disabled */
 627                filter->count = 0;
 628                return 0;
 629        }
 630
 631        alen = ETH_ALEN * uf.count;
 632        addr = kmalloc(alen, GFP_KERNEL);
 633        if (!addr)
 634                return -ENOMEM;
 635
 636        if (copy_from_user(addr, arg + sizeof(uf), alen)) {
 637                err = -EFAULT;
 638                goto done;
 639        }
 640
 641        /* The filter is updated without holding any locks. Which is
 642         * perfectly safe. We disable it first and in the worst
 643         * case we'll accept a few undesired packets. */
 644        filter->count = 0;
 645        wmb();
 646
 647        /* Use first set of addresses as an exact filter */
 648        for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
 649                memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
 650
 651        nexact = n;
 652
 653        /* Remaining multicast addresses are hashed,
 654         * unicast will leave the filter disabled. */
 655        memset(filter->mask, 0, sizeof(filter->mask));
 656        for (; n < uf.count; n++) {
 657                if (!is_multicast_ether_addr(addr[n].u)) {
 658                        err = 0; /* no filter */
 659                        goto done;
 660                }
 661                addr_hash_set(filter->mask, addr[n].u);
 662        }
 663
 664        /* For ALLMULTI just set the mask to all ones.
 665         * This overrides the mask populated above. */
 666        if ((uf.flags & TUN_FLT_ALLMULTI))
 667                memset(filter->mask, ~0, sizeof(filter->mask));
 668
 669        /* Now enable the filter */
 670        wmb();
 671        filter->count = nexact;
 672
 673        /* Return the number of exact filters */
 674        err = nexact;
 675
 676done:
 677        kfree(addr);
 678        return err;
 679}
 680
 681/* Returns: 0 - drop, !=0 - accept */
 682static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
 683{
 684        /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
 685         * at this point. */
 686        struct ethhdr *eh = (struct ethhdr *) skb->data;
 687        int i;
 688
 689        /* Exact match */
 690        for (i = 0; i < filter->count; i++)
 691                if (ether_addr_equal(eh->h_dest, filter->addr[i]))
 692                        return 1;
 693
 694        /* Inexact match (multicast only) */
 695        if (is_multicast_ether_addr(eh->h_dest))
 696                return addr_hash_test(filter->mask, eh->h_dest);
 697
 698        return 0;
 699}
 700
 701/*
 702 * Checks whether the packet is accepted or not.
 703 * Returns: 0 - drop, !=0 - accept
 704 */
 705static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
 706{
 707        if (!filter->count)
 708                return 1;
 709
 710        return run_filter(filter, skb);
 711}
 712
 713/* Network device part of the driver */
 714
 715static const struct ethtool_ops tun_ethtool_ops;
 716
 717/* Net device detach from fd. */
 718static void tun_net_uninit(struct net_device *dev)
 719{
 720        tun_detach_all(dev);
 721}
 722
 723/* Net device open. */
 724static int tun_net_open(struct net_device *dev)
 725{
 726        netif_tx_start_all_queues(dev);
 727        return 0;
 728}
 729
 730/* Net device close. */
 731static int tun_net_close(struct net_device *dev)
 732{
 733        netif_tx_stop_all_queues(dev);
 734        return 0;
 735}
 736
 737/* Net device start xmit */
 738static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 739{
 740        struct tun_struct *tun = netdev_priv(dev);
 741        int txq = skb->queue_mapping;
 742        struct tun_file *tfile;
 743        u32 numqueues = 0;
 744
 745        rcu_read_lock();
 746        tfile = rcu_dereference(tun->tfiles[txq]);
 747        numqueues = ACCESS_ONCE(tun->numqueues);
 748
 749        /* Drop packet if interface is not attached */
 750        if (txq >= numqueues)
 751                goto drop;
 752
 753        if (numqueues == 1) {
 754                /* Select queue was not called for the skbuff, so we extract the
 755                 * RPS hash and save it into the flow_table here.
 756                 */
 757                __u32 rxhash;
 758
 759                rxhash = skb_get_hash(skb);
 760                if (rxhash) {
 761                        struct tun_flow_entry *e;
 762                        e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)],
 763                                        rxhash);
 764                        if (e)
 765                                tun_flow_save_rps_rxhash(e, rxhash);
 766                }
 767        }
 768
 769        tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
 770
 771        BUG_ON(!tfile);
 772
 773        /* Drop if the filter does not like it.
 774         * This is a noop if the filter is disabled.
 775         * Filter can be enabled only for the TAP devices. */
 776        if (!check_filter(&tun->txflt, skb))
 777                goto drop;
 778
 779        if (tfile->socket.sk->sk_filter &&
 780            sk_filter(tfile->socket.sk, skb))
 781                goto drop;
 782
 783        /* Limit the number of packets queued by dividing txq length with the
 784         * number of queues.
 785         */
 786        if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) * numqueues
 787                          >= dev->tx_queue_len)
 788                goto drop;
 789
 790        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
 791                goto drop;
 792
 793        if (skb->sk) {
 794                sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags);
 795                sw_tx_timestamp(skb);
 796        }
 797
 798        /* Orphan the skb - required as we might hang on to it
 799         * for indefinite time.
 800         */
 801        skb_orphan(skb);
 802
 803        nf_reset(skb);
 804
 805        /* Enqueue packet */
 806        skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
 807
 808        /* Notify and wake up reader process */
 809        if (tfile->flags & TUN_FASYNC)
 810                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
 811        tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 812
 813        rcu_read_unlock();
 814        return NETDEV_TX_OK;
 815
 816drop:
 817        dev->stats.tx_dropped++;
 818        skb_tx_error(skb);
 819        kfree_skb(skb);
 820        rcu_read_unlock();
 821        return NETDEV_TX_OK;
 822}
 823
 824static void tun_net_mclist(struct net_device *dev)
 825{
 826        /*
 827         * This callback is supposed to deal with mc filter in
 828         * _rx_ path and has nothing to do with the _tx_ path.
 829         * In rx path we always accept everything userspace gives us.
 830         */
 831}
 832
 833#define MIN_MTU 68
 834#define MAX_MTU 65535
 835
 836static int
 837tun_net_change_mtu(struct net_device *dev, int new_mtu)
 838{
 839        if (new_mtu < MIN_MTU || new_mtu + dev->hard_header_len > MAX_MTU)
 840                return -EINVAL;
 841        dev->mtu = new_mtu;
 842        return 0;
 843}
 844
 845static netdev_features_t tun_net_fix_features(struct net_device *dev,
 846        netdev_features_t features)
 847{
 848        struct tun_struct *tun = netdev_priv(dev);
 849
 850        return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
 851}
 852#ifdef CONFIG_NET_POLL_CONTROLLER
 853static void tun_poll_controller(struct net_device *dev)
 854{
 855        /*
 856         * Tun only receives frames when:
 857         * 1) the char device endpoint gets data from user space
 858         * 2) the tun socket gets a sendmsg call from user space
 859         * Since both of those are synchronous operations, we are guaranteed
 860         * never to have pending data when we poll for it
 861         * so there is nothing to do here but return.
 862         * We need this though so netpoll recognizes us as an interface that
 863         * supports polling, which enables bridge devices in virt setups to
 864         * still use netconsole
 865         */
 866        return;
 867}
 868#endif
 869static const struct net_device_ops tun_netdev_ops = {
 870        .ndo_uninit             = tun_net_uninit,
 871        .ndo_open               = tun_net_open,
 872        .ndo_stop               = tun_net_close,
 873        .ndo_start_xmit         = tun_net_xmit,
 874        .ndo_change_mtu         = tun_net_change_mtu,
 875        .ndo_fix_features       = tun_net_fix_features,
 876        .ndo_select_queue       = tun_select_queue,
 877#ifdef CONFIG_NET_POLL_CONTROLLER
 878        .ndo_poll_controller    = tun_poll_controller,
 879#endif
 880};
 881
 882static const struct net_device_ops tap_netdev_ops = {
 883        .ndo_uninit             = tun_net_uninit,
 884        .ndo_open               = tun_net_open,
 885        .ndo_stop               = tun_net_close,
 886        .ndo_start_xmit         = tun_net_xmit,
 887        .ndo_change_mtu         = tun_net_change_mtu,
 888        .ndo_fix_features       = tun_net_fix_features,
 889        .ndo_set_rx_mode        = tun_net_mclist,
 890        .ndo_set_mac_address    = eth_mac_addr,
 891        .ndo_validate_addr      = eth_validate_addr,
 892        .ndo_select_queue       = tun_select_queue,
 893#ifdef CONFIG_NET_POLL_CONTROLLER
 894        .ndo_poll_controller    = tun_poll_controller,
 895#endif
 896};
 897
 898static void tun_flow_init(struct tun_struct *tun)
 899{
 900        int i;
 901
 902        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
 903                INIT_HLIST_HEAD(&tun->flows[i]);
 904
 905        tun->ageing_time = TUN_FLOW_EXPIRE;
 906        setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
 907        mod_timer(&tun->flow_gc_timer,
 908                  round_jiffies_up(jiffies + tun->ageing_time));
 909}
 910
 911static void tun_flow_uninit(struct tun_struct *tun)
 912{
 913        del_timer_sync(&tun->flow_gc_timer);
 914        tun_flow_flush(tun);
 915}
 916
 917/* Initialize net device. */
 918static void tun_net_init(struct net_device *dev)
 919{
 920        struct tun_struct *tun = netdev_priv(dev);
 921
 922        switch (tun->flags & TUN_TYPE_MASK) {
 923        case TUN_TUN_DEV:
 924                dev->netdev_ops = &tun_netdev_ops;
 925
 926                /* Point-to-Point TUN Device */
 927                dev->hard_header_len = 0;
 928                dev->addr_len = 0;
 929                dev->mtu = 1500;
 930
 931                /* Zero header length */
 932                dev->type = ARPHRD_NONE;
 933                dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
 934                dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
 935                break;
 936
 937        case TUN_TAP_DEV:
 938                dev->netdev_ops = &tap_netdev_ops;
 939                /* Ethernet TAP Device */
 940                ether_setup(dev);
 941                dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 942                dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 943
 944                eth_hw_addr_random(dev);
 945
 946                dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
 947                break;
 948        }
 949}
 950
 951/* Character device part */
 952
 953/* Poll */
 954static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 955{
 956        struct tun_file *tfile = file->private_data;
 957        struct tun_struct *tun = __tun_get(tfile);
 958        struct sock *sk;
 959        unsigned int mask = 0;
 960
 961        if (!tun)
 962                return POLLERR;
 963
 964        sk = tfile->socket.sk;
 965
 966        tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
 967
 968        poll_wait(file, sk_sleep(sk), wait);
 969
 970        if (!skb_queue_empty(&sk->sk_receive_queue))
 971                mask |= POLLIN | POLLRDNORM;
 972
 973        if (sock_writeable(sk) ||
 974            (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
 975             sock_writeable(sk)))
 976                mask |= POLLOUT | POLLWRNORM;
 977
 978        if (tun->dev->reg_state != NETREG_REGISTERED)
 979                mask = POLLERR;
 980
 981        tun_put(tun);
 982        return mask;
 983}
 984
 985/* prepad is the amount to reserve at front.  len is length after that.
 986 * linear is a hint as to how much to copy (usually headers). */
 987static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
 988                                     size_t prepad, size_t len,
 989                                     size_t linear, int noblock)
 990{
 991        struct sock *sk = tfile->socket.sk;
 992        struct sk_buff *skb;
 993        int err;
 994
 995        /* Under a page?  Don't bother with paged skb. */
 996        if (prepad + len < PAGE_SIZE || !linear)
 997                linear = len;
 998
 999        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1000                                   &err, 0);
1001        if (!skb)
1002                return ERR_PTR(err);
1003
1004        skb_reserve(skb, prepad);
1005        skb_put(skb, linear);
1006        skb->data_len = len - linear;
1007        skb->len += len - linear;
1008
1009        return skb;
1010}
1011
1012/* Get packet from user space buffer */
1013static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1014                            void *msg_control, const struct iovec *iv,
1015                            size_t total_len, size_t count, int noblock)
1016{
1017        struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
1018        struct sk_buff *skb;
1019        size_t len = total_len, align = NET_SKB_PAD, linear;
1020        struct virtio_net_hdr gso = { 0 };
1021        int good_linear;
1022        int offset = 0;
1023        int copylen;
1024        bool zerocopy = false;
1025        int err;
1026        u32 rxhash;
1027
1028        if (!(tun->flags & TUN_NO_PI)) {
1029                if (len < sizeof(pi))
1030                        return -EINVAL;
1031                len -= sizeof(pi);
1032
1033                if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
1034                        return -EFAULT;
1035                offset += sizeof(pi);
1036        }
1037
1038        if (tun->flags & TUN_VNET_HDR) {
1039                if (len < tun->vnet_hdr_sz)
1040                        return -EINVAL;
1041                len -= tun->vnet_hdr_sz;
1042
1043                if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
1044                        return -EFAULT;
1045
1046                if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1047                    gso.csum_start + gso.csum_offset + 2 > gso.hdr_len)
1048                        gso.hdr_len = gso.csum_start + gso.csum_offset + 2;
1049
1050                if (gso.hdr_len > len)
1051                        return -EINVAL;
1052                offset += tun->vnet_hdr_sz;
1053        }
1054
1055        if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
1056                align += NET_IP_ALIGN;
1057                if (unlikely(len < ETH_HLEN ||
1058                             (gso.hdr_len && gso.hdr_len < ETH_HLEN)))
1059                        return -EINVAL;
1060        }
1061
1062        good_linear = SKB_MAX_HEAD(align);
1063
1064        if (msg_control) {
1065                /* There are 256 bytes to be copied in skb, so there is
1066                 * enough room for skb expand head in case it is used.
1067                 * The rest of the buffer is mapped from userspace.
1068                 */
1069                copylen = gso.hdr_len ? gso.hdr_len : GOODCOPY_LEN;
1070                if (copylen > good_linear)
1071                        copylen = good_linear;
1072                linear = copylen;
1073                if (iov_pages(iv, offset + copylen, count) <= MAX_SKB_FRAGS)
1074                        zerocopy = true;
1075        }
1076
1077        if (!zerocopy) {
1078                copylen = len;
1079                if (gso.hdr_len > good_linear)
1080                        linear = good_linear;
1081                else
1082                        linear = gso.hdr_len;
1083        }
1084
1085        skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
1086        if (IS_ERR(skb)) {
1087                if (PTR_ERR(skb) != -EAGAIN)
1088                        tun->dev->stats.rx_dropped++;
1089                return PTR_ERR(skb);
1090        }
1091
1092        if (zerocopy)
1093                err = zerocopy_sg_from_iovec(skb, iv, offset, count);
1094        else {
1095                err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
1096                if (!err && msg_control) {
1097                        struct ubuf_info *uarg = msg_control;
1098                        uarg->callback(uarg, false);
1099                }
1100        }
1101
1102        if (err) {
1103                tun->dev->stats.rx_dropped++;
1104                kfree_skb(skb);
1105                return -EFAULT;
1106        }
1107
1108        if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1109                if (!skb_partial_csum_set(skb, gso.csum_start,
1110                                          gso.csum_offset)) {
1111                        tun->dev->stats.rx_frame_errors++;
1112                        kfree_skb(skb);
1113                        return -EINVAL;
1114                }
1115        }
1116
1117        switch (tun->flags & TUN_TYPE_MASK) {
1118        case TUN_TUN_DEV:
1119                if (tun->flags & TUN_NO_PI) {
1120                        switch (skb->data[0] & 0xf0) {
1121                        case 0x40:
1122                                pi.proto = htons(ETH_P_IP);
1123                                break;
1124                        case 0x60:
1125                                pi.proto = htons(ETH_P_IPV6);
1126                                break;
1127                        default:
1128                                tun->dev->stats.rx_dropped++;
1129                                kfree_skb(skb);
1130                                return -EINVAL;
1131                        }
1132                }
1133
1134                skb_reset_mac_header(skb);
1135                skb->protocol = pi.proto;
1136                skb->dev = tun->dev;
1137                break;
1138        case TUN_TAP_DEV:
1139                skb->protocol = eth_type_trans(skb, tun->dev);
1140                break;
1141        }
1142
1143        skb_reset_network_header(skb);
1144
1145        if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1146                pr_debug("GSO!\n");
1147                switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1148                case VIRTIO_NET_HDR_GSO_TCPV4:
1149                        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
1150                        break;
1151                case VIRTIO_NET_HDR_GSO_TCPV6:
1152                        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
1153                        break;
1154                case VIRTIO_NET_HDR_GSO_UDP:
1155                {
1156                        static bool warned;
1157
1158                        if (!warned) {
1159                                warned = true;
1160                                netdev_warn(tun->dev,
1161                                            "%s: using disabled UFO feature; please fix this program\n",
1162                                            current->comm);
1163                        }
1164                        skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1165                        if (skb->protocol == htons(ETH_P_IPV6))
1166                                ipv6_proxy_select_ident(skb);
1167                        break;
1168                }
1169                default:
1170                        tun->dev->stats.rx_frame_errors++;
1171                        kfree_skb(skb);
1172                        return -EINVAL;
1173                }
1174
1175                if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1176                        skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
1177
1178                skb_shinfo(skb)->gso_size = gso.gso_size;
1179                if (skb_shinfo(skb)->gso_size == 0) {
1180                        tun->dev->stats.rx_frame_errors++;
1181                        kfree_skb(skb);
1182                        return -EINVAL;
1183                }
1184
1185                /* Header must be checked, and gso_segs computed. */
1186                skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1187                skb_shinfo(skb)->gso_segs = 0;
1188        }
1189
1190        /* copy skb_ubuf_info for callback when skb has no error */
1191        if (zerocopy) {
1192                skb_shinfo(skb)->destructor_arg = msg_control;
1193                skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1194                skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1195        }
1196
1197        skb_probe_transport_header(skb, 0);
1198
1199        rxhash = skb_get_hash(skb);
1200        netif_rx_ni(skb);
1201
1202        tun->dev->stats.rx_packets++;
1203        tun->dev->stats.rx_bytes += len;
1204
1205        tun_flow_update(tun, rxhash, tfile);
1206        return total_len;
1207}
1208
1209static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
1210                              unsigned long count, loff_t pos)
1211{
1212        struct file *file = iocb->ki_filp;
1213        struct tun_struct *tun = tun_get(file);
1214        struct tun_file *tfile = file->private_data;
1215        ssize_t result;
1216
1217        if (!tun)
1218                return -EBADFD;
1219
1220        tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
1221
1222        result = tun_get_user(tun, tfile, NULL, iv, iov_length(iv, count),
1223                              count, file->f_flags & O_NONBLOCK);
1224
1225        tun_put(tun);
1226        return result;
1227}
1228
1229/* Put packet to the user space buffer */
1230static ssize_t tun_put_user(struct tun_struct *tun,
1231                            struct tun_file *tfile,
1232                            struct sk_buff *skb,
1233                            const struct iovec *iv, int len)
1234{
1235        struct tun_pi pi = { 0, skb->protocol };
1236        ssize_t total = 0;
1237        int vlan_offset = 0, copied;
1238        int vlan_hlen = 0;
1239        int vnet_hdr_sz = 0;
1240
1241        if (vlan_tx_tag_present(skb))
1242                vlan_hlen = VLAN_HLEN;
1243
1244        if (tun->flags & TUN_VNET_HDR)
1245                vnet_hdr_sz = tun->vnet_hdr_sz;
1246
1247        if (!(tun->flags & TUN_NO_PI)) {
1248                if ((len -= sizeof(pi)) < 0)
1249                        return -EINVAL;
1250
1251                if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
1252                        /* Packet will be striped */
1253                        pi.flags |= TUN_PKT_STRIP;
1254                }
1255
1256                if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
1257                        return -EFAULT;
1258                total += sizeof(pi);
1259        }
1260
1261        if (vnet_hdr_sz) {
1262                struct virtio_net_hdr gso = { 0 }; /* no info leak */
1263                if ((len -= vnet_hdr_sz) < 0)
1264                        return -EINVAL;
1265
1266                if (skb_is_gso(skb)) {
1267                        struct skb_shared_info *sinfo = skb_shinfo(skb);
1268
1269                        /* This is a hint as to how much should be linear. */
1270                        gso.hdr_len = skb_headlen(skb);
1271                        gso.gso_size = sinfo->gso_size;
1272                        if (sinfo->gso_type & SKB_GSO_TCPV4)
1273                                gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1274                        else if (sinfo->gso_type & SKB_GSO_TCPV6)
1275                                gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1276                        else {
1277                                pr_err("unexpected GSO type: "
1278                                       "0x%x, gso_size %d, hdr_len %d\n",
1279                                       sinfo->gso_type, gso.gso_size,
1280                                       gso.hdr_len);
1281                                print_hex_dump(KERN_ERR, "tun: ",
1282                                               DUMP_PREFIX_NONE,
1283                                               16, 1, skb->head,
1284                                               min((int)gso.hdr_len, 64), true);
1285                                WARN_ON_ONCE(1);
1286                                return -EINVAL;
1287                        }
1288                        if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1289                                gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1290                } else
1291                        gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1292
1293                if (skb->ip_summed == CHECKSUM_PARTIAL) {
1294                        gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1295                        gso.csum_start = skb_checksum_start_offset(skb) +
1296                                         vlan_hlen;
1297                        gso.csum_offset = skb->csum_offset;
1298                } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
1299                        gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
1300                } /* else everything is zero */
1301
1302                if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
1303                                               sizeof(gso))))
1304                        return -EFAULT;
1305                total += vnet_hdr_sz;
1306        }
1307
1308        copied = total;
1309        len = min_t(int, skb->len + vlan_hlen, len);
1310        total += skb->len + vlan_hlen;
1311        if (vlan_hlen) {
1312                int copy, ret;
1313                struct {
1314                        __be16 h_vlan_proto;
1315                        __be16 h_vlan_TCI;
1316                } veth;
1317
1318                veth.h_vlan_proto = skb->vlan_proto;
1319                veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
1320
1321                vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
1322
1323                copy = min_t(int, vlan_offset, len);
1324                ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
1325                len -= copy;
1326                copied += copy;
1327                if (ret || !len)
1328                        goto done;
1329
1330                copy = min_t(int, sizeof(veth), len);
1331                ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
1332                len -= copy;
1333                copied += copy;
1334                if (ret || !len)
1335                        goto done;
1336        }
1337
1338        skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
1339
1340done:
1341        tun->dev->stats.tx_packets++;
1342        tun->dev->stats.tx_bytes += len;
1343
1344        return total;
1345}
1346
1347static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
1348                           const struct iovec *iv, ssize_t len, int noblock)
1349{
1350        struct sk_buff *skb;
1351        ssize_t ret = 0;
1352        int peeked, err, off = 0;
1353
1354        tun_debug(KERN_INFO, tun, "tun_do_read\n");
1355
1356        if (!len)
1357                return ret;
1358
1359        if (tun->dev->reg_state != NETREG_REGISTERED)
1360                return -EIO;
1361
1362        /* Read frames from queue */
1363        skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
1364                                  &peeked, &off, &err);
1365        if (skb) {
1366                ret = tun_put_user(tun, tfile, skb, iv, len);
1367                kfree_skb(skb);
1368        } else
1369                ret = err;
1370
1371        return ret;
1372}
1373
1374static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
1375                            unsigned long count, loff_t pos)
1376{
1377        struct file *file = iocb->ki_filp;
1378        struct tun_file *tfile = file->private_data;
1379        struct tun_struct *tun = __tun_get(tfile);
1380        ssize_t len, ret;
1381
1382        if (!tun)
1383                return -EBADFD;
1384        len = iov_length(iv, count);
1385        if (len < 0) {
1386                ret = -EINVAL;
1387                goto out;
1388        }
1389
1390        ret = tun_do_read(tun, tfile, iv, len,
1391                          file->f_flags & O_NONBLOCK);
1392        ret = min_t(ssize_t, ret, len);
1393        if (ret > 0)
1394                iocb->ki_pos = ret;
1395out:
1396        tun_put(tun);
1397        return ret;
1398}
1399
1400static void tun_free_netdev(struct net_device *dev)
1401{
1402        struct tun_struct *tun = netdev_priv(dev);
1403
1404        BUG_ON(!(list_empty(&tun->disabled)));
1405        tun_flow_uninit(tun);
1406        security_tun_dev_free_security(tun->security);
1407        free_netdev(dev);
1408}
1409
1410static void tun_setup(struct net_device *dev)
1411{
1412        struct tun_struct *tun = netdev_priv(dev);
1413
1414        tun->owner = INVALID_UID;
1415        tun->group = INVALID_GID;
1416
1417        dev->ethtool_ops = &tun_ethtool_ops;
1418        dev->destructor = tun_free_netdev;
1419}
1420
1421/* Trivial set of netlink ops to allow deleting tun or tap
1422 * device with netlink.
1423 */
1424static int tun_validate(struct nlattr *tb[], struct nlattr *data[])
1425{
1426        return -EINVAL;
1427}
1428
1429static struct rtnl_link_ops tun_link_ops __read_mostly = {
1430        .kind           = DRV_NAME,
1431        .priv_size      = sizeof(struct tun_struct),
1432        .setup          = tun_setup,
1433        .validate       = tun_validate,
1434};
1435
1436static void tun_sock_write_space(struct sock *sk)
1437{
1438        struct tun_file *tfile;
1439        wait_queue_head_t *wqueue;
1440
1441        if (!sock_writeable(sk))
1442                return;
1443
1444        if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
1445                return;
1446
1447        wqueue = sk_sleep(sk);
1448        if (wqueue && waitqueue_active(wqueue))
1449                wake_up_interruptible_sync_poll(wqueue, POLLOUT |
1450                                                POLLWRNORM | POLLWRBAND);
1451
1452        tfile = container_of(sk, struct tun_file, sk);
1453        kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
1454}
1455
1456static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
1457                       struct msghdr *m, size_t total_len)
1458{
1459        int ret;
1460        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
1461        struct tun_struct *tun = __tun_get(tfile);
1462
1463        if (!tun)
1464                return -EBADFD;
1465        ret = tun_get_user(tun, tfile, m->msg_control, m->msg_iov, total_len,
1466                           m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
1467        tun_put(tun);
1468        return ret;
1469}
1470
1471static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
1472                       struct msghdr *m, size_t total_len,
1473                       int flags)
1474{
1475        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
1476        struct tun_struct *tun = __tun_get(tfile);
1477        int ret;
1478
1479        if (!tun)
1480                return -EBADFD;
1481
1482        if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
1483                ret = -EINVAL;
1484                goto out;
1485        }
1486        if (flags & MSG_ERRQUEUE) {
1487                ret = sock_recv_errqueue(sock->sk, m, total_len,
1488                                         SOL_PACKET, TUN_TX_TIMESTAMP);
1489                goto out;
1490        }
1491        ret = tun_do_read(tun, tfile, m->msg_iov, total_len,
1492                          flags & MSG_DONTWAIT);
1493        if (ret > total_len) {
1494                m->msg_flags |= MSG_TRUNC;
1495                ret = flags & MSG_TRUNC ? ret : total_len;
1496        }
1497out:
1498        tun_put(tun);
1499        return ret;
1500}
1501
1502static int tun_release(struct socket *sock)
1503{
1504        if (sock->sk)
1505                sock_put(sock->sk);
1506        return 0;
1507}
1508
1509/* Ops structure to mimic raw sockets with tun */
1510static const struct proto_ops tun_socket_ops = {
1511        .sendmsg = tun_sendmsg,
1512        .recvmsg = tun_recvmsg,
1513        .release = tun_release,
1514};
1515
1516static struct proto tun_proto = {
1517        .name           = "tun",
1518        .owner          = THIS_MODULE,
1519        .obj_size       = sizeof(struct tun_file),
1520};
1521
1522static int tun_flags(struct tun_struct *tun)
1523{
1524        int flags = 0;
1525
1526        if (tun->flags & TUN_TUN_DEV)
1527                flags |= IFF_TUN;
1528        else
1529                flags |= IFF_TAP;
1530
1531        if (tun->flags & TUN_NO_PI)
1532                flags |= IFF_NO_PI;
1533
1534        /* This flag has no real effect.  We track the value for backwards
1535         * compatibility.
1536         */
1537        if (tun->flags & TUN_ONE_QUEUE)
1538                flags |= IFF_ONE_QUEUE;
1539
1540        if (tun->flags & TUN_VNET_HDR)
1541                flags |= IFF_VNET_HDR;
1542
1543        if (tun->flags & TUN_TAP_MQ)
1544                flags |= IFF_MULTI_QUEUE;
1545
1546        if (tun->flags & TUN_PERSIST)
1547                flags |= IFF_PERSIST;
1548
1549        return flags;
1550}
1551
1552static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
1553                              char *buf)
1554{
1555        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1556        return sprintf(buf, "0x%x\n", tun_flags(tun));
1557}
1558
1559static ssize_t tun_show_owner(struct device *dev, struct device_attribute *attr,
1560                              char *buf)
1561{
1562        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1563        return uid_valid(tun->owner)?
1564                sprintf(buf, "%u\n",
1565                        from_kuid_munged(current_user_ns(), tun->owner)):
1566                sprintf(buf, "-1\n");
1567}
1568
1569static ssize_t tun_show_group(struct device *dev, struct device_attribute *attr,
1570                              char *buf)
1571{
1572        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1573        return gid_valid(tun->group) ?
1574                sprintf(buf, "%u\n",
1575                        from_kgid_munged(current_user_ns(), tun->group)):
1576                sprintf(buf, "-1\n");
1577}
1578
1579static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL);
1580static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL);
1581static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
1582
1583static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
1584{
1585        struct tun_struct *tun;
1586        struct tun_file *tfile = file->private_data;
1587        struct net_device *dev;
1588        int err;
1589
1590        if (tfile->detached)
1591                return -EINVAL;
1592
1593        dev = __dev_get_by_name(net, ifr->ifr_name);
1594        if (dev) {
1595                if (ifr->ifr_flags & IFF_TUN_EXCL)
1596                        return -EBUSY;
1597                if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
1598                        tun = netdev_priv(dev);
1599                else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
1600                        tun = netdev_priv(dev);
1601                else
1602                        return -EINVAL;
1603
1604                if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
1605                    !!(tun->flags & TUN_TAP_MQ))
1606                        return -EINVAL;
1607
1608                if (tun_not_capable(tun))
1609                        return -EPERM;
1610                err = security_tun_dev_open(tun->security);
1611                if (err < 0)
1612                        return err;
1613
1614                err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER);
1615                if (err < 0)
1616                        return err;
1617
1618                if (tun->flags & TUN_TAP_MQ &&
1619                    (tun->numqueues + tun->numdisabled > 1)) {
1620                        /* One or more queue has already been attached, no need
1621                         * to initialize the device again.
1622                         */
1623                        return 0;
1624                }
1625        }
1626        else {
1627                char *name;
1628                unsigned long flags = 0;
1629                int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
1630                             MAX_TAP_QUEUES : 1;
1631
1632                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1633                        return -EPERM;
1634                err = security_tun_dev_create();
1635                if (err < 0)
1636                        return err;
1637
1638                /* Set dev type */
1639                if (ifr->ifr_flags & IFF_TUN) {
1640                        /* TUN device */
1641                        flags |= TUN_TUN_DEV;
1642                        name = "tun%d";
1643                } else if (ifr->ifr_flags & IFF_TAP) {
1644                        /* TAP device */
1645                        flags |= TUN_TAP_DEV;
1646                        name = "tap%d";
1647                } else
1648                        return -EINVAL;
1649
1650                if (*ifr->ifr_name)
1651                        name = ifr->ifr_name;
1652
1653                dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
1654                                       NET_NAME_UNKNOWN, tun_setup, queues,
1655                                       queues);
1656
1657                if (!dev)
1658                        return -ENOMEM;
1659
1660                dev_net_set(dev, net);
1661                dev->rtnl_link_ops = &tun_link_ops;
1662                dev->ifindex = tfile->ifindex;
1663
1664                tun = netdev_priv(dev);
1665                tun->dev = dev;
1666                tun->flags = flags;
1667                tun->txflt.count = 0;
1668                tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
1669
1670                tun->filter_attached = false;
1671                tun->sndbuf = tfile->socket.sk->sk_sndbuf;
1672
1673                spin_lock_init(&tun->lock);
1674
1675                err = security_tun_dev_alloc_security(&tun->security);
1676                if (err < 0)
1677                        goto err_free_dev;
1678
1679                tun_net_init(dev);
1680                tun_flow_init(tun);
1681
1682                dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
1683                                   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
1684                                   NETIF_F_HW_VLAN_STAG_TX;
1685                dev->features = dev->hw_features;
1686                dev->vlan_features = dev->features &
1687                                     ~(NETIF_F_HW_VLAN_CTAG_TX |
1688                                       NETIF_F_HW_VLAN_STAG_TX);
1689
1690                INIT_LIST_HEAD(&tun->disabled);
1691                err = tun_attach(tun, file, false);
1692                if (err < 0)
1693                        goto err_free_flow;
1694
1695                err = register_netdevice(tun->dev);
1696                if (err < 0)
1697                        goto err_detach;
1698
1699                if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
1700                    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
1701                    device_create_file(&tun->dev->dev, &dev_attr_group))
1702                        pr_err("Failed to create tun sysfs files\n");
1703        }
1704
1705        netif_carrier_on(tun->dev);
1706
1707        tun_debug(KERN_INFO, tun, "tun_set_iff\n");
1708
1709        if (ifr->ifr_flags & IFF_NO_PI)
1710                tun->flags |= TUN_NO_PI;
1711        else
1712                tun->flags &= ~TUN_NO_PI;
1713
1714        /* This flag has no real effect.  We track the value for backwards
1715         * compatibility.
1716         */
1717        if (ifr->ifr_flags & IFF_ONE_QUEUE)
1718                tun->flags |= TUN_ONE_QUEUE;
1719        else
1720                tun->flags &= ~TUN_ONE_QUEUE;
1721
1722        if (ifr->ifr_flags & IFF_VNET_HDR)
1723                tun->flags |= TUN_VNET_HDR;
1724        else
1725                tun->flags &= ~TUN_VNET_HDR;
1726
1727        if (ifr->ifr_flags & IFF_MULTI_QUEUE)
1728                tun->flags |= TUN_TAP_MQ;
1729        else
1730                tun->flags &= ~TUN_TAP_MQ;
1731
1732        /* Make sure persistent devices do not get stuck in
1733         * xoff state.
1734         */
1735        if (netif_running(tun->dev))
1736                netif_tx_wake_all_queues(tun->dev);
1737
1738        strcpy(ifr->ifr_name, tun->dev->name);
1739        return 0;
1740
1741err_detach:
1742        tun_detach_all(dev);
1743err_free_flow:
1744        tun_flow_uninit(tun);
1745        security_tun_dev_free_security(tun->security);
1746err_free_dev:
1747        free_netdev(dev);
1748        return err;
1749}
1750
1751static void tun_get_iff(struct net *net, struct tun_struct *tun,
1752                       struct ifreq *ifr)
1753{
1754        tun_debug(KERN_INFO, tun, "tun_get_iff\n");
1755
1756        strcpy(ifr->ifr_name, tun->dev->name);
1757
1758        ifr->ifr_flags = tun_flags(tun);
1759
1760}
1761
1762/* This is like a cut-down ethtool ops, except done via tun fd so no
1763 * privs required. */
1764static int set_offload(struct tun_struct *tun, unsigned long arg)
1765{
1766        netdev_features_t features = 0;
1767
1768        if (arg & TUN_F_CSUM) {
1769                features |= NETIF_F_HW_CSUM;
1770                arg &= ~TUN_F_CSUM;
1771
1772                if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
1773                        if (arg & TUN_F_TSO_ECN) {
1774                                features |= NETIF_F_TSO_ECN;
1775                                arg &= ~TUN_F_TSO_ECN;
1776                        }
1777                        if (arg & TUN_F_TSO4)
1778                                features |= NETIF_F_TSO;
1779                        if (arg & TUN_F_TSO6)
1780                                features |= NETIF_F_TSO6;
1781                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
1782                }
1783        }
1784
1785        /* This gives the user a way to test for new features in future by
1786         * trying to set them. */
1787        if (arg)
1788                return -EINVAL;
1789
1790        tun->set_features = features;
1791        netdev_update_features(tun->dev);
1792
1793        return 0;
1794}
1795
1796static void tun_detach_filter(struct tun_struct *tun, int n)
1797{
1798        int i;
1799        struct tun_file *tfile;
1800
1801        for (i = 0; i < n; i++) {
1802                tfile = rtnl_dereference(tun->tfiles[i]);
1803                sk_detach_filter(tfile->socket.sk);
1804        }
1805
1806        tun->filter_attached = false;
1807}
1808
1809static int tun_attach_filter(struct tun_struct *tun)
1810{
1811        int i, ret = 0;
1812        struct tun_file *tfile;
1813
1814        for (i = 0; i < tun->numqueues; i++) {
1815                tfile = rtnl_dereference(tun->tfiles[i]);
1816                ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
1817                if (ret) {
1818                        tun_detach_filter(tun, i);
1819                        return ret;
1820                }
1821        }
1822
1823        tun->filter_attached = true;
1824        return ret;
1825}
1826
1827static void tun_set_sndbuf(struct tun_struct *tun)
1828{
1829        struct tun_file *tfile;
1830        int i;
1831
1832        for (i = 0; i < tun->numqueues; i++) {
1833                tfile = rtnl_dereference(tun->tfiles[i]);
1834                tfile->socket.sk->sk_sndbuf = tun->sndbuf;
1835        }
1836}
1837
1838static int tun_set_queue(struct file *file, struct ifreq *ifr)
1839{
1840        struct tun_file *tfile = file->private_data;
1841        struct tun_struct *tun;
1842        int ret = 0;
1843
1844        rtnl_lock();
1845
1846        if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
1847                tun = tfile->detached;
1848                if (!tun) {
1849                        ret = -EINVAL;
1850                        goto unlock;
1851                }
1852                ret = security_tun_dev_attach_queue(tun->security);
1853                if (ret < 0)
1854                        goto unlock;
1855                ret = tun_attach(tun, file, false);
1856        } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
1857                tun = rtnl_dereference(tfile->tun);
1858                if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached)
1859                        ret = -EINVAL;
1860                else
1861                        __tun_detach(tfile, false);
1862        } else
1863                ret = -EINVAL;
1864
1865unlock:
1866        rtnl_unlock();
1867        return ret;
1868}
1869
1870static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
1871                            unsigned long arg, int ifreq_len)
1872{
1873        struct tun_file *tfile = file->private_data;
1874        struct tun_struct *tun;
1875        void __user* argp = (void __user*)arg;
1876        struct ifreq ifr;
1877        kuid_t owner;
1878        kgid_t group;
1879        int sndbuf;
1880        int vnet_hdr_sz;
1881        unsigned int ifindex;
1882        int ret;
1883
1884        if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
1885                if (copy_from_user(&ifr, argp, ifreq_len))
1886                        return -EFAULT;
1887        } else {
1888                memset(&ifr, 0, sizeof(ifr));
1889        }
1890        if (cmd == TUNGETFEATURES) {
1891                /* Currently this just means: "what IFF flags are valid?".
1892                 * This is needed because we never checked for invalid flags on
1893                 * TUNSETIFF. */
1894                return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
1895                                IFF_VNET_HDR | IFF_MULTI_QUEUE,
1896                                (unsigned int __user*)argp);
1897        } else if (cmd == TUNSETQUEUE)
1898                return tun_set_queue(file, &ifr);
1899
1900        ret = 0;
1901        rtnl_lock();
1902
1903        tun = __tun_get(tfile);
1904        if (cmd == TUNSETIFF && !tun) {
1905                ifr.ifr_name[IFNAMSIZ-1] = '\0';
1906
1907                ret = tun_set_iff(tfile->net, file, &ifr);
1908
1909                if (ret)
1910                        goto unlock;
1911
1912                if (copy_to_user(argp, &ifr, ifreq_len))
1913                        ret = -EFAULT;
1914                goto unlock;
1915        }
1916        if (cmd == TUNSETIFINDEX) {
1917                ret = -EPERM;
1918                if (tun)
1919                        goto unlock;
1920
1921                ret = -EFAULT;
1922                if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
1923                        goto unlock;
1924
1925                ret = 0;
1926                tfile->ifindex = ifindex;
1927                goto unlock;
1928        }
1929
1930        ret = -EBADFD;
1931        if (!tun)
1932                goto unlock;
1933
1934        tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd);
1935
1936        ret = 0;
1937        switch (cmd) {
1938        case TUNGETIFF:
1939                tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
1940
1941                if (tfile->detached)
1942                        ifr.ifr_flags |= IFF_DETACH_QUEUE;
1943                if (!tfile->socket.sk->sk_filter)
1944                        ifr.ifr_flags |= IFF_NOFILTER;
1945
1946                if (copy_to_user(argp, &ifr, ifreq_len))
1947                        ret = -EFAULT;
1948                break;
1949
1950        case TUNSETNOCSUM:
1951                /* Disable/Enable checksum */
1952
1953                /* [unimplemented] */
1954                tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
1955                          arg ? "disabled" : "enabled");
1956                break;
1957
1958        case TUNSETPERSIST:
1959                /* Disable/Enable persist mode. Keep an extra reference to the
1960                 * module to prevent the module being unprobed.
1961                 */
1962                if (arg && !(tun->flags & TUN_PERSIST)) {
1963                        tun->flags |= TUN_PERSIST;
1964                        __module_get(THIS_MODULE);
1965                }
1966                if (!arg && (tun->flags & TUN_PERSIST)) {
1967                        tun->flags &= ~TUN_PERSIST;
1968                        module_put(THIS_MODULE);
1969                }
1970
1971                tun_debug(KERN_INFO, tun, "persist %s\n",
1972                          arg ? "enabled" : "disabled");
1973                break;
1974
1975        case TUNSETOWNER:
1976                /* Set owner of the device */
1977                owner = make_kuid(current_user_ns(), arg);
1978                if (!uid_valid(owner)) {
1979                        ret = -EINVAL;
1980                        break;
1981                }
1982                tun->owner = owner;
1983                tun_debug(KERN_INFO, tun, "owner set to %u\n",
1984                          from_kuid(&init_user_ns, tun->owner));
1985                break;
1986
1987        case TUNSETGROUP:
1988                /* Set group of the device */
1989                group = make_kgid(current_user_ns(), arg);
1990                if (!gid_valid(group)) {
1991                        ret = -EINVAL;
1992                        break;
1993                }
1994                tun->group = group;
1995                tun_debug(KERN_INFO, tun, "group set to %u\n",
1996                          from_kgid(&init_user_ns, tun->group));
1997                break;
1998
1999        case TUNSETLINK:
2000                /* Only allow setting the type when the interface is down */
2001                if (tun->dev->flags & IFF_UP) {
2002                        tun_debug(KERN_INFO, tun,
2003                                  "Linktype set failed because interface is up\n");
2004                        ret = -EBUSY;
2005                } else {
2006                        tun->dev->type = (int) arg;
2007                        tun_debug(KERN_INFO, tun, "linktype set to %d\n",
2008                                  tun->dev->type);
2009                        ret = 0;
2010                }
2011                break;
2012
2013#ifdef TUN_DEBUG
2014        case TUNSETDEBUG:
2015                tun->debug = arg;
2016                break;
2017#endif
2018        case TUNSETOFFLOAD:
2019                ret = set_offload(tun, arg);
2020                break;
2021
2022        case TUNSETTXFILTER:
2023                /* Can be set only for TAPs */
2024                ret = -EINVAL;
2025                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
2026                        break;
2027                ret = update_filter(&tun->txflt, (void __user *)arg);
2028                break;
2029
2030        case SIOCGIFHWADDR:
2031                /* Get hw address */
2032                memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
2033                ifr.ifr_hwaddr.sa_family = tun->dev->type;
2034                if (copy_to_user(argp, &ifr, ifreq_len))
2035                        ret = -EFAULT;
2036                break;
2037
2038        case SIOCSIFHWADDR:
2039                /* Set hw address */
2040                tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
2041                          ifr.ifr_hwaddr.sa_data);
2042
2043                ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
2044                break;
2045
2046        case TUNGETSNDBUF:
2047                sndbuf = tfile->socket.sk->sk_sndbuf;
2048                if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
2049                        ret = -EFAULT;
2050                break;
2051
2052        case TUNSETSNDBUF:
2053                if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
2054                        ret = -EFAULT;
2055                        break;
2056                }
2057
2058                tun->sndbuf = sndbuf;
2059                tun_set_sndbuf(tun);
2060                break;
2061
2062        case TUNGETVNETHDRSZ:
2063                vnet_hdr_sz = tun->vnet_hdr_sz;
2064                if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
2065                        ret = -EFAULT;
2066                break;
2067
2068        case TUNSETVNETHDRSZ:
2069                if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
2070                        ret = -EFAULT;
2071                        break;
2072                }
2073                if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
2074                        ret = -EINVAL;
2075                        break;
2076                }
2077
2078                tun->vnet_hdr_sz = vnet_hdr_sz;
2079                break;
2080
2081        case TUNATTACHFILTER:
2082                /* Can be set only for TAPs */
2083                ret = -EINVAL;
2084                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
2085                        break;
2086                ret = -EFAULT;
2087                if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
2088                        break;
2089
2090                ret = tun_attach_filter(tun);
2091                break;
2092
2093        case TUNDETACHFILTER:
2094                /* Can be set only for TAPs */
2095                ret = -EINVAL;
2096                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
2097                        break;
2098                ret = 0;
2099                tun_detach_filter(tun, tun->numqueues);
2100                break;
2101
2102        case TUNGETFILTER:
2103                ret = -EINVAL;
2104                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
2105                        break;
2106                ret = -EFAULT;
2107                if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
2108                        break;
2109                ret = 0;
2110                break;
2111
2112        default:
2113                ret = -EINVAL;
2114                break;
2115        }
2116
2117unlock:
2118        rtnl_unlock();
2119        if (tun)
2120                tun_put(tun);
2121        return ret;
2122}
2123
2124static long tun_chr_ioctl(struct file *file,
2125                          unsigned int cmd, unsigned long arg)
2126{
2127        return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
2128}
2129
2130#ifdef CONFIG_COMPAT
2131static long tun_chr_compat_ioctl(struct file *file,
2132                         unsigned int cmd, unsigned long arg)
2133{
2134        switch (cmd) {
2135        case TUNSETIFF:
2136        case TUNGETIFF:
2137        case TUNSETTXFILTER:
2138        case TUNGETSNDBUF:
2139        case TUNSETSNDBUF:
2140        case SIOCGIFHWADDR:
2141        case SIOCSIFHWADDR:
2142                arg = (unsigned long)compat_ptr(arg);
2143                break;
2144        default:
2145                arg = (compat_ulong_t)arg;
2146                break;
2147        }
2148
2149        /*
2150         * compat_ifreq is shorter than ifreq, so we must not access beyond
2151         * the end of that structure. All fields that are used in this
2152         * driver are compatible though, we don't need to convert the
2153         * contents.
2154         */
2155        return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
2156}
2157#endif /* CONFIG_COMPAT */
2158
2159static int tun_chr_fasync(int fd, struct file *file, int on)
2160{
2161        struct tun_file *tfile = file->private_data;
2162        int ret;
2163
2164        if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
2165                goto out;
2166
2167        if (on) {
2168                __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
2169                tfile->flags |= TUN_FASYNC;
2170        } else
2171                tfile->flags &= ~TUN_FASYNC;
2172        ret = 0;
2173out:
2174        return ret;
2175}
2176
2177static int tun_chr_open(struct inode *inode, struct file * file)
2178{
2179        struct tun_file *tfile;
2180
2181        DBG1(KERN_INFO, "tunX: tun_chr_open\n");
2182
2183        tfile = (struct tun_file *)sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL,
2184                                            &tun_proto);
2185        if (!tfile)
2186                return -ENOMEM;
2187        RCU_INIT_POINTER(tfile->tun, NULL);
2188        tfile->net = get_net(current->nsproxy->net_ns);
2189        tfile->flags = 0;
2190        tfile->ifindex = 0;
2191
2192        init_waitqueue_head(&tfile->wq.wait);
2193        RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
2194
2195        tfile->socket.file = file;
2196        tfile->socket.ops = &tun_socket_ops;
2197
2198        sock_init_data(&tfile->socket, &tfile->sk);
2199        sk_change_net(&tfile->sk, tfile->net);
2200
2201        tfile->sk.sk_write_space = tun_sock_write_space;
2202        tfile->sk.sk_sndbuf = INT_MAX;
2203
2204        file->private_data = tfile;
2205        set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags);
2206        INIT_LIST_HEAD(&tfile->next);
2207
2208        sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
2209
2210        return 0;
2211}
2212
2213static int tun_chr_close(struct inode *inode, struct file *file)
2214{
2215        struct tun_file *tfile = file->private_data;
2216        struct net *net = tfile->net;
2217
2218        tun_detach(tfile, true);
2219        put_net(net);
2220
2221        return 0;
2222}
2223
2224#ifdef CONFIG_PROC_FS
2225static int tun_chr_show_fdinfo(struct seq_file *m, struct file *f)
2226{
2227        struct tun_struct *tun;
2228        struct ifreq ifr;
2229
2230        memset(&ifr, 0, sizeof(ifr));
2231
2232        rtnl_lock();
2233        tun = tun_get(f);
2234        if (tun)
2235                tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
2236        rtnl_unlock();
2237
2238        if (tun)
2239                tun_put(tun);
2240
2241        return seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
2242}
2243#endif
2244
2245static const struct file_operations tun_fops = {
2246        .owner  = THIS_MODULE,
2247        .llseek = no_llseek,
2248        .read  = do_sync_read,
2249        .aio_read  = tun_chr_aio_read,
2250        .write = do_sync_write,
2251        .aio_write = tun_chr_aio_write,
2252        .poll   = tun_chr_poll,
2253        .unlocked_ioctl = tun_chr_ioctl,
2254#ifdef CONFIG_COMPAT
2255        .compat_ioctl = tun_chr_compat_ioctl,
2256#endif
2257        .open   = tun_chr_open,
2258        .release = tun_chr_close,
2259        .fasync = tun_chr_fasync,
2260#ifdef CONFIG_PROC_FS
2261        .show_fdinfo = tun_chr_show_fdinfo,
2262#endif
2263};
2264
2265static struct miscdevice tun_miscdev = {
2266        .minor = TUN_MINOR,
2267        .name = "tun",
2268        .nodename = "net/tun",
2269        .fops = &tun_fops,
2270};
2271
2272/* ethtool interface */
2273
2274static int tun_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
2275{
2276        cmd->supported          = 0;
2277        cmd->advertising        = 0;
2278        ethtool_cmd_speed_set(cmd, SPEED_10);
2279        cmd->duplex             = DUPLEX_FULL;
2280        cmd->port               = PORT_TP;
2281        cmd->phy_address        = 0;
2282        cmd->transceiver        = XCVR_INTERNAL;
2283        cmd->autoneg            = AUTONEG_DISABLE;
2284        cmd->maxtxpkt           = 0;
2285        cmd->maxrxpkt           = 0;
2286        return 0;
2287}
2288
2289static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
2290{
2291        struct tun_struct *tun = netdev_priv(dev);
2292
2293        strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
2294        strlcpy(info->version, DRV_VERSION, sizeof(info->version));
2295
2296        switch (tun->flags & TUN_TYPE_MASK) {
2297        case TUN_TUN_DEV:
2298                strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
2299                break;
2300        case TUN_TAP_DEV:
2301                strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
2302                break;
2303        }
2304}
2305
2306static u32 tun_get_msglevel(struct net_device *dev)
2307{
2308#ifdef TUN_DEBUG
2309        struct tun_struct *tun = netdev_priv(dev);
2310        return tun->debug;
2311#else
2312        return -EOPNOTSUPP;
2313#endif
2314}
2315
2316static void tun_set_msglevel(struct net_device *dev, u32 value)
2317{
2318#ifdef TUN_DEBUG
2319        struct tun_struct *tun = netdev_priv(dev);
2320        tun->debug = value;
2321#endif
2322}
2323
2324static const struct ethtool_ops tun_ethtool_ops = {
2325        .get_settings   = tun_get_settings,
2326        .get_drvinfo    = tun_get_drvinfo,
2327        .get_msglevel   = tun_get_msglevel,
2328        .set_msglevel   = tun_set_msglevel,
2329        .get_link       = ethtool_op_get_link,
2330        .get_ts_info    = ethtool_op_get_ts_info,
2331};
2332
2333
2334static int __init tun_init(void)
2335{
2336        int ret = 0;
2337
2338        pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
2339        pr_info("%s\n", DRV_COPYRIGHT);
2340
2341        ret = rtnl_link_register(&tun_link_ops);
2342        if (ret) {
2343                pr_err("Can't register link_ops\n");
2344                goto err_linkops;
2345        }
2346
2347        ret = misc_register(&tun_miscdev);
2348        if (ret) {
2349                pr_err("Can't register misc device %d\n", TUN_MINOR);
2350                goto err_misc;
2351        }
2352        return  0;
2353err_misc:
2354        rtnl_link_unregister(&tun_link_ops);
2355err_linkops:
2356        return ret;
2357}
2358
2359static void tun_cleanup(void)
2360{
2361        misc_deregister(&tun_miscdev);
2362        rtnl_link_unregister(&tun_link_ops);
2363}
2364
2365/* Get an underlying socket object from tun file.  Returns error unless file is
2366 * attached to a device.  The returned object works like a packet socket, it
2367 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
2368 * holding a reference to the file for as long as the socket is in use. */
2369struct socket *tun_get_socket(struct file *file)
2370{
2371        struct tun_file *tfile;
2372        if (file->f_op != &tun_fops)
2373                return ERR_PTR(-EINVAL);
2374        tfile = file->private_data;
2375        if (!tfile)
2376                return ERR_PTR(-EBADFD);
2377        return &tfile->socket;
2378}
2379EXPORT_SYMBOL_GPL(tun_get_socket);
2380
2381module_init(tun_init);
2382module_exit(tun_cleanup);
2383MODULE_DESCRIPTION(DRV_DESCRIPTION);
2384MODULE_AUTHOR(DRV_COPYRIGHT);
2385MODULE_LICENSE("GPL");
2386MODULE_ALIAS_MISCDEV(TUN_MINOR);
2387MODULE_ALIAS("devname:net/tun");
2388