linux/drivers/net/tun.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *  TUN - Universal TUN/TAP device driver.
   4 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
   5 *
   6 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
   7 */
   8
   9/*
  10 *  Changes:
  11 *
  12 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
  13 *    Add TUNSETLINK ioctl to set the link encapsulation
  14 *
  15 *  Mark Smith <markzzzsmith@yahoo.com.au>
  16 *    Use eth_random_addr() for tap MAC address.
  17 *
  18 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
  19 *    Fixes in packet dropping, queue length setting and queue wakeup.
  20 *    Increased default tx queue length.
  21 *    Added ethtool API.
  22 *    Minor cleanups
  23 *
  24 *  Daniel Podlejski <underley@underley.eu.org>
  25 *    Modifications for 2.3.99-pre5 kernel.
  26 */
  27
  28#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  29
  30#define DRV_NAME        "tun"
  31#define DRV_VERSION     "1.6"
  32#define DRV_DESCRIPTION "Universal TUN/TAP device driver"
  33#define DRV_COPYRIGHT   "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
  34
  35#include <linux/module.h>
  36#include <linux/errno.h>
  37#include <linux/kernel.h>
  38#include <linux/sched/signal.h>
  39#include <linux/major.h>
  40#include <linux/slab.h>
  41#include <linux/poll.h>
  42#include <linux/fcntl.h>
  43#include <linux/init.h>
  44#include <linux/skbuff.h>
  45#include <linux/netdevice.h>
  46#include <linux/etherdevice.h>
  47#include <linux/miscdevice.h>
  48#include <linux/ethtool.h>
  49#include <linux/rtnetlink.h>
  50#include <linux/compat.h>
  51#include <linux/if.h>
  52#include <linux/if_arp.h>
  53#include <linux/if_ether.h>
  54#include <linux/if_tun.h>
  55#include <linux/if_vlan.h>
  56#include <linux/crc32.h>
  57#include <linux/nsproxy.h>
  58#include <linux/virtio_net.h>
  59#include <linux/rcupdate.h>
  60#include <net/net_namespace.h>
  61#include <net/netns/generic.h>
  62#include <net/rtnetlink.h>
  63#include <net/sock.h>
  64#include <net/xdp.h>
  65#include <linux/seq_file.h>
  66#include <linux/uio.h>
  67#include <linux/skb_array.h>
  68#include <linux/bpf.h>
  69#include <linux/bpf_trace.h>
  70#include <linux/mutex.h>
  71
  72#include <linux/uaccess.h>
  73#include <linux/proc_fs.h>
  74
  75static void tun_default_link_ksettings(struct net_device *dev,
  76                                       struct ethtool_link_ksettings *cmd);
  77
  78#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
  79
  80/* TUN device flags */
  81
  82/* IFF_ATTACH_QUEUE is never stored in device flags,
  83 * overload it to mean fasync when stored there.
  84 */
  85#define TUN_FASYNC      IFF_ATTACH_QUEUE
  86/* High bits in flags field are unused. */
  87#define TUN_VNET_LE     0x80000000
  88#define TUN_VNET_BE     0x40000000
  89
  90#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
  91                      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
  92
  93#define GOODCOPY_LEN 128
  94
  95#define FLT_EXACT_COUNT 8
  96struct tap_filter {
  97        unsigned int    count;    /* Number of addrs. Zero means disabled */
  98        u32             mask[2];  /* Mask of the hashed addrs */
  99        unsigned char   addr[FLT_EXACT_COUNT][ETH_ALEN];
 100};
 101
 102/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
 103 * to max number of VCPUs in guest. */
 104#define MAX_TAP_QUEUES 256
 105#define MAX_TAP_FLOWS  4096
 106
 107#define TUN_FLOW_EXPIRE (3 * HZ)
 108
 109struct tun_pcpu_stats {
 110        u64_stats_t rx_packets;
 111        u64_stats_t rx_bytes;
 112        u64_stats_t tx_packets;
 113        u64_stats_t tx_bytes;
 114        struct u64_stats_sync syncp;
 115        u32 rx_dropped;
 116        u32 tx_dropped;
 117        u32 rx_frame_errors;
 118};
 119
 120/* A tun_file connects an open character device to a tuntap netdevice. It
 121 * also contains all socket related structures (except sock_fprog and tap_filter)
 122 * to serve as one transmit queue for tuntap device. The sock_fprog and
 123 * tap_filter were kept in tun_struct since they were used for filtering for the
 124 * netdevice not for a specific queue (at least I didn't see the requirement for
 125 * this).
 126 *
 127 * RCU usage:
 128 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
 129 * other can only be read while rcu_read_lock or rtnl_lock is held.
 130 */
 131struct tun_file {
 132        struct sock sk;
 133        struct socket socket;
 134        struct tun_struct __rcu *tun;
 135        struct fasync_struct *fasync;
 136        /* only used for fasnyc */
 137        unsigned int flags;
 138        union {
 139                u16 queue_index;
 140                unsigned int ifindex;
 141        };
 142        struct napi_struct napi;
 143        bool napi_enabled;
 144        bool napi_frags_enabled;
 145        struct mutex napi_mutex;        /* Protects access to the above napi */
 146        struct list_head next;
 147        struct tun_struct *detached;
 148        struct ptr_ring tx_ring;
 149        struct xdp_rxq_info xdp_rxq;
 150};
 151
 152struct tun_page {
 153        struct page *page;
 154        int count;
 155};
 156
 157struct tun_flow_entry {
 158        struct hlist_node hash_link;
 159        struct rcu_head rcu;
 160        struct tun_struct *tun;
 161
 162        u32 rxhash;
 163        u32 rps_rxhash;
 164        int queue_index;
 165        unsigned long updated ____cacheline_aligned_in_smp;
 166};
 167
 168#define TUN_NUM_FLOW_ENTRIES 1024
 169#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)
 170
 171struct tun_prog {
 172        struct rcu_head rcu;
 173        struct bpf_prog *prog;
 174};
 175
 176/* Since the socket were moved to tun_file, to preserve the behavior of persist
 177 * device, socket filter, sndbuf and vnet header size were restore when the
 178 * file were attached to a persist device.
 179 */
 180struct tun_struct {
 181        struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES];
 182        unsigned int            numqueues;
 183        unsigned int            flags;
 184        kuid_t                  owner;
 185        kgid_t                  group;
 186
 187        struct net_device       *dev;
 188        netdev_features_t       set_features;
 189#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
 190                          NETIF_F_TSO6)
 191
 192        int                     align;
 193        int                     vnet_hdr_sz;
 194        int                     sndbuf;
 195        struct tap_filter       txflt;
 196        struct sock_fprog       fprog;
 197        /* protected by rtnl lock */
 198        bool                    filter_attached;
 199        u32                     msg_enable;
 200        spinlock_t lock;
 201        struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
 202        struct timer_list flow_gc_timer;
 203        unsigned long ageing_time;
 204        unsigned int numdisabled;
 205        struct list_head disabled;
 206        void *security;
 207        u32 flow_count;
 208        u32 rx_batched;
 209        struct tun_pcpu_stats __percpu *pcpu_stats;
 210        struct bpf_prog __rcu *xdp_prog;
 211        struct tun_prog __rcu *steering_prog;
 212        struct tun_prog __rcu *filter_prog;
 213        struct ethtool_link_ksettings link_ksettings;
 214};
 215
 216struct veth {
 217        __be16 h_vlan_proto;
 218        __be16 h_vlan_TCI;
 219};
 220
 221bool tun_is_xdp_frame(void *ptr)
 222{
 223        return (unsigned long)ptr & TUN_XDP_FLAG;
 224}
 225EXPORT_SYMBOL(tun_is_xdp_frame);
 226
 227void *tun_xdp_to_ptr(void *ptr)
 228{
 229        return (void *)((unsigned long)ptr | TUN_XDP_FLAG);
 230}
 231EXPORT_SYMBOL(tun_xdp_to_ptr);
 232
 233void *tun_ptr_to_xdp(void *ptr)
 234{
 235        return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
 236}
 237EXPORT_SYMBOL(tun_ptr_to_xdp);
 238
 239static int tun_napi_receive(struct napi_struct *napi, int budget)
 240{
 241        struct tun_file *tfile = container_of(napi, struct tun_file, napi);
 242        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
 243        struct sk_buff_head process_queue;
 244        struct sk_buff *skb;
 245        int received = 0;
 246
 247        __skb_queue_head_init(&process_queue);
 248
 249        spin_lock(&queue->lock);
 250        skb_queue_splice_tail_init(queue, &process_queue);
 251        spin_unlock(&queue->lock);
 252
 253        while (received < budget && (skb = __skb_dequeue(&process_queue))) {
 254                napi_gro_receive(napi, skb);
 255                ++received;
 256        }
 257
 258        if (!skb_queue_empty(&process_queue)) {
 259                spin_lock(&queue->lock);
 260                skb_queue_splice(&process_queue, queue);
 261                spin_unlock(&queue->lock);
 262        }
 263
 264        return received;
 265}
 266
 267static int tun_napi_poll(struct napi_struct *napi, int budget)
 268{
 269        unsigned int received;
 270
 271        received = tun_napi_receive(napi, budget);
 272
 273        if (received < budget)
 274                napi_complete_done(napi, received);
 275
 276        return received;
 277}
 278
 279static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
 280                          bool napi_en, bool napi_frags)
 281{
 282        tfile->napi_enabled = napi_en;
 283        tfile->napi_frags_enabled = napi_en && napi_frags;
 284        if (napi_en) {
 285                netif_tx_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 286                                  NAPI_POLL_WEIGHT);
 287                napi_enable(&tfile->napi);
 288        }
 289}
 290
 291static void tun_napi_disable(struct tun_file *tfile)
 292{
 293        if (tfile->napi_enabled)
 294                napi_disable(&tfile->napi);
 295}
 296
 297static void tun_napi_del(struct tun_file *tfile)
 298{
 299        if (tfile->napi_enabled)
 300                netif_napi_del(&tfile->napi);
 301}
 302
 303static bool tun_napi_frags_enabled(const struct tun_file *tfile)
 304{
 305        return tfile->napi_frags_enabled;
 306}
 307
 308#ifdef CONFIG_TUN_VNET_CROSS_LE
 309static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 310{
 311        return tun->flags & TUN_VNET_BE ? false :
 312                virtio_legacy_is_little_endian();
 313}
 314
 315static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
 316{
 317        int be = !!(tun->flags & TUN_VNET_BE);
 318
 319        if (put_user(be, argp))
 320                return -EFAULT;
 321
 322        return 0;
 323}
 324
 325static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
 326{
 327        int be;
 328
 329        if (get_user(be, argp))
 330                return -EFAULT;
 331
 332        if (be)
 333                tun->flags |= TUN_VNET_BE;
 334        else
 335                tun->flags &= ~TUN_VNET_BE;
 336
 337        return 0;
 338}
 339#else
 340static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 341{
 342        return virtio_legacy_is_little_endian();
 343}
 344
 345static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
 346{
 347        return -EINVAL;
 348}
 349
 350static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
 351{
 352        return -EINVAL;
 353}
 354#endif /* CONFIG_TUN_VNET_CROSS_LE */
 355
 356static inline bool tun_is_little_endian(struct tun_struct *tun)
 357{
 358        return tun->flags & TUN_VNET_LE ||
 359                tun_legacy_is_little_endian(tun);
 360}
 361
 362static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
 363{
 364        return __virtio16_to_cpu(tun_is_little_endian(tun), val);
 365}
 366
 367static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
 368{
 369        return __cpu_to_virtio16(tun_is_little_endian(tun), val);
 370}
 371
 372static inline u32 tun_hashfn(u32 rxhash)
 373{
 374        return rxhash & TUN_MASK_FLOW_ENTRIES;
 375}
 376
 377static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
 378{
 379        struct tun_flow_entry *e;
 380
 381        hlist_for_each_entry_rcu(e, head, hash_link) {
 382                if (e->rxhash == rxhash)
 383                        return e;
 384        }
 385        return NULL;
 386}
 387
 388static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
 389                                              struct hlist_head *head,
 390                                              u32 rxhash, u16 queue_index)
 391{
 392        struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
 393
 394        if (e) {
 395                netif_info(tun, tx_queued, tun->dev,
 396                           "create flow: hash %u index %u\n",
 397                           rxhash, queue_index);
 398                e->updated = jiffies;
 399                e->rxhash = rxhash;
 400                e->rps_rxhash = 0;
 401                e->queue_index = queue_index;
 402                e->tun = tun;
 403                hlist_add_head_rcu(&e->hash_link, head);
 404                ++tun->flow_count;
 405        }
 406        return e;
 407}
 408
 409static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
 410{
 411        netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
 412                   e->rxhash, e->queue_index);
 413        hlist_del_rcu(&e->hash_link);
 414        kfree_rcu(e, rcu);
 415        --tun->flow_count;
 416}
 417
 418static void tun_flow_flush(struct tun_struct *tun)
 419{
 420        int i;
 421
 422        spin_lock_bh(&tun->lock);
 423        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 424                struct tun_flow_entry *e;
 425                struct hlist_node *n;
 426
 427                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
 428                        tun_flow_delete(tun, e);
 429        }
 430        spin_unlock_bh(&tun->lock);
 431}
 432
 433static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
 434{
 435        int i;
 436
 437        spin_lock_bh(&tun->lock);
 438        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 439                struct tun_flow_entry *e;
 440                struct hlist_node *n;
 441
 442                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
 443                        if (e->queue_index == queue_index)
 444                                tun_flow_delete(tun, e);
 445                }
 446        }
 447        spin_unlock_bh(&tun->lock);
 448}
 449
 450static void tun_flow_cleanup(struct timer_list *t)
 451{
 452        struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
 453        unsigned long delay = tun->ageing_time;
 454        unsigned long next_timer = jiffies + delay;
 455        unsigned long count = 0;
 456        int i;
 457
 458        spin_lock(&tun->lock);
 459        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 460                struct tun_flow_entry *e;
 461                struct hlist_node *n;
 462
 463                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
 464                        unsigned long this_timer;
 465
 466                        this_timer = e->updated + delay;
 467                        if (time_before_eq(this_timer, jiffies)) {
 468                                tun_flow_delete(tun, e);
 469                                continue;
 470                        }
 471                        count++;
 472                        if (time_before(this_timer, next_timer))
 473                                next_timer = this_timer;
 474                }
 475        }
 476
 477        if (count)
 478                mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
 479        spin_unlock(&tun->lock);
 480}
 481
 482static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
 483                            struct tun_file *tfile)
 484{
 485        struct hlist_head *head;
 486        struct tun_flow_entry *e;
 487        unsigned long delay = tun->ageing_time;
 488        u16 queue_index = tfile->queue_index;
 489
 490        head = &tun->flows[tun_hashfn(rxhash)];
 491
 492        rcu_read_lock();
 493
 494        e = tun_flow_find(head, rxhash);
 495        if (likely(e)) {
 496                /* TODO: keep queueing to old queue until it's empty? */
 497                if (READ_ONCE(e->queue_index) != queue_index)
 498                        WRITE_ONCE(e->queue_index, queue_index);
 499                if (e->updated != jiffies)
 500                        e->updated = jiffies;
 501                sock_rps_record_flow_hash(e->rps_rxhash);
 502        } else {
 503                spin_lock_bh(&tun->lock);
 504                if (!tun_flow_find(head, rxhash) &&
 505                    tun->flow_count < MAX_TAP_FLOWS)
 506                        tun_flow_create(tun, head, rxhash, queue_index);
 507
 508                if (!timer_pending(&tun->flow_gc_timer))
 509                        mod_timer(&tun->flow_gc_timer,
 510                                  round_jiffies_up(jiffies + delay));
 511                spin_unlock_bh(&tun->lock);
 512        }
 513
 514        rcu_read_unlock();
 515}
 516
 517/* Save the hash received in the stack receive path and update the
 518 * flow_hash table accordingly.
 519 */
 520static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 521{
 522        if (unlikely(e->rps_rxhash != hash))
 523                e->rps_rxhash = hash;
 524}
 525
 526/* We try to identify a flow through its rxhash. The reason that
 527 * we do not check rxq no. is because some cards(e.g 82599), chooses
 528 * the rxq based on the txq where the last packet of the flow comes. As
 529 * the userspace application move between processors, we may get a
 530 * different rxq no. here.
 531 */
 532static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 533{
 534        struct tun_flow_entry *e;
 535        u32 txq = 0;
 536        u32 numqueues = 0;
 537
 538        numqueues = READ_ONCE(tun->numqueues);
 539
 540        txq = __skb_get_hash_symmetric(skb);
 541        e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
 542        if (e) {
 543                tun_flow_save_rps_rxhash(e, txq);
 544                txq = e->queue_index;
 545        } else {
 546                /* use multiply and shift instead of expensive divide */
 547                txq = ((u64)txq * numqueues) >> 32;
 548        }
 549
 550        return txq;
 551}
 552
 553static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 554{
 555        struct tun_prog *prog;
 556        u32 numqueues;
 557        u16 ret = 0;
 558
 559        numqueues = READ_ONCE(tun->numqueues);
 560        if (!numqueues)
 561                return 0;
 562
 563        prog = rcu_dereference(tun->steering_prog);
 564        if (prog)
 565                ret = bpf_prog_run_clear_cb(prog->prog, skb);
 566
 567        return ret % numqueues;
 568}
 569
 570static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
 571                            struct net_device *sb_dev)
 572{
 573        struct tun_struct *tun = netdev_priv(dev);
 574        u16 ret;
 575
 576        rcu_read_lock();
 577        if (rcu_dereference(tun->steering_prog))
 578                ret = tun_ebpf_select_queue(tun, skb);
 579        else
 580                ret = tun_automq_select_queue(tun, skb);
 581        rcu_read_unlock();
 582
 583        return ret;
 584}
 585
 586static inline bool tun_not_capable(struct tun_struct *tun)
 587{
 588        const struct cred *cred = current_cred();
 589        struct net *net = dev_net(tun->dev);
 590
 591        return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
 592                  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
 593                !ns_capable(net->user_ns, CAP_NET_ADMIN);
 594}
 595
 596static void tun_set_real_num_queues(struct tun_struct *tun)
 597{
 598        netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
 599        netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
 600}
 601
 602static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
 603{
 604        tfile->detached = tun;
 605        list_add_tail(&tfile->next, &tun->disabled);
 606        ++tun->numdisabled;
 607}
 608
 609static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 610{
 611        struct tun_struct *tun = tfile->detached;
 612
 613        tfile->detached = NULL;
 614        list_del_init(&tfile->next);
 615        --tun->numdisabled;
 616        return tun;
 617}
 618
 619void tun_ptr_free(void *ptr)
 620{
 621        if (!ptr)
 622                return;
 623        if (tun_is_xdp_frame(ptr)) {
 624                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 625
 626                xdp_return_frame(xdpf);
 627        } else {
 628                __skb_array_destroy_skb(ptr);
 629        }
 630}
 631EXPORT_SYMBOL_GPL(tun_ptr_free);
 632
 633static void tun_queue_purge(struct tun_file *tfile)
 634{
 635        void *ptr;
 636
 637        while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
 638                tun_ptr_free(ptr);
 639
 640        skb_queue_purge(&tfile->sk.sk_write_queue);
 641        skb_queue_purge(&tfile->sk.sk_error_queue);
 642}
 643
 644static void __tun_detach(struct tun_file *tfile, bool clean)
 645{
 646        struct tun_file *ntfile;
 647        struct tun_struct *tun;
 648
 649        tun = rtnl_dereference(tfile->tun);
 650
 651        if (tun && clean) {
 652                tun_napi_disable(tfile);
 653                tun_napi_del(tfile);
 654        }
 655
 656        if (tun && !tfile->detached) {
 657                u16 index = tfile->queue_index;
 658                BUG_ON(index >= tun->numqueues);
 659
 660                rcu_assign_pointer(tun->tfiles[index],
 661                                   tun->tfiles[tun->numqueues - 1]);
 662                ntfile = rtnl_dereference(tun->tfiles[index]);
 663                ntfile->queue_index = index;
 664                rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
 665                                   NULL);
 666
 667                --tun->numqueues;
 668                if (clean) {
 669                        RCU_INIT_POINTER(tfile->tun, NULL);
 670                        sock_put(&tfile->sk);
 671                } else
 672                        tun_disable_queue(tun, tfile);
 673
 674                synchronize_net();
 675                tun_flow_delete_by_queue(tun, tun->numqueues + 1);
 676                /* Drop read queue */
 677                tun_queue_purge(tfile);
 678                tun_set_real_num_queues(tun);
 679        } else if (tfile->detached && clean) {
 680                tun = tun_enable_queue(tfile);
 681                sock_put(&tfile->sk);
 682        }
 683
 684        if (clean) {
 685                if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
 686                        netif_carrier_off(tun->dev);
 687
 688                        if (!(tun->flags & IFF_PERSIST) &&
 689                            tun->dev->reg_state == NETREG_REGISTERED)
 690                                unregister_netdevice(tun->dev);
 691                }
 692                if (tun)
 693                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
 694                ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 695                sock_put(&tfile->sk);
 696        }
 697}
 698
 699static void tun_detach(struct tun_file *tfile, bool clean)
 700{
 701        struct tun_struct *tun;
 702        struct net_device *dev;
 703
 704        rtnl_lock();
 705        tun = rtnl_dereference(tfile->tun);
 706        dev = tun ? tun->dev : NULL;
 707        __tun_detach(tfile, clean);
 708        if (dev)
 709                netdev_state_change(dev);
 710        rtnl_unlock();
 711}
 712
 713static void tun_detach_all(struct net_device *dev)
 714{
 715        struct tun_struct *tun = netdev_priv(dev);
 716        struct tun_file *tfile, *tmp;
 717        int i, n = tun->numqueues;
 718
 719        for (i = 0; i < n; i++) {
 720                tfile = rtnl_dereference(tun->tfiles[i]);
 721                BUG_ON(!tfile);
 722                tun_napi_disable(tfile);
 723                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 724                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 725                RCU_INIT_POINTER(tfile->tun, NULL);
 726                --tun->numqueues;
 727        }
 728        list_for_each_entry(tfile, &tun->disabled, next) {
 729                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 730                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 731                RCU_INIT_POINTER(tfile->tun, NULL);
 732        }
 733        BUG_ON(tun->numqueues != 0);
 734
 735        synchronize_net();
 736        for (i = 0; i < n; i++) {
 737                tfile = rtnl_dereference(tun->tfiles[i]);
 738                tun_napi_del(tfile);
 739                /* Drop read queue */
 740                tun_queue_purge(tfile);
 741                xdp_rxq_info_unreg(&tfile->xdp_rxq);
 742                sock_put(&tfile->sk);
 743        }
 744        list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
 745                tun_enable_queue(tfile);
 746                tun_queue_purge(tfile);
 747                xdp_rxq_info_unreg(&tfile->xdp_rxq);
 748                sock_put(&tfile->sk);
 749        }
 750        BUG_ON(tun->numdisabled != 0);
 751
 752        if (tun->flags & IFF_PERSIST)
 753                module_put(THIS_MODULE);
 754}
 755
 756static int tun_attach(struct tun_struct *tun, struct file *file,
 757                      bool skip_filter, bool napi, bool napi_frags,
 758                      bool publish_tun)
 759{
 760        struct tun_file *tfile = file->private_data;
 761        struct net_device *dev = tun->dev;
 762        int err;
 763
 764        err = security_tun_dev_attach(tfile->socket.sk, tun->security);
 765        if (err < 0)
 766                goto out;
 767
 768        err = -EINVAL;
 769        if (rtnl_dereference(tfile->tun) && !tfile->detached)
 770                goto out;
 771
 772        err = -EBUSY;
 773        if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
 774                goto out;
 775
 776        err = -E2BIG;
 777        if (!tfile->detached &&
 778            tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
 779                goto out;
 780
 781        err = 0;
 782
 783        /* Re-attach the filter to persist device */
 784        if (!skip_filter && (tun->filter_attached == true)) {
 785                lock_sock(tfile->socket.sk);
 786                err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
 787                release_sock(tfile->socket.sk);
 788                if (!err)
 789                        goto out;
 790        }
 791
 792        if (!tfile->detached &&
 793            ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
 794                            GFP_KERNEL, tun_ptr_free)) {
 795                err = -ENOMEM;
 796                goto out;
 797        }
 798
 799        tfile->queue_index = tun->numqueues;
 800        tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
 801
 802        if (tfile->detached) {
 803                /* Re-attach detached tfile, updating XDP queue_index */
 804                WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));
 805
 806                if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
 807                        tfile->xdp_rxq.queue_index = tfile->queue_index;
 808        } else {
 809                /* Setup XDP RX-queue info, for new tfile getting attached */
 810                err = xdp_rxq_info_reg(&tfile->xdp_rxq,
 811                                       tun->dev, tfile->queue_index);
 812                if (err < 0)
 813                        goto out;
 814                err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
 815                                                 MEM_TYPE_PAGE_SHARED, NULL);
 816                if (err < 0) {
 817                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
 818                        goto out;
 819                }
 820                err = 0;
 821        }
 822
 823        if (tfile->detached) {
 824                tun_enable_queue(tfile);
 825        } else {
 826                sock_hold(&tfile->sk);
 827                tun_napi_init(tun, tfile, napi, napi_frags);
 828        }
 829
 830        if (rtnl_dereference(tun->xdp_prog))
 831                sock_set_flag(&tfile->sk, SOCK_XDP);
 832
 833        /* device is allowed to go away first, so no need to hold extra
 834         * refcnt.
 835         */
 836
 837        /* Publish tfile->tun and tun->tfiles only after we've fully
 838         * initialized tfile; otherwise we risk using half-initialized
 839         * object.
 840         */
 841        if (publish_tun)
 842                rcu_assign_pointer(tfile->tun, tun);
 843        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 844        tun->numqueues++;
 845        tun_set_real_num_queues(tun);
 846out:
 847        return err;
 848}
 849
 850static struct tun_struct *tun_get(struct tun_file *tfile)
 851{
 852        struct tun_struct *tun;
 853
 854        rcu_read_lock();
 855        tun = rcu_dereference(tfile->tun);
 856        if (tun)
 857                dev_hold(tun->dev);
 858        rcu_read_unlock();
 859
 860        return tun;
 861}
 862
 863static void tun_put(struct tun_struct *tun)
 864{
 865        dev_put(tun->dev);
 866}
 867
 868/* TAP filtering */
 869static void addr_hash_set(u32 *mask, const u8 *addr)
 870{
 871        int n = ether_crc(ETH_ALEN, addr) >> 26;
 872        mask[n >> 5] |= (1 << (n & 31));
 873}
 874
 875static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
 876{
 877        int n = ether_crc(ETH_ALEN, addr) >> 26;
 878        return mask[n >> 5] & (1 << (n & 31));
 879}
 880
 881static int update_filter(struct tap_filter *filter, void __user *arg)
 882{
 883        struct { u8 u[ETH_ALEN]; } *addr;
 884        struct tun_filter uf;
 885        int err, alen, n, nexact;
 886
 887        if (copy_from_user(&uf, arg, sizeof(uf)))
 888                return -EFAULT;
 889
 890        if (!uf.count) {
 891                /* Disabled */
 892                filter->count = 0;
 893                return 0;
 894        }
 895
 896        alen = ETH_ALEN * uf.count;
 897        addr = memdup_user(arg + sizeof(uf), alen);
 898        if (IS_ERR(addr))
 899                return PTR_ERR(addr);
 900
 901        /* The filter is updated without holding any locks. Which is
 902         * perfectly safe. We disable it first and in the worst
 903         * case we'll accept a few undesired packets. */
 904        filter->count = 0;
 905        wmb();
 906
 907        /* Use first set of addresses as an exact filter */
 908        for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
 909                memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
 910
 911        nexact = n;
 912
 913        /* Remaining multicast addresses are hashed,
 914         * unicast will leave the filter disabled. */
 915        memset(filter->mask, 0, sizeof(filter->mask));
 916        for (; n < uf.count; n++) {
 917                if (!is_multicast_ether_addr(addr[n].u)) {
 918                        err = 0; /* no filter */
 919                        goto free_addr;
 920                }
 921                addr_hash_set(filter->mask, addr[n].u);
 922        }
 923
 924        /* For ALLMULTI just set the mask to all ones.
 925         * This overrides the mask populated above. */
 926        if ((uf.flags & TUN_FLT_ALLMULTI))
 927                memset(filter->mask, ~0, sizeof(filter->mask));
 928
 929        /* Now enable the filter */
 930        wmb();
 931        filter->count = nexact;
 932
 933        /* Return the number of exact filters */
 934        err = nexact;
 935free_addr:
 936        kfree(addr);
 937        return err;
 938}
 939
 940/* Returns: 0 - drop, !=0 - accept */
 941static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
 942{
 943        /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
 944         * at this point. */
 945        struct ethhdr *eh = (struct ethhdr *) skb->data;
 946        int i;
 947
 948        /* Exact match */
 949        for (i = 0; i < filter->count; i++)
 950                if (ether_addr_equal(eh->h_dest, filter->addr[i]))
 951                        return 1;
 952
 953        /* Inexact match (multicast only) */
 954        if (is_multicast_ether_addr(eh->h_dest))
 955                return addr_hash_test(filter->mask, eh->h_dest);
 956
 957        return 0;
 958}
 959
 960/*
 961 * Checks whether the packet is accepted or not.
 962 * Returns: 0 - drop, !=0 - accept
 963 */
 964static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
 965{
 966        if (!filter->count)
 967                return 1;
 968
 969        return run_filter(filter, skb);
 970}
 971
 972/* Network device part of the driver */
 973
 974static const struct ethtool_ops tun_ethtool_ops;
 975
 976/* Net device detach from fd. */
 977static void tun_net_uninit(struct net_device *dev)
 978{
 979        tun_detach_all(dev);
 980}
 981
 982/* Net device open. */
 983static int tun_net_open(struct net_device *dev)
 984{
 985        netif_tx_start_all_queues(dev);
 986
 987        return 0;
 988}
 989
 990/* Net device close. */
 991static int tun_net_close(struct net_device *dev)
 992{
 993        netif_tx_stop_all_queues(dev);
 994        return 0;
 995}
 996
 997/* Net device start xmit */
 998static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
 999{
1000#ifdef CONFIG_RPS
1001        if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
1002                /* Select queue was not called for the skbuff, so we extract the
1003                 * RPS hash and save it into the flow_table here.
1004                 */
1005                struct tun_flow_entry *e;
1006                __u32 rxhash;
1007
1008                rxhash = __skb_get_hash_symmetric(skb);
1009                e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
1010                if (e)
1011                        tun_flow_save_rps_rxhash(e, rxhash);
1012        }
1013#endif
1014}
1015
1016static unsigned int run_ebpf_filter(struct tun_struct *tun,
1017                                    struct sk_buff *skb,
1018                                    int len)
1019{
1020        struct tun_prog *prog = rcu_dereference(tun->filter_prog);
1021
1022        if (prog)
1023                len = bpf_prog_run_clear_cb(prog->prog, skb);
1024
1025        return len;
1026}
1027
1028/* Net device start xmit */
1029static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
1030{
1031        struct tun_struct *tun = netdev_priv(dev);
1032        int txq = skb->queue_mapping;
1033        struct tun_file *tfile;
1034        int len = skb->len;
1035
1036        rcu_read_lock();
1037        tfile = rcu_dereference(tun->tfiles[txq]);
1038
1039        /* Drop packet if interface is not attached */
1040        if (!tfile)
1041                goto drop;
1042
1043        if (!rcu_dereference(tun->steering_prog))
1044                tun_automq_xmit(tun, skb);
1045
1046        netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);
1047
1048        /* Drop if the filter does not like it.
1049         * This is a noop if the filter is disabled.
1050         * Filter can be enabled only for the TAP devices. */
1051        if (!check_filter(&tun->txflt, skb))
1052                goto drop;
1053
1054        if (tfile->socket.sk->sk_filter &&
1055            sk_filter(tfile->socket.sk, skb))
1056                goto drop;
1057
1058        len = run_ebpf_filter(tun, skb, len);
1059        if (len == 0 || pskb_trim(skb, len))
1060                goto drop;
1061
1062        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1063                goto drop;
1064
1065        skb_tx_timestamp(skb);
1066
1067        /* Orphan the skb - required as we might hang on to it
1068         * for indefinite time.
1069         */
1070        skb_orphan(skb);
1071
1072        nf_reset_ct(skb);
1073
1074        if (ptr_ring_produce(&tfile->tx_ring, skb))
1075                goto drop;
1076
1077        /* Notify and wake up reader process */
1078        if (tfile->flags & TUN_FASYNC)
1079                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1080        tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1081
1082        rcu_read_unlock();
1083        return NETDEV_TX_OK;
1084
1085drop:
1086        this_cpu_inc(tun->pcpu_stats->tx_dropped);
1087        skb_tx_error(skb);
1088        kfree_skb(skb);
1089        rcu_read_unlock();
1090        return NET_XMIT_DROP;
1091}
1092
1093static void tun_net_mclist(struct net_device *dev)
1094{
1095        /*
1096         * This callback is supposed to deal with mc filter in
1097         * _rx_ path and has nothing to do with the _tx_ path.
1098         * In rx path we always accept everything userspace gives us.
1099         */
1100}
1101
1102static netdev_features_t tun_net_fix_features(struct net_device *dev,
1103        netdev_features_t features)
1104{
1105        struct tun_struct *tun = netdev_priv(dev);
1106
1107        return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
1108}
1109
1110static void tun_set_headroom(struct net_device *dev, int new_hr)
1111{
1112        struct tun_struct *tun = netdev_priv(dev);
1113
1114        if (new_hr < NET_SKB_PAD)
1115                new_hr = NET_SKB_PAD;
1116
1117        tun->align = new_hr;
1118}
1119
1120static void
1121tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
1122{
1123        u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0;
1124        struct tun_struct *tun = netdev_priv(dev);
1125        struct tun_pcpu_stats *p;
1126        int i;
1127
1128        for_each_possible_cpu(i) {
1129                u64 rxpackets, rxbytes, txpackets, txbytes;
1130                unsigned int start;
1131
1132                p = per_cpu_ptr(tun->pcpu_stats, i);
1133                do {
1134                        start = u64_stats_fetch_begin(&p->syncp);
1135                        rxpackets       = u64_stats_read(&p->rx_packets);
1136                        rxbytes         = u64_stats_read(&p->rx_bytes);
1137                        txpackets       = u64_stats_read(&p->tx_packets);
1138                        txbytes         = u64_stats_read(&p->tx_bytes);
1139                } while (u64_stats_fetch_retry(&p->syncp, start));
1140
1141                stats->rx_packets       += rxpackets;
1142                stats->rx_bytes         += rxbytes;
1143                stats->tx_packets       += txpackets;
1144                stats->tx_bytes         += txbytes;
1145
1146                /* u32 counters */
1147                rx_dropped      += p->rx_dropped;
1148                rx_frame_errors += p->rx_frame_errors;
1149                tx_dropped      += p->tx_dropped;
1150        }
1151        stats->rx_dropped  = rx_dropped;
1152        stats->rx_frame_errors = rx_frame_errors;
1153        stats->tx_dropped = tx_dropped;
1154}
1155
1156static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1157                       struct netlink_ext_ack *extack)
1158{
1159        struct tun_struct *tun = netdev_priv(dev);
1160        struct tun_file *tfile;
1161        struct bpf_prog *old_prog;
1162        int i;
1163
1164        old_prog = rtnl_dereference(tun->xdp_prog);
1165        rcu_assign_pointer(tun->xdp_prog, prog);
1166        if (old_prog)
1167                bpf_prog_put(old_prog);
1168
1169        for (i = 0; i < tun->numqueues; i++) {
1170                tfile = rtnl_dereference(tun->tfiles[i]);
1171                if (prog)
1172                        sock_set_flag(&tfile->sk, SOCK_XDP);
1173                else
1174                        sock_reset_flag(&tfile->sk, SOCK_XDP);
1175        }
1176        list_for_each_entry(tfile, &tun->disabled, next) {
1177                if (prog)
1178                        sock_set_flag(&tfile->sk, SOCK_XDP);
1179                else
1180                        sock_reset_flag(&tfile->sk, SOCK_XDP);
1181        }
1182
1183        return 0;
1184}
1185
1186static u32 tun_xdp_query(struct net_device *dev)
1187{
1188        struct tun_struct *tun = netdev_priv(dev);
1189        const struct bpf_prog *xdp_prog;
1190
1191        xdp_prog = rtnl_dereference(tun->xdp_prog);
1192        if (xdp_prog)
1193                return xdp_prog->aux->id;
1194
1195        return 0;
1196}
1197
1198static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1199{
1200        switch (xdp->command) {
1201        case XDP_SETUP_PROG:
1202                return tun_xdp_set(dev, xdp->prog, xdp->extack);
1203        case XDP_QUERY_PROG:
1204                xdp->prog_id = tun_xdp_query(dev);
1205                return 0;
1206        default:
1207                return -EINVAL;
1208        }
1209}
1210
1211static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
1212{
1213        if (new_carrier) {
1214                struct tun_struct *tun = netdev_priv(dev);
1215
1216                if (!tun->numqueues)
1217                        return -EPERM;
1218
1219                netif_carrier_on(dev);
1220        } else {
1221                netif_carrier_off(dev);
1222        }
1223        return 0;
1224}
1225
1226static const struct net_device_ops tun_netdev_ops = {
1227        .ndo_uninit             = tun_net_uninit,
1228        .ndo_open               = tun_net_open,
1229        .ndo_stop               = tun_net_close,
1230        .ndo_start_xmit         = tun_net_xmit,
1231        .ndo_fix_features       = tun_net_fix_features,
1232        .ndo_select_queue       = tun_select_queue,
1233        .ndo_set_rx_headroom    = tun_set_headroom,
1234        .ndo_get_stats64        = tun_net_get_stats64,
1235        .ndo_change_carrier     = tun_net_change_carrier,
1236};
1237
1238static void __tun_xdp_flush_tfile(struct tun_file *tfile)
1239{
1240        /* Notify and wake up reader process */
1241        if (tfile->flags & TUN_FASYNC)
1242                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1243        tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1244}
1245
1246static int tun_xdp_xmit(struct net_device *dev, int n,
1247                        struct xdp_frame **frames, u32 flags)
1248{
1249        struct tun_struct *tun = netdev_priv(dev);
1250        struct tun_file *tfile;
1251        u32 numqueues;
1252        int drops = 0;
1253        int cnt = n;
1254        int i;
1255
1256        if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
1257                return -EINVAL;
1258
1259        rcu_read_lock();
1260
1261resample:
1262        numqueues = READ_ONCE(tun->numqueues);
1263        if (!numqueues) {
1264                rcu_read_unlock();
1265                return -ENXIO; /* Caller will free/return all frames */
1266        }
1267
1268        tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
1269                                            numqueues]);
1270        if (unlikely(!tfile))
1271                goto resample;
1272
1273        spin_lock(&tfile->tx_ring.producer_lock);
1274        for (i = 0; i < n; i++) {
1275                struct xdp_frame *xdp = frames[i];
1276                /* Encode the XDP flag into lowest bit for consumer to differ
1277                 * XDP buffer from sk_buff.
1278                 */
1279                void *frame = tun_xdp_to_ptr(xdp);
1280
1281                if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
1282                        this_cpu_inc(tun->pcpu_stats->tx_dropped);
1283                        xdp_return_frame_rx_napi(xdp);
1284                        drops++;
1285                }
1286        }
1287        spin_unlock(&tfile->tx_ring.producer_lock);
1288
1289        if (flags & XDP_XMIT_FLUSH)
1290                __tun_xdp_flush_tfile(tfile);
1291
1292        rcu_read_unlock();
1293        return cnt - drops;
1294}
1295
1296static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
1297{
1298        struct xdp_frame *frame = convert_to_xdp_frame(xdp);
1299
1300        if (unlikely(!frame))
1301                return -EOVERFLOW;
1302
1303        return tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
1304}
1305
1306static const struct net_device_ops tap_netdev_ops = {
1307        .ndo_uninit             = tun_net_uninit,
1308        .ndo_open               = tun_net_open,
1309        .ndo_stop               = tun_net_close,
1310        .ndo_start_xmit         = tun_net_xmit,
1311        .ndo_fix_features       = tun_net_fix_features,
1312        .ndo_set_rx_mode        = tun_net_mclist,
1313        .ndo_set_mac_address    = eth_mac_addr,
1314        .ndo_validate_addr      = eth_validate_addr,
1315        .ndo_select_queue       = tun_select_queue,
1316        .ndo_features_check     = passthru_features_check,
1317        .ndo_set_rx_headroom    = tun_set_headroom,
1318        .ndo_get_stats64        = tun_net_get_stats64,
1319        .ndo_bpf                = tun_xdp,
1320        .ndo_xdp_xmit           = tun_xdp_xmit,
1321        .ndo_change_carrier     = tun_net_change_carrier,
1322};
1323
1324static void tun_flow_init(struct tun_struct *tun)
1325{
1326        int i;
1327
1328        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
1329                INIT_HLIST_HEAD(&tun->flows[i]);
1330
1331        tun->ageing_time = TUN_FLOW_EXPIRE;
1332        timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
1333        mod_timer(&tun->flow_gc_timer,
1334                  round_jiffies_up(jiffies + tun->ageing_time));
1335}
1336
1337static void tun_flow_uninit(struct tun_struct *tun)
1338{
1339        del_timer_sync(&tun->flow_gc_timer);
1340        tun_flow_flush(tun);
1341}
1342
1343#define MIN_MTU 68
1344#define MAX_MTU 65535
1345
1346/* Initialize net device. */
1347static void tun_net_init(struct net_device *dev)
1348{
1349        struct tun_struct *tun = netdev_priv(dev);
1350
1351        switch (tun->flags & TUN_TYPE_MASK) {
1352        case IFF_TUN:
1353                dev->netdev_ops = &tun_netdev_ops;
1354
1355                /* Point-to-Point TUN Device */
1356                dev->hard_header_len = 0;
1357                dev->addr_len = 0;
1358                dev->mtu = 1500;
1359
1360                /* Zero header length */
1361                dev->type = ARPHRD_NONE;
1362                dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
1363                break;
1364
1365        case IFF_TAP:
1366                dev->netdev_ops = &tap_netdev_ops;
1367                /* Ethernet TAP Device */
1368                ether_setup(dev);
1369                dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1370                dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1371
1372                eth_hw_addr_random(dev);
1373
1374                break;
1375        }
1376
1377        dev->min_mtu = MIN_MTU;
1378        dev->max_mtu = MAX_MTU - dev->hard_header_len;
1379}
1380
1381static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
1382{
1383        struct sock *sk = tfile->socket.sk;
1384
1385        return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
1386}
1387
1388/* Character device part */
1389
1390/* Poll */
1391static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
1392{
1393        struct tun_file *tfile = file->private_data;
1394        struct tun_struct *tun = tun_get(tfile);
1395        struct sock *sk;
1396        __poll_t mask = 0;
1397
1398        if (!tun)
1399                return EPOLLERR;
1400
1401        sk = tfile->socket.sk;
1402
1403        poll_wait(file, sk_sleep(sk), wait);
1404
1405        if (!ptr_ring_empty(&tfile->tx_ring))
1406                mask |= EPOLLIN | EPOLLRDNORM;
1407
1408        /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
1409         * guarantee EPOLLOUT to be raised by either here or
1410         * tun_sock_write_space(). Then process could get notification
1411         * after it writes to a down device and meets -EIO.
1412         */
1413        if (tun_sock_writeable(tun, tfile) ||
1414            (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
1415             tun_sock_writeable(tun, tfile)))
1416                mask |= EPOLLOUT | EPOLLWRNORM;
1417
1418        if (tun->dev->reg_state != NETREG_REGISTERED)
1419                mask = EPOLLERR;
1420
1421        tun_put(tun);
1422        return mask;
1423}
1424
1425static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
1426                                            size_t len,
1427                                            const struct iov_iter *it)
1428{
1429        struct sk_buff *skb;
1430        size_t linear;
1431        int err;
1432        int i;
1433
1434        if (it->nr_segs > MAX_SKB_FRAGS + 1)
1435                return ERR_PTR(-ENOMEM);
1436
1437        local_bh_disable();
1438        skb = napi_get_frags(&tfile->napi);
1439        local_bh_enable();
1440        if (!skb)
1441                return ERR_PTR(-ENOMEM);
1442
1443        linear = iov_iter_single_seg_count(it);
1444        err = __skb_grow(skb, linear);
1445        if (err)
1446                goto free;
1447
1448        skb->len = len;
1449        skb->data_len = len - linear;
1450        skb->truesize += skb->data_len;
1451
1452        for (i = 1; i < it->nr_segs; i++) {
1453                size_t fragsz = it->iov[i].iov_len;
1454                struct page *page;
1455                void *frag;
1456
1457                if (fragsz == 0 || fragsz > PAGE_SIZE) {
1458                        err = -EINVAL;
1459                        goto free;
1460                }
1461                frag = netdev_alloc_frag(fragsz);
1462                if (!frag) {
1463                        err = -ENOMEM;
1464                        goto free;
1465                }
1466                page = virt_to_head_page(frag);
1467                skb_fill_page_desc(skb, i - 1, page,
1468                                   frag - page_address(page), fragsz);
1469        }
1470
1471        return skb;
1472free:
1473        /* frees skb and all frags allocated with napi_alloc_frag() */
1474        napi_free_frags(&tfile->napi);
1475        return ERR_PTR(err);
1476}
1477
1478/* prepad is the amount to reserve at front.  len is length after that.
1479 * linear is a hint as to how much to copy (usually headers). */
1480static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
1481                                     size_t prepad, size_t len,
1482                                     size_t linear, int noblock)
1483{
1484        struct sock *sk = tfile->socket.sk;
1485        struct sk_buff *skb;
1486        int err;
1487
1488        /* Under a page?  Don't bother with paged skb. */
1489        if (prepad + len < PAGE_SIZE || !linear)
1490                linear = len;
1491
1492        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1493                                   &err, 0);
1494        if (!skb)
1495                return ERR_PTR(err);
1496
1497        skb_reserve(skb, prepad);
1498        skb_put(skb, linear);
1499        skb->data_len = len - linear;
1500        skb->len += len - linear;
1501
1502        return skb;
1503}
1504
1505static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
1506                           struct sk_buff *skb, int more)
1507{
1508        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1509        struct sk_buff_head process_queue;
1510        u32 rx_batched = tun->rx_batched;
1511        bool rcv = false;
1512
1513        if (!rx_batched || (!more && skb_queue_empty(queue))) {
1514                local_bh_disable();
1515                skb_record_rx_queue(skb, tfile->queue_index);
1516                netif_receive_skb(skb);
1517                local_bh_enable();
1518                return;
1519        }
1520
1521        spin_lock(&queue->lock);
1522        if (!more || skb_queue_len(queue) == rx_batched) {
1523                __skb_queue_head_init(&process_queue);
1524                skb_queue_splice_tail_init(queue, &process_queue);
1525                rcv = true;
1526        } else {
1527                __skb_queue_tail(queue, skb);
1528        }
1529        spin_unlock(&queue->lock);
1530
1531        if (rcv) {
1532                struct sk_buff *nskb;
1533
1534                local_bh_disable();
1535                while ((nskb = __skb_dequeue(&process_queue))) {
1536                        skb_record_rx_queue(nskb, tfile->queue_index);
1537                        netif_receive_skb(nskb);
1538                }
1539                skb_record_rx_queue(skb, tfile->queue_index);
1540                netif_receive_skb(skb);
1541                local_bh_enable();
1542        }
1543}
1544
1545static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
1546                              int len, int noblock, bool zerocopy)
1547{
1548        if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
1549                return false;
1550
1551        if (tfile->socket.sk->sk_sndbuf != INT_MAX)
1552                return false;
1553
1554        if (!noblock)
1555                return false;
1556
1557        if (zerocopy)
1558                return false;
1559
1560        if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
1561            SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
1562                return false;
1563
1564        return true;
1565}
1566
1567static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
1568                                       struct page_frag *alloc_frag, char *buf,
1569                                       int buflen, int len, int pad)
1570{
1571        struct sk_buff *skb = build_skb(buf, buflen);
1572
1573        if (!skb)
1574                return ERR_PTR(-ENOMEM);
1575
1576        skb_reserve(skb, pad);
1577        skb_put(skb, len);
1578        skb_set_owner_w(skb, tfile->socket.sk);
1579
1580        get_page(alloc_frag->page);
1581        alloc_frag->offset += buflen;
1582
1583        return skb;
1584}
1585
1586static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
1587                       struct xdp_buff *xdp, u32 act)
1588{
1589        int err;
1590
1591        switch (act) {
1592        case XDP_REDIRECT:
1593                err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
1594                if (err)
1595                        return err;
1596                break;
1597        case XDP_TX:
1598                err = tun_xdp_tx(tun->dev, xdp);
1599                if (err < 0)
1600                        return err;
1601                break;
1602        case XDP_PASS:
1603                break;
1604        default:
1605                bpf_warn_invalid_xdp_action(act);
1606                /* fall through */
1607        case XDP_ABORTED:
1608                trace_xdp_exception(tun->dev, xdp_prog, act);
1609                /* fall through */
1610        case XDP_DROP:
1611                this_cpu_inc(tun->pcpu_stats->rx_dropped);
1612                break;
1613        }
1614
1615        return act;
1616}
1617
1618static struct sk_buff *tun_build_skb(struct tun_struct *tun,
1619                                     struct tun_file *tfile,
1620                                     struct iov_iter *from,
1621                                     struct virtio_net_hdr *hdr,
1622                                     int len, int *skb_xdp)
1623{
1624        struct page_frag *alloc_frag = &current->task_frag;
1625        struct bpf_prog *xdp_prog;
1626        int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1627        char *buf;
1628        size_t copied;
1629        int pad = TUN_RX_PAD;
1630        int err = 0;
1631
1632        rcu_read_lock();
1633        xdp_prog = rcu_dereference(tun->xdp_prog);
1634        if (xdp_prog)
1635                pad += XDP_PACKET_HEADROOM;
1636        buflen += SKB_DATA_ALIGN(len + pad);
1637        rcu_read_unlock();
1638
1639        alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
1640        if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
1641                return ERR_PTR(-ENOMEM);
1642
1643        buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1644        copied = copy_page_from_iter(alloc_frag->page,
1645                                     alloc_frag->offset + pad,
1646                                     len, from);
1647        if (copied != len)
1648                return ERR_PTR(-EFAULT);
1649
1650        /* There's a small window that XDP may be set after the check
1651         * of xdp_prog above, this should be rare and for simplicity
1652         * we do XDP on skb in case the headroom is not enough.
1653         */
1654        if (hdr->gso_type || !xdp_prog) {
1655                *skb_xdp = 1;
1656                return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
1657                                       pad);
1658        }
1659
1660        *skb_xdp = 0;
1661
1662        local_bh_disable();
1663        rcu_read_lock();
1664        xdp_prog = rcu_dereference(tun->xdp_prog);
1665        if (xdp_prog) {
1666                struct xdp_buff xdp;
1667                u32 act;
1668
1669                xdp.data_hard_start = buf;
1670                xdp.data = buf + pad;
1671                xdp_set_data_meta_invalid(&xdp);
1672                xdp.data_end = xdp.data + len;
1673                xdp.rxq = &tfile->xdp_rxq;
1674
1675                act = bpf_prog_run_xdp(xdp_prog, &xdp);
1676                if (act == XDP_REDIRECT || act == XDP_TX) {
1677                        get_page(alloc_frag->page);
1678                        alloc_frag->offset += buflen;
1679                }
1680                err = tun_xdp_act(tun, xdp_prog, &xdp, act);
1681                if (err < 0) {
1682                        if (act == XDP_REDIRECT || act == XDP_TX)
1683                                put_page(alloc_frag->page);
1684                        goto out;
1685                }
1686
1687                if (err == XDP_REDIRECT)
1688                        xdp_do_flush();
1689                if (err != XDP_PASS)
1690                        goto out;
1691
1692                pad = xdp.data - xdp.data_hard_start;
1693                len = xdp.data_end - xdp.data;
1694        }
1695        rcu_read_unlock();
1696        local_bh_enable();
1697
1698        return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
1699
1700out:
1701        rcu_read_unlock();
1702        local_bh_enable();
1703        return NULL;
1704}
1705
1706/* Get packet from user space buffer */
1707static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1708                            void *msg_control, struct iov_iter *from,
1709                            int noblock, bool more)
1710{
1711        struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
1712        struct sk_buff *skb;
1713        size_t total_len = iov_iter_count(from);
1714        size_t len = total_len, align = tun->align, linear;
1715        struct virtio_net_hdr gso = { 0 };
1716        struct tun_pcpu_stats *stats;
1717        int good_linear;
1718        int copylen;
1719        bool zerocopy = false;
1720        int err;
1721        u32 rxhash = 0;
1722        int skb_xdp = 1;
1723        bool frags = tun_napi_frags_enabled(tfile);
1724
1725        if (!(tun->flags & IFF_NO_PI)) {
1726                if (len < sizeof(pi))
1727                        return -EINVAL;
1728                len -= sizeof(pi);
1729
1730                if (!copy_from_iter_full(&pi, sizeof(pi), from))
1731                        return -EFAULT;
1732        }
1733
1734        if (tun->flags & IFF_VNET_HDR) {
1735                int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
1736
1737                if (len < vnet_hdr_sz)
1738                        return -EINVAL;
1739                len -= vnet_hdr_sz;
1740
1741                if (!copy_from_iter_full(&gso, sizeof(gso), from))
1742                        return -EFAULT;
1743
1744                if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1745                    tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len))
1746                        gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2);
1747
1748                if (tun16_to_cpu(tun, gso.hdr_len) > len)
1749                        return -EINVAL;
1750                iov_iter_advance(from, vnet_hdr_sz - sizeof(gso));
1751        }
1752
1753        if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
1754                align += NET_IP_ALIGN;
1755                if (unlikely(len < ETH_HLEN ||
1756                             (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
1757                        return -EINVAL;
1758        }
1759
1760        good_linear = SKB_MAX_HEAD(align);
1761
1762        if (msg_control) {
1763                struct iov_iter i = *from;
1764
1765                /* There are 256 bytes to be copied in skb, so there is
1766                 * enough room for skb expand head in case it is used.
1767                 * The rest of the buffer is mapped from userspace.
1768                 */
1769                copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN;
1770                if (copylen > good_linear)
1771                        copylen = good_linear;
1772                linear = copylen;
1773                iov_iter_advance(&i, copylen);
1774                if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
1775                        zerocopy = true;
1776        }
1777
1778        if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
1779                /* For the packet that is not easy to be processed
1780                 * (e.g gso or jumbo packet), we will do it at after
1781                 * skb was created with generic XDP routine.
1782                 */
1783                skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
1784                if (IS_ERR(skb)) {
1785                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
1786                        return PTR_ERR(skb);
1787                }
1788                if (!skb)
1789                        return total_len;
1790        } else {
1791                if (!zerocopy) {
1792                        copylen = len;
1793                        if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
1794                                linear = good_linear;
1795                        else
1796                                linear = tun16_to_cpu(tun, gso.hdr_len);
1797                }
1798
1799                if (frags) {
1800                        mutex_lock(&tfile->napi_mutex);
1801                        skb = tun_napi_alloc_frags(tfile, copylen, from);
1802                        /* tun_napi_alloc_frags() enforces a layout for the skb.
1803                         * If zerocopy is enabled, then this layout will be
1804                         * overwritten by zerocopy_sg_from_iter().
1805                         */
1806                        zerocopy = false;
1807                } else {
1808                        skb = tun_alloc_skb(tfile, align, copylen, linear,
1809                                            noblock);
1810                }
1811
1812                if (IS_ERR(skb)) {
1813                        if (PTR_ERR(skb) != -EAGAIN)
1814                                this_cpu_inc(tun->pcpu_stats->rx_dropped);
1815                        if (frags)
1816                                mutex_unlock(&tfile->napi_mutex);
1817                        return PTR_ERR(skb);
1818                }
1819
1820                if (zerocopy)
1821                        err = zerocopy_sg_from_iter(skb, from);
1822                else
1823                        err = skb_copy_datagram_from_iter(skb, 0, from, len);
1824
1825                if (err) {
1826                        err = -EFAULT;
1827drop:
1828                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
1829                        kfree_skb(skb);
1830                        if (frags) {
1831                                tfile->napi.skb = NULL;
1832                                mutex_unlock(&tfile->napi_mutex);
1833                        }
1834
1835                        return err;
1836                }
1837        }
1838
1839        if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
1840                this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
1841                kfree_skb(skb);
1842                if (frags) {
1843                        tfile->napi.skb = NULL;
1844                        mutex_unlock(&tfile->napi_mutex);
1845                }
1846
1847                return -EINVAL;
1848        }
1849
1850        switch (tun->flags & TUN_TYPE_MASK) {
1851        case IFF_TUN:
1852                if (tun->flags & IFF_NO_PI) {
1853                        u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0;
1854
1855                        switch (ip_version) {
1856                        case 4:
1857                                pi.proto = htons(ETH_P_IP);
1858                                break;
1859                        case 6:
1860                                pi.proto = htons(ETH_P_IPV6);
1861                                break;
1862                        default:
1863                                this_cpu_inc(tun->pcpu_stats->rx_dropped);
1864                                kfree_skb(skb);
1865                                return -EINVAL;
1866                        }
1867                }
1868
1869                skb_reset_mac_header(skb);
1870                skb->protocol = pi.proto;
1871                skb->dev = tun->dev;
1872                break;
1873        case IFF_TAP:
1874                if (!frags)
1875                        skb->protocol = eth_type_trans(skb, tun->dev);
1876                break;
1877        }
1878
1879        /* copy skb_ubuf_info for callback when skb has no error */
1880        if (zerocopy) {
1881                skb_shinfo(skb)->destructor_arg = msg_control;
1882                skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1883                skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1884        } else if (msg_control) {
1885                struct ubuf_info *uarg = msg_control;
1886                uarg->callback(uarg, false);
1887        }
1888
1889        skb_reset_network_header(skb);
1890        skb_probe_transport_header(skb);
1891        skb_record_rx_queue(skb, tfile->queue_index);
1892
1893        if (skb_xdp) {
1894                struct bpf_prog *xdp_prog;
1895                int ret;
1896
1897                local_bh_disable();
1898                rcu_read_lock();
1899                xdp_prog = rcu_dereference(tun->xdp_prog);
1900                if (xdp_prog) {
1901                        ret = do_xdp_generic(xdp_prog, skb);
1902                        if (ret != XDP_PASS) {
1903                                rcu_read_unlock();
1904                                local_bh_enable();
1905                                if (frags) {
1906                                        tfile->napi.skb = NULL;
1907                                        mutex_unlock(&tfile->napi_mutex);
1908                                }
1909                                return total_len;
1910                        }
1911                }
1912                rcu_read_unlock();
1913                local_bh_enable();
1914        }
1915
1916        /* Compute the costly rx hash only if needed for flow updates.
1917         * We may get a very small possibility of OOO during switching, not
1918         * worth to optimize.
1919         */
1920        if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
1921            !tfile->detached)
1922                rxhash = __skb_get_hash_symmetric(skb);
1923
1924        rcu_read_lock();
1925        if (unlikely(!(tun->dev->flags & IFF_UP))) {
1926                err = -EIO;
1927                rcu_read_unlock();
1928                goto drop;
1929        }
1930
1931        if (frags) {
1932                /* Exercise flow dissector code path. */
1933                u32 headlen = eth_get_headlen(tun->dev, skb->data,
1934                                              skb_headlen(skb));
1935
1936                if (unlikely(headlen > skb_headlen(skb))) {
1937                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
1938                        napi_free_frags(&tfile->napi);
1939                        rcu_read_unlock();
1940                        mutex_unlock(&tfile->napi_mutex);
1941                        WARN_ON(1);
1942                        return -ENOMEM;
1943                }
1944
1945                local_bh_disable();
1946                napi_gro_frags(&tfile->napi);
1947                local_bh_enable();
1948                mutex_unlock(&tfile->napi_mutex);
1949        } else if (tfile->napi_enabled) {
1950                struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1951                int queue_len;
1952
1953                spin_lock_bh(&queue->lock);
1954                __skb_queue_tail(queue, skb);
1955                queue_len = skb_queue_len(queue);
1956                spin_unlock(&queue->lock);
1957
1958                if (!more || queue_len > NAPI_POLL_WEIGHT)
1959                        napi_schedule(&tfile->napi);
1960
1961                local_bh_enable();
1962        } else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
1963                tun_rx_batched(tun, tfile, skb, more);
1964        } else {
1965                netif_rx_ni(skb);
1966        }
1967        rcu_read_unlock();
1968
1969        stats = get_cpu_ptr(tun->pcpu_stats);
1970        u64_stats_update_begin(&stats->syncp);
1971        u64_stats_inc(&stats->rx_packets);
1972        u64_stats_add(&stats->rx_bytes, len);
1973        u64_stats_update_end(&stats->syncp);
1974        put_cpu_ptr(stats);
1975
1976        if (rxhash)
1977                tun_flow_update(tun, rxhash, tfile);
1978
1979        return total_len;
1980}
1981
1982static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
1983{
1984        struct file *file = iocb->ki_filp;
1985        struct tun_file *tfile = file->private_data;
1986        struct tun_struct *tun = tun_get(tfile);
1987        ssize_t result;
1988
1989        if (!tun)
1990                return -EBADFD;
1991
1992        result = tun_get_user(tun, tfile, NULL, from,
1993                              file->f_flags & O_NONBLOCK, false);
1994
1995        tun_put(tun);
1996        return result;
1997}
1998
1999static ssize_t tun_put_user_xdp(struct tun_struct *tun,
2000                                struct tun_file *tfile,
2001                                struct xdp_frame *xdp_frame,
2002                                struct iov_iter *iter)
2003{
2004        int vnet_hdr_sz = 0;
2005        size_t size = xdp_frame->len;
2006        struct tun_pcpu_stats *stats;
2007        size_t ret;
2008
2009        if (tun->flags & IFF_VNET_HDR) {
2010                struct virtio_net_hdr gso = { 0 };
2011
2012                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2013                if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
2014                        return -EINVAL;
2015                if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
2016                             sizeof(gso)))
2017                        return -EFAULT;
2018                iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
2019        }
2020
2021        ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
2022
2023        stats = get_cpu_ptr(tun->pcpu_stats);
2024        u64_stats_update_begin(&stats->syncp);
2025        u64_stats_inc(&stats->tx_packets);
2026        u64_stats_add(&stats->tx_bytes, ret);
2027        u64_stats_update_end(&stats->syncp);
2028        put_cpu_ptr(tun->pcpu_stats);
2029
2030        return ret;
2031}
2032
2033/* Put packet to the user space buffer */
2034static ssize_t tun_put_user(struct tun_struct *tun,
2035                            struct tun_file *tfile,
2036                            struct sk_buff *skb,
2037                            struct iov_iter *iter)
2038{
2039        struct tun_pi pi = { 0, skb->protocol };
2040        struct tun_pcpu_stats *stats;
2041        ssize_t total;
2042        int vlan_offset = 0;
2043        int vlan_hlen = 0;
2044        int vnet_hdr_sz = 0;
2045
2046        if (skb_vlan_tag_present(skb))
2047                vlan_hlen = VLAN_HLEN;
2048
2049        if (tun->flags & IFF_VNET_HDR)
2050                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2051
2052        total = skb->len + vlan_hlen + vnet_hdr_sz;
2053
2054        if (!(tun->flags & IFF_NO_PI)) {
2055                if (iov_iter_count(iter) < sizeof(pi))
2056                        return -EINVAL;
2057
2058                total += sizeof(pi);
2059                if (iov_iter_count(iter) < total) {
2060                        /* Packet will be striped */
2061                        pi.flags |= TUN_PKT_STRIP;
2062                }
2063
2064                if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
2065                        return -EFAULT;
2066        }
2067
2068        if (vnet_hdr_sz) {
2069                struct virtio_net_hdr gso;
2070
2071                if (iov_iter_count(iter) < vnet_hdr_sz)
2072                        return -EINVAL;
2073
2074                if (virtio_net_hdr_from_skb(skb, &gso,
2075                                            tun_is_little_endian(tun), true,
2076                                            vlan_hlen)) {
2077                        struct skb_shared_info *sinfo = skb_shinfo(skb);
2078                        pr_err("unexpected GSO type: "
2079                               "0x%x, gso_size %d, hdr_len %d\n",
2080                               sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
2081                               tun16_to_cpu(tun, gso.hdr_len));
2082                        print_hex_dump(KERN_ERR, "tun: ",
2083                                       DUMP_PREFIX_NONE,
2084                                       16, 1, skb->head,
2085                                       min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
2086                        WARN_ON_ONCE(1);
2087                        return -EINVAL;
2088                }
2089
2090                if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
2091                        return -EFAULT;
2092
2093                iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
2094        }
2095
2096        if (vlan_hlen) {
2097                int ret;
2098                struct veth veth;
2099
2100                veth.h_vlan_proto = skb->vlan_proto;
2101                veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
2102
2103                vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
2104
2105                ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
2106                if (ret || !iov_iter_count(iter))
2107                        goto done;
2108
2109                ret = copy_to_iter(&veth, sizeof(veth), iter);
2110                if (ret != sizeof(veth) || !iov_iter_count(iter))
2111                        goto done;
2112        }
2113
2114        skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
2115
2116done:
2117        /* caller is in process context, */
2118        stats = get_cpu_ptr(tun->pcpu_stats);
2119        u64_stats_update_begin(&stats->syncp);
2120        u64_stats_inc(&stats->tx_packets);
2121        u64_stats_add(&stats->tx_bytes, skb->len + vlan_hlen);
2122        u64_stats_update_end(&stats->syncp);
2123        put_cpu_ptr(tun->pcpu_stats);
2124
2125        return total;
2126}
2127
2128static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
2129{
2130        DECLARE_WAITQUEUE(wait, current);
2131        void *ptr = NULL;
2132        int error = 0;
2133
2134        ptr = ptr_ring_consume(&tfile->tx_ring);
2135        if (ptr)
2136                goto out;
2137        if (noblock) {
2138                error = -EAGAIN;
2139                goto out;
2140        }
2141
2142        add_wait_queue(&tfile->socket.wq.wait, &wait);
2143
2144        while (1) {
2145                set_current_state(TASK_INTERRUPTIBLE);
2146                ptr = ptr_ring_consume(&tfile->tx_ring);
2147                if (ptr)
2148                        break;
2149                if (signal_pending(current)) {
2150                        error = -ERESTARTSYS;
2151                        break;
2152                }
2153                if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
2154                        error = -EFAULT;
2155                        break;
2156                }
2157
2158                schedule();
2159        }
2160
2161        __set_current_state(TASK_RUNNING);
2162        remove_wait_queue(&tfile->socket.wq.wait, &wait);
2163
2164out:
2165        *err = error;
2166        return ptr;
2167}
2168
2169static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
2170                           struct iov_iter *to,
2171                           int noblock, void *ptr)
2172{
2173        ssize_t ret;
2174        int err;
2175
2176        if (!iov_iter_count(to)) {
2177                tun_ptr_free(ptr);
2178                return 0;
2179        }
2180
2181        if (!ptr) {
2182                /* Read frames from ring */
2183                ptr = tun_ring_recv(tfile, noblock, &err);
2184                if (!ptr)
2185                        return err;
2186        }
2187
2188        if (tun_is_xdp_frame(ptr)) {
2189                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2190
2191                ret = tun_put_user_xdp(tun, tfile, xdpf, to);
2192                xdp_return_frame(xdpf);
2193        } else {
2194                struct sk_buff *skb = ptr;
2195
2196                ret = tun_put_user(tun, tfile, skb, to);
2197                if (unlikely(ret < 0))
2198                        kfree_skb(skb);
2199                else
2200                        consume_skb(skb);
2201        }
2202
2203        return ret;
2204}
2205
2206static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
2207{
2208        struct file *file = iocb->ki_filp;
2209        struct tun_file *tfile = file->private_data;
2210        struct tun_struct *tun = tun_get(tfile);
2211        ssize_t len = iov_iter_count(to), ret;
2212
2213        if (!tun)
2214                return -EBADFD;
2215        ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
2216        ret = min_t(ssize_t, ret, len);
2217        if (ret > 0)
2218                iocb->ki_pos = ret;
2219        tun_put(tun);
2220        return ret;
2221}
2222
2223static void tun_prog_free(struct rcu_head *rcu)
2224{
2225        struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
2226
2227        bpf_prog_destroy(prog->prog);
2228        kfree(prog);
2229}
2230
2231static int __tun_set_ebpf(struct tun_struct *tun,
2232                          struct tun_prog __rcu **prog_p,
2233                          struct bpf_prog *prog)
2234{
2235        struct tun_prog *old, *new = NULL;
2236
2237        if (prog) {
2238                new = kmalloc(sizeof(*new), GFP_KERNEL);
2239                if (!new)
2240                        return -ENOMEM;
2241                new->prog = prog;
2242        }
2243
2244        spin_lock_bh(&tun->lock);
2245        old = rcu_dereference_protected(*prog_p,
2246                                        lockdep_is_held(&tun->lock));
2247        rcu_assign_pointer(*prog_p, new);
2248        spin_unlock_bh(&tun->lock);
2249
2250        if (old)
2251                call_rcu(&old->rcu, tun_prog_free);
2252
2253        return 0;
2254}
2255
2256static void tun_free_netdev(struct net_device *dev)
2257{
2258        struct tun_struct *tun = netdev_priv(dev);
2259
2260        BUG_ON(!(list_empty(&tun->disabled)));
2261
2262        free_percpu(tun->pcpu_stats);
2263        /* We clear pcpu_stats so that tun_set_iff() can tell if
2264         * tun_free_netdev() has been called from register_netdevice().
2265         */
2266        tun->pcpu_stats = NULL;
2267
2268        tun_flow_uninit(tun);
2269        security_tun_dev_free_security(tun->security);
2270        __tun_set_ebpf(tun, &tun->steering_prog, NULL);
2271        __tun_set_ebpf(tun, &tun->filter_prog, NULL);
2272}
2273
2274static void tun_setup(struct net_device *dev)
2275{
2276        struct tun_struct *tun = netdev_priv(dev);
2277
2278        tun->owner = INVALID_UID;
2279        tun->group = INVALID_GID;
2280        tun_default_link_ksettings(dev, &tun->link_ksettings);
2281
2282        dev->ethtool_ops = &tun_ethtool_ops;
2283        dev->needs_free_netdev = true;
2284        dev->priv_destructor = tun_free_netdev;
2285        /* We prefer our own queue length */
2286        dev->tx_queue_len = TUN_READQ_SIZE;
2287}
2288
2289/* Trivial set of netlink ops to allow deleting tun or tap
2290 * device with netlink.
2291 */
2292static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
2293                        struct netlink_ext_ack *extack)
2294{
2295        NL_SET_ERR_MSG(extack,
2296                       "tun/tap creation via rtnetlink is not supported.");
2297        return -EOPNOTSUPP;
2298}
2299
2300static size_t tun_get_size(const struct net_device *dev)
2301{
2302        BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
2303        BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));
2304
2305        return nla_total_size(sizeof(uid_t)) + /* OWNER */
2306               nla_total_size(sizeof(gid_t)) + /* GROUP */
2307               nla_total_size(sizeof(u8)) + /* TYPE */
2308               nla_total_size(sizeof(u8)) + /* PI */
2309               nla_total_size(sizeof(u8)) + /* VNET_HDR */
2310               nla_total_size(sizeof(u8)) + /* PERSIST */
2311               nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */
2312               nla_total_size(sizeof(u32)) + /* NUM_QUEUES */
2313               nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */
2314               0;
2315}
2316
2317static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev)
2318{
2319        struct tun_struct *tun = netdev_priv(dev);
2320
2321        if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK))
2322                goto nla_put_failure;
2323        if (uid_valid(tun->owner) &&
2324            nla_put_u32(skb, IFLA_TUN_OWNER,
2325                        from_kuid_munged(current_user_ns(), tun->owner)))
2326                goto nla_put_failure;
2327        if (gid_valid(tun->group) &&
2328            nla_put_u32(skb, IFLA_TUN_GROUP,
2329                        from_kgid_munged(current_user_ns(), tun->group)))
2330                goto nla_put_failure;
2331        if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI)))
2332                goto nla_put_failure;
2333        if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR)))
2334                goto nla_put_failure;
2335        if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST)))
2336                goto nla_put_failure;
2337        if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE,
2338                       !!(tun->flags & IFF_MULTI_QUEUE)))
2339                goto nla_put_failure;
2340        if (tun->flags & IFF_MULTI_QUEUE) {
2341                if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues))
2342                        goto nla_put_failure;
2343                if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES,
2344                                tun->numdisabled))
2345                        goto nla_put_failure;
2346        }
2347
2348        return 0;
2349
2350nla_put_failure:
2351        return -EMSGSIZE;
2352}
2353
2354static struct rtnl_link_ops tun_link_ops __read_mostly = {
2355        .kind           = DRV_NAME,
2356        .priv_size      = sizeof(struct tun_struct),
2357        .setup          = tun_setup,
2358        .validate       = tun_validate,
2359        .get_size       = tun_get_size,
2360        .fill_info      = tun_fill_info,
2361};
2362
2363static void tun_sock_write_space(struct sock *sk)
2364{
2365        struct tun_file *tfile;
2366        wait_queue_head_t *wqueue;
2367
2368        if (!sock_writeable(sk))
2369                return;
2370
2371        if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
2372                return;
2373
2374        wqueue = sk_sleep(sk);
2375        if (wqueue && waitqueue_active(wqueue))
2376                wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
2377                                                EPOLLWRNORM | EPOLLWRBAND);
2378
2379        tfile = container_of(sk, struct tun_file, sk);
2380        kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
2381}
2382
2383static void tun_put_page(struct tun_page *tpage)
2384{
2385        if (tpage->page)
2386                __page_frag_cache_drain(tpage->page, tpage->count);
2387}
2388
2389static int tun_xdp_one(struct tun_struct *tun,
2390                       struct tun_file *tfile,
2391                       struct xdp_buff *xdp, int *flush,
2392                       struct tun_page *tpage)
2393{
2394        unsigned int datasize = xdp->data_end - xdp->data;
2395        struct tun_xdp_hdr *hdr = xdp->data_hard_start;
2396        struct virtio_net_hdr *gso = &hdr->gso;
2397        struct tun_pcpu_stats *stats;
2398        struct bpf_prog *xdp_prog;
2399        struct sk_buff *skb = NULL;
2400        u32 rxhash = 0, act;
2401        int buflen = hdr->buflen;
2402        int err = 0;
2403        bool skb_xdp = false;
2404        struct page *page;
2405
2406        xdp_prog = rcu_dereference(tun->xdp_prog);
2407        if (xdp_prog) {
2408                if (gso->gso_type) {
2409                        skb_xdp = true;
2410                        goto build;
2411                }
2412                xdp_set_data_meta_invalid(xdp);
2413                xdp->rxq = &tfile->xdp_rxq;
2414
2415                act = bpf_prog_run_xdp(xdp_prog, xdp);
2416                err = tun_xdp_act(tun, xdp_prog, xdp, act);
2417                if (err < 0) {
2418                        put_page(virt_to_head_page(xdp->data));
2419                        return err;
2420                }
2421
2422                switch (err) {
2423                case XDP_REDIRECT:
2424                        *flush = true;
2425                        /* fall through */
2426                case XDP_TX:
2427                        return 0;
2428                case XDP_PASS:
2429                        break;
2430                default:
2431                        page = virt_to_head_page(xdp->data);
2432                        if (tpage->page == page) {
2433                                ++tpage->count;
2434                        } else {
2435                                tun_put_page(tpage);
2436                                tpage->page = page;
2437                                tpage->count = 1;
2438                        }
2439                        return 0;
2440                }
2441        }
2442
2443build:
2444        skb = build_skb(xdp->data_hard_start, buflen);
2445        if (!skb) {
2446                err = -ENOMEM;
2447                goto out;
2448        }
2449
2450        skb_reserve(skb, xdp->data - xdp->data_hard_start);
2451        skb_put(skb, xdp->data_end - xdp->data);
2452
2453        if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
2454                this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
2455                kfree_skb(skb);
2456                err = -EINVAL;
2457                goto out;
2458        }
2459
2460        skb->protocol = eth_type_trans(skb, tun->dev);
2461        skb_reset_network_header(skb);
2462        skb_probe_transport_header(skb);
2463        skb_record_rx_queue(skb, tfile->queue_index);
2464
2465        if (skb_xdp) {
2466                err = do_xdp_generic(xdp_prog, skb);
2467                if (err != XDP_PASS)
2468                        goto out;
2469        }
2470
2471        if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
2472            !tfile->detached)
2473                rxhash = __skb_get_hash_symmetric(skb);
2474
2475        netif_receive_skb(skb);
2476
2477        /* No need for get_cpu_ptr() here since this function is
2478         * always called with bh disabled
2479         */
2480        stats = this_cpu_ptr(tun->pcpu_stats);
2481        u64_stats_update_begin(&stats->syncp);
2482        u64_stats_inc(&stats->rx_packets);
2483        u64_stats_add(&stats->rx_bytes, datasize);
2484        u64_stats_update_end(&stats->syncp);
2485
2486        if (rxhash)
2487                tun_flow_update(tun, rxhash, tfile);
2488
2489out:
2490        return err;
2491}
2492
2493static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
2494{
2495        int ret, i;
2496        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2497        struct tun_struct *tun = tun_get(tfile);
2498        struct tun_msg_ctl *ctl = m->msg_control;
2499        struct xdp_buff *xdp;
2500
2501        if (!tun)
2502                return -EBADFD;
2503
2504        if (ctl && (ctl->type == TUN_MSG_PTR)) {
2505                struct tun_page tpage;
2506                int n = ctl->num;
2507                int flush = 0;
2508
2509                memset(&tpage, 0, sizeof(tpage));
2510
2511                local_bh_disable();
2512                rcu_read_lock();
2513
2514                for (i = 0; i < n; i++) {
2515                        xdp = &((struct xdp_buff *)ctl->ptr)[i];
2516                        tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
2517                }
2518
2519                if (flush)
2520                        xdp_do_flush();
2521
2522                rcu_read_unlock();
2523                local_bh_enable();
2524
2525                tun_put_page(&tpage);
2526
2527                ret = total_len;
2528                goto out;
2529        }
2530
2531        ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
2532                           m->msg_flags & MSG_DONTWAIT,
2533                           m->msg_flags & MSG_MORE);
2534out:
2535        tun_put(tun);
2536        return ret;
2537}
2538
2539static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
2540                       int flags)
2541{
2542        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2543        struct tun_struct *tun = tun_get(tfile);
2544        void *ptr = m->msg_control;
2545        int ret;
2546
2547        if (!tun) {
2548                ret = -EBADFD;
2549                goto out_free;
2550        }
2551
2552        if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
2553                ret = -EINVAL;
2554                goto out_put_tun;
2555        }
2556        if (flags & MSG_ERRQUEUE) {
2557                ret = sock_recv_errqueue(sock->sk, m, total_len,
2558                                         SOL_PACKET, TUN_TX_TIMESTAMP);
2559                goto out;
2560        }
2561        ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
2562        if (ret > (ssize_t)total_len) {
2563                m->msg_flags |= MSG_TRUNC;
2564                ret = flags & MSG_TRUNC ? ret : total_len;
2565        }
2566out:
2567        tun_put(tun);
2568        return ret;
2569
2570out_put_tun:
2571        tun_put(tun);
2572out_free:
2573        tun_ptr_free(ptr);
2574        return ret;
2575}
2576
2577static int tun_ptr_peek_len(void *ptr)
2578{
2579        if (likely(ptr)) {
2580                if (tun_is_xdp_frame(ptr)) {
2581                        struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2582
2583                        return xdpf->len;
2584                }
2585                return __skb_array_len_with_tag(ptr);
2586        } else {
2587                return 0;
2588        }
2589}
2590
2591static int tun_peek_len(struct socket *sock)
2592{
2593        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2594        struct tun_struct *tun;
2595        int ret = 0;
2596
2597        tun = tun_get(tfile);
2598        if (!tun)
2599                return 0;
2600
2601        ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
2602        tun_put(tun);
2603
2604        return ret;
2605}
2606
2607/* Ops structure to mimic raw sockets with tun */
2608static const struct proto_ops tun_socket_ops = {
2609        .peek_len = tun_peek_len,
2610        .sendmsg = tun_sendmsg,
2611        .recvmsg = tun_recvmsg,
2612};
2613
2614static struct proto tun_proto = {
2615        .name           = "tun",
2616        .owner          = THIS_MODULE,
2617        .obj_size       = sizeof(struct tun_file),
2618};
2619
2620static int tun_flags(struct tun_struct *tun)
2621{
2622        return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
2623}
2624
2625static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
2626                              char *buf)
2627{
2628        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2629        return sprintf(buf, "0x%x\n", tun_flags(tun));
2630}
2631
2632static ssize_t tun_show_owner(struct device *dev, struct device_attribute *attr,
2633                              char *buf)
2634{
2635        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2636        return uid_valid(tun->owner)?
2637                sprintf(buf, "%u\n",
2638                        from_kuid_munged(current_user_ns(), tun->owner)):
2639                sprintf(buf, "-1\n");
2640}
2641
2642static ssize_t tun_show_group(struct device *dev, struct device_attribute *attr,
2643                              char *buf)
2644{
2645        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2646        return gid_valid(tun->group) ?
2647                sprintf(buf, "%u\n",
2648                        from_kgid_munged(current_user_ns(), tun->group)):
2649                sprintf(buf, "-1\n");
2650}
2651
2652static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL);
2653static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL);
2654static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
2655
2656static struct attribute *tun_dev_attrs[] = {
2657        &dev_attr_tun_flags.attr,
2658        &dev_attr_owner.attr,
2659        &dev_attr_group.attr,
2660        NULL
2661};
2662
2663static const struct attribute_group tun_attr_group = {
2664        .attrs = tun_dev_attrs
2665};
2666
2667static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
2668{
2669        struct tun_struct *tun;
2670        struct tun_file *tfile = file->private_data;
2671        struct net_device *dev;
2672        int err;
2673
2674        if (tfile->detached)
2675                return -EINVAL;
2676
2677        if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
2678                if (!capable(CAP_NET_ADMIN))
2679                        return -EPERM;
2680
2681                if (!(ifr->ifr_flags & IFF_NAPI) ||
2682                    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
2683                        return -EINVAL;
2684        }
2685
2686        dev = __dev_get_by_name(net, ifr->ifr_name);
2687        if (dev) {
2688                if (ifr->ifr_flags & IFF_TUN_EXCL)
2689                        return -EBUSY;
2690                if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
2691                        tun = netdev_priv(dev);
2692                else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
2693                        tun = netdev_priv(dev);
2694                else
2695                        return -EINVAL;
2696
2697                if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
2698                    !!(tun->flags & IFF_MULTI_QUEUE))
2699                        return -EINVAL;
2700
2701                if (tun_not_capable(tun))
2702                        return -EPERM;
2703                err = security_tun_dev_open(tun->security);
2704                if (err < 0)
2705                        return err;
2706
2707                err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
2708                                 ifr->ifr_flags & IFF_NAPI,
2709                                 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
2710                if (err < 0)
2711                        return err;
2712
2713                if (tun->flags & IFF_MULTI_QUEUE &&
2714                    (tun->numqueues + tun->numdisabled > 1)) {
2715                        /* One or more queue has already been attached, no need
2716                         * to initialize the device again.
2717                         */
2718                        netdev_state_change(dev);
2719                        return 0;
2720                }
2721
2722                tun->flags = (tun->flags & ~TUN_FEATURES) |
2723                              (ifr->ifr_flags & TUN_FEATURES);
2724
2725                netdev_state_change(dev);
2726        } else {
2727                char *name;
2728                unsigned long flags = 0;
2729                int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
2730                             MAX_TAP_QUEUES : 1;
2731
2732                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2733                        return -EPERM;
2734                err = security_tun_dev_create();
2735                if (err < 0)
2736                        return err;
2737
2738                /* Set dev type */
2739                if (ifr->ifr_flags & IFF_TUN) {
2740                        /* TUN device */
2741                        flags |= IFF_TUN;
2742                        name = "tun%d";
2743                } else if (ifr->ifr_flags & IFF_TAP) {
2744                        /* TAP device */
2745                        flags |= IFF_TAP;
2746                        name = "tap%d";
2747                } else
2748                        return -EINVAL;
2749
2750                if (*ifr->ifr_name)
2751                        name = ifr->ifr_name;
2752
2753                dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
2754                                       NET_NAME_UNKNOWN, tun_setup, queues,
2755                                       queues);
2756
2757                if (!dev)
2758                        return -ENOMEM;
2759
2760                dev_net_set(dev, net);
2761                dev->rtnl_link_ops = &tun_link_ops;
2762                dev->ifindex = tfile->ifindex;
2763                dev->sysfs_groups[0] = &tun_attr_group;
2764
2765                tun = netdev_priv(dev);
2766                tun->dev = dev;
2767                tun->flags = flags;
2768                tun->txflt.count = 0;
2769                tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
2770
2771                tun->align = NET_SKB_PAD;
2772                tun->filter_attached = false;
2773                tun->sndbuf = tfile->socket.sk->sk_sndbuf;
2774                tun->rx_batched = 0;
2775                RCU_INIT_POINTER(tun->steering_prog, NULL);
2776
2777                tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
2778                if (!tun->pcpu_stats) {
2779                        err = -ENOMEM;
2780                        goto err_free_dev;
2781                }
2782
2783                spin_lock_init(&tun->lock);
2784
2785                err = security_tun_dev_alloc_security(&tun->security);
2786                if (err < 0)
2787                        goto err_free_stat;
2788
2789                tun_net_init(dev);
2790                tun_flow_init(tun);
2791
2792                dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
2793                                   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
2794                                   NETIF_F_HW_VLAN_STAG_TX;
2795                dev->features = dev->hw_features | NETIF_F_LLTX;
2796                dev->vlan_features = dev->features &
2797                                     ~(NETIF_F_HW_VLAN_CTAG_TX |
2798                                       NETIF_F_HW_VLAN_STAG_TX);
2799
2800                tun->flags = (tun->flags & ~TUN_FEATURES) |
2801                              (ifr->ifr_flags & TUN_FEATURES);
2802
2803                INIT_LIST_HEAD(&tun->disabled);
2804                err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
2805                                 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
2806                if (err < 0)
2807                        goto err_free_flow;
2808
2809                err = register_netdevice(tun->dev);
2810                if (err < 0)
2811                        goto err_detach;
2812                /* free_netdev() won't check refcnt, to aovid race
2813                 * with dev_put() we need publish tun after registration.
2814                 */
2815                rcu_assign_pointer(tfile->tun, tun);
2816        }
2817
2818        netif_carrier_on(tun->dev);
2819
2820        /* Make sure persistent devices do not get stuck in
2821         * xoff state.
2822         */
2823        if (netif_running(tun->dev))
2824                netif_tx_wake_all_queues(tun->dev);
2825
2826        strcpy(ifr->ifr_name, tun->dev->name);
2827        return 0;
2828
2829err_detach:
2830        tun_detach_all(dev);
2831        /* We are here because register_netdevice() has failed.
2832         * If register_netdevice() already called tun_free_netdev()
2833         * while dealing with the error, tun->pcpu_stats has been cleared.
2834         */
2835        if (!tun->pcpu_stats)
2836                goto err_free_dev;
2837
2838err_free_flow:
2839        tun_flow_uninit(tun);
2840        security_tun_dev_free_security(tun->security);
2841err_free_stat:
2842        free_percpu(tun->pcpu_stats);
2843err_free_dev:
2844        free_netdev(dev);
2845        return err;
2846}
2847
2848static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
2849{
2850        strcpy(ifr->ifr_name, tun->dev->name);
2851
2852        ifr->ifr_flags = tun_flags(tun);
2853
2854}
2855
2856/* This is like a cut-down ethtool ops, except done via tun fd so no
2857 * privs required. */
2858static int set_offload(struct tun_struct *tun, unsigned long arg)
2859{
2860        netdev_features_t features = 0;
2861
2862        if (arg & TUN_F_CSUM) {
2863                features |= NETIF_F_HW_CSUM;
2864                arg &= ~TUN_F_CSUM;
2865
2866                if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
2867                        if (arg & TUN_F_TSO_ECN) {
2868                                features |= NETIF_F_TSO_ECN;
2869                                arg &= ~TUN_F_TSO_ECN;
2870                        }
2871                        if (arg & TUN_F_TSO4)
2872                                features |= NETIF_F_TSO;
2873                        if (arg & TUN_F_TSO6)
2874                                features |= NETIF_F_TSO6;
2875                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
2876                }
2877
2878                arg &= ~TUN_F_UFO;
2879        }
2880
2881        /* This gives the user a way to test for new features in future by
2882         * trying to set them. */
2883        if (arg)
2884                return -EINVAL;
2885
2886        tun->set_features = features;
2887        tun->dev->wanted_features &= ~TUN_USER_FEATURES;
2888        tun->dev->wanted_features |= features;
2889        netdev_update_features(tun->dev);
2890
2891        return 0;
2892}
2893
2894static void tun_detach_filter(struct tun_struct *tun, int n)
2895{
2896        int i;
2897        struct tun_file *tfile;
2898
2899        for (i = 0; i < n; i++) {
2900                tfile = rtnl_dereference(tun->tfiles[i]);
2901                lock_sock(tfile->socket.sk);
2902                sk_detach_filter(tfile->socket.sk);
2903                release_sock(tfile->socket.sk);
2904        }
2905
2906        tun->filter_attached = false;
2907}
2908
2909static int tun_attach_filter(struct tun_struct *tun)
2910{
2911        int i, ret = 0;
2912        struct tun_file *tfile;
2913
2914        for (i = 0; i < tun->numqueues; i++) {
2915                tfile = rtnl_dereference(tun->tfiles[i]);
2916                lock_sock(tfile->socket.sk);
2917                ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
2918                release_sock(tfile->socket.sk);
2919                if (ret) {
2920                        tun_detach_filter(tun, i);
2921                        return ret;
2922                }
2923        }
2924
2925        tun->filter_attached = true;
2926        return ret;
2927}
2928
2929static void tun_set_sndbuf(struct tun_struct *tun)
2930{
2931        struct tun_file *tfile;
2932        int i;
2933
2934        for (i = 0; i < tun->numqueues; i++) {
2935                tfile = rtnl_dereference(tun->tfiles[i]);
2936                tfile->socket.sk->sk_sndbuf = tun->sndbuf;
2937        }
2938}
2939
2940static int tun_set_queue(struct file *file, struct ifreq *ifr)
2941{
2942        struct tun_file *tfile = file->private_data;
2943        struct tun_struct *tun;
2944        int ret = 0;
2945
2946        rtnl_lock();
2947
2948        if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
2949                tun = tfile->detached;
2950                if (!tun) {
2951                        ret = -EINVAL;
2952                        goto unlock;
2953                }
2954                ret = security_tun_dev_attach_queue(tun->security);
2955                if (ret < 0)
2956                        goto unlock;
2957                ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
2958                                 tun->flags & IFF_NAPI_FRAGS, true);
2959        } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
2960                tun = rtnl_dereference(tfile->tun);
2961                if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
2962                        ret = -EINVAL;
2963                else
2964                        __tun_detach(tfile, false);
2965        } else
2966                ret = -EINVAL;
2967
2968        if (ret >= 0)
2969                netdev_state_change(tun->dev);
2970
2971unlock:
2972        rtnl_unlock();
2973        return ret;
2974}
2975
2976static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
2977                        void __user *data)
2978{
2979        struct bpf_prog *prog;
2980        int fd;
2981
2982        if (copy_from_user(&fd, data, sizeof(fd)))
2983                return -EFAULT;
2984
2985        if (fd == -1) {
2986                prog = NULL;
2987        } else {
2988                prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
2989                if (IS_ERR(prog))
2990                        return PTR_ERR(prog);
2991        }
2992
2993        return __tun_set_ebpf(tun, prog_p, prog);
2994}
2995
2996static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
2997                            unsigned long arg, int ifreq_len)
2998{
2999        struct tun_file *tfile = file->private_data;
3000        struct net *net = sock_net(&tfile->sk);
3001        struct tun_struct *tun;
3002        void __user* argp = (void __user*)arg;
3003        unsigned int ifindex, carrier;
3004        struct ifreq ifr;
3005        kuid_t owner;
3006        kgid_t group;
3007        int sndbuf;
3008        int vnet_hdr_sz;
3009        int le;
3010        int ret;
3011        bool do_notify = false;
3012
3013        if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
3014            (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
3015                if (copy_from_user(&ifr, argp, ifreq_len))
3016                        return -EFAULT;
3017        } else {
3018                memset(&ifr, 0, sizeof(ifr));
3019        }
3020        if (cmd == TUNGETFEATURES) {
3021                /* Currently this just means: "what IFF flags are valid?".
3022                 * This is needed because we never checked for invalid flags on
3023                 * TUNSETIFF.
3024                 */
3025                return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
3026                                (unsigned int __user*)argp);
3027        } else if (cmd == TUNSETQUEUE) {
3028                return tun_set_queue(file, &ifr);
3029        } else if (cmd == SIOCGSKNS) {
3030                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3031                        return -EPERM;
3032                return open_related_ns(&net->ns, get_net_ns);
3033        }
3034
3035        ret = 0;
3036        rtnl_lock();
3037
3038        tun = tun_get(tfile);
3039        if (cmd == TUNSETIFF) {
3040                ret = -EEXIST;
3041                if (tun)
3042                        goto unlock;
3043
3044                ifr.ifr_name[IFNAMSIZ-1] = '\0';
3045
3046                ret = tun_set_iff(net, file, &ifr);
3047
3048                if (ret)
3049                        goto unlock;
3050
3051                if (copy_to_user(argp, &ifr, ifreq_len))
3052                        ret = -EFAULT;
3053                goto unlock;
3054        }
3055        if (cmd == TUNSETIFINDEX) {
3056                ret = -EPERM;
3057                if (tun)
3058                        goto unlock;
3059
3060                ret = -EFAULT;
3061                if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
3062                        goto unlock;
3063
3064                ret = 0;
3065                tfile->ifindex = ifindex;
3066                goto unlock;
3067        }
3068
3069        ret = -EBADFD;
3070        if (!tun)
3071                goto unlock;
3072
3073        netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
3074
3075        net = dev_net(tun->dev);
3076        ret = 0;
3077        switch (cmd) {
3078        case TUNGETIFF:
3079                tun_get_iff(tun, &ifr);
3080
3081                if (tfile->detached)
3082                        ifr.ifr_flags |= IFF_DETACH_QUEUE;
3083                if (!tfile->socket.sk->sk_filter)
3084                        ifr.ifr_flags |= IFF_NOFILTER;
3085
3086                if (copy_to_user(argp, &ifr, ifreq_len))
3087                        ret = -EFAULT;
3088                break;
3089
3090        case TUNSETNOCSUM:
3091                /* Disable/Enable checksum */
3092
3093                /* [unimplemented] */
3094                netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
3095                           arg ? "disabled" : "enabled");
3096                break;
3097
3098        case TUNSETPERSIST:
3099                /* Disable/Enable persist mode. Keep an extra reference to the
3100                 * module to prevent the module being unprobed.
3101                 */
3102                if (arg && !(tun->flags & IFF_PERSIST)) {
3103                        tun->flags |= IFF_PERSIST;
3104                        __module_get(THIS_MODULE);
3105                        do_notify = true;
3106                }
3107                if (!arg && (tun->flags & IFF_PERSIST)) {
3108                        tun->flags &= ~IFF_PERSIST;
3109                        module_put(THIS_MODULE);
3110                        do_notify = true;
3111                }
3112
3113                netif_info(tun, drv, tun->dev, "persist %s\n",
3114                           arg ? "enabled" : "disabled");
3115                break;
3116
3117        case TUNSETOWNER:
3118                /* Set owner of the device */
3119                owner = make_kuid(current_user_ns(), arg);
3120                if (!uid_valid(owner)) {
3121                        ret = -EINVAL;
3122                        break;
3123                }
3124                tun->owner = owner;
3125                do_notify = true;
3126                netif_info(tun, drv, tun->dev, "owner set to %u\n",
3127                           from_kuid(&init_user_ns, tun->owner));
3128                break;
3129
3130        case TUNSETGROUP:
3131                /* Set group of the device */
3132                group = make_kgid(current_user_ns(), arg);
3133                if (!gid_valid(group)) {
3134                        ret = -EINVAL;
3135                        break;
3136                }
3137                tun->group = group;
3138                do_notify = true;
3139                netif_info(tun, drv, tun->dev, "group set to %u\n",
3140                           from_kgid(&init_user_ns, tun->group));
3141                break;
3142
3143        case TUNSETLINK:
3144                /* Only allow setting the type when the interface is down */
3145                if (tun->dev->flags & IFF_UP) {
3146                        netif_info(tun, drv, tun->dev,
3147                                   "Linktype set failed because interface is up\n");
3148                        ret = -EBUSY;
3149                } else {
3150                        tun->dev->type = (int) arg;
3151                        netif_info(tun, drv, tun->dev, "linktype set to %d\n",
3152                                   tun->dev->type);
3153                        ret = 0;
3154                }
3155                break;
3156
3157        case TUNSETDEBUG:
3158                tun->msg_enable = (u32)arg;
3159                break;
3160
3161        case TUNSETOFFLOAD:
3162                ret = set_offload(tun, arg);
3163                break;
3164
3165        case TUNSETTXFILTER:
3166                /* Can be set only for TAPs */
3167                ret = -EINVAL;
3168                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3169                        break;
3170                ret = update_filter(&tun->txflt, (void __user *)arg);
3171                break;
3172
3173        case SIOCGIFHWADDR:
3174                /* Get hw address */
3175                memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
3176                ifr.ifr_hwaddr.sa_family = tun->dev->type;
3177                if (copy_to_user(argp, &ifr, ifreq_len))
3178                        ret = -EFAULT;
3179                break;
3180
3181        case SIOCSIFHWADDR:
3182                /* Set hw address */
3183                ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr, NULL);
3184                break;
3185
3186        case TUNGETSNDBUF:
3187                sndbuf = tfile->socket.sk->sk_sndbuf;
3188                if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
3189                        ret = -EFAULT;
3190                break;
3191
3192        case TUNSETSNDBUF:
3193                if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
3194                        ret = -EFAULT;
3195                        break;
3196                }
3197                if (sndbuf <= 0) {
3198                        ret = -EINVAL;
3199                        break;
3200                }
3201
3202                tun->sndbuf = sndbuf;
3203                tun_set_sndbuf(tun);
3204                break;
3205
3206        case TUNGETVNETHDRSZ:
3207                vnet_hdr_sz = tun->vnet_hdr_sz;
3208                if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
3209                        ret = -EFAULT;
3210                break;
3211
3212        case TUNSETVNETHDRSZ:
3213                if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
3214                        ret = -EFAULT;
3215                        break;
3216                }
3217                if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
3218                        ret = -EINVAL;
3219                        break;
3220                }
3221
3222                tun->vnet_hdr_sz = vnet_hdr_sz;
3223                break;
3224
3225        case TUNGETVNETLE:
3226                le = !!(tun->flags & TUN_VNET_LE);
3227                if (put_user(le, (int __user *)argp))
3228                        ret = -EFAULT;
3229                break;
3230
3231        case TUNSETVNETLE:
3232                if (get_user(le, (int __user *)argp)) {
3233                        ret = -EFAULT;
3234                        break;
3235                }
3236                if (le)
3237                        tun->flags |= TUN_VNET_LE;
3238                else
3239                        tun->flags &= ~TUN_VNET_LE;
3240                break;
3241
3242        case TUNGETVNETBE:
3243                ret = tun_get_vnet_be(tun, argp);
3244                break;
3245
3246        case TUNSETVNETBE:
3247                ret = tun_set_vnet_be(tun, argp);
3248                break;
3249
3250        case TUNATTACHFILTER:
3251                /* Can be set only for TAPs */
3252                ret = -EINVAL;
3253                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3254                        break;
3255                ret = -EFAULT;
3256                if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
3257                        break;
3258
3259                ret = tun_attach_filter(tun);
3260                break;
3261
3262        case TUNDETACHFILTER:
3263                /* Can be set only for TAPs */
3264                ret = -EINVAL;
3265                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3266                        break;
3267                ret = 0;
3268                tun_detach_filter(tun, tun->numqueues);
3269                break;
3270
3271        case TUNGETFILTER:
3272                ret = -EINVAL;
3273                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3274                        break;
3275                ret = -EFAULT;
3276                if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
3277                        break;
3278                ret = 0;
3279                break;
3280
3281        case TUNSETSTEERINGEBPF:
3282                ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
3283                break;
3284
3285        case TUNSETFILTEREBPF:
3286                ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
3287                break;
3288
3289        case TUNSETCARRIER:
3290                ret = -EFAULT;
3291                if (copy_from_user(&carrier, argp, sizeof(carrier)))
3292                        goto unlock;
3293
3294                ret = tun_net_change_carrier(tun->dev, (bool)carrier);
3295                break;
3296
3297        case TUNGETDEVNETNS:
3298                ret = -EPERM;
3299                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3300                        goto unlock;
3301                ret = open_related_ns(&net->ns, get_net_ns);
3302                break;
3303
3304        default:
3305                ret = -EINVAL;
3306                break;
3307        }
3308
3309        if (do_notify)
3310                netdev_state_change(tun->dev);
3311
3312unlock:
3313        rtnl_unlock();
3314        if (tun)
3315                tun_put(tun);
3316        return ret;
3317}
3318
3319static long tun_chr_ioctl(struct file *file,
3320                          unsigned int cmd, unsigned long arg)
3321{
3322        return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
3323}
3324
3325#ifdef CONFIG_COMPAT
3326static long tun_chr_compat_ioctl(struct file *file,
3327                         unsigned int cmd, unsigned long arg)
3328{
3329        switch (cmd) {
3330        case TUNSETIFF:
3331        case TUNGETIFF:
3332        case TUNSETTXFILTER:
3333        case TUNGETSNDBUF:
3334        case TUNSETSNDBUF:
3335        case SIOCGIFHWADDR:
3336        case SIOCSIFHWADDR:
3337                arg = (unsigned long)compat_ptr(arg);
3338                break;
3339        default:
3340                arg = (compat_ulong_t)arg;
3341                break;
3342        }
3343
3344        /*
3345         * compat_ifreq is shorter than ifreq, so we must not access beyond
3346         * the end of that structure. All fields that are used in this
3347         * driver are compatible though, we don't need to convert the
3348         * contents.
3349         */
3350        return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
3351}
3352#endif /* CONFIG_COMPAT */
3353
3354static int tun_chr_fasync(int fd, struct file *file, int on)
3355{
3356        struct tun_file *tfile = file->private_data;
3357        int ret;
3358
3359        if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
3360                goto out;
3361
3362        if (on) {
3363                __f_setown(file, task_pid(current), PIDTYPE_TGID, 0);
3364                tfile->flags |= TUN_FASYNC;
3365        } else
3366                tfile->flags &= ~TUN_FASYNC;
3367        ret = 0;
3368out:
3369        return ret;
3370}
3371
3372static int tun_chr_open(struct inode *inode, struct file * file)
3373{
3374        struct net *net = current->nsproxy->net_ns;
3375        struct tun_file *tfile;
3376
3377        tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
3378                                            &tun_proto, 0);
3379        if (!tfile)
3380                return -ENOMEM;
3381        if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
3382                sk_free(&tfile->sk);
3383                return -ENOMEM;
3384        }
3385
3386        mutex_init(&tfile->napi_mutex);
3387        RCU_INIT_POINTER(tfile->tun, NULL);
3388        tfile->flags = 0;
3389        tfile->ifindex = 0;
3390
3391        init_waitqueue_head(&tfile->socket.wq.wait);
3392
3393        tfile->socket.file = file;
3394        tfile->socket.ops = &tun_socket_ops;
3395
3396        sock_init_data(&tfile->socket, &tfile->sk);
3397
3398        tfile->sk.sk_write_space = tun_sock_write_space;
3399        tfile->sk.sk_sndbuf = INT_MAX;
3400
3401        file->private_data = tfile;
3402        INIT_LIST_HEAD(&tfile->next);
3403
3404        sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
3405
3406        return 0;
3407}
3408
3409static int tun_chr_close(struct inode *inode, struct file *file)
3410{
3411        struct tun_file *tfile = file->private_data;
3412
3413        tun_detach(tfile, true);
3414
3415        return 0;
3416}
3417
3418#ifdef CONFIG_PROC_FS
3419static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
3420{
3421        struct tun_file *tfile = file->private_data;
3422        struct tun_struct *tun;
3423        struct ifreq ifr;
3424
3425        memset(&ifr, 0, sizeof(ifr));
3426
3427        rtnl_lock();
3428        tun = tun_get(tfile);
3429        if (tun)
3430                tun_get_iff(tun, &ifr);
3431        rtnl_unlock();
3432
3433        if (tun)
3434                tun_put(tun);
3435
3436        seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
3437}
3438#endif
3439
3440static const struct file_operations tun_fops = {
3441        .owner  = THIS_MODULE,
3442        .llseek = no_llseek,
3443        .read_iter  = tun_chr_read_iter,
3444        .write_iter = tun_chr_write_iter,
3445        .poll   = tun_chr_poll,
3446        .unlocked_ioctl = tun_chr_ioctl,
3447#ifdef CONFIG_COMPAT
3448        .compat_ioctl = tun_chr_compat_ioctl,
3449#endif
3450        .open   = tun_chr_open,
3451        .release = tun_chr_close,
3452        .fasync = tun_chr_fasync,
3453#ifdef CONFIG_PROC_FS
3454        .show_fdinfo = tun_chr_show_fdinfo,
3455#endif
3456};
3457
3458static struct miscdevice tun_miscdev = {
3459        .minor = TUN_MINOR,
3460        .name = "tun",
3461        .nodename = "net/tun",
3462        .fops = &tun_fops,
3463};
3464
3465/* ethtool interface */
3466
3467static void tun_default_link_ksettings(struct net_device *dev,
3468                                       struct ethtool_link_ksettings *cmd)
3469{
3470        ethtool_link_ksettings_zero_link_mode(cmd, supported);
3471        ethtool_link_ksettings_zero_link_mode(cmd, advertising);
3472        cmd->base.speed         = SPEED_10;
3473        cmd->base.duplex        = DUPLEX_FULL;
3474        cmd->base.port          = PORT_TP;
3475        cmd->base.phy_address   = 0;
3476        cmd->base.autoneg       = AUTONEG_DISABLE;
3477}
3478
3479static int tun_get_link_ksettings(struct net_device *dev,
3480                                  struct ethtool_link_ksettings *cmd)
3481{
3482        struct tun_struct *tun = netdev_priv(dev);
3483
3484        memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
3485        return 0;
3486}
3487
3488static int tun_set_link_ksettings(struct net_device *dev,
3489                                  const struct ethtool_link_ksettings *cmd)
3490{
3491        struct tun_struct *tun = netdev_priv(dev);
3492
3493        memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
3494        return 0;
3495}
3496
3497static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
3498{
3499        struct tun_struct *tun = netdev_priv(dev);
3500
3501        strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
3502        strlcpy(info->version, DRV_VERSION, sizeof(info->version));
3503
3504        switch (tun->flags & TUN_TYPE_MASK) {
3505        case IFF_TUN:
3506                strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
3507                break;
3508        case IFF_TAP:
3509                strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
3510                break;
3511        }
3512}
3513
3514static u32 tun_get_msglevel(struct net_device *dev)
3515{
3516        struct tun_struct *tun = netdev_priv(dev);
3517
3518        return tun->msg_enable;
3519}
3520
3521static void tun_set_msglevel(struct net_device *dev, u32 value)
3522{
3523        struct tun_struct *tun = netdev_priv(dev);
3524
3525        tun->msg_enable = value;
3526}
3527
3528static int tun_get_coalesce(struct net_device *dev,
3529                            struct ethtool_coalesce *ec)
3530{
3531        struct tun_struct *tun = netdev_priv(dev);
3532
3533        ec->rx_max_coalesced_frames = tun->rx_batched;
3534
3535        return 0;
3536}
3537
3538static int tun_set_coalesce(struct net_device *dev,
3539                            struct ethtool_coalesce *ec)
3540{
3541        struct tun_struct *tun = netdev_priv(dev);
3542
3543        if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
3544                tun->rx_batched = NAPI_POLL_WEIGHT;
3545        else
3546                tun->rx_batched = ec->rx_max_coalesced_frames;
3547
3548        return 0;
3549}
3550
3551static const struct ethtool_ops tun_ethtool_ops = {
3552        .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
3553        .get_drvinfo    = tun_get_drvinfo,
3554        .get_msglevel   = tun_get_msglevel,
3555        .set_msglevel   = tun_set_msglevel,
3556        .get_link       = ethtool_op_get_link,
3557        .get_ts_info    = ethtool_op_get_ts_info,
3558        .get_coalesce   = tun_get_coalesce,
3559        .set_coalesce   = tun_set_coalesce,
3560        .get_link_ksettings = tun_get_link_ksettings,
3561        .set_link_ksettings = tun_set_link_ksettings,
3562};
3563
3564static int tun_queue_resize(struct tun_struct *tun)
3565{
3566        struct net_device *dev = tun->dev;
3567        struct tun_file *tfile;
3568        struct ptr_ring **rings;
3569        int n = tun->numqueues + tun->numdisabled;
3570        int ret, i;
3571
3572        rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
3573        if (!rings)
3574                return -ENOMEM;
3575
3576        for (i = 0; i < tun->numqueues; i++) {
3577                tfile = rtnl_dereference(tun->tfiles[i]);
3578                rings[i] = &tfile->tx_ring;
3579        }
3580        list_for_each_entry(tfile, &tun->disabled, next)
3581                rings[i++] = &tfile->tx_ring;
3582
3583        ret = ptr_ring_resize_multiple(rings, n,
3584                                       dev->tx_queue_len, GFP_KERNEL,
3585                                       tun_ptr_free);
3586
3587        kfree(rings);
3588        return ret;
3589}
3590
3591static int tun_device_event(struct notifier_block *unused,
3592                            unsigned long event, void *ptr)
3593{
3594        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3595        struct tun_struct *tun = netdev_priv(dev);
3596        int i;
3597
3598        if (dev->rtnl_link_ops != &tun_link_ops)
3599                return NOTIFY_DONE;
3600
3601        switch (event) {
3602        case NETDEV_CHANGE_TX_QUEUE_LEN:
3603                if (tun_queue_resize(tun))
3604                        return NOTIFY_BAD;
3605                break;
3606        case NETDEV_UP:
3607                for (i = 0; i < tun->numqueues; i++) {
3608                        struct tun_file *tfile;
3609
3610                        tfile = rtnl_dereference(tun->tfiles[i]);
3611                        tfile->socket.sk->sk_write_space(tfile->socket.sk);
3612                }
3613                break;
3614        default:
3615                break;
3616        }
3617
3618        return NOTIFY_DONE;
3619}
3620
3621static struct notifier_block tun_notifier_block __read_mostly = {
3622        .notifier_call  = tun_device_event,
3623};
3624
3625static int __init tun_init(void)
3626{
3627        int ret = 0;
3628
3629        pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
3630
3631        ret = rtnl_link_register(&tun_link_ops);
3632        if (ret) {
3633                pr_err("Can't register link_ops\n");
3634                goto err_linkops;
3635        }
3636
3637        ret = misc_register(&tun_miscdev);
3638        if (ret) {
3639                pr_err("Can't register misc device %d\n", TUN_MINOR);
3640                goto err_misc;
3641        }
3642
3643        ret = register_netdevice_notifier(&tun_notifier_block);
3644        if (ret) {
3645                pr_err("Can't register netdevice notifier\n");
3646                goto err_notifier;
3647        }
3648
3649        return  0;
3650
3651err_notifier:
3652        misc_deregister(&tun_miscdev);
3653err_misc:
3654        rtnl_link_unregister(&tun_link_ops);
3655err_linkops:
3656        return ret;
3657}
3658
3659static void tun_cleanup(void)
3660{
3661        misc_deregister(&tun_miscdev);
3662        rtnl_link_unregister(&tun_link_ops);
3663        unregister_netdevice_notifier(&tun_notifier_block);
3664}
3665
3666/* Get an underlying socket object from tun file.  Returns error unless file is
3667 * attached to a device.  The returned object works like a packet socket, it
3668 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
3669 * holding a reference to the file for as long as the socket is in use. */
3670struct socket *tun_get_socket(struct file *file)
3671{
3672        struct tun_file *tfile;
3673        if (file->f_op != &tun_fops)
3674                return ERR_PTR(-EINVAL);
3675        tfile = file->private_data;
3676        if (!tfile)
3677                return ERR_PTR(-EBADFD);
3678        return &tfile->socket;
3679}
3680EXPORT_SYMBOL_GPL(tun_get_socket);
3681
3682struct ptr_ring *tun_get_tx_ring(struct file *file)
3683{
3684        struct tun_file *tfile;
3685
3686        if (file->f_op != &tun_fops)
3687                return ERR_PTR(-EINVAL);
3688        tfile = file->private_data;
3689        if (!tfile)
3690                return ERR_PTR(-EBADFD);
3691        return &tfile->tx_ring;
3692}
3693EXPORT_SYMBOL_GPL(tun_get_tx_ring);
3694
3695module_init(tun_init);
3696module_exit(tun_cleanup);
3697MODULE_DESCRIPTION(DRV_DESCRIPTION);
3698MODULE_AUTHOR(DRV_COPYRIGHT);
3699MODULE_LICENSE("GPL");
3700MODULE_ALIAS_MISCDEV(TUN_MINOR);
3701MODULE_ALIAS("devname:net/tun");
3702