linux/drivers/net/tun.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *  TUN - Universal TUN/TAP device driver.
   4 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
   5 *
   6 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
   7 */
   8
   9/*
  10 *  Changes:
  11 *
  12 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
  13 *    Add TUNSETLINK ioctl to set the link encapsulation
  14 *
  15 *  Mark Smith <markzzzsmith@yahoo.com.au>
  16 *    Use eth_random_addr() for tap MAC address.
  17 *
  18 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
  19 *    Fixes in packet dropping, queue length setting and queue wakeup.
  20 *    Increased default tx queue length.
  21 *    Added ethtool API.
  22 *    Minor cleanups
  23 *
  24 *  Daniel Podlejski <underley@underley.eu.org>
  25 *    Modifications for 2.3.99-pre5 kernel.
  26 */
  27
  28#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  29
  30#define DRV_NAME        "tun"
  31#define DRV_VERSION     "1.6"
  32#define DRV_DESCRIPTION "Universal TUN/TAP device driver"
  33#define DRV_COPYRIGHT   "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
  34
  35#include <linux/module.h>
  36#include <linux/errno.h>
  37#include <linux/kernel.h>
  38#include <linux/sched/signal.h>
  39#include <linux/major.h>
  40#include <linux/slab.h>
  41#include <linux/poll.h>
  42#include <linux/fcntl.h>
  43#include <linux/init.h>
  44#include <linux/skbuff.h>
  45#include <linux/netdevice.h>
  46#include <linux/etherdevice.h>
  47#include <linux/miscdevice.h>
  48#include <linux/ethtool.h>
  49#include <linux/rtnetlink.h>
  50#include <linux/compat.h>
  51#include <linux/if.h>
  52#include <linux/if_arp.h>
  53#include <linux/if_ether.h>
  54#include <linux/if_tun.h>
  55#include <linux/if_vlan.h>
  56#include <linux/crc32.h>
  57#include <linux/nsproxy.h>
  58#include <linux/virtio_net.h>
  59#include <linux/rcupdate.h>
  60#include <net/net_namespace.h>
  61#include <net/netns/generic.h>
  62#include <net/rtnetlink.h>
  63#include <net/sock.h>
  64#include <net/xdp.h>
  65#include <net/ip_tunnels.h>
  66#include <linux/seq_file.h>
  67#include <linux/uio.h>
  68#include <linux/skb_array.h>
  69#include <linux/bpf.h>
  70#include <linux/bpf_trace.h>
  71#include <linux/mutex.h>
  72
  73#include <linux/uaccess.h>
  74#include <linux/proc_fs.h>
  75
  76static void tun_default_link_ksettings(struct net_device *dev,
  77                                       struct ethtool_link_ksettings *cmd);
  78
  79#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
  80
  81/* TUN device flags */
  82
  83/* IFF_ATTACH_QUEUE is never stored in device flags,
  84 * overload it to mean fasync when stored there.
  85 */
  86#define TUN_FASYNC      IFF_ATTACH_QUEUE
  87/* High bits in flags field are unused. */
  88#define TUN_VNET_LE     0x80000000
  89#define TUN_VNET_BE     0x40000000
  90
  91#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
  92                      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
  93
  94#define GOODCOPY_LEN 128
  95
  96#define FLT_EXACT_COUNT 8
  97struct tap_filter {
  98        unsigned int    count;    /* Number of addrs. Zero means disabled */
  99        u32             mask[2];  /* Mask of the hashed addrs */
 100        unsigned char   addr[FLT_EXACT_COUNT][ETH_ALEN];
 101};
 102
 103/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
 104 * to max number of VCPUs in guest. */
 105#define MAX_TAP_QUEUES 256
 106#define MAX_TAP_FLOWS  4096
 107
 108#define TUN_FLOW_EXPIRE (3 * HZ)
 109
 110struct tun_pcpu_stats {
 111        u64_stats_t rx_packets;
 112        u64_stats_t rx_bytes;
 113        u64_stats_t tx_packets;
 114        u64_stats_t tx_bytes;
 115        struct u64_stats_sync syncp;
 116        u32 rx_dropped;
 117        u32 tx_dropped;
 118        u32 rx_frame_errors;
 119};
 120
 121/* A tun_file connects an open character device to a tuntap netdevice. It
 122 * also contains all socket related structures (except sock_fprog and tap_filter)
 123 * to serve as one transmit queue for tuntap device. The sock_fprog and
 124 * tap_filter were kept in tun_struct since they were used for filtering for the
 125 * netdevice not for a specific queue (at least I didn't see the requirement for
 126 * this).
 127 *
 128 * RCU usage:
 129 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
 130 * other can only be read while rcu_read_lock or rtnl_lock is held.
 131 */
 132struct tun_file {
 133        struct sock sk;
 134        struct socket socket;
 135        struct tun_struct __rcu *tun;
 136        struct fasync_struct *fasync;
 137        /* only used for fasnyc */
 138        unsigned int flags;
 139        union {
 140                u16 queue_index;
 141                unsigned int ifindex;
 142        };
 143        struct napi_struct napi;
 144        bool napi_enabled;
 145        bool napi_frags_enabled;
 146        struct mutex napi_mutex;        /* Protects access to the above napi */
 147        struct list_head next;
 148        struct tun_struct *detached;
 149        struct ptr_ring tx_ring;
 150        struct xdp_rxq_info xdp_rxq;
 151};
 152
 153struct tun_page {
 154        struct page *page;
 155        int count;
 156};
 157
 158struct tun_flow_entry {
 159        struct hlist_node hash_link;
 160        struct rcu_head rcu;
 161        struct tun_struct *tun;
 162
 163        u32 rxhash;
 164        u32 rps_rxhash;
 165        int queue_index;
 166        unsigned long updated ____cacheline_aligned_in_smp;
 167};
 168
 169#define TUN_NUM_FLOW_ENTRIES 1024
 170#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)
 171
 172struct tun_prog {
 173        struct rcu_head rcu;
 174        struct bpf_prog *prog;
 175};
 176
 177/* Since the socket were moved to tun_file, to preserve the behavior of persist
 178 * device, socket filter, sndbuf and vnet header size were restore when the
 179 * file were attached to a persist device.
 180 */
 181struct tun_struct {
 182        struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES];
 183        unsigned int            numqueues;
 184        unsigned int            flags;
 185        kuid_t                  owner;
 186        kgid_t                  group;
 187
 188        struct net_device       *dev;
 189        netdev_features_t       set_features;
 190#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
 191                          NETIF_F_TSO6)
 192
 193        int                     align;
 194        int                     vnet_hdr_sz;
 195        int                     sndbuf;
 196        struct tap_filter       txflt;
 197        struct sock_fprog       fprog;
 198        /* protected by rtnl lock */
 199        bool                    filter_attached;
 200        u32                     msg_enable;
 201        spinlock_t lock;
 202        struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
 203        struct timer_list flow_gc_timer;
 204        unsigned long ageing_time;
 205        unsigned int numdisabled;
 206        struct list_head disabled;
 207        void *security;
 208        u32 flow_count;
 209        u32 rx_batched;
 210        struct tun_pcpu_stats __percpu *pcpu_stats;
 211        struct bpf_prog __rcu *xdp_prog;
 212        struct tun_prog __rcu *steering_prog;
 213        struct tun_prog __rcu *filter_prog;
 214        struct ethtool_link_ksettings link_ksettings;
 215};
 216
 217struct veth {
 218        __be16 h_vlan_proto;
 219        __be16 h_vlan_TCI;
 220};
 221
 222static int tun_napi_receive(struct napi_struct *napi, int budget)
 223{
 224        struct tun_file *tfile = container_of(napi, struct tun_file, napi);
 225        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
 226        struct sk_buff_head process_queue;
 227        struct sk_buff *skb;
 228        int received = 0;
 229
 230        __skb_queue_head_init(&process_queue);
 231
 232        spin_lock(&queue->lock);
 233        skb_queue_splice_tail_init(queue, &process_queue);
 234        spin_unlock(&queue->lock);
 235
 236        while (received < budget && (skb = __skb_dequeue(&process_queue))) {
 237                napi_gro_receive(napi, skb);
 238                ++received;
 239        }
 240
 241        if (!skb_queue_empty(&process_queue)) {
 242                spin_lock(&queue->lock);
 243                skb_queue_splice(&process_queue, queue);
 244                spin_unlock(&queue->lock);
 245        }
 246
 247        return received;
 248}
 249
 250static int tun_napi_poll(struct napi_struct *napi, int budget)
 251{
 252        unsigned int received;
 253
 254        received = tun_napi_receive(napi, budget);
 255
 256        if (received < budget)
 257                napi_complete_done(napi, received);
 258
 259        return received;
 260}
 261
 262static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
 263                          bool napi_en, bool napi_frags)
 264{
 265        tfile->napi_enabled = napi_en;
 266        tfile->napi_frags_enabled = napi_en && napi_frags;
 267        if (napi_en) {
 268                netif_tx_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 269                                  NAPI_POLL_WEIGHT);
 270                napi_enable(&tfile->napi);
 271        }
 272}
 273
 274static void tun_napi_disable(struct tun_file *tfile)
 275{
 276        if (tfile->napi_enabled)
 277                napi_disable(&tfile->napi);
 278}
 279
 280static void tun_napi_del(struct tun_file *tfile)
 281{
 282        if (tfile->napi_enabled)
 283                netif_napi_del(&tfile->napi);
 284}
 285
 286static bool tun_napi_frags_enabled(const struct tun_file *tfile)
 287{
 288        return tfile->napi_frags_enabled;
 289}
 290
 291#ifdef CONFIG_TUN_VNET_CROSS_LE
 292static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 293{
 294        return tun->flags & TUN_VNET_BE ? false :
 295                virtio_legacy_is_little_endian();
 296}
 297
 298static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
 299{
 300        int be = !!(tun->flags & TUN_VNET_BE);
 301
 302        if (put_user(be, argp))
 303                return -EFAULT;
 304
 305        return 0;
 306}
 307
 308static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
 309{
 310        int be;
 311
 312        if (get_user(be, argp))
 313                return -EFAULT;
 314
 315        if (be)
 316                tun->flags |= TUN_VNET_BE;
 317        else
 318                tun->flags &= ~TUN_VNET_BE;
 319
 320        return 0;
 321}
 322#else
 323static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 324{
 325        return virtio_legacy_is_little_endian();
 326}
 327
 328static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
 329{
 330        return -EINVAL;
 331}
 332
 333static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
 334{
 335        return -EINVAL;
 336}
 337#endif /* CONFIG_TUN_VNET_CROSS_LE */
 338
 339static inline bool tun_is_little_endian(struct tun_struct *tun)
 340{
 341        return tun->flags & TUN_VNET_LE ||
 342                tun_legacy_is_little_endian(tun);
 343}
 344
 345static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
 346{
 347        return __virtio16_to_cpu(tun_is_little_endian(tun), val);
 348}
 349
 350static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
 351{
 352        return __cpu_to_virtio16(tun_is_little_endian(tun), val);
 353}
 354
 355static inline u32 tun_hashfn(u32 rxhash)
 356{
 357        return rxhash & TUN_MASK_FLOW_ENTRIES;
 358}
 359
 360static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
 361{
 362        struct tun_flow_entry *e;
 363
 364        hlist_for_each_entry_rcu(e, head, hash_link) {
 365                if (e->rxhash == rxhash)
 366                        return e;
 367        }
 368        return NULL;
 369}
 370
 371static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
 372                                              struct hlist_head *head,
 373                                              u32 rxhash, u16 queue_index)
 374{
 375        struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
 376
 377        if (e) {
 378                netif_info(tun, tx_queued, tun->dev,
 379                           "create flow: hash %u index %u\n",
 380                           rxhash, queue_index);
 381                e->updated = jiffies;
 382                e->rxhash = rxhash;
 383                e->rps_rxhash = 0;
 384                e->queue_index = queue_index;
 385                e->tun = tun;
 386                hlist_add_head_rcu(&e->hash_link, head);
 387                ++tun->flow_count;
 388        }
 389        return e;
 390}
 391
 392static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
 393{
 394        netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
 395                   e->rxhash, e->queue_index);
 396        hlist_del_rcu(&e->hash_link);
 397        kfree_rcu(e, rcu);
 398        --tun->flow_count;
 399}
 400
 401static void tun_flow_flush(struct tun_struct *tun)
 402{
 403        int i;
 404
 405        spin_lock_bh(&tun->lock);
 406        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 407                struct tun_flow_entry *e;
 408                struct hlist_node *n;
 409
 410                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
 411                        tun_flow_delete(tun, e);
 412        }
 413        spin_unlock_bh(&tun->lock);
 414}
 415
 416static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
 417{
 418        int i;
 419
 420        spin_lock_bh(&tun->lock);
 421        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 422                struct tun_flow_entry *e;
 423                struct hlist_node *n;
 424
 425                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
 426                        if (e->queue_index == queue_index)
 427                                tun_flow_delete(tun, e);
 428                }
 429        }
 430        spin_unlock_bh(&tun->lock);
 431}
 432
 433static void tun_flow_cleanup(struct timer_list *t)
 434{
 435        struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
 436        unsigned long delay = tun->ageing_time;
 437        unsigned long next_timer = jiffies + delay;
 438        unsigned long count = 0;
 439        int i;
 440
 441        spin_lock(&tun->lock);
 442        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
 443                struct tun_flow_entry *e;
 444                struct hlist_node *n;
 445
 446                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
 447                        unsigned long this_timer;
 448
 449                        this_timer = e->updated + delay;
 450                        if (time_before_eq(this_timer, jiffies)) {
 451                                tun_flow_delete(tun, e);
 452                                continue;
 453                        }
 454                        count++;
 455                        if (time_before(this_timer, next_timer))
 456                                next_timer = this_timer;
 457                }
 458        }
 459
 460        if (count)
 461                mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
 462        spin_unlock(&tun->lock);
 463}
 464
 465static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
 466                            struct tun_file *tfile)
 467{
 468        struct hlist_head *head;
 469        struct tun_flow_entry *e;
 470        unsigned long delay = tun->ageing_time;
 471        u16 queue_index = tfile->queue_index;
 472
 473        head = &tun->flows[tun_hashfn(rxhash)];
 474
 475        rcu_read_lock();
 476
 477        e = tun_flow_find(head, rxhash);
 478        if (likely(e)) {
 479                /* TODO: keep queueing to old queue until it's empty? */
 480                if (READ_ONCE(e->queue_index) != queue_index)
 481                        WRITE_ONCE(e->queue_index, queue_index);
 482                if (e->updated != jiffies)
 483                        e->updated = jiffies;
 484                sock_rps_record_flow_hash(e->rps_rxhash);
 485        } else {
 486                spin_lock_bh(&tun->lock);
 487                if (!tun_flow_find(head, rxhash) &&
 488                    tun->flow_count < MAX_TAP_FLOWS)
 489                        tun_flow_create(tun, head, rxhash, queue_index);
 490
 491                if (!timer_pending(&tun->flow_gc_timer))
 492                        mod_timer(&tun->flow_gc_timer,
 493                                  round_jiffies_up(jiffies + delay));
 494                spin_unlock_bh(&tun->lock);
 495        }
 496
 497        rcu_read_unlock();
 498}
 499
 500/* Save the hash received in the stack receive path and update the
 501 * flow_hash table accordingly.
 502 */
 503static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 504{
 505        if (unlikely(e->rps_rxhash != hash))
 506                e->rps_rxhash = hash;
 507}
 508
 509/* We try to identify a flow through its rxhash. The reason that
 510 * we do not check rxq no. is because some cards(e.g 82599), chooses
 511 * the rxq based on the txq where the last packet of the flow comes. As
 512 * the userspace application move between processors, we may get a
 513 * different rxq no. here.
 514 */
 515static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 516{
 517        struct tun_flow_entry *e;
 518        u32 txq = 0;
 519        u32 numqueues = 0;
 520
 521        numqueues = READ_ONCE(tun->numqueues);
 522
 523        txq = __skb_get_hash_symmetric(skb);
 524        e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
 525        if (e) {
 526                tun_flow_save_rps_rxhash(e, txq);
 527                txq = e->queue_index;
 528        } else {
 529                /* use multiply and shift instead of expensive divide */
 530                txq = ((u64)txq * numqueues) >> 32;
 531        }
 532
 533        return txq;
 534}
 535
 536static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 537{
 538        struct tun_prog *prog;
 539        u32 numqueues;
 540        u16 ret = 0;
 541
 542        numqueues = READ_ONCE(tun->numqueues);
 543        if (!numqueues)
 544                return 0;
 545
 546        prog = rcu_dereference(tun->steering_prog);
 547        if (prog)
 548                ret = bpf_prog_run_clear_cb(prog->prog, skb);
 549
 550        return ret % numqueues;
 551}
 552
 553static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
 554                            struct net_device *sb_dev)
 555{
 556        struct tun_struct *tun = netdev_priv(dev);
 557        u16 ret;
 558
 559        rcu_read_lock();
 560        if (rcu_dereference(tun->steering_prog))
 561                ret = tun_ebpf_select_queue(tun, skb);
 562        else
 563                ret = tun_automq_select_queue(tun, skb);
 564        rcu_read_unlock();
 565
 566        return ret;
 567}
 568
 569static inline bool tun_not_capable(struct tun_struct *tun)
 570{
 571        const struct cred *cred = current_cred();
 572        struct net *net = dev_net(tun->dev);
 573
 574        return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
 575                  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
 576                !ns_capable(net->user_ns, CAP_NET_ADMIN);
 577}
 578
 579static void tun_set_real_num_queues(struct tun_struct *tun)
 580{
 581        netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
 582        netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
 583}
 584
 585static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
 586{
 587        tfile->detached = tun;
 588        list_add_tail(&tfile->next, &tun->disabled);
 589        ++tun->numdisabled;
 590}
 591
 592static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 593{
 594        struct tun_struct *tun = tfile->detached;
 595
 596        tfile->detached = NULL;
 597        list_del_init(&tfile->next);
 598        --tun->numdisabled;
 599        return tun;
 600}
 601
 602void tun_ptr_free(void *ptr)
 603{
 604        if (!ptr)
 605                return;
 606        if (tun_is_xdp_frame(ptr)) {
 607                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 608
 609                xdp_return_frame(xdpf);
 610        } else {
 611                __skb_array_destroy_skb(ptr);
 612        }
 613}
 614EXPORT_SYMBOL_GPL(tun_ptr_free);
 615
 616static void tun_queue_purge(struct tun_file *tfile)
 617{
 618        void *ptr;
 619
 620        while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
 621                tun_ptr_free(ptr);
 622
 623        skb_queue_purge(&tfile->sk.sk_write_queue);
 624        skb_queue_purge(&tfile->sk.sk_error_queue);
 625}
 626
 627static void __tun_detach(struct tun_file *tfile, bool clean)
 628{
 629        struct tun_file *ntfile;
 630        struct tun_struct *tun;
 631
 632        tun = rtnl_dereference(tfile->tun);
 633
 634        if (tun && clean) {
 635                tun_napi_disable(tfile);
 636                tun_napi_del(tfile);
 637        }
 638
 639        if (tun && !tfile->detached) {
 640                u16 index = tfile->queue_index;
 641                BUG_ON(index >= tun->numqueues);
 642
 643                rcu_assign_pointer(tun->tfiles[index],
 644                                   tun->tfiles[tun->numqueues - 1]);
 645                ntfile = rtnl_dereference(tun->tfiles[index]);
 646                ntfile->queue_index = index;
 647                rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
 648                                   NULL);
 649
 650                --tun->numqueues;
 651                if (clean) {
 652                        RCU_INIT_POINTER(tfile->tun, NULL);
 653                        sock_put(&tfile->sk);
 654                } else
 655                        tun_disable_queue(tun, tfile);
 656
 657                synchronize_net();
 658                tun_flow_delete_by_queue(tun, tun->numqueues + 1);
 659                /* Drop read queue */
 660                tun_queue_purge(tfile);
 661                tun_set_real_num_queues(tun);
 662        } else if (tfile->detached && clean) {
 663                tun = tun_enable_queue(tfile);
 664                sock_put(&tfile->sk);
 665        }
 666
 667        if (clean) {
 668                if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
 669                        netif_carrier_off(tun->dev);
 670
 671                        if (!(tun->flags & IFF_PERSIST) &&
 672                            tun->dev->reg_state == NETREG_REGISTERED)
 673                                unregister_netdevice(tun->dev);
 674                }
 675                if (tun)
 676                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
 677                ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 678                sock_put(&tfile->sk);
 679        }
 680}
 681
 682static void tun_detach(struct tun_file *tfile, bool clean)
 683{
 684        struct tun_struct *tun;
 685        struct net_device *dev;
 686
 687        rtnl_lock();
 688        tun = rtnl_dereference(tfile->tun);
 689        dev = tun ? tun->dev : NULL;
 690        __tun_detach(tfile, clean);
 691        if (dev)
 692                netdev_state_change(dev);
 693        rtnl_unlock();
 694}
 695
 696static void tun_detach_all(struct net_device *dev)
 697{
 698        struct tun_struct *tun = netdev_priv(dev);
 699        struct tun_file *tfile, *tmp;
 700        int i, n = tun->numqueues;
 701
 702        for (i = 0; i < n; i++) {
 703                tfile = rtnl_dereference(tun->tfiles[i]);
 704                BUG_ON(!tfile);
 705                tun_napi_disable(tfile);
 706                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 707                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 708                RCU_INIT_POINTER(tfile->tun, NULL);
 709                --tun->numqueues;
 710        }
 711        list_for_each_entry(tfile, &tun->disabled, next) {
 712                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 713                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 714                RCU_INIT_POINTER(tfile->tun, NULL);
 715        }
 716        BUG_ON(tun->numqueues != 0);
 717
 718        synchronize_net();
 719        for (i = 0; i < n; i++) {
 720                tfile = rtnl_dereference(tun->tfiles[i]);
 721                tun_napi_del(tfile);
 722                /* Drop read queue */
 723                tun_queue_purge(tfile);
 724                xdp_rxq_info_unreg(&tfile->xdp_rxq);
 725                sock_put(&tfile->sk);
 726        }
 727        list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
 728                tun_enable_queue(tfile);
 729                tun_queue_purge(tfile);
 730                xdp_rxq_info_unreg(&tfile->xdp_rxq);
 731                sock_put(&tfile->sk);
 732        }
 733        BUG_ON(tun->numdisabled != 0);
 734
 735        if (tun->flags & IFF_PERSIST)
 736                module_put(THIS_MODULE);
 737}
 738
 739static int tun_attach(struct tun_struct *tun, struct file *file,
 740                      bool skip_filter, bool napi, bool napi_frags,
 741                      bool publish_tun)
 742{
 743        struct tun_file *tfile = file->private_data;
 744        struct net_device *dev = tun->dev;
 745        int err;
 746
 747        err = security_tun_dev_attach(tfile->socket.sk, tun->security);
 748        if (err < 0)
 749                goto out;
 750
 751        err = -EINVAL;
 752        if (rtnl_dereference(tfile->tun) && !tfile->detached)
 753                goto out;
 754
 755        err = -EBUSY;
 756        if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
 757                goto out;
 758
 759        err = -E2BIG;
 760        if (!tfile->detached &&
 761            tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
 762                goto out;
 763
 764        err = 0;
 765
 766        /* Re-attach the filter to persist device */
 767        if (!skip_filter && (tun->filter_attached == true)) {
 768                lock_sock(tfile->socket.sk);
 769                err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
 770                release_sock(tfile->socket.sk);
 771                if (!err)
 772                        goto out;
 773        }
 774
 775        if (!tfile->detached &&
 776            ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
 777                            GFP_KERNEL, tun_ptr_free)) {
 778                err = -ENOMEM;
 779                goto out;
 780        }
 781
 782        tfile->queue_index = tun->numqueues;
 783        tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
 784
 785        if (tfile->detached) {
 786                /* Re-attach detached tfile, updating XDP queue_index */
 787                WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));
 788
 789                if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
 790                        tfile->xdp_rxq.queue_index = tfile->queue_index;
 791        } else {
 792                /* Setup XDP RX-queue info, for new tfile getting attached */
 793                err = xdp_rxq_info_reg(&tfile->xdp_rxq,
 794                                       tun->dev, tfile->queue_index);
 795                if (err < 0)
 796                        goto out;
 797                err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
 798                                                 MEM_TYPE_PAGE_SHARED, NULL);
 799                if (err < 0) {
 800                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
 801                        goto out;
 802                }
 803                err = 0;
 804        }
 805
 806        if (tfile->detached) {
 807                tun_enable_queue(tfile);
 808        } else {
 809                sock_hold(&tfile->sk);
 810                tun_napi_init(tun, tfile, napi, napi_frags);
 811        }
 812
 813        if (rtnl_dereference(tun->xdp_prog))
 814                sock_set_flag(&tfile->sk, SOCK_XDP);
 815
 816        /* device is allowed to go away first, so no need to hold extra
 817         * refcnt.
 818         */
 819
 820        /* Publish tfile->tun and tun->tfiles only after we've fully
 821         * initialized tfile; otherwise we risk using half-initialized
 822         * object.
 823         */
 824        if (publish_tun)
 825                rcu_assign_pointer(tfile->tun, tun);
 826        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 827        tun->numqueues++;
 828        tun_set_real_num_queues(tun);
 829out:
 830        return err;
 831}
 832
 833static struct tun_struct *tun_get(struct tun_file *tfile)
 834{
 835        struct tun_struct *tun;
 836
 837        rcu_read_lock();
 838        tun = rcu_dereference(tfile->tun);
 839        if (tun)
 840                dev_hold(tun->dev);
 841        rcu_read_unlock();
 842
 843        return tun;
 844}
 845
 846static void tun_put(struct tun_struct *tun)
 847{
 848        dev_put(tun->dev);
 849}
 850
 851/* TAP filtering */
 852static void addr_hash_set(u32 *mask, const u8 *addr)
 853{
 854        int n = ether_crc(ETH_ALEN, addr) >> 26;
 855        mask[n >> 5] |= (1 << (n & 31));
 856}
 857
 858static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
 859{
 860        int n = ether_crc(ETH_ALEN, addr) >> 26;
 861        return mask[n >> 5] & (1 << (n & 31));
 862}
 863
 864static int update_filter(struct tap_filter *filter, void __user *arg)
 865{
 866        struct { u8 u[ETH_ALEN]; } *addr;
 867        struct tun_filter uf;
 868        int err, alen, n, nexact;
 869
 870        if (copy_from_user(&uf, arg, sizeof(uf)))
 871                return -EFAULT;
 872
 873        if (!uf.count) {
 874                /* Disabled */
 875                filter->count = 0;
 876                return 0;
 877        }
 878
 879        alen = ETH_ALEN * uf.count;
 880        addr = memdup_user(arg + sizeof(uf), alen);
 881        if (IS_ERR(addr))
 882                return PTR_ERR(addr);
 883
 884        /* The filter is updated without holding any locks. Which is
 885         * perfectly safe. We disable it first and in the worst
 886         * case we'll accept a few undesired packets. */
 887        filter->count = 0;
 888        wmb();
 889
 890        /* Use first set of addresses as an exact filter */
 891        for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
 892                memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
 893
 894        nexact = n;
 895
 896        /* Remaining multicast addresses are hashed,
 897         * unicast will leave the filter disabled. */
 898        memset(filter->mask, 0, sizeof(filter->mask));
 899        for (; n < uf.count; n++) {
 900                if (!is_multicast_ether_addr(addr[n].u)) {
 901                        err = 0; /* no filter */
 902                        goto free_addr;
 903                }
 904                addr_hash_set(filter->mask, addr[n].u);
 905        }
 906
 907        /* For ALLMULTI just set the mask to all ones.
 908         * This overrides the mask populated above. */
 909        if ((uf.flags & TUN_FLT_ALLMULTI))
 910                memset(filter->mask, ~0, sizeof(filter->mask));
 911
 912        /* Now enable the filter */
 913        wmb();
 914        filter->count = nexact;
 915
 916        /* Return the number of exact filters */
 917        err = nexact;
 918free_addr:
 919        kfree(addr);
 920        return err;
 921}
 922
 923/* Returns: 0 - drop, !=0 - accept */
 924static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
 925{
 926        /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
 927         * at this point. */
 928        struct ethhdr *eh = (struct ethhdr *) skb->data;
 929        int i;
 930
 931        /* Exact match */
 932        for (i = 0; i < filter->count; i++)
 933                if (ether_addr_equal(eh->h_dest, filter->addr[i]))
 934                        return 1;
 935
 936        /* Inexact match (multicast only) */
 937        if (is_multicast_ether_addr(eh->h_dest))
 938                return addr_hash_test(filter->mask, eh->h_dest);
 939
 940        return 0;
 941}
 942
 943/*
 944 * Checks whether the packet is accepted or not.
 945 * Returns: 0 - drop, !=0 - accept
 946 */
 947static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
 948{
 949        if (!filter->count)
 950                return 1;
 951
 952        return run_filter(filter, skb);
 953}
 954
 955/* Network device part of the driver */
 956
 957static const struct ethtool_ops tun_ethtool_ops;
 958
 959/* Net device detach from fd. */
 960static void tun_net_uninit(struct net_device *dev)
 961{
 962        tun_detach_all(dev);
 963}
 964
 965/* Net device open. */
 966static int tun_net_open(struct net_device *dev)
 967{
 968        netif_tx_start_all_queues(dev);
 969
 970        return 0;
 971}
 972
 973/* Net device close. */
 974static int tun_net_close(struct net_device *dev)
 975{
 976        netif_tx_stop_all_queues(dev);
 977        return 0;
 978}
 979
 980/* Net device start xmit */
 981static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
 982{
 983#ifdef CONFIG_RPS
 984        if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
 985                /* Select queue was not called for the skbuff, so we extract the
 986                 * RPS hash and save it into the flow_table here.
 987                 */
 988                struct tun_flow_entry *e;
 989                __u32 rxhash;
 990
 991                rxhash = __skb_get_hash_symmetric(skb);
 992                e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
 993                if (e)
 994                        tun_flow_save_rps_rxhash(e, rxhash);
 995        }
 996#endif
 997}
 998
 999static unsigned int run_ebpf_filter(struct tun_struct *tun,
1000                                    struct sk_buff *skb,
1001                                    int len)
1002{
1003        struct tun_prog *prog = rcu_dereference(tun->filter_prog);
1004
1005        if (prog)
1006                len = bpf_prog_run_clear_cb(prog->prog, skb);
1007
1008        return len;
1009}
1010
1011/* Net device start xmit */
1012static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
1013{
1014        struct tun_struct *tun = netdev_priv(dev);
1015        int txq = skb->queue_mapping;
1016        struct tun_file *tfile;
1017        int len = skb->len;
1018
1019        rcu_read_lock();
1020        tfile = rcu_dereference(tun->tfiles[txq]);
1021
1022        /* Drop packet if interface is not attached */
1023        if (!tfile)
1024                goto drop;
1025
1026        if (!rcu_dereference(tun->steering_prog))
1027                tun_automq_xmit(tun, skb);
1028
1029        netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);
1030
1031        /* Drop if the filter does not like it.
1032         * This is a noop if the filter is disabled.
1033         * Filter can be enabled only for the TAP devices. */
1034        if (!check_filter(&tun->txflt, skb))
1035                goto drop;
1036
1037        if (tfile->socket.sk->sk_filter &&
1038            sk_filter(tfile->socket.sk, skb))
1039                goto drop;
1040
1041        len = run_ebpf_filter(tun, skb, len);
1042        if (len == 0 || pskb_trim(skb, len))
1043                goto drop;
1044
1045        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1046                goto drop;
1047
1048        skb_tx_timestamp(skb);
1049
1050        /* Orphan the skb - required as we might hang on to it
1051         * for indefinite time.
1052         */
1053        skb_orphan(skb);
1054
1055        nf_reset_ct(skb);
1056
1057        if (ptr_ring_produce(&tfile->tx_ring, skb))
1058                goto drop;
1059
1060        /* Notify and wake up reader process */
1061        if (tfile->flags & TUN_FASYNC)
1062                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1063        tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1064
1065        rcu_read_unlock();
1066        return NETDEV_TX_OK;
1067
1068drop:
1069        this_cpu_inc(tun->pcpu_stats->tx_dropped);
1070        skb_tx_error(skb);
1071        kfree_skb(skb);
1072        rcu_read_unlock();
1073        return NET_XMIT_DROP;
1074}
1075
1076static void tun_net_mclist(struct net_device *dev)
1077{
1078        /*
1079         * This callback is supposed to deal with mc filter in
1080         * _rx_ path and has nothing to do with the _tx_ path.
1081         * In rx path we always accept everything userspace gives us.
1082         */
1083}
1084
1085static netdev_features_t tun_net_fix_features(struct net_device *dev,
1086        netdev_features_t features)
1087{
1088        struct tun_struct *tun = netdev_priv(dev);
1089
1090        return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
1091}
1092
1093static void tun_set_headroom(struct net_device *dev, int new_hr)
1094{
1095        struct tun_struct *tun = netdev_priv(dev);
1096
1097        if (new_hr < NET_SKB_PAD)
1098                new_hr = NET_SKB_PAD;
1099
1100        tun->align = new_hr;
1101}
1102
1103static void
1104tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
1105{
1106        u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0;
1107        struct tun_struct *tun = netdev_priv(dev);
1108        struct tun_pcpu_stats *p;
1109        int i;
1110
1111        for_each_possible_cpu(i) {
1112                u64 rxpackets, rxbytes, txpackets, txbytes;
1113                unsigned int start;
1114
1115                p = per_cpu_ptr(tun->pcpu_stats, i);
1116                do {
1117                        start = u64_stats_fetch_begin(&p->syncp);
1118                        rxpackets       = u64_stats_read(&p->rx_packets);
1119                        rxbytes         = u64_stats_read(&p->rx_bytes);
1120                        txpackets       = u64_stats_read(&p->tx_packets);
1121                        txbytes         = u64_stats_read(&p->tx_bytes);
1122                } while (u64_stats_fetch_retry(&p->syncp, start));
1123
1124                stats->rx_packets       += rxpackets;
1125                stats->rx_bytes         += rxbytes;
1126                stats->tx_packets       += txpackets;
1127                stats->tx_bytes         += txbytes;
1128
1129                /* u32 counters */
1130                rx_dropped      += p->rx_dropped;
1131                rx_frame_errors += p->rx_frame_errors;
1132                tx_dropped      += p->tx_dropped;
1133        }
1134        stats->rx_dropped  = rx_dropped;
1135        stats->rx_frame_errors = rx_frame_errors;
1136        stats->tx_dropped = tx_dropped;
1137}
1138
1139static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1140                       struct netlink_ext_ack *extack)
1141{
1142        struct tun_struct *tun = netdev_priv(dev);
1143        struct tun_file *tfile;
1144        struct bpf_prog *old_prog;
1145        int i;
1146
1147        old_prog = rtnl_dereference(tun->xdp_prog);
1148        rcu_assign_pointer(tun->xdp_prog, prog);
1149        if (old_prog)
1150                bpf_prog_put(old_prog);
1151
1152        for (i = 0; i < tun->numqueues; i++) {
1153                tfile = rtnl_dereference(tun->tfiles[i]);
1154                if (prog)
1155                        sock_set_flag(&tfile->sk, SOCK_XDP);
1156                else
1157                        sock_reset_flag(&tfile->sk, SOCK_XDP);
1158        }
1159        list_for_each_entry(tfile, &tun->disabled, next) {
1160                if (prog)
1161                        sock_set_flag(&tfile->sk, SOCK_XDP);
1162                else
1163                        sock_reset_flag(&tfile->sk, SOCK_XDP);
1164        }
1165
1166        return 0;
1167}
1168
1169static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1170{
1171        switch (xdp->command) {
1172        case XDP_SETUP_PROG:
1173                return tun_xdp_set(dev, xdp->prog, xdp->extack);
1174        default:
1175                return -EINVAL;
1176        }
1177}
1178
1179static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
1180{
1181        if (new_carrier) {
1182                struct tun_struct *tun = netdev_priv(dev);
1183
1184                if (!tun->numqueues)
1185                        return -EPERM;
1186
1187                netif_carrier_on(dev);
1188        } else {
1189                netif_carrier_off(dev);
1190        }
1191        return 0;
1192}
1193
1194static const struct net_device_ops tun_netdev_ops = {
1195        .ndo_uninit             = tun_net_uninit,
1196        .ndo_open               = tun_net_open,
1197        .ndo_stop               = tun_net_close,
1198        .ndo_start_xmit         = tun_net_xmit,
1199        .ndo_fix_features       = tun_net_fix_features,
1200        .ndo_select_queue       = tun_select_queue,
1201        .ndo_set_rx_headroom    = tun_set_headroom,
1202        .ndo_get_stats64        = tun_net_get_stats64,
1203        .ndo_change_carrier     = tun_net_change_carrier,
1204};
1205
1206static void __tun_xdp_flush_tfile(struct tun_file *tfile)
1207{
1208        /* Notify and wake up reader process */
1209        if (tfile->flags & TUN_FASYNC)
1210                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1211        tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1212}
1213
1214static int tun_xdp_xmit(struct net_device *dev, int n,
1215                        struct xdp_frame **frames, u32 flags)
1216{
1217        struct tun_struct *tun = netdev_priv(dev);
1218        struct tun_file *tfile;
1219        u32 numqueues;
1220        int drops = 0;
1221        int cnt = n;
1222        int i;
1223
1224        if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
1225                return -EINVAL;
1226
1227        rcu_read_lock();
1228
1229resample:
1230        numqueues = READ_ONCE(tun->numqueues);
1231        if (!numqueues) {
1232                rcu_read_unlock();
1233                return -ENXIO; /* Caller will free/return all frames */
1234        }
1235
1236        tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
1237                                            numqueues]);
1238        if (unlikely(!tfile))
1239                goto resample;
1240
1241        spin_lock(&tfile->tx_ring.producer_lock);
1242        for (i = 0; i < n; i++) {
1243                struct xdp_frame *xdp = frames[i];
1244                /* Encode the XDP flag into lowest bit for consumer to differ
1245                 * XDP buffer from sk_buff.
1246                 */
1247                void *frame = tun_xdp_to_ptr(xdp);
1248
1249                if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
1250                        this_cpu_inc(tun->pcpu_stats->tx_dropped);
1251                        xdp_return_frame_rx_napi(xdp);
1252                        drops++;
1253                }
1254        }
1255        spin_unlock(&tfile->tx_ring.producer_lock);
1256
1257        if (flags & XDP_XMIT_FLUSH)
1258                __tun_xdp_flush_tfile(tfile);
1259
1260        rcu_read_unlock();
1261        return cnt - drops;
1262}
1263
1264static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
1265{
1266        struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
1267
1268        if (unlikely(!frame))
1269                return -EOVERFLOW;
1270
1271        return tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
1272}
1273
1274static const struct net_device_ops tap_netdev_ops = {
1275        .ndo_uninit             = tun_net_uninit,
1276        .ndo_open               = tun_net_open,
1277        .ndo_stop               = tun_net_close,
1278        .ndo_start_xmit         = tun_net_xmit,
1279        .ndo_fix_features       = tun_net_fix_features,
1280        .ndo_set_rx_mode        = tun_net_mclist,
1281        .ndo_set_mac_address    = eth_mac_addr,
1282        .ndo_validate_addr      = eth_validate_addr,
1283        .ndo_select_queue       = tun_select_queue,
1284        .ndo_features_check     = passthru_features_check,
1285        .ndo_set_rx_headroom    = tun_set_headroom,
1286        .ndo_get_stats64        = tun_net_get_stats64,
1287        .ndo_bpf                = tun_xdp,
1288        .ndo_xdp_xmit           = tun_xdp_xmit,
1289        .ndo_change_carrier     = tun_net_change_carrier,
1290};
1291
1292static void tun_flow_init(struct tun_struct *tun)
1293{
1294        int i;
1295
1296        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
1297                INIT_HLIST_HEAD(&tun->flows[i]);
1298
1299        tun->ageing_time = TUN_FLOW_EXPIRE;
1300        timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
1301        mod_timer(&tun->flow_gc_timer,
1302                  round_jiffies_up(jiffies + tun->ageing_time));
1303}
1304
1305static void tun_flow_uninit(struct tun_struct *tun)
1306{
1307        del_timer_sync(&tun->flow_gc_timer);
1308        tun_flow_flush(tun);
1309}
1310
1311#define MIN_MTU 68
1312#define MAX_MTU 65535
1313
1314/* Initialize net device. */
1315static void tun_net_init(struct net_device *dev)
1316{
1317        struct tun_struct *tun = netdev_priv(dev);
1318
1319        switch (tun->flags & TUN_TYPE_MASK) {
1320        case IFF_TUN:
1321                dev->netdev_ops = &tun_netdev_ops;
1322                dev->header_ops = &ip_tunnel_header_ops;
1323
1324                /* Point-to-Point TUN Device */
1325                dev->hard_header_len = 0;
1326                dev->addr_len = 0;
1327                dev->mtu = 1500;
1328
1329                /* Zero header length */
1330                dev->type = ARPHRD_NONE;
1331                dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
1332                break;
1333
1334        case IFF_TAP:
1335                dev->netdev_ops = &tap_netdev_ops;
1336                /* Ethernet TAP Device */
1337                ether_setup(dev);
1338                dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1339                dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1340
1341                eth_hw_addr_random(dev);
1342
1343                break;
1344        }
1345
1346        dev->min_mtu = MIN_MTU;
1347        dev->max_mtu = MAX_MTU - dev->hard_header_len;
1348}
1349
1350static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
1351{
1352        struct sock *sk = tfile->socket.sk;
1353
1354        return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
1355}
1356
1357/* Character device part */
1358
1359/* Poll */
1360static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
1361{
1362        struct tun_file *tfile = file->private_data;
1363        struct tun_struct *tun = tun_get(tfile);
1364        struct sock *sk;
1365        __poll_t mask = 0;
1366
1367        if (!tun)
1368                return EPOLLERR;
1369
1370        sk = tfile->socket.sk;
1371
1372        poll_wait(file, sk_sleep(sk), wait);
1373
1374        if (!ptr_ring_empty(&tfile->tx_ring))
1375                mask |= EPOLLIN | EPOLLRDNORM;
1376
1377        /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
1378         * guarantee EPOLLOUT to be raised by either here or
1379         * tun_sock_write_space(). Then process could get notification
1380         * after it writes to a down device and meets -EIO.
1381         */
1382        if (tun_sock_writeable(tun, tfile) ||
1383            (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
1384             tun_sock_writeable(tun, tfile)))
1385                mask |= EPOLLOUT | EPOLLWRNORM;
1386
1387        if (tun->dev->reg_state != NETREG_REGISTERED)
1388                mask = EPOLLERR;
1389
1390        tun_put(tun);
1391        return mask;
1392}
1393
1394static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
1395                                            size_t len,
1396                                            const struct iov_iter *it)
1397{
1398        struct sk_buff *skb;
1399        size_t linear;
1400        int err;
1401        int i;
1402
1403        if (it->nr_segs > MAX_SKB_FRAGS + 1)
1404                return ERR_PTR(-ENOMEM);
1405
1406        local_bh_disable();
1407        skb = napi_get_frags(&tfile->napi);
1408        local_bh_enable();
1409        if (!skb)
1410                return ERR_PTR(-ENOMEM);
1411
1412        linear = iov_iter_single_seg_count(it);
1413        err = __skb_grow(skb, linear);
1414        if (err)
1415                goto free;
1416
1417        skb->len = len;
1418        skb->data_len = len - linear;
1419        skb->truesize += skb->data_len;
1420
1421        for (i = 1; i < it->nr_segs; i++) {
1422                size_t fragsz = it->iov[i].iov_len;
1423                struct page *page;
1424                void *frag;
1425
1426                if (fragsz == 0 || fragsz > PAGE_SIZE) {
1427                        err = -EINVAL;
1428                        goto free;
1429                }
1430                frag = netdev_alloc_frag(fragsz);
1431                if (!frag) {
1432                        err = -ENOMEM;
1433                        goto free;
1434                }
1435                page = virt_to_head_page(frag);
1436                skb_fill_page_desc(skb, i - 1, page,
1437                                   frag - page_address(page), fragsz);
1438        }
1439
1440        return skb;
1441free:
1442        /* frees skb and all frags allocated with napi_alloc_frag() */
1443        napi_free_frags(&tfile->napi);
1444        return ERR_PTR(err);
1445}
1446
1447/* prepad is the amount to reserve at front.  len is length after that.
1448 * linear is a hint as to how much to copy (usually headers). */
1449static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
1450                                     size_t prepad, size_t len,
1451                                     size_t linear, int noblock)
1452{
1453        struct sock *sk = tfile->socket.sk;
1454        struct sk_buff *skb;
1455        int err;
1456
1457        /* Under a page?  Don't bother with paged skb. */
1458        if (prepad + len < PAGE_SIZE || !linear)
1459                linear = len;
1460
1461        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1462                                   &err, 0);
1463        if (!skb)
1464                return ERR_PTR(err);
1465
1466        skb_reserve(skb, prepad);
1467        skb_put(skb, linear);
1468        skb->data_len = len - linear;
1469        skb->len += len - linear;
1470
1471        return skb;
1472}
1473
1474static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
1475                           struct sk_buff *skb, int more)
1476{
1477        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1478        struct sk_buff_head process_queue;
1479        u32 rx_batched = tun->rx_batched;
1480        bool rcv = false;
1481
1482        if (!rx_batched || (!more && skb_queue_empty(queue))) {
1483                local_bh_disable();
1484                skb_record_rx_queue(skb, tfile->queue_index);
1485                netif_receive_skb(skb);
1486                local_bh_enable();
1487                return;
1488        }
1489
1490        spin_lock(&queue->lock);
1491        if (!more || skb_queue_len(queue) == rx_batched) {
1492                __skb_queue_head_init(&process_queue);
1493                skb_queue_splice_tail_init(queue, &process_queue);
1494                rcv = true;
1495        } else {
1496                __skb_queue_tail(queue, skb);
1497        }
1498        spin_unlock(&queue->lock);
1499
1500        if (rcv) {
1501                struct sk_buff *nskb;
1502
1503                local_bh_disable();
1504                while ((nskb = __skb_dequeue(&process_queue))) {
1505                        skb_record_rx_queue(nskb, tfile->queue_index);
1506                        netif_receive_skb(nskb);
1507                }
1508                skb_record_rx_queue(skb, tfile->queue_index);
1509                netif_receive_skb(skb);
1510                local_bh_enable();
1511        }
1512}
1513
1514static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
1515                              int len, int noblock, bool zerocopy)
1516{
1517        if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
1518                return false;
1519
1520        if (tfile->socket.sk->sk_sndbuf != INT_MAX)
1521                return false;
1522
1523        if (!noblock)
1524                return false;
1525
1526        if (zerocopy)
1527                return false;
1528
1529        if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
1530            SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
1531                return false;
1532
1533        return true;
1534}
1535
1536static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
1537                                       struct page_frag *alloc_frag, char *buf,
1538                                       int buflen, int len, int pad)
1539{
1540        struct sk_buff *skb = build_skb(buf, buflen);
1541
1542        if (!skb)
1543                return ERR_PTR(-ENOMEM);
1544
1545        skb_reserve(skb, pad);
1546        skb_put(skb, len);
1547        skb_set_owner_w(skb, tfile->socket.sk);
1548
1549        get_page(alloc_frag->page);
1550        alloc_frag->offset += buflen;
1551
1552        return skb;
1553}
1554
1555static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
1556                       struct xdp_buff *xdp, u32 act)
1557{
1558        int err;
1559
1560        switch (act) {
1561        case XDP_REDIRECT:
1562                err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
1563                if (err)
1564                        return err;
1565                break;
1566        case XDP_TX:
1567                err = tun_xdp_tx(tun->dev, xdp);
1568                if (err < 0)
1569                        return err;
1570                break;
1571        case XDP_PASS:
1572                break;
1573        default:
1574                bpf_warn_invalid_xdp_action(act);
1575                fallthrough;
1576        case XDP_ABORTED:
1577                trace_xdp_exception(tun->dev, xdp_prog, act);
1578                fallthrough;
1579        case XDP_DROP:
1580                this_cpu_inc(tun->pcpu_stats->rx_dropped);
1581                break;
1582        }
1583
1584        return act;
1585}
1586
1587static struct sk_buff *tun_build_skb(struct tun_struct *tun,
1588                                     struct tun_file *tfile,
1589                                     struct iov_iter *from,
1590                                     struct virtio_net_hdr *hdr,
1591                                     int len, int *skb_xdp)
1592{
1593        struct page_frag *alloc_frag = &current->task_frag;
1594        struct bpf_prog *xdp_prog;
1595        int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1596        char *buf;
1597        size_t copied;
1598        int pad = TUN_RX_PAD;
1599        int err = 0;
1600
1601        rcu_read_lock();
1602        xdp_prog = rcu_dereference(tun->xdp_prog);
1603        if (xdp_prog)
1604                pad += XDP_PACKET_HEADROOM;
1605        buflen += SKB_DATA_ALIGN(len + pad);
1606        rcu_read_unlock();
1607
1608        alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
1609        if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
1610                return ERR_PTR(-ENOMEM);
1611
1612        buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1613        copied = copy_page_from_iter(alloc_frag->page,
1614                                     alloc_frag->offset + pad,
1615                                     len, from);
1616        if (copied != len)
1617                return ERR_PTR(-EFAULT);
1618
1619        /* There's a small window that XDP may be set after the check
1620         * of xdp_prog above, this should be rare and for simplicity
1621         * we do XDP on skb in case the headroom is not enough.
1622         */
1623        if (hdr->gso_type || !xdp_prog) {
1624                *skb_xdp = 1;
1625                return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
1626                                       pad);
1627        }
1628
1629        *skb_xdp = 0;
1630
1631        local_bh_disable();
1632        rcu_read_lock();
1633        xdp_prog = rcu_dereference(tun->xdp_prog);
1634        if (xdp_prog) {
1635                struct xdp_buff xdp;
1636                u32 act;
1637
1638                xdp.data_hard_start = buf;
1639                xdp.data = buf + pad;
1640                xdp_set_data_meta_invalid(&xdp);
1641                xdp.data_end = xdp.data + len;
1642                xdp.rxq = &tfile->xdp_rxq;
1643                xdp.frame_sz = buflen;
1644
1645                act = bpf_prog_run_xdp(xdp_prog, &xdp);
1646                if (act == XDP_REDIRECT || act == XDP_TX) {
1647                        get_page(alloc_frag->page);
1648                        alloc_frag->offset += buflen;
1649                }
1650                err = tun_xdp_act(tun, xdp_prog, &xdp, act);
1651                if (err < 0) {
1652                        if (act == XDP_REDIRECT || act == XDP_TX)
1653                                put_page(alloc_frag->page);
1654                        goto out;
1655                }
1656
1657                if (err == XDP_REDIRECT)
1658                        xdp_do_flush();
1659                if (err != XDP_PASS)
1660                        goto out;
1661
1662                pad = xdp.data - xdp.data_hard_start;
1663                len = xdp.data_end - xdp.data;
1664        }
1665        rcu_read_unlock();
1666        local_bh_enable();
1667
1668        return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
1669
1670out:
1671        rcu_read_unlock();
1672        local_bh_enable();
1673        return NULL;
1674}
1675
1676/* Get packet from user space buffer */
1677static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1678                            void *msg_control, struct iov_iter *from,
1679                            int noblock, bool more)
1680{
1681        struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
1682        struct sk_buff *skb;
1683        size_t total_len = iov_iter_count(from);
1684        size_t len = total_len, align = tun->align, linear;
1685        struct virtio_net_hdr gso = { 0 };
1686        struct tun_pcpu_stats *stats;
1687        int good_linear;
1688        int copylen;
1689        bool zerocopy = false;
1690        int err;
1691        u32 rxhash = 0;
1692        int skb_xdp = 1;
1693        bool frags = tun_napi_frags_enabled(tfile);
1694
1695        if (!(tun->flags & IFF_NO_PI)) {
1696                if (len < sizeof(pi))
1697                        return -EINVAL;
1698                len -= sizeof(pi);
1699
1700                if (!copy_from_iter_full(&pi, sizeof(pi), from))
1701                        return -EFAULT;
1702        }
1703
1704        if (tun->flags & IFF_VNET_HDR) {
1705                int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
1706
1707                if (len < vnet_hdr_sz)
1708                        return -EINVAL;
1709                len -= vnet_hdr_sz;
1710
1711                if (!copy_from_iter_full(&gso, sizeof(gso), from))
1712                        return -EFAULT;
1713
1714                if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1715                    tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len))
1716                        gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2);
1717
1718                if (tun16_to_cpu(tun, gso.hdr_len) > len)
1719                        return -EINVAL;
1720                iov_iter_advance(from, vnet_hdr_sz - sizeof(gso));
1721        }
1722
1723        if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
1724                align += NET_IP_ALIGN;
1725                if (unlikely(len < ETH_HLEN ||
1726                             (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
1727                        return -EINVAL;
1728        }
1729
1730        good_linear = SKB_MAX_HEAD(align);
1731
1732        if (msg_control) {
1733                struct iov_iter i = *from;
1734
1735                /* There are 256 bytes to be copied in skb, so there is
1736                 * enough room for skb expand head in case it is used.
1737                 * The rest of the buffer is mapped from userspace.
1738                 */
1739                copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN;
1740                if (copylen > good_linear)
1741                        copylen = good_linear;
1742                linear = copylen;
1743                iov_iter_advance(&i, copylen);
1744                if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
1745                        zerocopy = true;
1746        }
1747
1748        if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
1749                /* For the packet that is not easy to be processed
1750                 * (e.g gso or jumbo packet), we will do it at after
1751                 * skb was created with generic XDP routine.
1752                 */
1753                skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
1754                if (IS_ERR(skb)) {
1755                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
1756                        return PTR_ERR(skb);
1757                }
1758                if (!skb)
1759                        return total_len;
1760        } else {
1761                if (!zerocopy) {
1762                        copylen = len;
1763                        if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
1764                                linear = good_linear;
1765                        else
1766                                linear = tun16_to_cpu(tun, gso.hdr_len);
1767                }
1768
1769                if (frags) {
1770                        mutex_lock(&tfile->napi_mutex);
1771                        skb = tun_napi_alloc_frags(tfile, copylen, from);
1772                        /* tun_napi_alloc_frags() enforces a layout for the skb.
1773                         * If zerocopy is enabled, then this layout will be
1774                         * overwritten by zerocopy_sg_from_iter().
1775                         */
1776                        zerocopy = false;
1777                } else {
1778                        skb = tun_alloc_skb(tfile, align, copylen, linear,
1779                                            noblock);
1780                }
1781
1782                if (IS_ERR(skb)) {
1783                        if (PTR_ERR(skb) != -EAGAIN)
1784                                this_cpu_inc(tun->pcpu_stats->rx_dropped);
1785                        if (frags)
1786                                mutex_unlock(&tfile->napi_mutex);
1787                        return PTR_ERR(skb);
1788                }
1789
1790                if (zerocopy)
1791                        err = zerocopy_sg_from_iter(skb, from);
1792                else
1793                        err = skb_copy_datagram_from_iter(skb, 0, from, len);
1794
1795                if (err) {
1796                        err = -EFAULT;
1797drop:
1798                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
1799                        kfree_skb(skb);
1800                        if (frags) {
1801                                tfile->napi.skb = NULL;
1802                                mutex_unlock(&tfile->napi_mutex);
1803                        }
1804
1805                        return err;
1806                }
1807        }
1808
1809        if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
1810                this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
1811                kfree_skb(skb);
1812                if (frags) {
1813                        tfile->napi.skb = NULL;
1814                        mutex_unlock(&tfile->napi_mutex);
1815                }
1816
1817                return -EINVAL;
1818        }
1819
1820        switch (tun->flags & TUN_TYPE_MASK) {
1821        case IFF_TUN:
1822                if (tun->flags & IFF_NO_PI) {
1823                        u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0;
1824
1825                        switch (ip_version) {
1826                        case 4:
1827                                pi.proto = htons(ETH_P_IP);
1828                                break;
1829                        case 6:
1830                                pi.proto = htons(ETH_P_IPV6);
1831                                break;
1832                        default:
1833                                this_cpu_inc(tun->pcpu_stats->rx_dropped);
1834                                kfree_skb(skb);
1835                                return -EINVAL;
1836                        }
1837                }
1838
1839                skb_reset_mac_header(skb);
1840                skb->protocol = pi.proto;
1841                skb->dev = tun->dev;
1842                break;
1843        case IFF_TAP:
1844                if (frags && !pskb_may_pull(skb, ETH_HLEN)) {
1845                        err = -ENOMEM;
1846                        goto drop;
1847                }
1848                skb->protocol = eth_type_trans(skb, tun->dev);
1849                break;
1850        }
1851
1852        /* copy skb_ubuf_info for callback when skb has no error */
1853        if (zerocopy) {
1854                skb_shinfo(skb)->destructor_arg = msg_control;
1855                skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1856                skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1857        } else if (msg_control) {
1858                struct ubuf_info *uarg = msg_control;
1859                uarg->callback(uarg, false);
1860        }
1861
1862        skb_reset_network_header(skb);
1863        skb_probe_transport_header(skb);
1864        skb_record_rx_queue(skb, tfile->queue_index);
1865
1866        if (skb_xdp) {
1867                struct bpf_prog *xdp_prog;
1868                int ret;
1869
1870                local_bh_disable();
1871                rcu_read_lock();
1872                xdp_prog = rcu_dereference(tun->xdp_prog);
1873                if (xdp_prog) {
1874                        ret = do_xdp_generic(xdp_prog, skb);
1875                        if (ret != XDP_PASS) {
1876                                rcu_read_unlock();
1877                                local_bh_enable();
1878                                if (frags) {
1879                                        tfile->napi.skb = NULL;
1880                                        mutex_unlock(&tfile->napi_mutex);
1881                                }
1882                                return total_len;
1883                        }
1884                }
1885                rcu_read_unlock();
1886                local_bh_enable();
1887        }
1888
1889        /* Compute the costly rx hash only if needed for flow updates.
1890         * We may get a very small possibility of OOO during switching, not
1891         * worth to optimize.
1892         */
1893        if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
1894            !tfile->detached)
1895                rxhash = __skb_get_hash_symmetric(skb);
1896
1897        rcu_read_lock();
1898        if (unlikely(!(tun->dev->flags & IFF_UP))) {
1899                err = -EIO;
1900                rcu_read_unlock();
1901                goto drop;
1902        }
1903
1904        if (frags) {
1905                u32 headlen;
1906
1907                /* Exercise flow dissector code path. */
1908                skb_push(skb, ETH_HLEN);
1909                headlen = eth_get_headlen(tun->dev, skb->data,
1910                                          skb_headlen(skb));
1911
1912                if (unlikely(headlen > skb_headlen(skb))) {
1913                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
1914                        napi_free_frags(&tfile->napi);
1915                        rcu_read_unlock();
1916                        mutex_unlock(&tfile->napi_mutex);
1917                        WARN_ON(1);
1918                        return -ENOMEM;
1919                }
1920
1921                local_bh_disable();
1922                napi_gro_frags(&tfile->napi);
1923                local_bh_enable();
1924                mutex_unlock(&tfile->napi_mutex);
1925        } else if (tfile->napi_enabled) {
1926                struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1927                int queue_len;
1928
1929                spin_lock_bh(&queue->lock);
1930                __skb_queue_tail(queue, skb);
1931                queue_len = skb_queue_len(queue);
1932                spin_unlock(&queue->lock);
1933
1934                if (!more || queue_len > NAPI_POLL_WEIGHT)
1935                        napi_schedule(&tfile->napi);
1936
1937                local_bh_enable();
1938        } else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
1939                tun_rx_batched(tun, tfile, skb, more);
1940        } else {
1941                netif_rx_ni(skb);
1942        }
1943        rcu_read_unlock();
1944
1945        stats = get_cpu_ptr(tun->pcpu_stats);
1946        u64_stats_update_begin(&stats->syncp);
1947        u64_stats_inc(&stats->rx_packets);
1948        u64_stats_add(&stats->rx_bytes, len);
1949        u64_stats_update_end(&stats->syncp);
1950        put_cpu_ptr(stats);
1951
1952        if (rxhash)
1953                tun_flow_update(tun, rxhash, tfile);
1954
1955        return total_len;
1956}
1957
1958static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
1959{
1960        struct file *file = iocb->ki_filp;
1961        struct tun_file *tfile = file->private_data;
1962        struct tun_struct *tun = tun_get(tfile);
1963        ssize_t result;
1964        int noblock = 0;
1965
1966        if (!tun)
1967                return -EBADFD;
1968
1969        if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
1970                noblock = 1;
1971
1972        result = tun_get_user(tun, tfile, NULL, from, noblock, false);
1973
1974        tun_put(tun);
1975        return result;
1976}
1977
1978static ssize_t tun_put_user_xdp(struct tun_struct *tun,
1979                                struct tun_file *tfile,
1980                                struct xdp_frame *xdp_frame,
1981                                struct iov_iter *iter)
1982{
1983        int vnet_hdr_sz = 0;
1984        size_t size = xdp_frame->len;
1985        struct tun_pcpu_stats *stats;
1986        size_t ret;
1987
1988        if (tun->flags & IFF_VNET_HDR) {
1989                struct virtio_net_hdr gso = { 0 };
1990
1991                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
1992                if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
1993                        return -EINVAL;
1994                if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
1995                             sizeof(gso)))
1996                        return -EFAULT;
1997                iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
1998        }
1999
2000        ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
2001
2002        stats = get_cpu_ptr(tun->pcpu_stats);
2003        u64_stats_update_begin(&stats->syncp);
2004        u64_stats_inc(&stats->tx_packets);
2005        u64_stats_add(&stats->tx_bytes, ret);
2006        u64_stats_update_end(&stats->syncp);
2007        put_cpu_ptr(tun->pcpu_stats);
2008
2009        return ret;
2010}
2011
2012/* Put packet to the user space buffer */
2013static ssize_t tun_put_user(struct tun_struct *tun,
2014                            struct tun_file *tfile,
2015                            struct sk_buff *skb,
2016                            struct iov_iter *iter)
2017{
2018        struct tun_pi pi = { 0, skb->protocol };
2019        struct tun_pcpu_stats *stats;
2020        ssize_t total;
2021        int vlan_offset = 0;
2022        int vlan_hlen = 0;
2023        int vnet_hdr_sz = 0;
2024
2025        if (skb_vlan_tag_present(skb))
2026                vlan_hlen = VLAN_HLEN;
2027
2028        if (tun->flags & IFF_VNET_HDR)
2029                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2030
2031        total = skb->len + vlan_hlen + vnet_hdr_sz;
2032
2033        if (!(tun->flags & IFF_NO_PI)) {
2034                if (iov_iter_count(iter) < sizeof(pi))
2035                        return -EINVAL;
2036
2037                total += sizeof(pi);
2038                if (iov_iter_count(iter) < total) {
2039                        /* Packet will be striped */
2040                        pi.flags |= TUN_PKT_STRIP;
2041                }
2042
2043                if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
2044                        return -EFAULT;
2045        }
2046
2047        if (vnet_hdr_sz) {
2048                struct virtio_net_hdr gso;
2049
2050                if (iov_iter_count(iter) < vnet_hdr_sz)
2051                        return -EINVAL;
2052
2053                if (virtio_net_hdr_from_skb(skb, &gso,
2054                                            tun_is_little_endian(tun), true,
2055                                            vlan_hlen)) {
2056                        struct skb_shared_info *sinfo = skb_shinfo(skb);
2057                        pr_err("unexpected GSO type: "
2058                               "0x%x, gso_size %d, hdr_len %d\n",
2059                               sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
2060                               tun16_to_cpu(tun, gso.hdr_len));
2061                        print_hex_dump(KERN_ERR, "tun: ",
2062                                       DUMP_PREFIX_NONE,
2063                                       16, 1, skb->head,
2064                                       min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
2065                        WARN_ON_ONCE(1);
2066                        return -EINVAL;
2067                }
2068
2069                if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
2070                        return -EFAULT;
2071
2072                iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
2073        }
2074
2075        if (vlan_hlen) {
2076                int ret;
2077                struct veth veth;
2078
2079                veth.h_vlan_proto = skb->vlan_proto;
2080                veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
2081
2082                vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
2083
2084                ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
2085                if (ret || !iov_iter_count(iter))
2086                        goto done;
2087
2088                ret = copy_to_iter(&veth, sizeof(veth), iter);
2089                if (ret != sizeof(veth) || !iov_iter_count(iter))
2090                        goto done;
2091        }
2092
2093        skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
2094
2095done:
2096        /* caller is in process context, */
2097        stats = get_cpu_ptr(tun->pcpu_stats);
2098        u64_stats_update_begin(&stats->syncp);
2099        u64_stats_inc(&stats->tx_packets);
2100        u64_stats_add(&stats->tx_bytes, skb->len + vlan_hlen);
2101        u64_stats_update_end(&stats->syncp);
2102        put_cpu_ptr(tun->pcpu_stats);
2103
2104        return total;
2105}
2106
2107static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
2108{
2109        DECLARE_WAITQUEUE(wait, current);
2110        void *ptr = NULL;
2111        int error = 0;
2112
2113        ptr = ptr_ring_consume(&tfile->tx_ring);
2114        if (ptr)
2115                goto out;
2116        if (noblock) {
2117                error = -EAGAIN;
2118                goto out;
2119        }
2120
2121        add_wait_queue(&tfile->socket.wq.wait, &wait);
2122
2123        while (1) {
2124                set_current_state(TASK_INTERRUPTIBLE);
2125                ptr = ptr_ring_consume(&tfile->tx_ring);
2126                if (ptr)
2127                        break;
2128                if (signal_pending(current)) {
2129                        error = -ERESTARTSYS;
2130                        break;
2131                }
2132                if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
2133                        error = -EFAULT;
2134                        break;
2135                }
2136
2137                schedule();
2138        }
2139
2140        __set_current_state(TASK_RUNNING);
2141        remove_wait_queue(&tfile->socket.wq.wait, &wait);
2142
2143out:
2144        *err = error;
2145        return ptr;
2146}
2147
2148static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
2149                           struct iov_iter *to,
2150                           int noblock, void *ptr)
2151{
2152        ssize_t ret;
2153        int err;
2154
2155        if (!iov_iter_count(to)) {
2156                tun_ptr_free(ptr);
2157                return 0;
2158        }
2159
2160        if (!ptr) {
2161                /* Read frames from ring */
2162                ptr = tun_ring_recv(tfile, noblock, &err);
2163                if (!ptr)
2164                        return err;
2165        }
2166
2167        if (tun_is_xdp_frame(ptr)) {
2168                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2169
2170                ret = tun_put_user_xdp(tun, tfile, xdpf, to);
2171                xdp_return_frame(xdpf);
2172        } else {
2173                struct sk_buff *skb = ptr;
2174
2175                ret = tun_put_user(tun, tfile, skb, to);
2176                if (unlikely(ret < 0))
2177                        kfree_skb(skb);
2178                else
2179                        consume_skb(skb);
2180        }
2181
2182        return ret;
2183}
2184
2185static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
2186{
2187        struct file *file = iocb->ki_filp;
2188        struct tun_file *tfile = file->private_data;
2189        struct tun_struct *tun = tun_get(tfile);
2190        ssize_t len = iov_iter_count(to), ret;
2191        int noblock = 0;
2192
2193        if (!tun)
2194                return -EBADFD;
2195
2196        if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
2197                noblock = 1;
2198
2199        ret = tun_do_read(tun, tfile, to, noblock, NULL);
2200        ret = min_t(ssize_t, ret, len);
2201        if (ret > 0)
2202                iocb->ki_pos = ret;
2203        tun_put(tun);
2204        return ret;
2205}
2206
2207static void tun_prog_free(struct rcu_head *rcu)
2208{
2209        struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
2210
2211        bpf_prog_destroy(prog->prog);
2212        kfree(prog);
2213}
2214
2215static int __tun_set_ebpf(struct tun_struct *tun,
2216                          struct tun_prog __rcu **prog_p,
2217                          struct bpf_prog *prog)
2218{
2219        struct tun_prog *old, *new = NULL;
2220
2221        if (prog) {
2222                new = kmalloc(sizeof(*new), GFP_KERNEL);
2223                if (!new)
2224                        return -ENOMEM;
2225                new->prog = prog;
2226        }
2227
2228        spin_lock_bh(&tun->lock);
2229        old = rcu_dereference_protected(*prog_p,
2230                                        lockdep_is_held(&tun->lock));
2231        rcu_assign_pointer(*prog_p, new);
2232        spin_unlock_bh(&tun->lock);
2233
2234        if (old)
2235                call_rcu(&old->rcu, tun_prog_free);
2236
2237        return 0;
2238}
2239
2240static void tun_free_netdev(struct net_device *dev)
2241{
2242        struct tun_struct *tun = netdev_priv(dev);
2243
2244        BUG_ON(!(list_empty(&tun->disabled)));
2245
2246        free_percpu(tun->pcpu_stats);
2247        /* We clear pcpu_stats so that tun_set_iff() can tell if
2248         * tun_free_netdev() has been called from register_netdevice().
2249         */
2250        tun->pcpu_stats = NULL;
2251
2252        tun_flow_uninit(tun);
2253        security_tun_dev_free_security(tun->security);
2254        __tun_set_ebpf(tun, &tun->steering_prog, NULL);
2255        __tun_set_ebpf(tun, &tun->filter_prog, NULL);
2256}
2257
2258static void tun_setup(struct net_device *dev)
2259{
2260        struct tun_struct *tun = netdev_priv(dev);
2261
2262        tun->owner = INVALID_UID;
2263        tun->group = INVALID_GID;
2264        tun_default_link_ksettings(dev, &tun->link_ksettings);
2265
2266        dev->ethtool_ops = &tun_ethtool_ops;
2267        dev->needs_free_netdev = true;
2268        dev->priv_destructor = tun_free_netdev;
2269        /* We prefer our own queue length */
2270        dev->tx_queue_len = TUN_READQ_SIZE;
2271}
2272
2273/* Trivial set of netlink ops to allow deleting tun or tap
2274 * device with netlink.
2275 */
2276static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
2277                        struct netlink_ext_ack *extack)
2278{
2279        NL_SET_ERR_MSG(extack,
2280                       "tun/tap creation via rtnetlink is not supported.");
2281        return -EOPNOTSUPP;
2282}
2283
2284static size_t tun_get_size(const struct net_device *dev)
2285{
2286        BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
2287        BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));
2288
2289        return nla_total_size(sizeof(uid_t)) + /* OWNER */
2290               nla_total_size(sizeof(gid_t)) + /* GROUP */
2291               nla_total_size(sizeof(u8)) + /* TYPE */
2292               nla_total_size(sizeof(u8)) + /* PI */
2293               nla_total_size(sizeof(u8)) + /* VNET_HDR */
2294               nla_total_size(sizeof(u8)) + /* PERSIST */
2295               nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */
2296               nla_total_size(sizeof(u32)) + /* NUM_QUEUES */
2297               nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */
2298               0;
2299}
2300
2301static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev)
2302{
2303        struct tun_struct *tun = netdev_priv(dev);
2304
2305        if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK))
2306                goto nla_put_failure;
2307        if (uid_valid(tun->owner) &&
2308            nla_put_u32(skb, IFLA_TUN_OWNER,
2309                        from_kuid_munged(current_user_ns(), tun->owner)))
2310                goto nla_put_failure;
2311        if (gid_valid(tun->group) &&
2312            nla_put_u32(skb, IFLA_TUN_GROUP,
2313                        from_kgid_munged(current_user_ns(), tun->group)))
2314                goto nla_put_failure;
2315        if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI)))
2316                goto nla_put_failure;
2317        if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR)))
2318                goto nla_put_failure;
2319        if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST)))
2320                goto nla_put_failure;
2321        if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE,
2322                       !!(tun->flags & IFF_MULTI_QUEUE)))
2323                goto nla_put_failure;
2324        if (tun->flags & IFF_MULTI_QUEUE) {
2325                if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues))
2326                        goto nla_put_failure;
2327                if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES,
2328                                tun->numdisabled))
2329                        goto nla_put_failure;
2330        }
2331
2332        return 0;
2333
2334nla_put_failure:
2335        return -EMSGSIZE;
2336}
2337
2338static struct rtnl_link_ops tun_link_ops __read_mostly = {
2339        .kind           = DRV_NAME,
2340        .priv_size      = sizeof(struct tun_struct),
2341        .setup          = tun_setup,
2342        .validate       = tun_validate,
2343        .get_size       = tun_get_size,
2344        .fill_info      = tun_fill_info,
2345};
2346
2347static void tun_sock_write_space(struct sock *sk)
2348{
2349        struct tun_file *tfile;
2350        wait_queue_head_t *wqueue;
2351
2352        if (!sock_writeable(sk))
2353                return;
2354
2355        if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
2356                return;
2357
2358        wqueue = sk_sleep(sk);
2359        if (wqueue && waitqueue_active(wqueue))
2360                wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
2361                                                EPOLLWRNORM | EPOLLWRBAND);
2362
2363        tfile = container_of(sk, struct tun_file, sk);
2364        kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
2365}
2366
2367static void tun_put_page(struct tun_page *tpage)
2368{
2369        if (tpage->page)
2370                __page_frag_cache_drain(tpage->page, tpage->count);
2371}
2372
2373static int tun_xdp_one(struct tun_struct *tun,
2374                       struct tun_file *tfile,
2375                       struct xdp_buff *xdp, int *flush,
2376                       struct tun_page *tpage)
2377{
2378        unsigned int datasize = xdp->data_end - xdp->data;
2379        struct tun_xdp_hdr *hdr = xdp->data_hard_start;
2380        struct virtio_net_hdr *gso = &hdr->gso;
2381        struct tun_pcpu_stats *stats;
2382        struct bpf_prog *xdp_prog;
2383        struct sk_buff *skb = NULL;
2384        u32 rxhash = 0, act;
2385        int buflen = hdr->buflen;
2386        int err = 0;
2387        bool skb_xdp = false;
2388        struct page *page;
2389
2390        xdp_prog = rcu_dereference(tun->xdp_prog);
2391        if (xdp_prog) {
2392                if (gso->gso_type) {
2393                        skb_xdp = true;
2394                        goto build;
2395                }
2396                xdp_set_data_meta_invalid(xdp);
2397                xdp->rxq = &tfile->xdp_rxq;
2398                xdp->frame_sz = buflen;
2399
2400                act = bpf_prog_run_xdp(xdp_prog, xdp);
2401                err = tun_xdp_act(tun, xdp_prog, xdp, act);
2402                if (err < 0) {
2403                        put_page(virt_to_head_page(xdp->data));
2404                        return err;
2405                }
2406
2407                switch (err) {
2408                case XDP_REDIRECT:
2409                        *flush = true;
2410                        fallthrough;
2411                case XDP_TX:
2412                        return 0;
2413                case XDP_PASS:
2414                        break;
2415                default:
2416                        page = virt_to_head_page(xdp->data);
2417                        if (tpage->page == page) {
2418                                ++tpage->count;
2419                        } else {
2420                                tun_put_page(tpage);
2421                                tpage->page = page;
2422                                tpage->count = 1;
2423                        }
2424                        return 0;
2425                }
2426        }
2427
2428build:
2429        skb = build_skb(xdp->data_hard_start, buflen);
2430        if (!skb) {
2431                err = -ENOMEM;
2432                goto out;
2433        }
2434
2435        skb_reserve(skb, xdp->data - xdp->data_hard_start);
2436        skb_put(skb, xdp->data_end - xdp->data);
2437
2438        if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
2439                this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
2440                kfree_skb(skb);
2441                err = -EINVAL;
2442                goto out;
2443        }
2444
2445        skb->protocol = eth_type_trans(skb, tun->dev);
2446        skb_reset_network_header(skb);
2447        skb_probe_transport_header(skb);
2448        skb_record_rx_queue(skb, tfile->queue_index);
2449
2450        if (skb_xdp) {
2451                err = do_xdp_generic(xdp_prog, skb);
2452                if (err != XDP_PASS)
2453                        goto out;
2454        }
2455
2456        if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
2457            !tfile->detached)
2458                rxhash = __skb_get_hash_symmetric(skb);
2459
2460        netif_receive_skb(skb);
2461
2462        /* No need for get_cpu_ptr() here since this function is
2463         * always called with bh disabled
2464         */
2465        stats = this_cpu_ptr(tun->pcpu_stats);
2466        u64_stats_update_begin(&stats->syncp);
2467        u64_stats_inc(&stats->rx_packets);
2468        u64_stats_add(&stats->rx_bytes, datasize);
2469        u64_stats_update_end(&stats->syncp);
2470
2471        if (rxhash)
2472                tun_flow_update(tun, rxhash, tfile);
2473
2474out:
2475        return err;
2476}
2477
2478static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
2479{
2480        int ret, i;
2481        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2482        struct tun_struct *tun = tun_get(tfile);
2483        struct tun_msg_ctl *ctl = m->msg_control;
2484        struct xdp_buff *xdp;
2485
2486        if (!tun)
2487                return -EBADFD;
2488
2489        if (ctl && (ctl->type == TUN_MSG_PTR)) {
2490                struct tun_page tpage;
2491                int n = ctl->num;
2492                int flush = 0;
2493
2494                memset(&tpage, 0, sizeof(tpage));
2495
2496                local_bh_disable();
2497                rcu_read_lock();
2498
2499                for (i = 0; i < n; i++) {
2500                        xdp = &((struct xdp_buff *)ctl->ptr)[i];
2501                        tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
2502                }
2503
2504                if (flush)
2505                        xdp_do_flush();
2506
2507                rcu_read_unlock();
2508                local_bh_enable();
2509
2510                tun_put_page(&tpage);
2511
2512                ret = total_len;
2513                goto out;
2514        }
2515
2516        ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
2517                           m->msg_flags & MSG_DONTWAIT,
2518                           m->msg_flags & MSG_MORE);
2519out:
2520        tun_put(tun);
2521        return ret;
2522}
2523
2524static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
2525                       int flags)
2526{
2527        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2528        struct tun_struct *tun = tun_get(tfile);
2529        void *ptr = m->msg_control;
2530        int ret;
2531
2532        if (!tun) {
2533                ret = -EBADFD;
2534                goto out_free;
2535        }
2536
2537        if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
2538                ret = -EINVAL;
2539                goto out_put_tun;
2540        }
2541        if (flags & MSG_ERRQUEUE) {
2542                ret = sock_recv_errqueue(sock->sk, m, total_len,
2543                                         SOL_PACKET, TUN_TX_TIMESTAMP);
2544                goto out;
2545        }
2546        ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
2547        if (ret > (ssize_t)total_len) {
2548                m->msg_flags |= MSG_TRUNC;
2549                ret = flags & MSG_TRUNC ? ret : total_len;
2550        }
2551out:
2552        tun_put(tun);
2553        return ret;
2554
2555out_put_tun:
2556        tun_put(tun);
2557out_free:
2558        tun_ptr_free(ptr);
2559        return ret;
2560}
2561
2562static int tun_ptr_peek_len(void *ptr)
2563{
2564        if (likely(ptr)) {
2565                if (tun_is_xdp_frame(ptr)) {
2566                        struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2567
2568                        return xdpf->len;
2569                }
2570                return __skb_array_len_with_tag(ptr);
2571        } else {
2572                return 0;
2573        }
2574}
2575
2576static int tun_peek_len(struct socket *sock)
2577{
2578        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2579        struct tun_struct *tun;
2580        int ret = 0;
2581
2582        tun = tun_get(tfile);
2583        if (!tun)
2584                return 0;
2585
2586        ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
2587        tun_put(tun);
2588
2589        return ret;
2590}
2591
2592/* Ops structure to mimic raw sockets with tun */
2593static const struct proto_ops tun_socket_ops = {
2594        .peek_len = tun_peek_len,
2595        .sendmsg = tun_sendmsg,
2596        .recvmsg = tun_recvmsg,
2597};
2598
2599static struct proto tun_proto = {
2600        .name           = "tun",
2601        .owner          = THIS_MODULE,
2602        .obj_size       = sizeof(struct tun_file),
2603};
2604
2605static int tun_flags(struct tun_struct *tun)
2606{
2607        return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
2608}
2609
2610static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
2611                              char *buf)
2612{
2613        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2614        return sprintf(buf, "0x%x\n", tun_flags(tun));
2615}
2616
2617static ssize_t tun_show_owner(struct device *dev, struct device_attribute *attr,
2618                              char *buf)
2619{
2620        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2621        return uid_valid(tun->owner)?
2622                sprintf(buf, "%u\n",
2623                        from_kuid_munged(current_user_ns(), tun->owner)):
2624                sprintf(buf, "-1\n");
2625}
2626
2627static ssize_t tun_show_group(struct device *dev, struct device_attribute *attr,
2628                              char *buf)
2629{
2630        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2631        return gid_valid(tun->group) ?
2632                sprintf(buf, "%u\n",
2633                        from_kgid_munged(current_user_ns(), tun->group)):
2634                sprintf(buf, "-1\n");
2635}
2636
2637static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL);
2638static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL);
2639static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
2640
2641static struct attribute *tun_dev_attrs[] = {
2642        &dev_attr_tun_flags.attr,
2643        &dev_attr_owner.attr,
2644        &dev_attr_group.attr,
2645        NULL
2646};
2647
2648static const struct attribute_group tun_attr_group = {
2649        .attrs = tun_dev_attrs
2650};
2651
2652static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
2653{
2654        struct tun_struct *tun;
2655        struct tun_file *tfile = file->private_data;
2656        struct net_device *dev;
2657        int err;
2658
2659        if (tfile->detached)
2660                return -EINVAL;
2661
2662        if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
2663                if (!capable(CAP_NET_ADMIN))
2664                        return -EPERM;
2665
2666                if (!(ifr->ifr_flags & IFF_NAPI) ||
2667                    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
2668                        return -EINVAL;
2669        }
2670
2671        dev = __dev_get_by_name(net, ifr->ifr_name);
2672        if (dev) {
2673                if (ifr->ifr_flags & IFF_TUN_EXCL)
2674                        return -EBUSY;
2675                if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
2676                        tun = netdev_priv(dev);
2677                else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
2678                        tun = netdev_priv(dev);
2679                else
2680                        return -EINVAL;
2681
2682                if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
2683                    !!(tun->flags & IFF_MULTI_QUEUE))
2684                        return -EINVAL;
2685
2686                if (tun_not_capable(tun))
2687                        return -EPERM;
2688                err = security_tun_dev_open(tun->security);
2689                if (err < 0)
2690                        return err;
2691
2692                err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
2693                                 ifr->ifr_flags & IFF_NAPI,
2694                                 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
2695                if (err < 0)
2696                        return err;
2697
2698                if (tun->flags & IFF_MULTI_QUEUE &&
2699                    (tun->numqueues + tun->numdisabled > 1)) {
2700                        /* One or more queue has already been attached, no need
2701                         * to initialize the device again.
2702                         */
2703                        netdev_state_change(dev);
2704                        return 0;
2705                }
2706
2707                tun->flags = (tun->flags & ~TUN_FEATURES) |
2708                              (ifr->ifr_flags & TUN_FEATURES);
2709
2710                netdev_state_change(dev);
2711        } else {
2712                char *name;
2713                unsigned long flags = 0;
2714                int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
2715                             MAX_TAP_QUEUES : 1;
2716
2717                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2718                        return -EPERM;
2719                err = security_tun_dev_create();
2720                if (err < 0)
2721                        return err;
2722
2723                /* Set dev type */
2724                if (ifr->ifr_flags & IFF_TUN) {
2725                        /* TUN device */
2726                        flags |= IFF_TUN;
2727                        name = "tun%d";
2728                } else if (ifr->ifr_flags & IFF_TAP) {
2729                        /* TAP device */
2730                        flags |= IFF_TAP;
2731                        name = "tap%d";
2732                } else
2733                        return -EINVAL;
2734
2735                if (*ifr->ifr_name)
2736                        name = ifr->ifr_name;
2737
2738                dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
2739                                       NET_NAME_UNKNOWN, tun_setup, queues,
2740                                       queues);
2741
2742                if (!dev)
2743                        return -ENOMEM;
2744
2745                dev_net_set(dev, net);
2746                dev->rtnl_link_ops = &tun_link_ops;
2747                dev->ifindex = tfile->ifindex;
2748                dev->sysfs_groups[0] = &tun_attr_group;
2749
2750                tun = netdev_priv(dev);
2751                tun->dev = dev;
2752                tun->flags = flags;
2753                tun->txflt.count = 0;
2754                tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
2755
2756                tun->align = NET_SKB_PAD;
2757                tun->filter_attached = false;
2758                tun->sndbuf = tfile->socket.sk->sk_sndbuf;
2759                tun->rx_batched = 0;
2760                RCU_INIT_POINTER(tun->steering_prog, NULL);
2761
2762                tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
2763                if (!tun->pcpu_stats) {
2764                        err = -ENOMEM;
2765                        goto err_free_dev;
2766                }
2767
2768                spin_lock_init(&tun->lock);
2769
2770                err = security_tun_dev_alloc_security(&tun->security);
2771                if (err < 0)
2772                        goto err_free_stat;
2773
2774                tun_net_init(dev);
2775                tun_flow_init(tun);
2776
2777                dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
2778                                   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
2779                                   NETIF_F_HW_VLAN_STAG_TX;
2780                dev->features = dev->hw_features | NETIF_F_LLTX;
2781                dev->vlan_features = dev->features &
2782                                     ~(NETIF_F_HW_VLAN_CTAG_TX |
2783                                       NETIF_F_HW_VLAN_STAG_TX);
2784
2785                tun->flags = (tun->flags & ~TUN_FEATURES) |
2786                              (ifr->ifr_flags & TUN_FEATURES);
2787
2788                INIT_LIST_HEAD(&tun->disabled);
2789                err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
2790                                 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
2791                if (err < 0)
2792                        goto err_free_flow;
2793
2794                err = register_netdevice(tun->dev);
2795                if (err < 0)
2796                        goto err_detach;
2797                /* free_netdev() won't check refcnt, to aovid race
2798                 * with dev_put() we need publish tun after registration.
2799                 */
2800                rcu_assign_pointer(tfile->tun, tun);
2801        }
2802
2803        netif_carrier_on(tun->dev);
2804
2805        /* Make sure persistent devices do not get stuck in
2806         * xoff state.
2807         */
2808        if (netif_running(tun->dev))
2809                netif_tx_wake_all_queues(tun->dev);
2810
2811        strcpy(ifr->ifr_name, tun->dev->name);
2812        return 0;
2813
2814err_detach:
2815        tun_detach_all(dev);
2816        /* We are here because register_netdevice() has failed.
2817         * If register_netdevice() already called tun_free_netdev()
2818         * while dealing with the error, tun->pcpu_stats has been cleared.
2819         */
2820        if (!tun->pcpu_stats)
2821                goto err_free_dev;
2822
2823err_free_flow:
2824        tun_flow_uninit(tun);
2825        security_tun_dev_free_security(tun->security);
2826err_free_stat:
2827        free_percpu(tun->pcpu_stats);
2828err_free_dev:
2829        free_netdev(dev);
2830        return err;
2831}
2832
2833static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
2834{
2835        strcpy(ifr->ifr_name, tun->dev->name);
2836
2837        ifr->ifr_flags = tun_flags(tun);
2838
2839}
2840
2841/* This is like a cut-down ethtool ops, except done via tun fd so no
2842 * privs required. */
2843static int set_offload(struct tun_struct *tun, unsigned long arg)
2844{
2845        netdev_features_t features = 0;
2846
2847        if (arg & TUN_F_CSUM) {
2848                features |= NETIF_F_HW_CSUM;
2849                arg &= ~TUN_F_CSUM;
2850
2851                if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
2852                        if (arg & TUN_F_TSO_ECN) {
2853                                features |= NETIF_F_TSO_ECN;
2854                                arg &= ~TUN_F_TSO_ECN;
2855                        }
2856                        if (arg & TUN_F_TSO4)
2857                                features |= NETIF_F_TSO;
2858                        if (arg & TUN_F_TSO6)
2859                                features |= NETIF_F_TSO6;
2860                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
2861                }
2862
2863                arg &= ~TUN_F_UFO;
2864        }
2865
2866        /* This gives the user a way to test for new features in future by
2867         * trying to set them. */
2868        if (arg)
2869                return -EINVAL;
2870
2871        tun->set_features = features;
2872        tun->dev->wanted_features &= ~TUN_USER_FEATURES;
2873        tun->dev->wanted_features |= features;
2874        netdev_update_features(tun->dev);
2875
2876        return 0;
2877}
2878
2879static void tun_detach_filter(struct tun_struct *tun, int n)
2880{
2881        int i;
2882        struct tun_file *tfile;
2883
2884        for (i = 0; i < n; i++) {
2885                tfile = rtnl_dereference(tun->tfiles[i]);
2886                lock_sock(tfile->socket.sk);
2887                sk_detach_filter(tfile->socket.sk);
2888                release_sock(tfile->socket.sk);
2889        }
2890
2891        tun->filter_attached = false;
2892}
2893
2894static int tun_attach_filter(struct tun_struct *tun)
2895{
2896        int i, ret = 0;
2897        struct tun_file *tfile;
2898
2899        for (i = 0; i < tun->numqueues; i++) {
2900                tfile = rtnl_dereference(tun->tfiles[i]);
2901                lock_sock(tfile->socket.sk);
2902                ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
2903                release_sock(tfile->socket.sk);
2904                if (ret) {
2905                        tun_detach_filter(tun, i);
2906                        return ret;
2907                }
2908        }
2909
2910        tun->filter_attached = true;
2911        return ret;
2912}
2913
2914static void tun_set_sndbuf(struct tun_struct *tun)
2915{
2916        struct tun_file *tfile;
2917        int i;
2918
2919        for (i = 0; i < tun->numqueues; i++) {
2920                tfile = rtnl_dereference(tun->tfiles[i]);
2921                tfile->socket.sk->sk_sndbuf = tun->sndbuf;
2922        }
2923}
2924
2925static int tun_set_queue(struct file *file, struct ifreq *ifr)
2926{
2927        struct tun_file *tfile = file->private_data;
2928        struct tun_struct *tun;
2929        int ret = 0;
2930
2931        rtnl_lock();
2932
2933        if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
2934                tun = tfile->detached;
2935                if (!tun) {
2936                        ret = -EINVAL;
2937                        goto unlock;
2938                }
2939                ret = security_tun_dev_attach_queue(tun->security);
2940                if (ret < 0)
2941                        goto unlock;
2942                ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
2943                                 tun->flags & IFF_NAPI_FRAGS, true);
2944        } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
2945                tun = rtnl_dereference(tfile->tun);
2946                if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
2947                        ret = -EINVAL;
2948                else
2949                        __tun_detach(tfile, false);
2950        } else
2951                ret = -EINVAL;
2952
2953        if (ret >= 0)
2954                netdev_state_change(tun->dev);
2955
2956unlock:
2957        rtnl_unlock();
2958        return ret;
2959}
2960
2961static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
2962                        void __user *data)
2963{
2964        struct bpf_prog *prog;
2965        int fd;
2966
2967        if (copy_from_user(&fd, data, sizeof(fd)))
2968                return -EFAULT;
2969
2970        if (fd == -1) {
2971                prog = NULL;
2972        } else {
2973                prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
2974                if (IS_ERR(prog))
2975                        return PTR_ERR(prog);
2976        }
2977
2978        return __tun_set_ebpf(tun, prog_p, prog);
2979}
2980
2981static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
2982                            unsigned long arg, int ifreq_len)
2983{
2984        struct tun_file *tfile = file->private_data;
2985        struct net *net = sock_net(&tfile->sk);
2986        struct tun_struct *tun;
2987        void __user* argp = (void __user*)arg;
2988        unsigned int ifindex, carrier;
2989        struct ifreq ifr;
2990        kuid_t owner;
2991        kgid_t group;
2992        int sndbuf;
2993        int vnet_hdr_sz;
2994        int le;
2995        int ret;
2996        bool do_notify = false;
2997
2998        if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
2999            (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
3000                if (copy_from_user(&ifr, argp, ifreq_len))
3001                        return -EFAULT;
3002        } else {
3003                memset(&ifr, 0, sizeof(ifr));
3004        }
3005        if (cmd == TUNGETFEATURES) {
3006                /* Currently this just means: "what IFF flags are valid?".
3007                 * This is needed because we never checked for invalid flags on
3008                 * TUNSETIFF.
3009                 */
3010                return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
3011                                (unsigned int __user*)argp);
3012        } else if (cmd == TUNSETQUEUE) {
3013                return tun_set_queue(file, &ifr);
3014        } else if (cmd == SIOCGSKNS) {
3015                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3016                        return -EPERM;
3017                return open_related_ns(&net->ns, get_net_ns);
3018        }
3019
3020        ret = 0;
3021        rtnl_lock();
3022
3023        tun = tun_get(tfile);
3024        if (cmd == TUNSETIFF) {
3025                ret = -EEXIST;
3026                if (tun)
3027                        goto unlock;
3028
3029                ifr.ifr_name[IFNAMSIZ-1] = '\0';
3030
3031                ret = tun_set_iff(net, file, &ifr);
3032
3033                if (ret)
3034                        goto unlock;
3035
3036                if (copy_to_user(argp, &ifr, ifreq_len))
3037                        ret = -EFAULT;
3038                goto unlock;
3039        }
3040        if (cmd == TUNSETIFINDEX) {
3041                ret = -EPERM;
3042                if (tun)
3043                        goto unlock;
3044
3045                ret = -EFAULT;
3046                if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
3047                        goto unlock;
3048
3049                ret = 0;
3050                tfile->ifindex = ifindex;
3051                goto unlock;
3052        }
3053
3054        ret = -EBADFD;
3055        if (!tun)
3056                goto unlock;
3057
3058        netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
3059
3060        net = dev_net(tun->dev);
3061        ret = 0;
3062        switch (cmd) {
3063        case TUNGETIFF:
3064                tun_get_iff(tun, &ifr);
3065
3066                if (tfile->detached)
3067                        ifr.ifr_flags |= IFF_DETACH_QUEUE;
3068                if (!tfile->socket.sk->sk_filter)
3069                        ifr.ifr_flags |= IFF_NOFILTER;
3070
3071                if (copy_to_user(argp, &ifr, ifreq_len))
3072                        ret = -EFAULT;
3073                break;
3074
3075        case TUNSETNOCSUM:
3076                /* Disable/Enable checksum */
3077
3078                /* [unimplemented] */
3079                netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
3080                           arg ? "disabled" : "enabled");
3081                break;
3082
3083        case TUNSETPERSIST:
3084                /* Disable/Enable persist mode. Keep an extra reference to the
3085                 * module to prevent the module being unprobed.
3086                 */
3087                if (arg && !(tun->flags & IFF_PERSIST)) {
3088                        tun->flags |= IFF_PERSIST;
3089                        __module_get(THIS_MODULE);
3090                        do_notify = true;
3091                }
3092                if (!arg && (tun->flags & IFF_PERSIST)) {
3093                        tun->flags &= ~IFF_PERSIST;
3094                        module_put(THIS_MODULE);
3095                        do_notify = true;
3096                }
3097
3098                netif_info(tun, drv, tun->dev, "persist %s\n",
3099                           arg ? "enabled" : "disabled");
3100                break;
3101
3102        case TUNSETOWNER:
3103                /* Set owner of the device */
3104                owner = make_kuid(current_user_ns(), arg);
3105                if (!uid_valid(owner)) {
3106                        ret = -EINVAL;
3107                        break;
3108                }
3109                tun->owner = owner;
3110                do_notify = true;
3111                netif_info(tun, drv, tun->dev, "owner set to %u\n",
3112                           from_kuid(&init_user_ns, tun->owner));
3113                break;
3114
3115        case TUNSETGROUP:
3116                /* Set group of the device */
3117                group = make_kgid(current_user_ns(), arg);
3118                if (!gid_valid(group)) {
3119                        ret = -EINVAL;
3120                        break;
3121                }
3122                tun->group = group;
3123                do_notify = true;
3124                netif_info(tun, drv, tun->dev, "group set to %u\n",
3125                           from_kgid(&init_user_ns, tun->group));
3126                break;
3127
3128        case TUNSETLINK:
3129                /* Only allow setting the type when the interface is down */
3130                if (tun->dev->flags & IFF_UP) {
3131                        netif_info(tun, drv, tun->dev,
3132                                   "Linktype set failed because interface is up\n");
3133                        ret = -EBUSY;
3134                } else {
3135                        tun->dev->type = (int) arg;
3136                        netif_info(tun, drv, tun->dev, "linktype set to %d\n",
3137                                   tun->dev->type);
3138                        ret = 0;
3139                }
3140                break;
3141
3142        case TUNSETDEBUG:
3143                tun->msg_enable = (u32)arg;
3144                break;
3145
3146        case TUNSETOFFLOAD:
3147                ret = set_offload(tun, arg);
3148                break;
3149
3150        case TUNSETTXFILTER:
3151                /* Can be set only for TAPs */
3152                ret = -EINVAL;
3153                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3154                        break;
3155                ret = update_filter(&tun->txflt, (void __user *)arg);
3156                break;
3157
3158        case SIOCGIFHWADDR:
3159                /* Get hw address */
3160                memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
3161                ifr.ifr_hwaddr.sa_family = tun->dev->type;
3162                if (copy_to_user(argp, &ifr, ifreq_len))
3163                        ret = -EFAULT;
3164                break;
3165
3166        case SIOCSIFHWADDR:
3167                /* Set hw address */
3168                ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr, NULL);
3169                break;
3170
3171        case TUNGETSNDBUF:
3172                sndbuf = tfile->socket.sk->sk_sndbuf;
3173                if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
3174                        ret = -EFAULT;
3175                break;
3176
3177        case TUNSETSNDBUF:
3178                if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
3179                        ret = -EFAULT;
3180                        break;
3181                }
3182                if (sndbuf <= 0) {
3183                        ret = -EINVAL;
3184                        break;
3185                }
3186
3187                tun->sndbuf = sndbuf;
3188                tun_set_sndbuf(tun);
3189                break;
3190
3191        case TUNGETVNETHDRSZ:
3192                vnet_hdr_sz = tun->vnet_hdr_sz;
3193                if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
3194                        ret = -EFAULT;
3195                break;
3196
3197        case TUNSETVNETHDRSZ:
3198                if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
3199                        ret = -EFAULT;
3200                        break;
3201                }
3202                if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
3203                        ret = -EINVAL;
3204                        break;
3205                }
3206
3207                tun->vnet_hdr_sz = vnet_hdr_sz;
3208                break;
3209
3210        case TUNGETVNETLE:
3211                le = !!(tun->flags & TUN_VNET_LE);
3212                if (put_user(le, (int __user *)argp))
3213                        ret = -EFAULT;
3214                break;
3215
3216        case TUNSETVNETLE:
3217                if (get_user(le, (int __user *)argp)) {
3218                        ret = -EFAULT;
3219                        break;
3220                }
3221                if (le)
3222                        tun->flags |= TUN_VNET_LE;
3223                else
3224                        tun->flags &= ~TUN_VNET_LE;
3225                break;
3226
3227        case TUNGETVNETBE:
3228                ret = tun_get_vnet_be(tun, argp);
3229                break;
3230
3231        case TUNSETVNETBE:
3232                ret = tun_set_vnet_be(tun, argp);
3233                break;
3234
3235        case TUNATTACHFILTER:
3236                /* Can be set only for TAPs */
3237                ret = -EINVAL;
3238                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3239                        break;
3240                ret = -EFAULT;
3241                if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
3242                        break;
3243
3244                ret = tun_attach_filter(tun);
3245                break;
3246
3247        case TUNDETACHFILTER:
3248                /* Can be set only for TAPs */
3249                ret = -EINVAL;
3250                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3251                        break;
3252                ret = 0;
3253                tun_detach_filter(tun, tun->numqueues);
3254                break;
3255
3256        case TUNGETFILTER:
3257                ret = -EINVAL;
3258                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3259                        break;
3260                ret = -EFAULT;
3261                if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
3262                        break;
3263                ret = 0;
3264                break;
3265
3266        case TUNSETSTEERINGEBPF:
3267                ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
3268                break;
3269
3270        case TUNSETFILTEREBPF:
3271                ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
3272                break;
3273
3274        case TUNSETCARRIER:
3275                ret = -EFAULT;
3276                if (copy_from_user(&carrier, argp, sizeof(carrier)))
3277                        goto unlock;
3278
3279                ret = tun_net_change_carrier(tun->dev, (bool)carrier);
3280                break;
3281
3282        case TUNGETDEVNETNS:
3283                ret = -EPERM;
3284                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3285                        goto unlock;
3286                ret = open_related_ns(&net->ns, get_net_ns);
3287                break;
3288
3289        default:
3290                ret = -EINVAL;
3291                break;
3292        }
3293
3294        if (do_notify)
3295                netdev_state_change(tun->dev);
3296
3297unlock:
3298        rtnl_unlock();
3299        if (tun)
3300                tun_put(tun);
3301        return ret;
3302}
3303
3304static long tun_chr_ioctl(struct file *file,
3305                          unsigned int cmd, unsigned long arg)
3306{
3307        return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
3308}
3309
3310#ifdef CONFIG_COMPAT
3311static long tun_chr_compat_ioctl(struct file *file,
3312                         unsigned int cmd, unsigned long arg)
3313{
3314        switch (cmd) {
3315        case TUNSETIFF:
3316        case TUNGETIFF:
3317        case TUNSETTXFILTER:
3318        case TUNGETSNDBUF:
3319        case TUNSETSNDBUF:
3320        case SIOCGIFHWADDR:
3321        case SIOCSIFHWADDR:
3322                arg = (unsigned long)compat_ptr(arg);
3323                break;
3324        default:
3325                arg = (compat_ulong_t)arg;
3326                break;
3327        }
3328
3329        /*
3330         * compat_ifreq is shorter than ifreq, so we must not access beyond
3331         * the end of that structure. All fields that are used in this
3332         * driver are compatible though, we don't need to convert the
3333         * contents.
3334         */
3335        return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
3336}
3337#endif /* CONFIG_COMPAT */
3338
3339static int tun_chr_fasync(int fd, struct file *file, int on)
3340{
3341        struct tun_file *tfile = file->private_data;
3342        int ret;
3343
3344        if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
3345                goto out;
3346
3347        if (on) {
3348                __f_setown(file, task_pid(current), PIDTYPE_TGID, 0);
3349                tfile->flags |= TUN_FASYNC;
3350        } else
3351                tfile->flags &= ~TUN_FASYNC;
3352        ret = 0;
3353out:
3354        return ret;
3355}
3356
3357static int tun_chr_open(struct inode *inode, struct file * file)
3358{
3359        struct net *net = current->nsproxy->net_ns;
3360        struct tun_file *tfile;
3361
3362        tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
3363                                            &tun_proto, 0);
3364        if (!tfile)
3365                return -ENOMEM;
3366        if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
3367                sk_free(&tfile->sk);
3368                return -ENOMEM;
3369        }
3370
3371        mutex_init(&tfile->napi_mutex);
3372        RCU_INIT_POINTER(tfile->tun, NULL);
3373        tfile->flags = 0;
3374        tfile->ifindex = 0;
3375
3376        init_waitqueue_head(&tfile->socket.wq.wait);
3377
3378        tfile->socket.file = file;
3379        tfile->socket.ops = &tun_socket_ops;
3380
3381        sock_init_data(&tfile->socket, &tfile->sk);
3382
3383        tfile->sk.sk_write_space = tun_sock_write_space;
3384        tfile->sk.sk_sndbuf = INT_MAX;
3385
3386        file->private_data = tfile;
3387        INIT_LIST_HEAD(&tfile->next);
3388
3389        sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
3390
3391        return 0;
3392}
3393
3394static int tun_chr_close(struct inode *inode, struct file *file)
3395{
3396        struct tun_file *tfile = file->private_data;
3397
3398        tun_detach(tfile, true);
3399
3400        return 0;
3401}
3402
3403#ifdef CONFIG_PROC_FS
3404static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
3405{
3406        struct tun_file *tfile = file->private_data;
3407        struct tun_struct *tun;
3408        struct ifreq ifr;
3409
3410        memset(&ifr, 0, sizeof(ifr));
3411
3412        rtnl_lock();
3413        tun = tun_get(tfile);
3414        if (tun)
3415                tun_get_iff(tun, &ifr);
3416        rtnl_unlock();
3417
3418        if (tun)
3419                tun_put(tun);
3420
3421        seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
3422}
3423#endif
3424
3425static const struct file_operations tun_fops = {
3426        .owner  = THIS_MODULE,
3427        .llseek = no_llseek,
3428        .read_iter  = tun_chr_read_iter,
3429        .write_iter = tun_chr_write_iter,
3430        .poll   = tun_chr_poll,
3431        .unlocked_ioctl = tun_chr_ioctl,
3432#ifdef CONFIG_COMPAT
3433        .compat_ioctl = tun_chr_compat_ioctl,
3434#endif
3435        .open   = tun_chr_open,
3436        .release = tun_chr_close,
3437        .fasync = tun_chr_fasync,
3438#ifdef CONFIG_PROC_FS
3439        .show_fdinfo = tun_chr_show_fdinfo,
3440#endif
3441};
3442
3443static struct miscdevice tun_miscdev = {
3444        .minor = TUN_MINOR,
3445        .name = "tun",
3446        .nodename = "net/tun",
3447        .fops = &tun_fops,
3448};
3449
3450/* ethtool interface */
3451
3452static void tun_default_link_ksettings(struct net_device *dev,
3453                                       struct ethtool_link_ksettings *cmd)
3454{
3455        ethtool_link_ksettings_zero_link_mode(cmd, supported);
3456        ethtool_link_ksettings_zero_link_mode(cmd, advertising);
3457        cmd->base.speed         = SPEED_10;
3458        cmd->base.duplex        = DUPLEX_FULL;
3459        cmd->base.port          = PORT_TP;
3460        cmd->base.phy_address   = 0;
3461        cmd->base.autoneg       = AUTONEG_DISABLE;
3462}
3463
3464static int tun_get_link_ksettings(struct net_device *dev,
3465                                  struct ethtool_link_ksettings *cmd)
3466{
3467        struct tun_struct *tun = netdev_priv(dev);
3468
3469        memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
3470        return 0;
3471}
3472
3473static int tun_set_link_ksettings(struct net_device *dev,
3474                                  const struct ethtool_link_ksettings *cmd)
3475{
3476        struct tun_struct *tun = netdev_priv(dev);
3477
3478        memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
3479        return 0;
3480}
3481
3482static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
3483{
3484        struct tun_struct *tun = netdev_priv(dev);
3485
3486        strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
3487        strlcpy(info->version, DRV_VERSION, sizeof(info->version));
3488
3489        switch (tun->flags & TUN_TYPE_MASK) {
3490        case IFF_TUN:
3491                strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
3492                break;
3493        case IFF_TAP:
3494                strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
3495                break;
3496        }
3497}
3498
3499static u32 tun_get_msglevel(struct net_device *dev)
3500{
3501        struct tun_struct *tun = netdev_priv(dev);
3502
3503        return tun->msg_enable;
3504}
3505
3506static void tun_set_msglevel(struct net_device *dev, u32 value)
3507{
3508        struct tun_struct *tun = netdev_priv(dev);
3509
3510        tun->msg_enable = value;
3511}
3512
3513static int tun_get_coalesce(struct net_device *dev,
3514                            struct ethtool_coalesce *ec)
3515{
3516        struct tun_struct *tun = netdev_priv(dev);
3517
3518        ec->rx_max_coalesced_frames = tun->rx_batched;
3519
3520        return 0;
3521}
3522
3523static int tun_set_coalesce(struct net_device *dev,
3524                            struct ethtool_coalesce *ec)
3525{
3526        struct tun_struct *tun = netdev_priv(dev);
3527
3528        if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
3529                tun->rx_batched = NAPI_POLL_WEIGHT;
3530        else
3531                tun->rx_batched = ec->rx_max_coalesced_frames;
3532
3533        return 0;
3534}
3535
3536static const struct ethtool_ops tun_ethtool_ops = {
3537        .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
3538        .get_drvinfo    = tun_get_drvinfo,
3539        .get_msglevel   = tun_get_msglevel,
3540        .set_msglevel   = tun_set_msglevel,
3541        .get_link       = ethtool_op_get_link,
3542        .get_ts_info    = ethtool_op_get_ts_info,
3543        .get_coalesce   = tun_get_coalesce,
3544        .set_coalesce   = tun_set_coalesce,
3545        .get_link_ksettings = tun_get_link_ksettings,
3546        .set_link_ksettings = tun_set_link_ksettings,
3547};
3548
3549static int tun_queue_resize(struct tun_struct *tun)
3550{
3551        struct net_device *dev = tun->dev;
3552        struct tun_file *tfile;
3553        struct ptr_ring **rings;
3554        int n = tun->numqueues + tun->numdisabled;
3555        int ret, i;
3556
3557        rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
3558        if (!rings)
3559                return -ENOMEM;
3560
3561        for (i = 0; i < tun->numqueues; i++) {
3562                tfile = rtnl_dereference(tun->tfiles[i]);
3563                rings[i] = &tfile->tx_ring;
3564        }
3565        list_for_each_entry(tfile, &tun->disabled, next)
3566                rings[i++] = &tfile->tx_ring;
3567
3568        ret = ptr_ring_resize_multiple(rings, n,
3569                                       dev->tx_queue_len, GFP_KERNEL,
3570                                       tun_ptr_free);
3571
3572        kfree(rings);
3573        return ret;
3574}
3575
3576static int tun_device_event(struct notifier_block *unused,
3577                            unsigned long event, void *ptr)
3578{
3579        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3580        struct tun_struct *tun = netdev_priv(dev);
3581        int i;
3582
3583        if (dev->rtnl_link_ops != &tun_link_ops)
3584                return NOTIFY_DONE;
3585
3586        switch (event) {
3587        case NETDEV_CHANGE_TX_QUEUE_LEN:
3588                if (tun_queue_resize(tun))
3589                        return NOTIFY_BAD;
3590                break;
3591        case NETDEV_UP:
3592                for (i = 0; i < tun->numqueues; i++) {
3593                        struct tun_file *tfile;
3594
3595                        tfile = rtnl_dereference(tun->tfiles[i]);
3596                        tfile->socket.sk->sk_write_space(tfile->socket.sk);
3597                }
3598                break;
3599        default:
3600                break;
3601        }
3602
3603        return NOTIFY_DONE;
3604}
3605
3606static struct notifier_block tun_notifier_block __read_mostly = {
3607        .notifier_call  = tun_device_event,
3608};
3609
3610static int __init tun_init(void)
3611{
3612        int ret = 0;
3613
3614        pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
3615
3616        ret = rtnl_link_register(&tun_link_ops);
3617        if (ret) {
3618                pr_err("Can't register link_ops\n");
3619                goto err_linkops;
3620        }
3621
3622        ret = misc_register(&tun_miscdev);
3623        if (ret) {
3624                pr_err("Can't register misc device %d\n", TUN_MINOR);
3625                goto err_misc;
3626        }
3627
3628        ret = register_netdevice_notifier(&tun_notifier_block);
3629        if (ret) {
3630                pr_err("Can't register netdevice notifier\n");
3631                goto err_notifier;
3632        }
3633
3634        return  0;
3635
3636err_notifier:
3637        misc_deregister(&tun_miscdev);
3638err_misc:
3639        rtnl_link_unregister(&tun_link_ops);
3640err_linkops:
3641        return ret;
3642}
3643
3644static void tun_cleanup(void)
3645{
3646        misc_deregister(&tun_miscdev);
3647        rtnl_link_unregister(&tun_link_ops);
3648        unregister_netdevice_notifier(&tun_notifier_block);
3649}
3650
3651/* Get an underlying socket object from tun file.  Returns error unless file is
3652 * attached to a device.  The returned object works like a packet socket, it
3653 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
3654 * holding a reference to the file for as long as the socket is in use. */
3655struct socket *tun_get_socket(struct file *file)
3656{
3657        struct tun_file *tfile;
3658        if (file->f_op != &tun_fops)
3659                return ERR_PTR(-EINVAL);
3660        tfile = file->private_data;
3661        if (!tfile)
3662                return ERR_PTR(-EBADFD);
3663        return &tfile->socket;
3664}
3665EXPORT_SYMBOL_GPL(tun_get_socket);
3666
3667struct ptr_ring *tun_get_tx_ring(struct file *file)
3668{
3669        struct tun_file *tfile;
3670
3671        if (file->f_op != &tun_fops)
3672                return ERR_PTR(-EINVAL);
3673        tfile = file->private_data;
3674        if (!tfile)
3675                return ERR_PTR(-EBADFD);
3676        return &tfile->tx_ring;
3677}
3678EXPORT_SYMBOL_GPL(tun_get_tx_ring);
3679
3680module_init(tun_init);
3681module_exit(tun_cleanup);
3682MODULE_DESCRIPTION(DRV_DESCRIPTION);
3683MODULE_AUTHOR(DRV_COPYRIGHT);
3684MODULE_LICENSE("GPL");
3685MODULE_ALIAS_MISCDEV(TUN_MINOR);
3686MODULE_ALIAS("devname:net/tun");
3687