linux/drivers/net/virtio_net.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* A network driver using virtio.
   3 *
   4 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
   5 */
   6//#define DEBUG
   7#include <linux/netdevice.h>
   8#include <linux/etherdevice.h>
   9#include <linux/ethtool.h>
  10#include <linux/module.h>
  11#include <linux/virtio.h>
  12#include <linux/virtio_net.h>
  13#include <linux/bpf.h>
  14#include <linux/bpf_trace.h>
  15#include <linux/scatterlist.h>
  16#include <linux/if_vlan.h>
  17#include <linux/slab.h>
  18#include <linux/cpu.h>
  19#include <linux/average.h>
  20#include <linux/filter.h>
  21#include <linux/kernel.h>
  22#include <net/route.h>
  23#include <net/xdp.h>
  24#include <net/net_failover.h>
  25
  26static int napi_weight = NAPI_POLL_WEIGHT;
  27module_param(napi_weight, int, 0444);
  28
  29static bool csum = true, gso = true, napi_tx = true;
  30module_param(csum, bool, 0444);
  31module_param(gso, bool, 0444);
  32module_param(napi_tx, bool, 0644);
  33
  34/* FIXME: MTU in config. */
  35#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
  36#define GOOD_COPY_LEN   128
  37
  38#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
  39
  40/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
  41#define VIRTIO_XDP_HEADROOM 256
  42
  43/* Separating two types of XDP xmit */
  44#define VIRTIO_XDP_TX           BIT(0)
  45#define VIRTIO_XDP_REDIR        BIT(1)
  46
  47#define VIRTIO_XDP_FLAG BIT(0)
  48
  49/* RX packet size EWMA. The average packet size is used to determine the packet
  50 * buffer size when refilling RX rings. As the entire RX ring may be refilled
  51 * at once, the weight is chosen so that the EWMA will be insensitive to short-
  52 * term, transient changes in packet size.
  53 */
  54DECLARE_EWMA(pkt_len, 0, 64)
  55
  56#define VIRTNET_DRIVER_VERSION "1.0.0"
  57
  58static const unsigned long guest_offloads[] = {
  59        VIRTIO_NET_F_GUEST_TSO4,
  60        VIRTIO_NET_F_GUEST_TSO6,
  61        VIRTIO_NET_F_GUEST_ECN,
  62        VIRTIO_NET_F_GUEST_UFO,
  63        VIRTIO_NET_F_GUEST_CSUM
  64};
  65
  66#define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
  67                                (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
  68                                (1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
  69                                (1ULL << VIRTIO_NET_F_GUEST_UFO))
  70
  71struct virtnet_stat_desc {
  72        char desc[ETH_GSTRING_LEN];
  73        size_t offset;
  74};
  75
  76struct virtnet_sq_stats {
  77        struct u64_stats_sync syncp;
  78        u64 packets;
  79        u64 bytes;
  80        u64 xdp_tx;
  81        u64 xdp_tx_drops;
  82        u64 kicks;
  83};
  84
  85struct virtnet_rq_stats {
  86        struct u64_stats_sync syncp;
  87        u64 packets;
  88        u64 bytes;
  89        u64 drops;
  90        u64 xdp_packets;
  91        u64 xdp_tx;
  92        u64 xdp_redirects;
  93        u64 xdp_drops;
  94        u64 kicks;
  95};
  96
  97#define VIRTNET_SQ_STAT(m)      offsetof(struct virtnet_sq_stats, m)
  98#define VIRTNET_RQ_STAT(m)      offsetof(struct virtnet_rq_stats, m)
  99
 100static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
 101        { "packets",            VIRTNET_SQ_STAT(packets) },
 102        { "bytes",              VIRTNET_SQ_STAT(bytes) },
 103        { "xdp_tx",             VIRTNET_SQ_STAT(xdp_tx) },
 104        { "xdp_tx_drops",       VIRTNET_SQ_STAT(xdp_tx_drops) },
 105        { "kicks",              VIRTNET_SQ_STAT(kicks) },
 106};
 107
 108static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
 109        { "packets",            VIRTNET_RQ_STAT(packets) },
 110        { "bytes",              VIRTNET_RQ_STAT(bytes) },
 111        { "drops",              VIRTNET_RQ_STAT(drops) },
 112        { "xdp_packets",        VIRTNET_RQ_STAT(xdp_packets) },
 113        { "xdp_tx",             VIRTNET_RQ_STAT(xdp_tx) },
 114        { "xdp_redirects",      VIRTNET_RQ_STAT(xdp_redirects) },
 115        { "xdp_drops",          VIRTNET_RQ_STAT(xdp_drops) },
 116        { "kicks",              VIRTNET_RQ_STAT(kicks) },
 117};
 118
 119#define VIRTNET_SQ_STATS_LEN    ARRAY_SIZE(virtnet_sq_stats_desc)
 120#define VIRTNET_RQ_STATS_LEN    ARRAY_SIZE(virtnet_rq_stats_desc)
 121
 122/* Internal representation of a send virtqueue */
 123struct send_queue {
 124        /* Virtqueue associated with this send _queue */
 125        struct virtqueue *vq;
 126
 127        /* TX: fragments + linear part + virtio header */
 128        struct scatterlist sg[MAX_SKB_FRAGS + 2];
 129
 130        /* Name of the send queue: output.$index */
 131        char name[40];
 132
 133        struct virtnet_sq_stats stats;
 134
 135        struct napi_struct napi;
 136};
 137
 138/* Internal representation of a receive virtqueue */
 139struct receive_queue {
 140        /* Virtqueue associated with this receive_queue */
 141        struct virtqueue *vq;
 142
 143        struct napi_struct napi;
 144
 145        struct bpf_prog __rcu *xdp_prog;
 146
 147        struct virtnet_rq_stats stats;
 148
 149        /* Chain pages by the private ptr. */
 150        struct page *pages;
 151
 152        /* Average packet length for mergeable receive buffers. */
 153        struct ewma_pkt_len mrg_avg_pkt_len;
 154
 155        /* Page frag for packet buffer allocation. */
 156        struct page_frag alloc_frag;
 157
 158        /* RX: fragments + linear part + virtio header */
 159        struct scatterlist sg[MAX_SKB_FRAGS + 2];
 160
 161        /* Min single buffer size for mergeable buffers case. */
 162        unsigned int min_buf_len;
 163
 164        /* Name of this receive queue: input.$index */
 165        char name[40];
 166
 167        struct xdp_rxq_info xdp_rxq;
 168};
 169
 170/* Control VQ buffers: protected by the rtnl lock */
 171struct control_buf {
 172        struct virtio_net_ctrl_hdr hdr;
 173        virtio_net_ctrl_ack status;
 174        struct virtio_net_ctrl_mq mq;
 175        u8 promisc;
 176        u8 allmulti;
 177        __virtio16 vid;
 178        __virtio64 offloads;
 179};
 180
 181struct virtnet_info {
 182        struct virtio_device *vdev;
 183        struct virtqueue *cvq;
 184        struct net_device *dev;
 185        struct send_queue *sq;
 186        struct receive_queue *rq;
 187        unsigned int status;
 188
 189        /* Max # of queue pairs supported by the device */
 190        u16 max_queue_pairs;
 191
 192        /* # of queue pairs currently used by the driver */
 193        u16 curr_queue_pairs;
 194
 195        /* # of XDP queue pairs currently used by the driver */
 196        u16 xdp_queue_pairs;
 197
 198        /* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
 199        bool xdp_enabled;
 200
 201        /* I like... big packets and I cannot lie! */
 202        bool big_packets;
 203
 204        /* Host will merge rx buffers for big packets (shake it! shake it!) */
 205        bool mergeable_rx_bufs;
 206
 207        /* Has control virtqueue */
 208        bool has_cvq;
 209
 210        /* Host can handle any s/g split between our header and packet data */
 211        bool any_header_sg;
 212
 213        /* Packet virtio header size */
 214        u8 hdr_len;
 215
 216        /* Work struct for refilling if we run low on memory. */
 217        struct delayed_work refill;
 218
 219        /* Work struct for config space updates */
 220        struct work_struct config_work;
 221
 222        /* Does the affinity hint is set for virtqueues? */
 223        bool affinity_hint_set;
 224
 225        /* CPU hotplug instances for online & dead */
 226        struct hlist_node node;
 227        struct hlist_node node_dead;
 228
 229        struct control_buf *ctrl;
 230
 231        /* Ethtool settings */
 232        u8 duplex;
 233        u32 speed;
 234
 235        unsigned long guest_offloads;
 236        unsigned long guest_offloads_capable;
 237
 238        /* failover when STANDBY feature enabled */
 239        struct failover *failover;
 240};
 241
 242struct padded_vnet_hdr {
 243        struct virtio_net_hdr_mrg_rxbuf hdr;
 244        /*
 245         * hdr is in a separate sg buffer, and data sg buffer shares same page
 246         * with this header sg. This padding makes next sg 16 byte aligned
 247         * after the header.
 248         */
 249        char padding[4];
 250};
 251
 252static bool is_xdp_frame(void *ptr)
 253{
 254        return (unsigned long)ptr & VIRTIO_XDP_FLAG;
 255}
 256
 257static void *xdp_to_ptr(struct xdp_frame *ptr)
 258{
 259        return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
 260}
 261
 262static struct xdp_frame *ptr_to_xdp(void *ptr)
 263{
 264        return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
 265}
 266
 267/* Converting between virtqueue no. and kernel tx/rx queue no.
 268 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 269 */
 270static int vq2txq(struct virtqueue *vq)
 271{
 272        return (vq->index - 1) / 2;
 273}
 274
 275static int txq2vq(int txq)
 276{
 277        return txq * 2 + 1;
 278}
 279
 280static int vq2rxq(struct virtqueue *vq)
 281{
 282        return vq->index / 2;
 283}
 284
 285static int rxq2vq(int rxq)
 286{
 287        return rxq * 2;
 288}
 289
 290static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
 291{
 292        return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
 293}
 294
 295/*
 296 * private is used to chain pages for big packets, put the whole
 297 * most recent used list in the beginning for reuse
 298 */
 299static void give_pages(struct receive_queue *rq, struct page *page)
 300{
 301        struct page *end;
 302
 303        /* Find end of list, sew whole thing into vi->rq.pages. */
 304        for (end = page; end->private; end = (struct page *)end->private);
 305        end->private = (unsigned long)rq->pages;
 306        rq->pages = page;
 307}
 308
 309static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 310{
 311        struct page *p = rq->pages;
 312
 313        if (p) {
 314                rq->pages = (struct page *)p->private;
 315                /* clear private here, it is used to chain pages */
 316                p->private = 0;
 317        } else
 318                p = alloc_page(gfp_mask);
 319        return p;
 320}
 321
 322static void virtqueue_napi_schedule(struct napi_struct *napi,
 323                                    struct virtqueue *vq)
 324{
 325        if (napi_schedule_prep(napi)) {
 326                virtqueue_disable_cb(vq);
 327                __napi_schedule(napi);
 328        }
 329}
 330
 331static void virtqueue_napi_complete(struct napi_struct *napi,
 332                                    struct virtqueue *vq, int processed)
 333{
 334        int opaque;
 335
 336        opaque = virtqueue_enable_cb_prepare(vq);
 337        if (napi_complete_done(napi, processed)) {
 338                if (unlikely(virtqueue_poll(vq, opaque)))
 339                        virtqueue_napi_schedule(napi, vq);
 340        } else {
 341                virtqueue_disable_cb(vq);
 342        }
 343}
 344
 345static void skb_xmit_done(struct virtqueue *vq)
 346{
 347        struct virtnet_info *vi = vq->vdev->priv;
 348        struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
 349
 350        /* Suppress further interrupts. */
 351        virtqueue_disable_cb(vq);
 352
 353        if (napi->weight)
 354                virtqueue_napi_schedule(napi, vq);
 355        else
 356                /* We were probably waiting for more output buffers. */
 357                netif_wake_subqueue(vi->dev, vq2txq(vq));
 358}
 359
 360#define MRG_CTX_HEADER_SHIFT 22
 361static void *mergeable_len_to_ctx(unsigned int truesize,
 362                                  unsigned int headroom)
 363{
 364        return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
 365}
 366
 367static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
 368{
 369        return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
 370}
 371
 372static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
 373{
 374        return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
 375}
 376
 377/* Called from bottom half context */
 378static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 379                                   struct receive_queue *rq,
 380                                   struct page *page, unsigned int offset,
 381                                   unsigned int len, unsigned int truesize,
 382                                   bool hdr_valid, unsigned int metasize,
 383                                   bool whole_page)
 384{
 385        struct sk_buff *skb;
 386        struct virtio_net_hdr_mrg_rxbuf *hdr;
 387        unsigned int copy, hdr_len, hdr_padded_len;
 388        struct page *page_to_free = NULL;
 389        int tailroom, shinfo_size;
 390        char *p, *hdr_p, *buf;
 391
 392        p = page_address(page) + offset;
 393        hdr_p = p;
 394
 395        hdr_len = vi->hdr_len;
 396        if (vi->mergeable_rx_bufs)
 397                hdr_padded_len = sizeof(*hdr);
 398        else
 399                hdr_padded_len = sizeof(struct padded_vnet_hdr);
 400
 401        /* If whole_page, there is an offset between the beginning of the
 402         * data and the allocated space, otherwise the data and the allocated
 403         * space are aligned.
 404         *
 405         * Buffers with headroom use PAGE_SIZE as alloc size, see
 406         * add_recvbuf_mergeable() + get_mergeable_buf_len()
 407         */
 408        if (whole_page) {
 409                /* Buffers with whole_page use PAGE_SIZE as alloc size,
 410                 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
 411                 */
 412                truesize = PAGE_SIZE;
 413
 414                /* page maybe head page, so we should get the buf by p, not the
 415                 * page
 416                 */
 417                tailroom = truesize - len - offset_in_page(p);
 418                buf = (char *)((unsigned long)p & PAGE_MASK);
 419        } else {
 420                tailroom = truesize - len;
 421                buf = p;
 422        }
 423
 424        len -= hdr_len;
 425        offset += hdr_padded_len;
 426        p += hdr_padded_len;
 427
 428        shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 429
 430        /* copy small packet so we can reuse these pages */
 431        if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
 432                skb = build_skb(buf, truesize);
 433                if (unlikely(!skb))
 434                        return NULL;
 435
 436                skb_reserve(skb, p - buf);
 437                skb_put(skb, len);
 438                goto ok;
 439        }
 440
 441        /* copy small packet so we can reuse these pages for small data */
 442        skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
 443        if (unlikely(!skb))
 444                return NULL;
 445
 446        /* Copy all frame if it fits skb->head, otherwise
 447         * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
 448         */
 449        if (len <= skb_tailroom(skb))
 450                copy = len;
 451        else
 452                copy = ETH_HLEN + metasize;
 453        skb_put_data(skb, p, copy);
 454
 455        len -= copy;
 456        offset += copy;
 457
 458        if (vi->mergeable_rx_bufs) {
 459                if (len)
 460                        skb_add_rx_frag(skb, 0, page, offset, len, truesize);
 461                else
 462                        page_to_free = page;
 463                goto ok;
 464        }
 465
 466        /*
 467         * Verify that we can indeed put this data into a skb.
 468         * This is here to handle cases when the device erroneously
 469         * tries to receive more than is possible. This is usually
 470         * the case of a broken device.
 471         */
 472        if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
 473                net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
 474                dev_kfree_skb(skb);
 475                return NULL;
 476        }
 477        BUG_ON(offset >= PAGE_SIZE);
 478        while (len) {
 479                unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
 480                skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
 481                                frag_size, truesize);
 482                len -= frag_size;
 483                page = (struct page *)page->private;
 484                offset = 0;
 485        }
 486
 487        if (page)
 488                give_pages(rq, page);
 489
 490ok:
 491        /* hdr_valid means no XDP, so we can copy the vnet header */
 492        if (hdr_valid) {
 493                hdr = skb_vnet_hdr(skb);
 494                memcpy(hdr, hdr_p, hdr_len);
 495        }
 496        if (page_to_free)
 497                put_page(page_to_free);
 498
 499        if (metasize) {
 500                __skb_pull(skb, metasize);
 501                skb_metadata_set(skb, metasize);
 502        }
 503
 504        return skb;
 505}
 506
 507static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
 508                                   struct send_queue *sq,
 509                                   struct xdp_frame *xdpf)
 510{
 511        struct virtio_net_hdr_mrg_rxbuf *hdr;
 512        int err;
 513
 514        if (unlikely(xdpf->headroom < vi->hdr_len))
 515                return -EOVERFLOW;
 516
 517        /* Make room for virtqueue hdr (also change xdpf->headroom?) */
 518        xdpf->data -= vi->hdr_len;
 519        /* Zero header and leave csum up to XDP layers */
 520        hdr = xdpf->data;
 521        memset(hdr, 0, vi->hdr_len);
 522        xdpf->len   += vi->hdr_len;
 523
 524        sg_init_one(sq->sg, xdpf->data, xdpf->len);
 525
 526        err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
 527                                   GFP_ATOMIC);
 528        if (unlikely(err))
 529                return -ENOSPC; /* Caller handle free/refcnt */
 530
 531        return 0;
 532}
 533
 534/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
 535 * the current cpu, so it does not need to be locked.
 536 *
 537 * Here we use marco instead of inline functions because we have to deal with
 538 * three issues at the same time: 1. the choice of sq. 2. judge and execute the
 539 * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
 540 * functions to perfectly solve these three problems at the same time.
 541 */
 542#define virtnet_xdp_get_sq(vi) ({                                       \
 543        struct netdev_queue *txq;                                       \
 544        typeof(vi) v = (vi);                                            \
 545        unsigned int qp;                                                \
 546                                                                        \
 547        if (v->curr_queue_pairs > nr_cpu_ids) {                         \
 548                qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
 549                qp += smp_processor_id();                               \
 550                txq = netdev_get_tx_queue(v->dev, qp);                  \
 551                __netif_tx_acquire(txq);                                \
 552        } else {                                                        \
 553                qp = smp_processor_id() % v->curr_queue_pairs;          \
 554                txq = netdev_get_tx_queue(v->dev, qp);                  \
 555                __netif_tx_lock(txq, raw_smp_processor_id());           \
 556        }                                                               \
 557        v->sq + qp;                                                     \
 558})
 559
 560#define virtnet_xdp_put_sq(vi, q) {                                     \
 561        struct netdev_queue *txq;                                       \
 562        typeof(vi) v = (vi);                                            \
 563                                                                        \
 564        txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
 565        if (v->curr_queue_pairs > nr_cpu_ids)                           \
 566                __netif_tx_release(txq);                                \
 567        else                                                            \
 568                __netif_tx_unlock(txq);                                 \
 569}
 570
 571static int virtnet_xdp_xmit(struct net_device *dev,
 572                            int n, struct xdp_frame **frames, u32 flags)
 573{
 574        struct virtnet_info *vi = netdev_priv(dev);
 575        struct receive_queue *rq = vi->rq;
 576        struct bpf_prog *xdp_prog;
 577        struct send_queue *sq;
 578        unsigned int len;
 579        int packets = 0;
 580        int bytes = 0;
 581        int nxmit = 0;
 582        int kicks = 0;
 583        void *ptr;
 584        int ret;
 585        int i;
 586
 587        /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 588         * indicate XDP resources have been successfully allocated.
 589         */
 590        xdp_prog = rcu_access_pointer(rq->xdp_prog);
 591        if (!xdp_prog)
 592                return -ENXIO;
 593
 594        sq = virtnet_xdp_get_sq(vi);
 595
 596        if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
 597                ret = -EINVAL;
 598                goto out;
 599        }
 600
 601        /* Free up any pending old buffers before queueing new ones. */
 602        while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 603                if (likely(is_xdp_frame(ptr))) {
 604                        struct xdp_frame *frame = ptr_to_xdp(ptr);
 605
 606                        bytes += frame->len;
 607                        xdp_return_frame(frame);
 608                } else {
 609                        struct sk_buff *skb = ptr;
 610
 611                        bytes += skb->len;
 612                        napi_consume_skb(skb, false);
 613                }
 614                packets++;
 615        }
 616
 617        for (i = 0; i < n; i++) {
 618                struct xdp_frame *xdpf = frames[i];
 619
 620                if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
 621                        break;
 622                nxmit++;
 623        }
 624        ret = nxmit;
 625
 626        if (flags & XDP_XMIT_FLUSH) {
 627                if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
 628                        kicks = 1;
 629        }
 630out:
 631        u64_stats_update_begin(&sq->stats.syncp);
 632        sq->stats.bytes += bytes;
 633        sq->stats.packets += packets;
 634        sq->stats.xdp_tx += n;
 635        sq->stats.xdp_tx_drops += n - nxmit;
 636        sq->stats.kicks += kicks;
 637        u64_stats_update_end(&sq->stats.syncp);
 638
 639        virtnet_xdp_put_sq(vi, sq);
 640        return ret;
 641}
 642
 643static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
 644{
 645        return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
 646}
 647
 648/* We copy the packet for XDP in the following cases:
 649 *
 650 * 1) Packet is scattered across multiple rx buffers.
 651 * 2) Headroom space is insufficient.
 652 *
 653 * This is inefficient but it's a temporary condition that
 654 * we hit right after XDP is enabled and until queue is refilled
 655 * with large buffers with sufficient headroom - so it should affect
 656 * at most queue size packets.
 657 * Afterwards, the conditions to enable
 658 * XDP should preclude the underlying device from sending packets
 659 * across multiple buffers (num_buf > 1), and we make sure buffers
 660 * have enough headroom.
 661 */
 662static struct page *xdp_linearize_page(struct receive_queue *rq,
 663                                       u16 *num_buf,
 664                                       struct page *p,
 665                                       int offset,
 666                                       int page_off,
 667                                       unsigned int *len)
 668{
 669        struct page *page = alloc_page(GFP_ATOMIC);
 670
 671        if (!page)
 672                return NULL;
 673
 674        memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 675        page_off += *len;
 676
 677        while (--*num_buf) {
 678                int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 679                unsigned int buflen;
 680                void *buf;
 681                int off;
 682
 683                buf = virtqueue_get_buf(rq->vq, &buflen);
 684                if (unlikely(!buf))
 685                        goto err_buf;
 686
 687                p = virt_to_head_page(buf);
 688                off = buf - page_address(p);
 689
 690                /* guard against a misconfigured or uncooperative backend that
 691                 * is sending packet larger than the MTU.
 692                 */
 693                if ((page_off + buflen + tailroom) > PAGE_SIZE) {
 694                        put_page(p);
 695                        goto err_buf;
 696                }
 697
 698                memcpy(page_address(page) + page_off,
 699                       page_address(p) + off, buflen);
 700                page_off += buflen;
 701                put_page(p);
 702        }
 703
 704        /* Headroom does not contribute to packet length */
 705        *len = page_off - VIRTIO_XDP_HEADROOM;
 706        return page;
 707err_buf:
 708        __free_pages(page, 0);
 709        return NULL;
 710}
 711
 712static struct sk_buff *receive_small(struct net_device *dev,
 713                                     struct virtnet_info *vi,
 714                                     struct receive_queue *rq,
 715                                     void *buf, void *ctx,
 716                                     unsigned int len,
 717                                     unsigned int *xdp_xmit,
 718                                     struct virtnet_rq_stats *stats)
 719{
 720        struct sk_buff *skb;
 721        struct bpf_prog *xdp_prog;
 722        unsigned int xdp_headroom = (unsigned long)ctx;
 723        unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
 724        unsigned int headroom = vi->hdr_len + header_offset;
 725        unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 726                              SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 727        struct page *page = virt_to_head_page(buf);
 728        unsigned int delta = 0;
 729        struct page *xdp_page;
 730        int err;
 731        unsigned int metasize = 0;
 732
 733        len -= vi->hdr_len;
 734        stats->bytes += len;
 735
 736        if (unlikely(len > GOOD_PACKET_LEN)) {
 737                pr_debug("%s: rx error: len %u exceeds max size %d\n",
 738                         dev->name, len, GOOD_PACKET_LEN);
 739                dev->stats.rx_length_errors++;
 740                goto err_len;
 741        }
 742        rcu_read_lock();
 743        xdp_prog = rcu_dereference(rq->xdp_prog);
 744        if (xdp_prog) {
 745                struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
 746                struct xdp_frame *xdpf;
 747                struct xdp_buff xdp;
 748                void *orig_data;
 749                u32 act;
 750
 751                if (unlikely(hdr->hdr.gso_type))
 752                        goto err_xdp;
 753
 754                if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
 755                        int offset = buf - page_address(page) + header_offset;
 756                        unsigned int tlen = len + vi->hdr_len;
 757                        u16 num_buf = 1;
 758
 759                        xdp_headroom = virtnet_get_headroom(vi);
 760                        header_offset = VIRTNET_RX_PAD + xdp_headroom;
 761                        headroom = vi->hdr_len + header_offset;
 762                        buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 763                                 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 764                        xdp_page = xdp_linearize_page(rq, &num_buf, page,
 765                                                      offset, header_offset,
 766                                                      &tlen);
 767                        if (!xdp_page)
 768                                goto err_xdp;
 769
 770                        buf = page_address(xdp_page);
 771                        put_page(page);
 772                        page = xdp_page;
 773                }
 774
 775                xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
 776                xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
 777                                 xdp_headroom, len, true);
 778                orig_data = xdp.data;
 779                act = bpf_prog_run_xdp(xdp_prog, &xdp);
 780                stats->xdp_packets++;
 781
 782                switch (act) {
 783                case XDP_PASS:
 784                        /* Recalculate length in case bpf program changed it */
 785                        delta = orig_data - xdp.data;
 786                        len = xdp.data_end - xdp.data;
 787                        metasize = xdp.data - xdp.data_meta;
 788                        break;
 789                case XDP_TX:
 790                        stats->xdp_tx++;
 791                        xdpf = xdp_convert_buff_to_frame(&xdp);
 792                        if (unlikely(!xdpf))
 793                                goto err_xdp;
 794                        err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
 795                        if (unlikely(!err)) {
 796                                xdp_return_frame_rx_napi(xdpf);
 797                        } else if (unlikely(err < 0)) {
 798                                trace_xdp_exception(vi->dev, xdp_prog, act);
 799                                goto err_xdp;
 800                        }
 801                        *xdp_xmit |= VIRTIO_XDP_TX;
 802                        rcu_read_unlock();
 803                        goto xdp_xmit;
 804                case XDP_REDIRECT:
 805                        stats->xdp_redirects++;
 806                        err = xdp_do_redirect(dev, &xdp, xdp_prog);
 807                        if (err)
 808                                goto err_xdp;
 809                        *xdp_xmit |= VIRTIO_XDP_REDIR;
 810                        rcu_read_unlock();
 811                        goto xdp_xmit;
 812                default:
 813                        bpf_warn_invalid_xdp_action(act);
 814                        fallthrough;
 815                case XDP_ABORTED:
 816                        trace_xdp_exception(vi->dev, xdp_prog, act);
 817                        goto err_xdp;
 818                case XDP_DROP:
 819                        goto err_xdp;
 820                }
 821        }
 822        rcu_read_unlock();
 823
 824        skb = build_skb(buf, buflen);
 825        if (!skb) {
 826                put_page(page);
 827                goto err;
 828        }
 829        skb_reserve(skb, headroom - delta);
 830        skb_put(skb, len);
 831        if (!xdp_prog) {
 832                buf += header_offset;
 833                memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
 834        } /* keep zeroed vnet hdr since XDP is loaded */
 835
 836        if (metasize)
 837                skb_metadata_set(skb, metasize);
 838
 839err:
 840        return skb;
 841
 842err_xdp:
 843        rcu_read_unlock();
 844        stats->xdp_drops++;
 845err_len:
 846        stats->drops++;
 847        put_page(page);
 848xdp_xmit:
 849        return NULL;
 850}
 851
 852static struct sk_buff *receive_big(struct net_device *dev,
 853                                   struct virtnet_info *vi,
 854                                   struct receive_queue *rq,
 855                                   void *buf,
 856                                   unsigned int len,
 857                                   struct virtnet_rq_stats *stats)
 858{
 859        struct page *page = buf;
 860        struct sk_buff *skb =
 861                page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
 862
 863        stats->bytes += len - vi->hdr_len;
 864        if (unlikely(!skb))
 865                goto err;
 866
 867        return skb;
 868
 869err:
 870        stats->drops++;
 871        give_pages(rq, page);
 872        return NULL;
 873}
 874
 875static struct sk_buff *receive_mergeable(struct net_device *dev,
 876                                         struct virtnet_info *vi,
 877                                         struct receive_queue *rq,
 878                                         void *buf,
 879                                         void *ctx,
 880                                         unsigned int len,
 881                                         unsigned int *xdp_xmit,
 882                                         struct virtnet_rq_stats *stats)
 883{
 884        struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
 885        u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
 886        struct page *page = virt_to_head_page(buf);
 887        int offset = buf - page_address(page);
 888        struct sk_buff *head_skb, *curr_skb;
 889        struct bpf_prog *xdp_prog;
 890        unsigned int truesize = mergeable_ctx_to_truesize(ctx);
 891        unsigned int headroom = mergeable_ctx_to_headroom(ctx);
 892        unsigned int metasize = 0;
 893        unsigned int frame_sz;
 894        int err;
 895
 896        head_skb = NULL;
 897        stats->bytes += len - vi->hdr_len;
 898
 899        if (unlikely(len > truesize)) {
 900                pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
 901                         dev->name, len, (unsigned long)ctx);
 902                dev->stats.rx_length_errors++;
 903                goto err_skb;
 904        }
 905        rcu_read_lock();
 906        xdp_prog = rcu_dereference(rq->xdp_prog);
 907        if (xdp_prog) {
 908                struct xdp_frame *xdpf;
 909                struct page *xdp_page;
 910                struct xdp_buff xdp;
 911                void *data;
 912                u32 act;
 913
 914                /* Transient failure which in theory could occur if
 915                 * in-flight packets from before XDP was enabled reach
 916                 * the receive path after XDP is loaded.
 917                 */
 918                if (unlikely(hdr->hdr.gso_type))
 919                        goto err_xdp;
 920
 921                /* Buffers with headroom use PAGE_SIZE as alloc size,
 922                 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
 923                 */
 924                frame_sz = headroom ? PAGE_SIZE : truesize;
 925
 926                /* This happens when rx buffer size is underestimated
 927                 * or headroom is not enough because of the buffer
 928                 * was refilled before XDP is set. This should only
 929                 * happen for the first several packets, so we don't
 930                 * care much about its performance.
 931                 */
 932                if (unlikely(num_buf > 1 ||
 933                             headroom < virtnet_get_headroom(vi))) {
 934                        /* linearize data for XDP */
 935                        xdp_page = xdp_linearize_page(rq, &num_buf,
 936                                                      page, offset,
 937                                                      VIRTIO_XDP_HEADROOM,
 938                                                      &len);
 939                        frame_sz = PAGE_SIZE;
 940
 941                        if (!xdp_page)
 942                                goto err_xdp;
 943                        offset = VIRTIO_XDP_HEADROOM;
 944                } else {
 945                        xdp_page = page;
 946                }
 947
 948                /* Allow consuming headroom but reserve enough space to push
 949                 * the descriptor on if we get an XDP_TX return code.
 950                 */
 951                data = page_address(xdp_page) + offset;
 952                xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
 953                xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
 954                                 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
 955
 956                act = bpf_prog_run_xdp(xdp_prog, &xdp);
 957                stats->xdp_packets++;
 958
 959                switch (act) {
 960                case XDP_PASS:
 961                        metasize = xdp.data - xdp.data_meta;
 962
 963                        /* recalculate offset to account for any header
 964                         * adjustments and minus the metasize to copy the
 965                         * metadata in page_to_skb(). Note other cases do not
 966                         * build an skb and avoid using offset
 967                         */
 968                        offset = xdp.data - page_address(xdp_page) -
 969                                 vi->hdr_len - metasize;
 970
 971                        /* recalculate len if xdp.data, xdp.data_end or
 972                         * xdp.data_meta were adjusted
 973                         */
 974                        len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
 975                        /* We can only create skb based on xdp_page. */
 976                        if (unlikely(xdp_page != page)) {
 977                                rcu_read_unlock();
 978                                put_page(page);
 979                                head_skb = page_to_skb(vi, rq, xdp_page, offset,
 980                                                       len, PAGE_SIZE, false,
 981                                                       metasize, true);
 982                                return head_skb;
 983                        }
 984                        break;
 985                case XDP_TX:
 986                        stats->xdp_tx++;
 987                        xdpf = xdp_convert_buff_to_frame(&xdp);
 988                        if (unlikely(!xdpf))
 989                                goto err_xdp;
 990                        err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
 991                        if (unlikely(!err)) {
 992                                xdp_return_frame_rx_napi(xdpf);
 993                        } else if (unlikely(err < 0)) {
 994                                trace_xdp_exception(vi->dev, xdp_prog, act);
 995                                if (unlikely(xdp_page != page))
 996                                        put_page(xdp_page);
 997                                goto err_xdp;
 998                        }
 999                        *xdp_xmit |= VIRTIO_XDP_TX;
1000                        if (unlikely(xdp_page != page))
1001                                put_page(page);
1002                        rcu_read_unlock();
1003                        goto xdp_xmit;
1004                case XDP_REDIRECT:
1005                        stats->xdp_redirects++;
1006                        err = xdp_do_redirect(dev, &xdp, xdp_prog);
1007                        if (err) {
1008                                if (unlikely(xdp_page != page))
1009                                        put_page(xdp_page);
1010                                goto err_xdp;
1011                        }
1012                        *xdp_xmit |= VIRTIO_XDP_REDIR;
1013                        if (unlikely(xdp_page != page))
1014                                put_page(page);
1015                        rcu_read_unlock();
1016                        goto xdp_xmit;
1017                default:
1018                        bpf_warn_invalid_xdp_action(act);
1019                        fallthrough;
1020                case XDP_ABORTED:
1021                        trace_xdp_exception(vi->dev, xdp_prog, act);
1022                        fallthrough;
1023                case XDP_DROP:
1024                        if (unlikely(xdp_page != page))
1025                                __free_pages(xdp_page, 0);
1026                        goto err_xdp;
1027                }
1028        }
1029        rcu_read_unlock();
1030
1031        head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
1032                               metasize, !!headroom);
1033        curr_skb = head_skb;
1034
1035        if (unlikely(!curr_skb))
1036                goto err_skb;
1037        while (--num_buf) {
1038                int num_skb_frags;
1039
1040                buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1041                if (unlikely(!buf)) {
1042                        pr_debug("%s: rx error: %d buffers out of %d missing\n",
1043                                 dev->name, num_buf,
1044                                 virtio16_to_cpu(vi->vdev,
1045                                                 hdr->num_buffers));
1046                        dev->stats.rx_length_errors++;
1047                        goto err_buf;
1048                }
1049
1050                stats->bytes += len;
1051                page = virt_to_head_page(buf);
1052
1053                truesize = mergeable_ctx_to_truesize(ctx);
1054                if (unlikely(len > truesize)) {
1055                        pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1056                                 dev->name, len, (unsigned long)ctx);
1057                        dev->stats.rx_length_errors++;
1058                        goto err_skb;
1059                }
1060
1061                num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
1062                if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
1063                        struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
1064
1065                        if (unlikely(!nskb))
1066                                goto err_skb;
1067                        if (curr_skb == head_skb)
1068                                skb_shinfo(curr_skb)->frag_list = nskb;
1069                        else
1070                                curr_skb->next = nskb;
1071                        curr_skb = nskb;
1072                        head_skb->truesize += nskb->truesize;
1073                        num_skb_frags = 0;
1074                }
1075                if (curr_skb != head_skb) {
1076                        head_skb->data_len += len;
1077                        head_skb->len += len;
1078                        head_skb->truesize += truesize;
1079                }
1080                offset = buf - page_address(page);
1081                if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
1082                        put_page(page);
1083                        skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1084                                             len, truesize);
1085                } else {
1086                        skb_add_rx_frag(curr_skb, num_skb_frags, page,
1087                                        offset, len, truesize);
1088                }
1089        }
1090
1091        ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1092        return head_skb;
1093
1094err_xdp:
1095        rcu_read_unlock();
1096        stats->xdp_drops++;
1097err_skb:
1098        put_page(page);
1099        while (num_buf-- > 1) {
1100                buf = virtqueue_get_buf(rq->vq, &len);
1101                if (unlikely(!buf)) {
1102                        pr_debug("%s: rx error: %d buffers missing\n",
1103                                 dev->name, num_buf);
1104                        dev->stats.rx_length_errors++;
1105                        break;
1106                }
1107                stats->bytes += len;
1108                page = virt_to_head_page(buf);
1109                put_page(page);
1110        }
1111err_buf:
1112        stats->drops++;
1113        dev_kfree_skb(head_skb);
1114xdp_xmit:
1115        return NULL;
1116}
1117
1118static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
1119                        void *buf, unsigned int len, void **ctx,
1120                        unsigned int *xdp_xmit,
1121                        struct virtnet_rq_stats *stats)
1122{
1123        struct net_device *dev = vi->dev;
1124        struct sk_buff *skb;
1125        struct virtio_net_hdr_mrg_rxbuf *hdr;
1126
1127        if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1128                pr_debug("%s: short packet %i\n", dev->name, len);
1129                dev->stats.rx_length_errors++;
1130                if (vi->mergeable_rx_bufs) {
1131                        put_page(virt_to_head_page(buf));
1132                } else if (vi->big_packets) {
1133                        give_pages(rq, buf);
1134                } else {
1135                        put_page(virt_to_head_page(buf));
1136                }
1137                return;
1138        }
1139
1140        if (vi->mergeable_rx_bufs)
1141                skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1142                                        stats);
1143        else if (vi->big_packets)
1144                skb = receive_big(dev, vi, rq, buf, len, stats);
1145        else
1146                skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1147
1148        if (unlikely(!skb))
1149                return;
1150
1151        hdr = skb_vnet_hdr(skb);
1152
1153        if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1154                skb->ip_summed = CHECKSUM_UNNECESSARY;
1155
1156        if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
1157                                  virtio_is_little_endian(vi->vdev))) {
1158                net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
1159                                     dev->name, hdr->hdr.gso_type,
1160                                     hdr->hdr.gso_size);
1161                goto frame_err;
1162        }
1163
1164        skb_record_rx_queue(skb, vq2rxq(rq->vq));
1165        skb->protocol = eth_type_trans(skb, dev);
1166        pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
1167                 ntohs(skb->protocol), skb->len, skb->pkt_type);
1168
1169        napi_gro_receive(&rq->napi, skb);
1170        return;
1171
1172frame_err:
1173        dev->stats.rx_frame_errors++;
1174        dev_kfree_skb(skb);
1175}
1176
1177/* Unlike mergeable buffers, all buffers are allocated to the
1178 * same size, except for the headroom. For this reason we do
1179 * not need to use  mergeable_len_to_ctx here - it is enough
1180 * to store the headroom as the context ignoring the truesize.
1181 */
1182static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
1183                             gfp_t gfp)
1184{
1185        struct page_frag *alloc_frag = &rq->alloc_frag;
1186        char *buf;
1187        unsigned int xdp_headroom = virtnet_get_headroom(vi);
1188        void *ctx = (void *)(unsigned long)xdp_headroom;
1189        int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1190        int err;
1191
1192        len = SKB_DATA_ALIGN(len) +
1193              SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1194        if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1195                return -ENOMEM;
1196
1197        buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1198        get_page(alloc_frag->page);
1199        alloc_frag->offset += len;
1200        sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
1201                    vi->hdr_len + GOOD_PACKET_LEN);
1202        err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1203        if (err < 0)
1204                put_page(virt_to_head_page(buf));
1205        return err;
1206}
1207
1208static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
1209                           gfp_t gfp)
1210{
1211        struct page *first, *list = NULL;
1212        char *p;
1213        int i, err, offset;
1214
1215        sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);
1216
1217        /* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1218        for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1219                first = get_a_page(rq, gfp);
1220                if (!first) {
1221                        if (list)
1222                                give_pages(rq, list);
1223                        return -ENOMEM;
1224                }
1225                sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1226
1227                /* chain new page in list head to match sg */
1228                first->private = (unsigned long)list;
1229                list = first;
1230        }
1231
1232        first = get_a_page(rq, gfp);
1233        if (!first) {
1234                give_pages(rq, list);
1235                return -ENOMEM;
1236        }
1237        p = page_address(first);
1238
1239        /* rq->sg[0], rq->sg[1] share the same page */
1240        /* a separated rq->sg[0] for header - required in case !any_header_sg */
1241        sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1242
1243        /* rq->sg[1] for data packet, from offset */
1244        offset = sizeof(struct padded_vnet_hdr);
1245        sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1246
1247        /* chain first in list head */
1248        first->private = (unsigned long)list;
1249        err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
1250                                  first, gfp);
1251        if (err < 0)
1252                give_pages(rq, first);
1253
1254        return err;
1255}
1256
1257static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1258                                          struct ewma_pkt_len *avg_pkt_len,
1259                                          unsigned int room)
1260{
1261        const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1262        unsigned int len;
1263
1264        if (room)
1265                return PAGE_SIZE - room;
1266
1267        len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1268                                rq->min_buf_len, PAGE_SIZE - hdr_len);
1269
1270        return ALIGN(len, L1_CACHE_BYTES);
1271}
1272
1273static int add_recvbuf_mergeable(struct virtnet_info *vi,
1274                                 struct receive_queue *rq, gfp_t gfp)
1275{
1276        struct page_frag *alloc_frag = &rq->alloc_frag;
1277        unsigned int headroom = virtnet_get_headroom(vi);
1278        unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
1279        unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1280        char *buf;
1281        void *ctx;
1282        int err;
1283        unsigned int len, hole;
1284
1285        /* Extra tailroom is needed to satisfy XDP's assumption. This
1286         * means rx frags coalescing won't work, but consider we've
1287         * disabled GSO for XDP, it won't be a big issue.
1288         */
1289        len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
1290        if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1291                return -ENOMEM;
1292
1293        buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1294        buf += headroom; /* advance address leaving hole at front of pkt */
1295        get_page(alloc_frag->page);
1296        alloc_frag->offset += len + room;
1297        hole = alloc_frag->size - alloc_frag->offset;
1298        if (hole < len + room) {
1299                /* To avoid internal fragmentation, if there is very likely not
1300                 * enough space for another buffer, add the remaining space to
1301                 * the current buffer.
1302                 */
1303                len += hole;
1304                alloc_frag->offset += hole;
1305        }
1306
1307        sg_init_one(rq->sg, buf, len);
1308        ctx = mergeable_len_to_ctx(len, headroom);
1309        err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1310        if (err < 0)
1311                put_page(virt_to_head_page(buf));
1312
1313        return err;
1314}
1315
1316/*
1317 * Returns false if we couldn't fill entirely (OOM).
1318 *
1319 * Normally run in the receive path, but can also be run from ndo_open
1320 * before we're receiving packets, or from refill_work which is
1321 * careful to disable receiving (using napi_disable).
1322 */
1323static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
1324                          gfp_t gfp)
1325{
1326        int err;
1327        bool oom;
1328
1329        do {
1330                if (vi->mergeable_rx_bufs)
1331                        err = add_recvbuf_mergeable(vi, rq, gfp);
1332                else if (vi->big_packets)
1333                        err = add_recvbuf_big(vi, rq, gfp);
1334                else
1335                        err = add_recvbuf_small(vi, rq, gfp);
1336
1337                oom = err == -ENOMEM;
1338                if (err)
1339                        break;
1340        } while (rq->vq->num_free);
1341        if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1342                unsigned long flags;
1343
1344                flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1345                rq->stats.kicks++;
1346                u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
1347        }
1348
1349        return !oom;
1350}
1351
1352static void skb_recv_done(struct virtqueue *rvq)
1353{
1354        struct virtnet_info *vi = rvq->vdev->priv;
1355        struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1356
1357        virtqueue_napi_schedule(&rq->napi, rvq);
1358}
1359
1360static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1361{
1362        napi_enable(napi);
1363
1364        /* If all buffers were filled by other side before we napi_enabled, we
1365         * won't get another interrupt, so process any outstanding packets now.
1366         * Call local_bh_enable after to trigger softIRQ processing.
1367         */
1368        local_bh_disable();
1369        virtqueue_napi_schedule(napi, vq);
1370        local_bh_enable();
1371}
1372
1373static void virtnet_napi_tx_enable(struct virtnet_info *vi,
1374                                   struct virtqueue *vq,
1375                                   struct napi_struct *napi)
1376{
1377        if (!napi->weight)
1378                return;
1379
1380        /* Tx napi touches cachelines on the cpu handling tx interrupts. Only
1381         * enable the feature if this is likely affine with the transmit path.
1382         */
1383        if (!vi->affinity_hint_set) {
1384                napi->weight = 0;
1385                return;
1386        }
1387
1388        return virtnet_napi_enable(vq, napi);
1389}
1390
1391static void virtnet_napi_tx_disable(struct napi_struct *napi)
1392{
1393        if (napi->weight)
1394                napi_disable(napi);
1395}
1396
1397static void refill_work(struct work_struct *work)
1398{
1399        struct virtnet_info *vi =
1400                container_of(work, struct virtnet_info, refill.work);
1401        bool still_empty;
1402        int i;
1403
1404        for (i = 0; i < vi->curr_queue_pairs; i++) {
1405                struct receive_queue *rq = &vi->rq[i];
1406
1407                napi_disable(&rq->napi);
1408                still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1409                virtnet_napi_enable(rq->vq, &rq->napi);
1410
1411                /* In theory, this can happen: if we don't get any buffers in
1412                 * we will *never* try to fill again.
1413                 */
1414                if (still_empty)
1415                        schedule_delayed_work(&vi->refill, HZ/2);
1416        }
1417}
1418
1419static int virtnet_receive(struct receive_queue *rq, int budget,
1420                           unsigned int *xdp_xmit)
1421{
1422        struct virtnet_info *vi = rq->vq->vdev->priv;
1423        struct virtnet_rq_stats stats = {};
1424        unsigned int len;
1425        void *buf;
1426        int i;
1427
1428        if (!vi->big_packets || vi->mergeable_rx_bufs) {
1429                void *ctx;
1430
1431                while (stats.packets < budget &&
1432                       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1433                        receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1434                        stats.packets++;
1435                }
1436        } else {
1437                while (stats.packets < budget &&
1438                       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1439                        receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1440                        stats.packets++;
1441                }
1442        }
1443
1444        if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
1445                if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1446                        schedule_delayed_work(&vi->refill, 0);
1447        }
1448
1449        u64_stats_update_begin(&rq->stats.syncp);
1450        for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
1451                size_t offset = virtnet_rq_stats_desc[i].offset;
1452                u64 *item;
1453
1454                item = (u64 *)((u8 *)&rq->stats + offset);
1455                *item += *(u64 *)((u8 *)&stats + offset);
1456        }
1457        u64_stats_update_end(&rq->stats.syncp);
1458
1459        return stats.packets;
1460}
1461
1462static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1463{
1464        unsigned int len;
1465        unsigned int packets = 0;
1466        unsigned int bytes = 0;
1467        void *ptr;
1468
1469        while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
1470                if (likely(!is_xdp_frame(ptr))) {
1471                        struct sk_buff *skb = ptr;
1472
1473                        pr_debug("Sent skb %p\n", skb);
1474
1475                        bytes += skb->len;
1476                        napi_consume_skb(skb, in_napi);
1477                } else {
1478                        struct xdp_frame *frame = ptr_to_xdp(ptr);
1479
1480                        bytes += frame->len;
1481                        xdp_return_frame(frame);
1482                }
1483                packets++;
1484        }
1485
1486        /* Avoid overhead when no packets have been processed
1487         * happens when called speculatively from start_xmit.
1488         */
1489        if (!packets)
1490                return;
1491
1492        u64_stats_update_begin(&sq->stats.syncp);
1493        sq->stats.bytes += bytes;
1494        sq->stats.packets += packets;
1495        u64_stats_update_end(&sq->stats.syncp);
1496}
1497
1498static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
1499{
1500        if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
1501                return false;
1502        else if (q < vi->curr_queue_pairs)
1503                return true;
1504        else
1505                return false;
1506}
1507
1508static void virtnet_poll_cleantx(struct receive_queue *rq)
1509{
1510        struct virtnet_info *vi = rq->vq->vdev->priv;
1511        unsigned int index = vq2rxq(rq->vq);
1512        struct send_queue *sq = &vi->sq[index];
1513        struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
1514
1515        if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1516                return;
1517
1518        if (__netif_tx_trylock(txq)) {
1519                do {
1520                        virtqueue_disable_cb(sq->vq);
1521                        free_old_xmit_skbs(sq, true);
1522                } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1523
1524                if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1525                        netif_tx_wake_queue(txq);
1526
1527                __netif_tx_unlock(txq);
1528        }
1529}
1530
1531static int virtnet_poll(struct napi_struct *napi, int budget)
1532{
1533        struct receive_queue *rq =
1534                container_of(napi, struct receive_queue, napi);
1535        struct virtnet_info *vi = rq->vq->vdev->priv;
1536        struct send_queue *sq;
1537        unsigned int received;
1538        unsigned int xdp_xmit = 0;
1539
1540        virtnet_poll_cleantx(rq);
1541
1542        received = virtnet_receive(rq, budget, &xdp_xmit);
1543
1544        /* Out of packets? */
1545        if (received < budget)
1546                virtqueue_napi_complete(napi, rq->vq, received);
1547
1548        if (xdp_xmit & VIRTIO_XDP_REDIR)
1549                xdp_do_flush();
1550
1551        if (xdp_xmit & VIRTIO_XDP_TX) {
1552                sq = virtnet_xdp_get_sq(vi);
1553                if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
1554                        u64_stats_update_begin(&sq->stats.syncp);
1555                        sq->stats.kicks++;
1556                        u64_stats_update_end(&sq->stats.syncp);
1557                }
1558                virtnet_xdp_put_sq(vi, sq);
1559        }
1560
1561        return received;
1562}
1563
1564static int virtnet_open(struct net_device *dev)
1565{
1566        struct virtnet_info *vi = netdev_priv(dev);
1567        int i, err;
1568
1569        for (i = 0; i < vi->max_queue_pairs; i++) {
1570                if (i < vi->curr_queue_pairs)
1571                        /* Make sure we have some buffers: if oom use wq. */
1572                        if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1573                                schedule_delayed_work(&vi->refill, 0);
1574
1575                err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1576                if (err < 0)
1577                        return err;
1578
1579                err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
1580                                                 MEM_TYPE_PAGE_SHARED, NULL);
1581                if (err < 0) {
1582                        xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
1583                        return err;
1584                }
1585
1586                virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1587                virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
1588        }
1589
1590        return 0;
1591}
1592
1593static int virtnet_poll_tx(struct napi_struct *napi, int budget)
1594{
1595        struct send_queue *sq = container_of(napi, struct send_queue, napi);
1596        struct virtnet_info *vi = sq->vq->vdev->priv;
1597        unsigned int index = vq2txq(sq->vq);
1598        struct netdev_queue *txq;
1599        int opaque;
1600        bool done;
1601
1602        if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
1603                /* We don't need to enable cb for XDP */
1604                napi_complete_done(napi, 0);
1605                return 0;
1606        }
1607
1608        txq = netdev_get_tx_queue(vi->dev, index);
1609        __netif_tx_lock(txq, raw_smp_processor_id());
1610        virtqueue_disable_cb(sq->vq);
1611        free_old_xmit_skbs(sq, true);
1612
1613        if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1614                netif_tx_wake_queue(txq);
1615
1616        opaque = virtqueue_enable_cb_prepare(sq->vq);
1617
1618        done = napi_complete_done(napi, 0);
1619
1620        if (!done)
1621                virtqueue_disable_cb(sq->vq);
1622
1623        __netif_tx_unlock(txq);
1624
1625        if (done) {
1626                if (unlikely(virtqueue_poll(sq->vq, opaque))) {
1627                        if (napi_schedule_prep(napi)) {
1628                                __netif_tx_lock(txq, raw_smp_processor_id());
1629                                virtqueue_disable_cb(sq->vq);
1630                                __netif_tx_unlock(txq);
1631                                __napi_schedule(napi);
1632                        }
1633                }
1634        }
1635
1636        return 0;
1637}
1638
1639static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
1640{
1641        struct virtio_net_hdr_mrg_rxbuf *hdr;
1642        const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1643        struct virtnet_info *vi = sq->vq->vdev->priv;
1644        int num_sg;
1645        unsigned hdr_len = vi->hdr_len;
1646        bool can_push;
1647
1648        pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1649
1650        can_push = vi->any_header_sg &&
1651                !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
1652                !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
1653        /* Even if we can, don't push here yet as this would skew
1654         * csum_start offset below. */
1655        if (can_push)
1656                hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1657        else
1658                hdr = skb_vnet_hdr(skb);
1659
1660        if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1661                                    virtio_is_little_endian(vi->vdev), false,
1662                                    0))
1663                return -EPROTO;
1664
1665        if (vi->mergeable_rx_bufs)
1666                hdr->num_buffers = 0;
1667
1668        sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1669        if (can_push) {
1670                __skb_push(skb, hdr_len);
1671                num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1672                if (unlikely(num_sg < 0))
1673                        return num_sg;
1674                /* Pull header back to avoid skew in tx bytes calculations. */
1675                __skb_pull(skb, hdr_len);
1676        } else {
1677                sg_set_buf(sq->sg, hdr, hdr_len);
1678                num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
1679                if (unlikely(num_sg < 0))
1680                        return num_sg;
1681                num_sg++;
1682        }
1683        return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1684}
1685
1686static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1687{
1688        struct virtnet_info *vi = netdev_priv(dev);
1689        int qnum = skb_get_queue_mapping(skb);
1690        struct send_queue *sq = &vi->sq[qnum];
1691        int err;
1692        struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1693        bool kick = !netdev_xmit_more();
1694        bool use_napi = sq->napi.weight;
1695
1696        /* Free up any pending old buffers before queueing new ones. */
1697        do {
1698                if (use_napi)
1699                        virtqueue_disable_cb(sq->vq);
1700
1701                free_old_xmit_skbs(sq, false);
1702
1703        } while (use_napi && kick &&
1704               unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1705
1706        /* timestamp packet in software */
1707        skb_tx_timestamp(skb);
1708
1709        /* Try to transmit */
1710        err = xmit_skb(sq, skb);
1711
1712        /* This should not happen! */
1713        if (unlikely(err)) {
1714                dev->stats.tx_fifo_errors++;
1715                if (net_ratelimit())
1716                        dev_warn(&dev->dev,
1717                                 "Unexpected TXQ (%d) queue failure: %d\n",
1718                                 qnum, err);
1719                dev->stats.tx_dropped++;
1720                dev_kfree_skb_any(skb);
1721                return NETDEV_TX_OK;
1722        }
1723
1724        /* Don't wait up for transmitted skbs to be freed. */
1725        if (!use_napi) {
1726                skb_orphan(skb);
1727                nf_reset_ct(skb);
1728        }
1729
1730        /* If running out of space, stop queue to avoid getting packets that we
1731         * are then unable to transmit.
1732         * An alternative would be to force queuing layer to requeue the skb by
1733         * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
1734         * returned in a normal path of operation: it means that driver is not
1735         * maintaining the TX queue stop/start state properly, and causes
1736         * the stack to do a non-trivial amount of useless work.
1737         * Since most packets only take 1 or 2 ring slots, stopping the queue
1738         * early means 16 slots are typically wasted.
1739         */
1740        if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
1741                netif_stop_subqueue(dev, qnum);
1742                if (!use_napi &&
1743                    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1744                        /* More just got used, free them then recheck. */
1745                        free_old_xmit_skbs(sq, false);
1746                        if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
1747                                netif_start_subqueue(dev, qnum);
1748                                virtqueue_disable_cb(sq->vq);
1749                        }
1750                }
1751        }
1752
1753        if (kick || netif_xmit_stopped(txq)) {
1754                if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
1755                        u64_stats_update_begin(&sq->stats.syncp);
1756                        sq->stats.kicks++;
1757                        u64_stats_update_end(&sq->stats.syncp);
1758                }
1759        }
1760
1761        return NETDEV_TX_OK;
1762}
1763
1764/*
1765 * Send command via the control virtqueue and check status.  Commands
1766 * supported by the hypervisor, as indicated by feature bits, should
1767 * never fail unless improperly formatted.
1768 */
1769static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1770                                 struct scatterlist *out)
1771{
1772        struct scatterlist *sgs[4], hdr, stat;
1773        unsigned out_num = 0, tmp;
1774        int ret;
1775
1776        /* Caller should know better */
1777        BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1778
1779        vi->ctrl->status = ~0;
1780        vi->ctrl->hdr.class = class;
1781        vi->ctrl->hdr.cmd = cmd;
1782        /* Add header */
1783        sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1784        sgs[out_num++] = &hdr;
1785
1786        if (out)
1787                sgs[out_num++] = out;
1788
1789        /* Add return status. */
1790        sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1791        sgs[out_num] = &stat;
1792
1793        BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1794        ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1795        if (ret < 0) {
1796                dev_warn(&vi->vdev->dev,
1797                         "Failed to add sgs for command vq: %d\n.", ret);
1798                return false;
1799        }
1800
1801        if (unlikely(!virtqueue_kick(vi->cvq)))
1802                return vi->ctrl->status == VIRTIO_NET_OK;
1803
1804        /* Spin for a response, the kick causes an ioport write, trapping
1805         * into the hypervisor, so the request should be handled immediately.
1806         */
1807        while (!virtqueue_get_buf(vi->cvq, &tmp) &&
1808               !virtqueue_is_broken(vi->cvq))
1809                cpu_relax();
1810
1811        return vi->ctrl->status == VIRTIO_NET_OK;
1812}
1813
1814static int virtnet_set_mac_address(struct net_device *dev, void *p)
1815{
1816        struct virtnet_info *vi = netdev_priv(dev);
1817        struct virtio_device *vdev = vi->vdev;
1818        int ret;
1819        struct sockaddr *addr;
1820        struct scatterlist sg;
1821
1822        if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
1823                return -EOPNOTSUPP;
1824
1825        addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1826        if (!addr)
1827                return -ENOMEM;
1828
1829        ret = eth_prepare_mac_addr_change(dev, addr);
1830        if (ret)
1831                goto out;
1832
1833        if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
1834                sg_init_one(&sg, addr->sa_data, dev->addr_len);
1835                if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1836                                          VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1837                        dev_warn(&vdev->dev,
1838                                 "Failed to set mac address by vq command.\n");
1839                        ret = -EINVAL;
1840                        goto out;
1841                }
1842        } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
1843                   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1844                unsigned int i;
1845
1846                /* Naturally, this has an atomicity problem. */
1847                for (i = 0; i < dev->addr_len; i++)
1848                        virtio_cwrite8(vdev,
1849                                       offsetof(struct virtio_net_config, mac) +
1850                                       i, addr->sa_data[i]);
1851        }
1852
1853        eth_commit_mac_addr_change(dev, p);
1854        ret = 0;
1855
1856out:
1857        kfree(addr);
1858        return ret;
1859}
1860
1861static void virtnet_stats(struct net_device *dev,
1862                          struct rtnl_link_stats64 *tot)
1863{
1864        struct virtnet_info *vi = netdev_priv(dev);
1865        unsigned int start;
1866        int i;
1867
1868        for (i = 0; i < vi->max_queue_pairs; i++) {
1869                u64 tpackets, tbytes, rpackets, rbytes, rdrops;
1870                struct receive_queue *rq = &vi->rq[i];
1871                struct send_queue *sq = &vi->sq[i];
1872
1873                do {
1874                        start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
1875                        tpackets = sq->stats.packets;
1876                        tbytes   = sq->stats.bytes;
1877                } while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1878
1879                do {
1880                        start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1881                        rpackets = rq->stats.packets;
1882                        rbytes   = rq->stats.bytes;
1883                        rdrops   = rq->stats.drops;
1884                } while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1885
1886                tot->rx_packets += rpackets;
1887                tot->tx_packets += tpackets;
1888                tot->rx_bytes   += rbytes;
1889                tot->tx_bytes   += tbytes;
1890                tot->rx_dropped += rdrops;
1891        }
1892
1893        tot->tx_dropped = dev->stats.tx_dropped;
1894        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1895        tot->rx_length_errors = dev->stats.rx_length_errors;
1896        tot->rx_frame_errors = dev->stats.rx_frame_errors;
1897}
1898
1899static void virtnet_ack_link_announce(struct virtnet_info *vi)
1900{
1901        rtnl_lock();
1902        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1903                                  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1904                dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
1905        rtnl_unlock();
1906}
1907
1908static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
1909{
1910        struct scatterlist sg;
1911        struct net_device *dev = vi->dev;
1912
1913        if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
1914                return 0;
1915
1916        vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
1917        sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
1918
1919        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1920                                  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
1921                dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
1922                         queue_pairs);
1923                return -EINVAL;
1924        } else {
1925                vi->curr_queue_pairs = queue_pairs;
1926                /* virtnet_open() will refill when device is going to up. */
1927                if (dev->flags & IFF_UP)
1928                        schedule_delayed_work(&vi->refill, 0);
1929        }
1930
1931        return 0;
1932}
1933
1934static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
1935{
1936        int err;
1937
1938        rtnl_lock();
1939        err = _virtnet_set_queues(vi, queue_pairs);
1940        rtnl_unlock();
1941        return err;
1942}
1943
1944static int virtnet_close(struct net_device *dev)
1945{
1946        struct virtnet_info *vi = netdev_priv(dev);
1947        int i;
1948
1949        /* Make sure refill_work doesn't re-enable napi! */
1950        cancel_delayed_work_sync(&vi->refill);
1951
1952        for (i = 0; i < vi->max_queue_pairs; i++) {
1953                xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
1954                napi_disable(&vi->rq[i].napi);
1955                virtnet_napi_tx_disable(&vi->sq[i].napi);
1956        }
1957
1958        return 0;
1959}
1960
1961static void virtnet_set_rx_mode(struct net_device *dev)
1962{
1963        struct virtnet_info *vi = netdev_priv(dev);
1964        struct scatterlist sg[2];
1965        struct virtio_net_ctrl_mac *mac_data;
1966        struct netdev_hw_addr *ha;
1967        int uc_count;
1968        int mc_count;
1969        void *buf;
1970        int i;
1971
1972        /* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1973        if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
1974                return;
1975
1976        vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
1977        vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1978
1979        sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
1980
1981        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1982                                  VIRTIO_NET_CTRL_RX_PROMISC, sg))
1983                dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1984                         vi->ctrl->promisc ? "en" : "dis");
1985
1986        sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
1987
1988        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1989                                  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1990                dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1991                         vi->ctrl->allmulti ? "en" : "dis");
1992
1993        uc_count = netdev_uc_count(dev);
1994        mc_count = netdev_mc_count(dev);
1995        /* MAC filter - use one buffer for both lists */
1996        buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
1997                      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
1998        mac_data = buf;
1999        if (!buf)
2000                return;
2001
2002        sg_init_table(sg, 2);
2003
2004        /* Store the unicast list and count in the front of the buffer */
2005        mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
2006        i = 0;
2007        netdev_for_each_uc_addr(ha, dev)
2008                memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2009
2010        sg_set_buf(&sg[0], mac_data,
2011                   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
2012
2013        /* multicast list and count fill the end */
2014        mac_data = (void *)&mac_data->macs[uc_count][0];
2015
2016        mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
2017        i = 0;
2018        netdev_for_each_mc_addr(ha, dev)
2019                memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2020
2021        sg_set_buf(&sg[1], mac_data,
2022                   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
2023
2024        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
2025                                  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
2026                dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
2027
2028        kfree(buf);
2029}
2030
2031static int virtnet_vlan_rx_add_vid(struct net_device *dev,
2032                                   __be16 proto, u16 vid)
2033{
2034        struct virtnet_info *vi = netdev_priv(dev);
2035        struct scatterlist sg;
2036
2037        vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2038        sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2039
2040        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2041                                  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
2042                dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
2043        return 0;
2044}
2045
2046static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
2047                                    __be16 proto, u16 vid)
2048{
2049        struct virtnet_info *vi = netdev_priv(dev);
2050        struct scatterlist sg;
2051
2052        vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2053        sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2054
2055        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2056                                  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
2057                dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
2058        return 0;
2059}
2060
2061static void virtnet_clean_affinity(struct virtnet_info *vi)
2062{
2063        int i;
2064
2065        if (vi->affinity_hint_set) {
2066                for (i = 0; i < vi->max_queue_pairs; i++) {
2067                        virtqueue_set_affinity(vi->rq[i].vq, NULL);
2068                        virtqueue_set_affinity(vi->sq[i].vq, NULL);
2069                }
2070
2071                vi->affinity_hint_set = false;
2072        }
2073}
2074
2075static void virtnet_set_affinity(struct virtnet_info *vi)
2076{
2077        cpumask_var_t mask;
2078        int stragglers;
2079        int group_size;
2080        int i, j, cpu;
2081        int num_cpu;
2082        int stride;
2083
2084        if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
2085                virtnet_clean_affinity(vi);
2086                return;
2087        }
2088
2089        num_cpu = num_online_cpus();
2090        stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
2091        stragglers = num_cpu >= vi->curr_queue_pairs ?
2092                        num_cpu % vi->curr_queue_pairs :
2093                        0;
2094        cpu = cpumask_next(-1, cpu_online_mask);
2095
2096        for (i = 0; i < vi->curr_queue_pairs; i++) {
2097                group_size = stride + (i < stragglers ? 1 : 0);
2098
2099                for (j = 0; j < group_size; j++) {
2100                        cpumask_set_cpu(cpu, mask);
2101                        cpu = cpumask_next_wrap(cpu, cpu_online_mask,
2102                                                nr_cpu_ids, false);
2103                }
2104                virtqueue_set_affinity(vi->rq[i].vq, mask);
2105                virtqueue_set_affinity(vi->sq[i].vq, mask);
2106                __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
2107                cpumask_clear(mask);
2108        }
2109
2110        vi->affinity_hint_set = true;
2111        free_cpumask_var(mask);
2112}
2113
2114static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
2115{
2116        struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2117                                                   node);
2118        virtnet_set_affinity(vi);
2119        return 0;
2120}
2121
2122static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
2123{
2124        struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2125                                                   node_dead);
2126        virtnet_set_affinity(vi);
2127        return 0;
2128}
2129
2130static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
2131{
2132        struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2133                                                   node);
2134
2135        virtnet_clean_affinity(vi);
2136        return 0;
2137}
2138
2139static enum cpuhp_state virtionet_online;
2140
2141static int virtnet_cpu_notif_add(struct virtnet_info *vi)
2142{
2143        int ret;
2144
2145        ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
2146        if (ret)
2147                return ret;
2148        ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
2149                                               &vi->node_dead);
2150        if (!ret)
2151                return ret;
2152        cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
2153        return ret;
2154}
2155
2156static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
2157{
2158        cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
2159        cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
2160                                            &vi->node_dead);
2161}
2162
2163static void virtnet_get_ringparam(struct net_device *dev,
2164                                struct ethtool_ringparam *ring)
2165{
2166        struct virtnet_info *vi = netdev_priv(dev);
2167
2168        ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
2169        ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
2170        ring->rx_pending = ring->rx_max_pending;
2171        ring->tx_pending = ring->tx_max_pending;
2172}
2173
2174
2175static void virtnet_get_drvinfo(struct net_device *dev,
2176                                struct ethtool_drvinfo *info)
2177{
2178        struct virtnet_info *vi = netdev_priv(dev);
2179        struct virtio_device *vdev = vi->vdev;
2180
2181        strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
2182        strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
2183        strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
2184
2185}
2186
2187/* TODO: Eliminate OOO packets during switching */
2188static int virtnet_set_channels(struct net_device *dev,
2189                                struct ethtool_channels *channels)
2190{
2191        struct virtnet_info *vi = netdev_priv(dev);
2192        u16 queue_pairs = channels->combined_count;
2193        int err;
2194
2195        /* We don't support separate rx/tx channels.
2196         * We don't allow setting 'other' channels.
2197         */
2198        if (channels->rx_count || channels->tx_count || channels->other_count)
2199                return -EINVAL;
2200
2201        if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2202                return -EINVAL;
2203
2204        /* For now we don't support modifying channels while XDP is loaded
2205         * also when XDP is loaded all RX queues have XDP programs so we only
2206         * need to check a single RX queue.
2207         */
2208        if (vi->rq[0].xdp_prog)
2209                return -EINVAL;
2210
2211        get_online_cpus();
2212        err = _virtnet_set_queues(vi, queue_pairs);
2213        if (err) {
2214                put_online_cpus();
2215                goto err;
2216        }
2217        virtnet_set_affinity(vi);
2218        put_online_cpus();
2219
2220        netif_set_real_num_tx_queues(dev, queue_pairs);
2221        netif_set_real_num_rx_queues(dev, queue_pairs);
2222 err:
2223        return err;
2224}
2225
2226static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
2227{
2228        struct virtnet_info *vi = netdev_priv(dev);
2229        unsigned int i, j;
2230        u8 *p = data;
2231
2232        switch (stringset) {
2233        case ETH_SS_STATS:
2234                for (i = 0; i < vi->curr_queue_pairs; i++) {
2235                        for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
2236                                ethtool_sprintf(&p, "rx_queue_%u_%s", i,
2237                                                virtnet_rq_stats_desc[j].desc);
2238                }
2239
2240                for (i = 0; i < vi->curr_queue_pairs; i++) {
2241                        for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
2242                                ethtool_sprintf(&p, "tx_queue_%u_%s", i,
2243                                                virtnet_sq_stats_desc[j].desc);
2244                }
2245                break;
2246        }
2247}
2248
2249static int virtnet_get_sset_count(struct net_device *dev, int sset)
2250{
2251        struct virtnet_info *vi = netdev_priv(dev);
2252
2253        switch (sset) {
2254        case ETH_SS_STATS:
2255                return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
2256                                               VIRTNET_SQ_STATS_LEN);
2257        default:
2258                return -EOPNOTSUPP;
2259        }
2260}
2261
2262static void virtnet_get_ethtool_stats(struct net_device *dev,
2263                                      struct ethtool_stats *stats, u64 *data)
2264{
2265        struct virtnet_info *vi = netdev_priv(dev);
2266        unsigned int idx = 0, start, i, j;
2267        const u8 *stats_base;
2268        size_t offset;
2269
2270        for (i = 0; i < vi->curr_queue_pairs; i++) {
2271                struct receive_queue *rq = &vi->rq[i];
2272
2273                stats_base = (u8 *)&rq->stats;
2274                do {
2275                        start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
2276                        for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
2277                                offset = virtnet_rq_stats_desc[j].offset;
2278                                data[idx + j] = *(u64 *)(stats_base + offset);
2279                        }
2280                } while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
2281                idx += VIRTNET_RQ_STATS_LEN;
2282        }
2283
2284        for (i = 0; i < vi->curr_queue_pairs; i++) {
2285                struct send_queue *sq = &vi->sq[i];
2286
2287                stats_base = (u8 *)&sq->stats;
2288                do {
2289                        start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
2290                        for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
2291                                offset = virtnet_sq_stats_desc[j].offset;
2292                                data[idx + j] = *(u64 *)(stats_base + offset);
2293                        }
2294                } while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
2295                idx += VIRTNET_SQ_STATS_LEN;
2296        }
2297}
2298
2299static void virtnet_get_channels(struct net_device *dev,
2300                                 struct ethtool_channels *channels)
2301{
2302        struct virtnet_info *vi = netdev_priv(dev);
2303
2304        channels->combined_count = vi->curr_queue_pairs;
2305        channels->max_combined = vi->max_queue_pairs;
2306        channels->max_other = 0;
2307        channels->rx_count = 0;
2308        channels->tx_count = 0;
2309        channels->other_count = 0;
2310}
2311
2312static int virtnet_set_link_ksettings(struct net_device *dev,
2313                                      const struct ethtool_link_ksettings *cmd)
2314{
2315        struct virtnet_info *vi = netdev_priv(dev);
2316
2317        return ethtool_virtdev_set_link_ksettings(dev, cmd,
2318                                                  &vi->speed, &vi->duplex);
2319}
2320
2321static int virtnet_get_link_ksettings(struct net_device *dev,
2322                                      struct ethtool_link_ksettings *cmd)
2323{
2324        struct virtnet_info *vi = netdev_priv(dev);
2325
2326        cmd->base.speed = vi->speed;
2327        cmd->base.duplex = vi->duplex;
2328        cmd->base.port = PORT_OTHER;
2329
2330        return 0;
2331}
2332
2333static int virtnet_set_coalesce(struct net_device *dev,
2334                                struct ethtool_coalesce *ec)
2335{
2336        struct virtnet_info *vi = netdev_priv(dev);
2337        int i, napi_weight;
2338
2339        if (ec->tx_max_coalesced_frames > 1 ||
2340            ec->rx_max_coalesced_frames != 1)
2341                return -EINVAL;
2342
2343        napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
2344        if (napi_weight ^ vi->sq[0].napi.weight) {
2345                if (dev->flags & IFF_UP)
2346                        return -EBUSY;
2347                for (i = 0; i < vi->max_queue_pairs; i++)
2348                        vi->sq[i].napi.weight = napi_weight;
2349        }
2350
2351        return 0;
2352}
2353
2354static int virtnet_get_coalesce(struct net_device *dev,
2355                                struct ethtool_coalesce *ec)
2356{
2357        struct ethtool_coalesce ec_default = {
2358                .cmd = ETHTOOL_GCOALESCE,
2359                .rx_max_coalesced_frames = 1,
2360        };
2361        struct virtnet_info *vi = netdev_priv(dev);
2362
2363        memcpy(ec, &ec_default, sizeof(ec_default));
2364
2365        if (vi->sq[0].napi.weight)
2366                ec->tx_max_coalesced_frames = 1;
2367
2368        return 0;
2369}
2370
2371static void virtnet_init_settings(struct net_device *dev)
2372{
2373        struct virtnet_info *vi = netdev_priv(dev);
2374
2375        vi->speed = SPEED_UNKNOWN;
2376        vi->duplex = DUPLEX_UNKNOWN;
2377}
2378
2379static void virtnet_update_settings(struct virtnet_info *vi)
2380{
2381        u32 speed;
2382        u8 duplex;
2383
2384        if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
2385                return;
2386
2387        virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
2388
2389        if (ethtool_validate_speed(speed))
2390                vi->speed = speed;
2391
2392        virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
2393
2394        if (ethtool_validate_duplex(duplex))
2395                vi->duplex = duplex;
2396}
2397
2398static const struct ethtool_ops virtnet_ethtool_ops = {
2399        .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
2400        .get_drvinfo = virtnet_get_drvinfo,
2401        .get_link = ethtool_op_get_link,
2402        .get_ringparam = virtnet_get_ringparam,
2403        .get_strings = virtnet_get_strings,
2404        .get_sset_count = virtnet_get_sset_count,
2405        .get_ethtool_stats = virtnet_get_ethtool_stats,
2406        .set_channels = virtnet_set_channels,
2407        .get_channels = virtnet_get_channels,
2408        .get_ts_info = ethtool_op_get_ts_info,
2409        .get_link_ksettings = virtnet_get_link_ksettings,
2410        .set_link_ksettings = virtnet_set_link_ksettings,
2411        .set_coalesce = virtnet_set_coalesce,
2412        .get_coalesce = virtnet_get_coalesce,
2413};
2414
2415static void virtnet_freeze_down(struct virtio_device *vdev)
2416{
2417        struct virtnet_info *vi = vdev->priv;
2418        int i;
2419
2420        /* Make sure no work handler is accessing the device */
2421        flush_work(&vi->config_work);
2422
2423        netif_tx_lock_bh(vi->dev);
2424        netif_device_detach(vi->dev);
2425        netif_tx_unlock_bh(vi->dev);
2426        cancel_delayed_work_sync(&vi->refill);
2427
2428        if (netif_running(vi->dev)) {
2429                for (i = 0; i < vi->max_queue_pairs; i++) {
2430                        napi_disable(&vi->rq[i].napi);
2431                        virtnet_napi_tx_disable(&vi->sq[i].napi);
2432                }
2433        }
2434}
2435
2436static int init_vqs(struct virtnet_info *vi);
2437
2438static int virtnet_restore_up(struct virtio_device *vdev)
2439{
2440        struct virtnet_info *vi = vdev->priv;
2441        int err, i;
2442
2443        err = init_vqs(vi);
2444        if (err)
2445                return err;
2446
2447        virtio_device_ready(vdev);
2448
2449        if (netif_running(vi->dev)) {
2450                for (i = 0; i < vi->curr_queue_pairs; i++)
2451                        if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
2452                                schedule_delayed_work(&vi->refill, 0);
2453
2454                for (i = 0; i < vi->max_queue_pairs; i++) {
2455                        virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2456                        virtnet_napi_tx_enable(vi, vi->sq[i].vq,
2457                                               &vi->sq[i].napi);
2458                }
2459        }
2460
2461        netif_tx_lock_bh(vi->dev);
2462        netif_device_attach(vi->dev);
2463        netif_tx_unlock_bh(vi->dev);
2464        return err;
2465}
2466
2467static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
2468{
2469        struct scatterlist sg;
2470        vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2471
2472        sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2473
2474        if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
2475                                  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
2476                dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
2477                return -EINVAL;
2478        }
2479
2480        return 0;
2481}
2482
2483static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
2484{
2485        u64 offloads = 0;
2486
2487        if (!vi->guest_offloads)
2488                return 0;
2489
2490        return virtnet_set_guest_offloads(vi, offloads);
2491}
2492
2493static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
2494{
2495        u64 offloads = vi->guest_offloads;
2496
2497        if (!vi->guest_offloads)
2498                return 0;
2499
2500        return virtnet_set_guest_offloads(vi, offloads);
2501}
2502
2503static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
2504                           struct netlink_ext_ack *extack)
2505{
2506        unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
2507        struct virtnet_info *vi = netdev_priv(dev);
2508        struct bpf_prog *old_prog;
2509        u16 xdp_qp = 0, curr_qp;
2510        int i, err;
2511
2512        if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
2513            && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
2514                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2515                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2516                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
2517                virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
2518                NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first");
2519                return -EOPNOTSUPP;
2520        }
2521
2522        if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2523                NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
2524                return -EINVAL;
2525        }
2526
2527        if (dev->mtu > max_sz) {
2528                NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
2529                netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
2530                return -EINVAL;
2531        }
2532
2533        curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
2534        if (prog)
2535                xdp_qp = nr_cpu_ids;
2536
2537        /* XDP requires extra queues for XDP_TX */
2538        if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2539                netdev_warn(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
2540                            curr_qp + xdp_qp, vi->max_queue_pairs);
2541                xdp_qp = 0;
2542        }
2543
2544        old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
2545        if (!prog && !old_prog)
2546                return 0;
2547
2548        if (prog)
2549                bpf_prog_add(prog, vi->max_queue_pairs - 1);
2550
2551        /* Make sure NAPI is not using any XDP TX queues for RX. */
2552        if (netif_running(dev)) {
2553                for (i = 0; i < vi->max_queue_pairs; i++) {
2554                        napi_disable(&vi->rq[i].napi);
2555                        virtnet_napi_tx_disable(&vi->sq[i].napi);
2556                }
2557        }
2558
2559        if (!prog) {
2560                for (i = 0; i < vi->max_queue_pairs; i++) {
2561                        rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
2562                        if (i == 0)
2563                                virtnet_restore_guest_offloads(vi);
2564                }
2565                synchronize_net();
2566        }
2567
2568        err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
2569        if (err)
2570                goto err;
2571        netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2572        vi->xdp_queue_pairs = xdp_qp;
2573
2574        if (prog) {
2575                vi->xdp_enabled = true;
2576                for (i = 0; i < vi->max_queue_pairs; i++) {
2577                        rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
2578                        if (i == 0 && !old_prog)
2579                                virtnet_clear_guest_offloads(vi);
2580                }
2581        } else {
2582                vi->xdp_enabled = false;
2583        }
2584
2585        for (i = 0; i < vi->max_queue_pairs; i++) {
2586                if (old_prog)
2587                        bpf_prog_put(old_prog);
2588                if (netif_running(dev)) {
2589                        virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2590                        virtnet_napi_tx_enable(vi, vi->sq[i].vq,
2591                                               &vi->sq[i].napi);
2592                }
2593        }
2594
2595        return 0;
2596
2597err:
2598        if (!prog) {
2599                virtnet_clear_guest_offloads(vi);
2600                for (i = 0; i < vi->max_queue_pairs; i++)
2601                        rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
2602        }
2603
2604        if (netif_running(dev)) {
2605                for (i = 0; i < vi->max_queue_pairs; i++) {
2606                        virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2607                        virtnet_napi_tx_enable(vi, vi->sq[i].vq,
2608                                               &vi->sq[i].napi);
2609                }
2610        }
2611        if (prog)
2612                bpf_prog_sub(prog, vi->max_queue_pairs - 1);
2613        return err;
2614}
2615
2616static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
2617{
2618        switch (xdp->command) {
2619        case XDP_SETUP_PROG:
2620                return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
2621        default:
2622                return -EINVAL;
2623        }
2624}
2625
2626static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
2627                                      size_t len)
2628{
2629        struct virtnet_info *vi = netdev_priv(dev);
2630        int ret;
2631
2632        if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
2633                return -EOPNOTSUPP;
2634
2635        ret = snprintf(buf, len, "sby");
2636        if (ret >= len)
2637                return -EOPNOTSUPP;
2638
2639        return 0;
2640}
2641
2642static int virtnet_set_features(struct net_device *dev,
2643                                netdev_features_t features)
2644{
2645        struct virtnet_info *vi = netdev_priv(dev);
2646        u64 offloads;
2647        int err;
2648
2649        if ((dev->features ^ features) & NETIF_F_GRO_HW) {
2650                if (vi->xdp_enabled)
2651                        return -EBUSY;
2652
2653                if (features & NETIF_F_GRO_HW)
2654                        offloads = vi->guest_offloads_capable;
2655                else
2656                        offloads = vi->guest_offloads_capable &
2657                                   ~GUEST_OFFLOAD_GRO_HW_MASK;
2658
2659                err = virtnet_set_guest_offloads(vi, offloads);
2660                if (err)
2661                        return err;
2662                vi->guest_offloads = offloads;
2663        }
2664
2665        return 0;
2666}
2667
2668static const struct net_device_ops virtnet_netdev = {
2669        .ndo_open            = virtnet_open,
2670        .ndo_stop            = virtnet_close,
2671        .ndo_start_xmit      = start_xmit,
2672        .ndo_validate_addr   = eth_validate_addr,
2673        .ndo_set_mac_address = virtnet_set_mac_address,
2674        .ndo_set_rx_mode     = virtnet_set_rx_mode,
2675        .ndo_get_stats64     = virtnet_stats,
2676        .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
2677        .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2678        .ndo_bpf                = virtnet_xdp,
2679        .ndo_xdp_xmit           = virtnet_xdp_xmit,
2680        .ndo_features_check     = passthru_features_check,
2681        .ndo_get_phys_port_name = virtnet_get_phys_port_name,
2682        .ndo_set_features       = virtnet_set_features,
2683};
2684
2685static void virtnet_config_changed_work(struct work_struct *work)
2686{
2687        struct virtnet_info *vi =
2688                container_of(work, struct virtnet_info, config_work);
2689        u16 v;
2690
2691        if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
2692                                 struct virtio_net_config, status, &v) < 0)
2693                return;
2694
2695        if (v & VIRTIO_NET_S_ANNOUNCE) {
2696                netdev_notify_peers(vi->dev);
2697                virtnet_ack_link_announce(vi);
2698        }
2699
2700        /* Ignore unknown (future) status bits */
2701        v &= VIRTIO_NET_S_LINK_UP;
2702
2703        if (vi->status == v)
2704                return;
2705
2706        vi->status = v;
2707
2708        if (vi->status & VIRTIO_NET_S_LINK_UP) {
2709                virtnet_update_settings(vi);
2710                netif_carrier_on(vi->dev);
2711                netif_tx_wake_all_queues(vi->dev);
2712        } else {
2713                netif_carrier_off(vi->dev);
2714                netif_tx_stop_all_queues(vi->dev);
2715        }
2716}
2717
2718static void virtnet_config_changed(struct virtio_device *vdev)
2719{
2720        struct virtnet_info *vi = vdev->priv;
2721
2722        schedule_work(&vi->config_work);
2723}
2724
2725static void virtnet_free_queues(struct virtnet_info *vi)
2726{
2727        int i;
2728
2729        for (i = 0; i < vi->max_queue_pairs; i++) {
2730                __netif_napi_del(&vi->rq[i].napi);
2731                __netif_napi_del(&vi->sq[i].napi);
2732        }
2733
2734        /* We called __netif_napi_del(),
2735         * we need to respect an RCU grace period before freeing vi->rq
2736         */
2737        synchronize_net();
2738
2739        kfree(vi->rq);
2740        kfree(vi->sq);
2741        kfree(vi->ctrl);
2742}
2743
2744static void _free_receive_bufs(struct virtnet_info *vi)
2745{
2746        struct bpf_prog *old_prog;
2747        int i;
2748
2749        for (i = 0; i < vi->max_queue_pairs; i++) {
2750                while (vi->rq[i].pages)
2751                        __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
2752
2753                old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
2754                RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
2755                if (old_prog)
2756                        bpf_prog_put(old_prog);
2757        }
2758}
2759
2760static void free_receive_bufs(struct virtnet_info *vi)
2761{
2762        rtnl_lock();
2763        _free_receive_bufs(vi);
2764        rtnl_unlock();
2765}
2766
2767static void free_receive_page_frags(struct virtnet_info *vi)
2768{
2769        int i;
2770        for (i = 0; i < vi->max_queue_pairs; i++)
2771                if (vi->rq[i].alloc_frag.page)
2772                        put_page(vi->rq[i].alloc_frag.page);
2773}
2774
2775static void free_unused_bufs(struct virtnet_info *vi)
2776{
2777        void *buf;
2778        int i;
2779
2780        for (i = 0; i < vi->max_queue_pairs; i++) {
2781                struct virtqueue *vq = vi->sq[i].vq;
2782                while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2783                        if (!is_xdp_frame(buf))
2784                                dev_kfree_skb(buf);
2785                        else
2786                                xdp_return_frame(ptr_to_xdp(buf));
2787                }
2788        }
2789
2790        for (i = 0; i < vi->max_queue_pairs; i++) {
2791                struct virtqueue *vq = vi->rq[i].vq;
2792
2793                while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2794                        if (vi->mergeable_rx_bufs) {
2795                                put_page(virt_to_head_page(buf));
2796                        } else if (vi->big_packets) {
2797                                give_pages(&vi->rq[i], buf);
2798                        } else {
2799                                put_page(virt_to_head_page(buf));
2800                        }
2801                }
2802        }
2803}
2804
2805static void virtnet_del_vqs(struct virtnet_info *vi)
2806{
2807        struct virtio_device *vdev = vi->vdev;
2808
2809        virtnet_clean_affinity(vi);
2810
2811        vdev->config->del_vqs(vdev);
2812
2813        virtnet_free_queues(vi);
2814}
2815
2816/* How large should a single buffer be so a queue full of these can fit at
2817 * least one full packet?
2818 * Logic below assumes the mergeable buffer header is used.
2819 */
2820static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
2821{
2822        const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2823        unsigned int rq_size = virtqueue_get_vring_size(vq);
2824        unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
2825        unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
2826        unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);
2827
2828        return max(max(min_buf_len, hdr_len) - hdr_len,
2829                   (unsigned int)GOOD_PACKET_LEN);
2830}
2831
2832static int virtnet_find_vqs(struct virtnet_info *vi)
2833{
2834        vq_callback_t **callbacks;
2835        struct virtqueue **vqs;
2836        int ret = -ENOMEM;
2837        int i, total_vqs;
2838        const char **names;
2839        bool *ctx;
2840
2841        /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
2842         * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
2843         * possible control vq.
2844         */
2845        total_vqs = vi->max_queue_pairs * 2 +
2846                    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
2847
2848        /* Allocate space for find_vqs parameters */
2849        vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
2850        if (!vqs)
2851                goto err_vq;
2852        callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
2853        if (!callbacks)
2854                goto err_callback;
2855        names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
2856        if (!names)
2857                goto err_names;
2858        if (!vi->big_packets || vi->mergeable_rx_bufs) {
2859                ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
2860                if (!ctx)
2861                        goto err_ctx;
2862        } else {
2863                ctx = NULL;
2864        }
2865
2866        /* Parameters for control virtqueue, if any */
2867        if (vi->has_cvq) {
2868                callbacks[total_vqs - 1] = NULL;
2869                names[total_vqs - 1] = "control";
2870        }
2871
2872        /* Allocate/initialize parameters for send/receive virtqueues */
2873        for (i = 0; i < vi->max_queue_pairs; i++) {
2874                callbacks[rxq2vq(i)] = skb_recv_done;
2875                callbacks[txq2vq(i)] = skb_xmit_done;
2876                sprintf(vi->rq[i].name, "input.%d", i);
2877                sprintf(vi->sq[i].name, "output.%d", i);
2878                names[rxq2vq(i)] = vi->rq[i].name;
2879                names[txq2vq(i)] = vi->sq[i].name;
2880                if (ctx)
2881                        ctx[rxq2vq(i)] = true;
2882        }
2883
2884        ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks,
2885                                  names, ctx, NULL);
2886        if (ret)
2887                goto err_find;
2888
2889        if (vi->has_cvq) {
2890                vi->cvq = vqs[total_vqs - 1];
2891                if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2892                        vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2893        }
2894
2895        for (i = 0; i < vi->max_queue_pairs; i++) {
2896                vi->rq[i].vq = vqs[rxq2vq(i)];
2897                vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
2898                vi->sq[i].vq = vqs[txq2vq(i)];
2899        }
2900
2901        /* run here: ret == 0. */
2902
2903
2904err_find:
2905        kfree(ctx);
2906err_ctx:
2907        kfree(names);
2908err_names:
2909        kfree(callbacks);
2910err_callback:
2911        kfree(vqs);
2912err_vq:
2913        return ret;
2914}
2915
2916static int virtnet_alloc_queues(struct virtnet_info *vi)
2917{
2918        int i;
2919
2920        if (vi->has_cvq) {
2921                vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
2922                if (!vi->ctrl)
2923                        goto err_ctrl;
2924        } else {
2925                vi->ctrl = NULL;
2926        }
2927        vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
2928        if (!vi->sq)
2929                goto err_sq;
2930        vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
2931        if (!vi->rq)
2932                goto err_rq;
2933
2934        INIT_DELAYED_WORK(&vi->refill, refill_work);
2935        for (i = 0; i < vi->max_queue_pairs; i++) {
2936                vi->rq[i].pages = NULL;
2937                netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
2938                               napi_weight);
2939                netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
2940                                  napi_tx ? napi_weight : 0);
2941
2942                sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
2943                ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
2944                sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
2945
2946                u64_stats_init(&vi->rq[i].stats.syncp);
2947                u64_stats_init(&vi->sq[i].stats.syncp);
2948        }
2949
2950        return 0;
2951
2952err_rq:
2953        kfree(vi->sq);
2954err_sq:
2955        kfree(vi->ctrl);
2956err_ctrl:
2957        return -ENOMEM;
2958}
2959
2960static int init_vqs(struct virtnet_info *vi)
2961{
2962        int ret;
2963
2964        /* Allocate send & receive queues */
2965        ret = virtnet_alloc_queues(vi);
2966        if (ret)
2967                goto err;
2968
2969        ret = virtnet_find_vqs(vi);
2970        if (ret)
2971                goto err_free;
2972
2973        get_online_cpus();
2974        virtnet_set_affinity(vi);
2975        put_online_cpus();
2976
2977        return 0;
2978
2979err_free:
2980        virtnet_free_queues(vi);
2981err:
2982        return ret;
2983}
2984
2985#ifdef CONFIG_SYSFS
2986static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2987                char *buf)
2988{
2989        struct virtnet_info *vi = netdev_priv(queue->dev);
2990        unsigned int queue_index = get_netdev_rx_queue_index(queue);
2991        unsigned int headroom = virtnet_get_headroom(vi);
2992        unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
2993        struct ewma_pkt_len *avg;
2994
2995        BUG_ON(queue_index >= vi->max_queue_pairs);
2996        avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2997        return sprintf(buf, "%u\n",
2998                       get_mergeable_buf_len(&vi->rq[queue_index], avg,
2999                                       SKB_DATA_ALIGN(headroom + tailroom)));
3000}
3001
3002static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
3003        __ATTR_RO(mergeable_rx_buffer_size);
3004
3005static struct attribute *virtio_net_mrg_rx_attrs[] = {
3006        &mergeable_rx_buffer_size_attribute.attr,
3007        NULL
3008};
3009
3010static const struct attribute_group virtio_net_mrg_rx_group = {
3011        .name = "virtio_net",
3012        .attrs = virtio_net_mrg_rx_attrs
3013};
3014#endif
3015
3016static bool virtnet_fail_on_feature(struct virtio_device *vdev,
3017                                    unsigned int fbit,
3018                                    const char *fname, const char *dname)
3019{
3020        if (!virtio_has_feature(vdev, fbit))
3021                return false;
3022
3023        dev_err(&vdev->dev, "device advertises feature %s but not %s",
3024                fname, dname);
3025
3026        return true;
3027}
3028
3029#define VIRTNET_FAIL_ON(vdev, fbit, dbit)                       \
3030        virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)
3031
3032static bool virtnet_validate_features(struct virtio_device *vdev)
3033{
3034        if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
3035            (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
3036                             "VIRTIO_NET_F_CTRL_VQ") ||
3037             VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
3038                             "VIRTIO_NET_F_CTRL_VQ") ||
3039             VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
3040                             "VIRTIO_NET_F_CTRL_VQ") ||
3041             VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
3042             VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
3043                             "VIRTIO_NET_F_CTRL_VQ"))) {
3044                return false;
3045        }
3046
3047        return true;
3048}
3049
3050#define MIN_MTU ETH_MIN_MTU
3051#define MAX_MTU ETH_MAX_MTU
3052
3053static int virtnet_validate(struct virtio_device *vdev)
3054{
3055        if (!vdev->config->get) {
3056                dev_err(&vdev->dev, "%s failure: config access disabled\n",
3057                        __func__);
3058                return -EINVAL;
3059        }
3060
3061        if (!virtnet_validate_features(vdev))
3062                return -EINVAL;
3063
3064        if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
3065                int mtu = virtio_cread16(vdev,
3066                                         offsetof(struct virtio_net_config,
3067                                                  mtu));
3068                if (mtu < MIN_MTU)
3069                        __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
3070        }
3071
3072        return 0;
3073}
3074
3075static int virtnet_probe(struct virtio_device *vdev)
3076{
3077        int i, err = -ENOMEM;
3078        struct net_device *dev;
3079        struct virtnet_info *vi;
3080        u16 max_queue_pairs;
3081        int mtu;
3082
3083        /* Find if host supports multiqueue virtio_net device */
3084        err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
3085                                   struct virtio_net_config,
3086                                   max_virtqueue_pairs, &max_queue_pairs);
3087
3088        /* We need at least 2 queue's */
3089        if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
3090            max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
3091            !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
3092                max_queue_pairs = 1;
3093
3094        /* Allocate ourselves a network device with room for our info */
3095        dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
3096        if (!dev)
3097                return -ENOMEM;
3098
3099        /* Set up network device as normal. */
3100        dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
3101                           IFF_TX_SKB_NO_LINEAR;
3102        dev->netdev_ops = &virtnet_netdev;
3103        dev->features = NETIF_F_HIGHDMA;
3104
3105        dev->ethtool_ops = &virtnet_ethtool_ops;
3106        SET_NETDEV_DEV(dev, &vdev->dev);
3107
3108        /* Do we support "hardware" checksums? */
3109        if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
3110                /* This opens up the world of extra features. */
3111                dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3112                if (csum)
3113                        dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3114
3115                if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
3116                        dev->hw_features |= NETIF_F_TSO
3117                                | NETIF_F_TSO_ECN | NETIF_F_TSO6;
3118                }
3119                /* Individual feature bits: what can host handle? */
3120                if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
3121                        dev->hw_features |= NETIF_F_TSO;
3122                if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
3123                        dev->hw_features |= NETIF_F_TSO6;
3124                if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
3125                        dev->hw_features |= NETIF_F_TSO_ECN;
3126
3127                dev->features |= NETIF_F_GSO_ROBUST;
3128
3129                if (gso)
3130                        dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3131                /* (!csum && gso) case will be fixed by register_netdev() */
3132        }
3133        if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
3134                dev->features |= NETIF_F_RXCSUM;
3135        if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3136            virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
3137                dev->features |= NETIF_F_GRO_HW;
3138        if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3139                dev->hw_features |= NETIF_F_GRO_HW;
3140
3141        dev->vlan_features = dev->features;
3142
3143        /* MTU range: 68 - 65535 */
3144        dev->min_mtu = MIN_MTU;
3145        dev->max_mtu = MAX_MTU;
3146
3147        /* Configuration may specify what MAC to use.  Otherwise random. */
3148        if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
3149                virtio_cread_bytes(vdev,
3150                                   offsetof(struct virtio_net_config, mac),
3151                                   dev->dev_addr, dev->addr_len);
3152        else
3153                eth_hw_addr_random(dev);
3154
3155        /* Set up our device-specific information */
3156        vi = netdev_priv(dev);
3157        vi->dev = dev;
3158        vi->vdev = vdev;
3159        vdev->priv = vi;
3160
3161        INIT_WORK(&vi->config_work, virtnet_config_changed_work);
3162
3163        /* If we can receive ANY GSO packets, we must allocate large ones. */
3164        if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3165            virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3166            virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
3167            virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
3168                vi->big_packets = true;
3169
3170        if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
3171                vi->mergeable_rx_bufs = true;
3172
3173        if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
3174            virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3175                vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3176        else
3177                vi->hdr_len = sizeof(struct virtio_net_hdr);
3178
3179        if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
3180            virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3181                vi->any_header_sg = true;
3182
3183        if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
3184                vi->has_cvq = true;
3185
3186        if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
3187                mtu = virtio_cread16(vdev,
3188                                     offsetof(struct virtio_net_config,
3189                                              mtu));
3190                if (mtu < dev->min_mtu) {
3191                        /* Should never trigger: MTU was previously validated
3192                         * in virtnet_validate.
3193                         */
3194                        dev_err(&vdev->dev,
3195                                "device MTU appears to have changed it is now %d < %d",
3196                                mtu, dev->min_mtu);
3197                        err = -EINVAL;
3198                        goto free;
3199                }
3200
3201                dev->mtu = mtu;
3202                dev->max_mtu = mtu;
3203
3204                /* TODO: size buffers correctly in this case. */
3205                if (dev->mtu > ETH_DATA_LEN)
3206                        vi->big_packets = true;
3207        }
3208
3209        if (vi->any_header_sg)
3210                dev->needed_headroom = vi->hdr_len;
3211
3212        /* Enable multiqueue by default */
3213        if (num_online_cpus() >= max_queue_pairs)
3214                vi->curr_queue_pairs = max_queue_pairs;
3215        else
3216                vi->curr_queue_pairs = num_online_cpus();
3217        vi->max_queue_pairs = max_queue_pairs;
3218
3219        /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3220        err = init_vqs(vi);
3221        if (err)
3222                goto free;
3223
3224#ifdef CONFIG_SYSFS
3225        if (vi->mergeable_rx_bufs)
3226                dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
3227#endif
3228        netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
3229        netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
3230
3231        virtnet_init_settings(dev);
3232
3233        if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
3234                vi->failover = net_failover_create(vi->dev);
3235                if (IS_ERR(vi->failover)) {
3236                        err = PTR_ERR(vi->failover);
3237                        goto free_vqs;
3238                }
3239        }
3240
3241        err = register_netdev(dev);
3242        if (err) {
3243                pr_debug("virtio_net: registering device failed\n");
3244                goto free_failover;
3245        }
3246
3247        virtio_device_ready(vdev);
3248
3249        err = virtnet_cpu_notif_add(vi);
3250        if (err) {
3251                pr_debug("virtio_net: registering cpu notifier failed\n");
3252                goto free_unregister_netdev;
3253        }
3254
3255        virtnet_set_queues(vi, vi->curr_queue_pairs);
3256
3257        /* Assume link up if device can't report link status,
3258           otherwise get link status from config. */
3259        netif_carrier_off(dev);
3260        if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3261                schedule_work(&vi->config_work);
3262        } else {
3263                vi->status = VIRTIO_NET_S_LINK_UP;
3264                virtnet_update_settings(vi);
3265                netif_carrier_on(dev);
3266        }
3267
3268        for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
3269                if (virtio_has_feature(vi->vdev, guest_offloads[i]))
3270                        set_bit(guest_offloads[i], &vi->guest_offloads);
3271        vi->guest_offloads_capable = vi->guest_offloads;
3272
3273        pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
3274                 dev->name, max_queue_pairs);
3275
3276        return 0;
3277
3278free_unregister_netdev:
3279        vi->vdev->config->reset(vdev);
3280
3281        unregister_netdev(dev);
3282free_failover:
3283        net_failover_destroy(vi->failover);
3284free_vqs:
3285        cancel_delayed_work_sync(&vi->refill);
3286        free_receive_page_frags(vi);
3287        virtnet_del_vqs(vi);
3288free:
3289        free_netdev(dev);
3290        return err;
3291}
3292
3293static void remove_vq_common(struct virtnet_info *vi)
3294{
3295        vi->vdev->config->reset(vi->vdev);
3296
3297        /* Free unused buffers in both send and recv, if any. */
3298        free_unused_bufs(vi);
3299
3300        free_receive_bufs(vi);
3301
3302        free_receive_page_frags(vi);
3303
3304        virtnet_del_vqs(vi);
3305}
3306
3307static void virtnet_remove(struct virtio_device *vdev)
3308{
3309        struct virtnet_info *vi = vdev->priv;
3310
3311        virtnet_cpu_notif_remove(vi);
3312
3313        /* Make sure no work handler is accessing the device. */
3314        flush_work(&vi->config_work);
3315
3316        unregister_netdev(vi->dev);
3317
3318        net_failover_destroy(vi->failover);
3319
3320        remove_vq_common(vi);
3321
3322        free_netdev(vi->dev);
3323}
3324
3325static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3326{
3327        struct virtnet_info *vi = vdev->priv;
3328
3329        virtnet_cpu_notif_remove(vi);
3330        virtnet_freeze_down(vdev);
3331        remove_vq_common(vi);
3332
3333        return 0;
3334}
3335
3336static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3337{
3338        struct virtnet_info *vi = vdev->priv;
3339        int err;
3340
3341        err = virtnet_restore_up(vdev);
3342        if (err)
3343                return err;
3344        virtnet_set_queues(vi, vi->curr_queue_pairs);
3345
3346        err = virtnet_cpu_notif_add(vi);
3347        if (err) {
3348                virtnet_freeze_down(vdev);
3349                remove_vq_common(vi);
3350                return err;
3351        }
3352
3353        return 0;
3354}
3355
3356static struct virtio_device_id id_table[] = {
3357        { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
3358        { 0 },
3359};
3360
3361#define VIRTNET_FEATURES \
3362        VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
3363        VIRTIO_NET_F_MAC, \
3364        VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
3365        VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
3366        VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
3367        VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
3368        VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
3369        VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
3370        VIRTIO_NET_F_CTRL_MAC_ADDR, \
3371        VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3372        VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
3373
3374static unsigned int features[] = {
3375        VIRTNET_FEATURES,
3376};
3377
3378static unsigned int features_legacy[] = {
3379        VIRTNET_FEATURES,
3380        VIRTIO_NET_F_GSO,
3381        VIRTIO_F_ANY_LAYOUT,
3382};
3383
3384static struct virtio_driver virtio_net_driver = {
3385        .feature_table = features,
3386        .feature_table_size = ARRAY_SIZE(features),
3387        .feature_table_legacy = features_legacy,
3388        .feature_table_size_legacy = ARRAY_SIZE(features_legacy),
3389        .driver.name =  KBUILD_MODNAME,
3390        .driver.owner = THIS_MODULE,
3391        .id_table =     id_table,
3392        .validate =     virtnet_validate,
3393        .probe =        virtnet_probe,
3394        .remove =       virtnet_remove,
3395        .config_changed = virtnet_config_changed,
3396#ifdef CONFIG_PM_SLEEP
3397        .freeze =       virtnet_freeze,
3398        .restore =      virtnet_restore,
3399#endif
3400};
3401
3402static __init int virtio_net_driver_init(void)
3403{
3404        int ret;
3405
3406        ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3407                                      virtnet_cpu_online,
3408                                      virtnet_cpu_down_prep);
3409        if (ret < 0)
3410                goto out;
3411        virtionet_online = ret;
3412        ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3413                                      NULL, virtnet_cpu_dead);
3414        if (ret)
3415                goto err_dead;
3416
3417        ret = register_virtio_driver(&virtio_net_driver);
3418        if (ret)
3419                goto err_virtio;
3420        return 0;
3421err_virtio:
3422        cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
3423err_dead:
3424        cpuhp_remove_multi_state(virtionet_online);
3425out:
3426        return ret;
3427}
3428module_init(virtio_net_driver_init);
3429
3430static __exit void virtio_net_driver_exit(void)
3431{
3432        unregister_virtio_driver(&virtio_net_driver);
3433        cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
3434        cpuhp_remove_multi_state(virtionet_online);
3435}
3436module_exit(virtio_net_driver_exit);
3437
3438MODULE_DEVICE_TABLE(virtio, id_table);
3439MODULE_DESCRIPTION("Virtio network driver");
3440MODULE_LICENSE("GPL");
3441