LXR dpdk/lib/vhost/virtio

   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2010-2016 Intel Corporation
   3 */
   4
   5#include <stdint.h>
   6#include <stdbool.h>
   7#include <linux/virtio_net.h>
   8
   9#include <rte_mbuf.h>
  10#include <rte_memcpy.h>
  11#include <rte_net.h>
  12#include <rte_ether.h>
  13#include <rte_ip.h>
  14#include <rte_dmadev.h>
  15#include <rte_vhost.h>
  16#include <rte_tcp.h>
  17#include <rte_udp.h>
  18#include <rte_sctp.h>
  19#include <rte_arp.h>
  20#include <rte_spinlock.h>
  21#include <rte_malloc.h>
  22#include <rte_vhost_async.h>
  23
  24#include "iotlb.h"
  25#include "vhost.h"
  26
  27#define MAX_BATCH_LEN 256
  28
  29static __rte_always_inline uint16_t
  30async_poll_dequeue_completed(struct virtio_net *dev, struct vhost_virtqueue *vq,
  31                struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
  32                uint16_t vchan_id, bool legacy_ol_flags);
  33
  34/* DMA device copy operation tracking array. */
  35struct async_dma_info dma_copy_track[RTE_DMADEV_DEFAULT_MAX];
  36
  37static  __rte_always_inline bool
  38rxvq_is_mergeable(struct virtio_net *dev)
  39{
  40        return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
  41}
  42
  43static  __rte_always_inline bool
  44virtio_net_is_inorder(struct virtio_net *dev)
  45{
  46        return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
  47}
  48
  49static bool
  50is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
  51{
  52        return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
  53}
  54
  55/*
  56 * This function must be called with virtqueue's access_lock taken.
  57 */
  58static inline void
  59vhost_queue_stats_update(struct virtio_net *dev, struct vhost_virtqueue *vq,
  60                struct rte_mbuf **pkts, uint16_t count)
  61{
  62        struct virtqueue_stats *stats = &vq->stats;
  63        int i;
  64
  65        if (!(dev->flags & VIRTIO_DEV_STATS_ENABLED))
  66                return;
  67
  68        for (i = 0; i < count; i++) {
  69                struct rte_ether_addr *ea;
  70                struct rte_mbuf *pkt = pkts[i];
  71                uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt);
  72
  73                stats->packets++;
  74                stats->bytes += pkt_len;
  75
  76                if (pkt_len == 64) {
  77                        stats->size_bins[1]++;
  78                } else if (pkt_len > 64 && pkt_len < 1024) {
  79                        uint32_t bin;
  80
  81                        /* count zeros, and offset into correct bin */
  82                        bin = (sizeof(pkt_len) * 8) - __builtin_clz(pkt_len) - 5;
  83                        stats->size_bins[bin]++;
  84                } else {
  85                        if (pkt_len < 64)
  86                                stats->size_bins[0]++;
  87                        else if (pkt_len < 1519)
  88                                stats->size_bins[6]++;
  89                        else
  90                                stats->size_bins[7]++;
  91                }
  92
  93                ea = rte_pktmbuf_mtod(pkt, struct rte_ether_addr *);
  94                if (rte_is_multicast_ether_addr(ea)) {
  95                        if (rte_is_broadcast_ether_addr(ea))
  96                                stats->broadcast++;
  97                        else
  98                                stats->multicast++;
  99                }
 100        }
 101}
 102
 103static __rte_always_inline int64_t
 104vhost_async_dma_transfer_one(struct virtio_net *dev, struct vhost_virtqueue *vq,
 105                int16_t dma_id, uint16_t vchan_id, uint16_t flag_idx,
 106                struct vhost_iov_iter *pkt)
 107{
 108        struct async_dma_vchan_info *dma_info = &dma_copy_track[dma_id].vchans[vchan_id];
 109        uint16_t ring_mask = dma_info->ring_mask;
 110        static bool vhost_async_dma_copy_log;
 111
 112
 113        struct vhost_iovec *iov = pkt->iov;
 114        int copy_idx = 0;
 115        uint32_t nr_segs = pkt->nr_segs;
 116        uint16_t i;
 117
 118        if (rte_dma_burst_capacity(dma_id, vchan_id) < nr_segs)
 119                return -1;
 120
 121        for (i = 0; i < nr_segs; i++) {
 122                copy_idx = rte_dma_copy(dma_id, vchan_id, (rte_iova_t)iov[i].src_addr,
 123                                (rte_iova_t)iov[i].dst_addr, iov[i].len, RTE_DMA_OP_FLAG_LLC);
 124                /**
 125                 * Since all memory is pinned and DMA vChannel
 126                 * ring has enough space, failure should be a
 127                 * rare case. If failure happens, it means DMA
 128                 * device encounters serious errors; in this
 129                 * case, please stop async data-path and check
 130                 * what has happened to DMA device.
 131                 */
 132                if (unlikely(copy_idx < 0)) {
 133                        if (!vhost_async_dma_copy_log) {
 134                                VHOST_LOG_DATA(dev->ifname, ERR,
 135                                        "DMA copy failed for channel %d:%u\n",
 136                                        dma_id, vchan_id);
 137                                vhost_async_dma_copy_log = true;
 138                        }
 139                        return -1;
 140                }
 141        }
 142
 143        /**
 144         * Only store packet completion flag address in the last copy's
 145         * slot, and other slots are set to NULL.
 146         */
 147        dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask] = &vq->async->pkts_cmpl_flag[flag_idx];
 148
 149        return nr_segs;
 150}
 151
 152static __rte_always_inline uint16_t
 153vhost_async_dma_transfer(struct virtio_net *dev, struct vhost_virtqueue *vq,
 154                int16_t dma_id, uint16_t vchan_id, uint16_t head_idx,
 155                struct vhost_iov_iter *pkts, uint16_t nr_pkts)
 156{
 157        struct async_dma_vchan_info *dma_info = &dma_copy_track[dma_id].vchans[vchan_id];
 158        int64_t ret, nr_copies = 0;
 159        uint16_t pkt_idx;
 160
 161        rte_spinlock_lock(&dma_info->dma_lock);
 162
 163        for (pkt_idx = 0; pkt_idx < nr_pkts; pkt_idx++) {
 164                ret = vhost_async_dma_transfer_one(dev, vq, dma_id, vchan_id, head_idx,
 165                                &pkts[pkt_idx]);
 166                if (unlikely(ret < 0))
 167                        break;
 168
 169                nr_copies += ret;
 170                head_idx++;
 171                if (head_idx >= vq->size)
 172                        head_idx -= vq->size;
 173        }
 174
 175        if (likely(nr_copies > 0))
 176                rte_dma_submit(dma_id, vchan_id);
 177
 178        rte_spinlock_unlock(&dma_info->dma_lock);
 179
 180        return pkt_idx;
 181}
 182
 183static __rte_always_inline uint16_t
 184vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t vchan_id,
 185                uint16_t max_pkts)
 186{
 187        struct async_dma_vchan_info *dma_info = &dma_copy_track[dma_id].vchans[vchan_id];
 188        uint16_t ring_mask = dma_info->ring_mask;
 189        uint16_t last_idx = 0;
 190        uint16_t nr_copies;
 191        uint16_t copy_idx;
 192        uint16_t i;
 193        bool has_error = false;
 194        static bool vhost_async_dma_complete_log;
 195
 196        rte_spinlock_lock(&dma_info->dma_lock);
 197
 198        /**
 199         * Print error log for debugging, if DMA reports error during
 200         * DMA transfer. We do not handle error in vhost level.
 201         */
 202        nr_copies = rte_dma_completed(dma_id, vchan_id, max_pkts, &last_idx, &has_error);
 203        if (unlikely(!vhost_async_dma_complete_log && has_error)) {
 204                VHOST_LOG_DATA(dev->ifname, ERR,
 205                        "DMA completion failure on channel %d:%u\n",
 206                        dma_id, vchan_id);
 207                vhost_async_dma_complete_log = true;
 208        } else if (nr_copies == 0) {
 209                goto out;
 210        }
 211
 212        copy_idx = last_idx - nr_copies + 1;
 213        for (i = 0; i < nr_copies; i++) {
 214                bool *flag;
 215
 216                flag = dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask];
 217                if (flag) {
 218                        /**
 219                         * Mark the packet flag as received. The flag
 220                         * could belong to another virtqueue but write
 221                         * is atomic.
 222                         */
 223                        *flag = true;
 224                        dma_info->pkts_cmpl_flag_addr[copy_idx & ring_mask] = NULL;
 225                }
 226                copy_idx++;
 227        }
 228
 229out:
 230        rte_spinlock_unlock(&dma_info->dma_lock);
 231        return nr_copies;
 232}
 233
 234static inline void
 235do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 236{
 237        struct batch_copy_elem *elem = vq->batch_copy_elems;
 238        uint16_t count = vq->batch_copy_nb_elems;
 239        int i;
 240
 241        for (i = 0; i < count; i++) {
 242                rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
 243                vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
 244                                           elem[i].len);
 245                PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
 246        }
 247
 248        vq->batch_copy_nb_elems = 0;
 249}
 250
 251static inline void
 252do_data_copy_dequeue(struct vhost_virtqueue *vq)
 253{
 254        struct batch_copy_elem *elem = vq->batch_copy_elems;
 255        uint16_t count = vq->batch_copy_nb_elems;
 256        int i;
 257
 258        for (i = 0; i < count; i++)
 259                rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
 260
 261        vq->batch_copy_nb_elems = 0;
 262}
 263
 264static __rte_always_inline void
 265do_flush_shadow_used_ring_split(struct virtio_net *dev,
 266                        struct vhost_virtqueue *vq,
 267                        uint16_t to, uint16_t from, uint16_t size)
 268{
 269        rte_memcpy(&vq->used->ring[to],
 270                        &vq->shadow_used_split[from],
 271                        size * sizeof(struct vring_used_elem));
 272        vhost_log_cache_used_vring(dev, vq,
 273                        offsetof(struct vring_used, ring[to]),
 274                        size * sizeof(struct vring_used_elem));
 275}
 276
 277static __rte_always_inline void
 278flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 279{
 280        uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
 281
 282        if (used_idx + vq->shadow_used_idx <= vq->size) {
 283                do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
 284                                          vq->shadow_used_idx);
 285        } else {
 286                uint16_t size;
 287
 288                /* update used ring interval [used_idx, vq->size] */
 289                size = vq->size - used_idx;
 290                do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
 291
 292                /* update the left half used ring interval [0, left_size] */
 293                do_flush_shadow_used_ring_split(dev, vq, 0, size,
 294                                          vq->shadow_used_idx - size);
 295        }
 296        vq->last_used_idx += vq->shadow_used_idx;
 297
 298        vhost_log_cache_sync(dev, vq);
 299
 300        __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
 301                           __ATOMIC_RELEASE);
 302        vq->shadow_used_idx = 0;
 303        vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 304                sizeof(vq->used->idx));
 305}
 306
 307static __rte_always_inline void
 308update_shadow_used_ring_split(struct vhost_virtqueue *vq,
 309                         uint16_t desc_idx, uint32_t len)
 310{
 311        uint16_t i = vq->shadow_used_idx++;
 312
 313        vq->shadow_used_split[i].id  = desc_idx;
 314        vq->shadow_used_split[i].len = len;
 315}
 316
 317static __rte_always_inline void
 318vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
 319                                  struct vhost_virtqueue *vq)
 320{
 321        int i;
 322        uint16_t used_idx = vq->last_used_idx;
 323        uint16_t head_idx = vq->last_used_idx;
 324        uint16_t head_flags = 0;
 325
 326        /* Split loop in two to save memory barriers */
 327        for (i = 0; i < vq->shadow_used_idx; i++) {
 328                vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
 329                vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
 330
 331                used_idx += vq->shadow_used_packed[i].count;
 332                if (used_idx >= vq->size)
 333                        used_idx -= vq->size;
 334        }
 335
 336        /* The ordering for storing desc flags needs to be enforced. */
 337        rte_atomic_thread_fence(__ATOMIC_RELEASE);
 338
 339        for (i = 0; i < vq->shadow_used_idx; i++) {
 340                uint16_t flags;
 341
 342                if (vq->shadow_used_packed[i].len)
 343                        flags = VRING_DESC_F_WRITE;
 344                else
 345                        flags = 0;
 346
 347                if (vq->used_wrap_counter) {
 348                        flags |= VRING_DESC_F_USED;
 349                        flags |= VRING_DESC_F_AVAIL;
 350                } else {
 351                        flags &= ~VRING_DESC_F_USED;
 352                        flags &= ~VRING_DESC_F_AVAIL;
 353                }
 354
 355                if (i > 0) {
 356                        vq->desc_packed[vq->last_used_idx].flags = flags;
 357
 358                        vhost_log_cache_used_vring(dev, vq,
 359                                        vq->last_used_idx *
 360                                        sizeof(struct vring_packed_desc),
 361                                        sizeof(struct vring_packed_desc));
 362                } else {
 363                        head_idx = vq->last_used_idx;
 364                        head_flags = flags;
 365                }
 366
 367                vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
 368        }
 369
 370        vq->desc_packed[head_idx].flags = head_flags;
 371
 372        vhost_log_cache_used_vring(dev, vq,
 373                                head_idx *
 374                                sizeof(struct vring_packed_desc),
 375                                sizeof(struct vring_packed_desc));
 376
 377        vq->shadow_used_idx = 0;
 378        vhost_log_cache_sync(dev, vq);
 379}
 380
 381static __rte_always_inline void
 382vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
 383                                  struct vhost_virtqueue *vq)
 384{
 385        struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
 386
 387        vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
 388        /* desc flags is the synchronization point for virtio packed vring */
 389        __atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
 390                         used_elem->flags, __ATOMIC_RELEASE);
 391
 392        vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
 393                                   sizeof(struct vring_packed_desc),
 394                                   sizeof(struct vring_packed_desc));
 395        vq->shadow_used_idx = 0;
 396        vhost_log_cache_sync(dev, vq);
 397}
 398
 399static __rte_always_inline void
 400vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
 401                                 struct vhost_virtqueue *vq,
 402                                 uint64_t *lens,
 403                                 uint16_t *ids)
 404{
 405        uint16_t i;
 406        uint16_t flags;
 407        uint16_t last_used_idx;
 408        struct vring_packed_desc *desc_base;
 409
 410        last_used_idx = vq->last_used_idx;
 411        desc_base = &vq->desc_packed[last_used_idx];
 412
 413        flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
 414
 415        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
 416                desc_base[i].id = ids[i];
 417                desc_base[i].len = lens[i];
 418        }
 419
 420        rte_atomic_thread_fence(__ATOMIC_RELEASE);
 421
 422        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
 423                desc_base[i].flags = flags;
 424        }
 425
 426        vhost_log_cache_used_vring(dev, vq, last_used_idx *
 427                                   sizeof(struct vring_packed_desc),
 428                                   sizeof(struct vring_packed_desc) *
 429                                   PACKED_BATCH_SIZE);
 430        vhost_log_cache_sync(dev, vq);
 431
 432        vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
 433}
 434
 435static __rte_always_inline void
 436vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
 437                                          uint16_t id)
 438{
 439        vq->shadow_used_packed[0].id = id;
 440
 441        if (!vq->shadow_used_idx) {
 442                vq->shadow_last_used_idx = vq->last_used_idx;
 443                vq->shadow_used_packed[0].flags =
 444                        PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
 445                vq->shadow_used_packed[0].len = 0;
 446                vq->shadow_used_packed[0].count = 1;
 447                vq->shadow_used_idx++;
 448        }
 449
 450        vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
 451}
 452
 453static __rte_always_inline void
 454vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
 455                                  struct vhost_virtqueue *vq,
 456                                  uint16_t *ids)
 457{
 458        uint16_t flags;
 459        uint16_t i;
 460        uint16_t begin;
 461
 462        flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
 463
 464        if (!vq->shadow_used_idx) {
 465                vq->shadow_last_used_idx = vq->last_used_idx;
 466                vq->shadow_used_packed[0].id  = ids[0];
 467                vq->shadow_used_packed[0].len = 0;
 468                vq->shadow_used_packed[0].count = 1;
 469                vq->shadow_used_packed[0].flags = flags;
 470                vq->shadow_used_idx++;
 471                begin = 1;
 472        } else
 473                begin = 0;
 474
 475        vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
 476                vq->desc_packed[vq->last_used_idx + i].id = ids[i];
 477                vq->desc_packed[vq->last_used_idx + i].len = 0;
 478        }
 479
 480        rte_atomic_thread_fence(__ATOMIC_RELEASE);
 481        vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
 482                vq->desc_packed[vq->last_used_idx + i].flags = flags;
 483
 484        vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
 485                                   sizeof(struct vring_packed_desc),
 486                                   sizeof(struct vring_packed_desc) *
 487                                   PACKED_BATCH_SIZE);
 488        vhost_log_cache_sync(dev, vq);
 489
 490        vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
 491}
 492
 493static __rte_always_inline void
 494vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
 495                                   uint16_t buf_id,
 496                                   uint16_t count)
 497{
 498        uint16_t flags;
 499
 500        flags = vq->desc_packed[vq->last_used_idx].flags;
 501        if (vq->used_wrap_counter) {
 502                flags |= VRING_DESC_F_USED;
 503                flags |= VRING_DESC_F_AVAIL;
 504        } else {
 505                flags &= ~VRING_DESC_F_USED;
 506                flags &= ~VRING_DESC_F_AVAIL;
 507        }
 508
 509        if (!vq->shadow_used_idx) {
 510                vq->shadow_last_used_idx = vq->last_used_idx;
 511
 512                vq->shadow_used_packed[0].id  = buf_id;
 513                vq->shadow_used_packed[0].len = 0;
 514                vq->shadow_used_packed[0].flags = flags;
 515                vq->shadow_used_idx++;
 516        } else {
 517                vq->desc_packed[vq->last_used_idx].id = buf_id;
 518                vq->desc_packed[vq->last_used_idx].len = 0;
 519                vq->desc_packed[vq->last_used_idx].flags = flags;
 520        }
 521
 522        vq_inc_last_used_packed(vq, count);
 523}
 524
 525static __rte_always_inline void
 526vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 527                                           uint16_t buf_id,
 528                                           uint16_t count)
 529{
 530        uint16_t flags;
 531
 532        vq->shadow_used_packed[0].id = buf_id;
 533
 534        flags = vq->desc_packed[vq->last_used_idx].flags;
 535        if (vq->used_wrap_counter) {
 536                flags |= VRING_DESC_F_USED;
 537                flags |= VRING_DESC_F_AVAIL;
 538        } else {
 539                flags &= ~VRING_DESC_F_USED;
 540                flags &= ~VRING_DESC_F_AVAIL;
 541        }
 542
 543        if (!vq->shadow_used_idx) {
 544                vq->shadow_last_used_idx = vq->last_used_idx;
 545                vq->shadow_used_packed[0].len = 0;
 546                vq->shadow_used_packed[0].flags = flags;
 547                vq->shadow_used_idx++;
 548        }
 549
 550        vq_inc_last_used_packed(vq, count);
 551}
 552
 553static __rte_always_inline void
 554vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
 555                                   uint32_t *len,
 556                                   uint16_t *id,
 557                                   uint16_t *count,
 558                                   uint16_t num_buffers)
 559{
 560        uint16_t i;
 561
 562        for (i = 0; i < num_buffers; i++) {
 563                /* enqueue shadow flush action aligned with batch num */
 564                if (!vq->shadow_used_idx)
 565                        vq->shadow_aligned_idx = vq->last_used_idx &
 566                                PACKED_BATCH_MASK;
 567                vq->shadow_used_packed[vq->shadow_used_idx].id  = id[i];
 568                vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
 569                vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
 570                vq->shadow_aligned_idx += count[i];
 571                vq->shadow_used_idx++;
 572        }
 573}
 574
 575static __rte_always_inline void
 576vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 577                                   struct vhost_virtqueue *vq,
 578                                   uint32_t *len,
 579                                   uint16_t *id,
 580                                   uint16_t *count,
 581                                   uint16_t num_buffers)
 582{
 583        vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 584
 585        if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 586                do_data_copy_enqueue(dev, vq);
 587                vhost_flush_enqueue_shadow_packed(dev, vq);
 588        }
 589}
 590
 591/* avoid write operation when necessary, to lessen cache issues */
 592#define ASSIGN_UNLESS_EQUAL(var, val) do {      \
 593        if ((var) != (val))                     \
 594                (var) = (val);                  \
 595} while (0)
 596
 597static __rte_always_inline void
 598virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 599{
 600        uint64_t csum_l4 = m_buf->ol_flags & RTE_MBUF_F_TX_L4_MASK;
 601
 602        if (m_buf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)
 603                csum_l4 |= RTE_MBUF_F_TX_TCP_CKSUM;
 604
 605        if (csum_l4) {
 606                /*
 607                 * Pseudo-header checksum must be set as per Virtio spec.
 608                 *
 609                 * Note: We don't propagate rte_net_intel_cksum_prepare()
 610                 * errors, as it would have an impact on performance, and an
 611                 * error would mean the packet is dropped by the guest instead
 612                 * of being dropped here.
 613                 */
 614                rte_net_intel_cksum_prepare(m_buf);
 615
 616                net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 617                net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
 618
 619                switch (csum_l4) {
 620                case RTE_MBUF_F_TX_TCP_CKSUM:
 621                        net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
 622                                                cksum));
 623                        break;
 624                case RTE_MBUF_F_TX_UDP_CKSUM:
 625                        net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
 626                                                dgram_cksum));
 627                        break;
 628                case RTE_MBUF_F_TX_SCTP_CKSUM:
 629                        net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
 630                                                cksum));
 631                        break;
 632                }
 633        } else {
 634                ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
 635                ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
 636                ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
 637        }
 638
 639        /* IP cksum verification cannot be bypassed, then calculate here */
 640        if (m_buf->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) {
 641                struct rte_ipv4_hdr *ipv4_hdr;
 642
 643                ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
 644                                                   m_buf->l2_len);
 645                ipv4_hdr->hdr_checksum = 0;
 646                ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
 647        }
 648
 649        if (m_buf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
 650                if (m_buf->ol_flags & RTE_MBUF_F_TX_IPV4)
 651                        net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 652                else
 653                        net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
 654                net_hdr->gso_size = m_buf->tso_segsz;
 655                net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
 656                                        + m_buf->l4_len;
 657        } else if (m_buf->ol_flags & RTE_MBUF_F_TX_UDP_SEG) {
 658                net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
 659                net_hdr->gso_size = m_buf->tso_segsz;
 660                net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
 661                        m_buf->l4_len;
 662        } else {
 663                ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
 664                ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
 665                ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
 666        }
 667}
 668
 669static __rte_always_inline int
 670map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 671                struct buf_vector *buf_vec, uint16_t *vec_idx,
 672                uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
 673{
 674        uint16_t vec_id = *vec_idx;
 675
 676        while (desc_len) {
 677                uint64_t desc_addr;
 678                uint64_t desc_chunck_len = desc_len;
 679
 680                if (unlikely(vec_id >= BUF_VECTOR_MAX))
 681                        return -1;
 682
 683                desc_addr = vhost_iova_to_vva(dev, vq,
 684                                desc_iova,
 685                                &desc_chunck_len,
 686                                perm);
 687                if (unlikely(!desc_addr))
 688                        return -1;
 689
 690                rte_prefetch0((void *)(uintptr_t)desc_addr);
 691
 692                buf_vec[vec_id].buf_iova = desc_iova;
 693                buf_vec[vec_id].buf_addr = desc_addr;
 694                buf_vec[vec_id].buf_len  = desc_chunck_len;
 695
 696                desc_len -= desc_chunck_len;
 697                desc_iova += desc_chunck_len;
 698                vec_id++;
 699        }
 700        *vec_idx = vec_id;
 701
 702        return 0;
 703}
 704
 705static __rte_always_inline int
 706fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 707                         uint32_t avail_idx, uint16_t *vec_idx,
 708                         struct buf_vector *buf_vec, uint16_t *desc_chain_head,
 709                         uint32_t *desc_chain_len, uint8_t perm)
 710{
 711        uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
 712        uint16_t vec_id = *vec_idx;
 713        uint32_t len    = 0;
 714        uint64_t dlen;
 715        uint32_t nr_descs = vq->size;
 716        uint32_t cnt    = 0;
 717        struct vring_desc *descs = vq->desc;
 718        struct vring_desc *idesc = NULL;
 719
 720        if (unlikely(idx >= vq->size))
 721                return -1;
 722
 723        *desc_chain_head = idx;
 724
 725        if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
 726                dlen = vq->desc[idx].len;
 727                nr_descs = dlen / sizeof(struct vring_desc);
 728                if (unlikely(nr_descs > vq->size))
 729                        return -1;
 730
 731                descs = (struct vring_desc *)(uintptr_t)
 732                        vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
 733                                                &dlen,
 734                                                VHOST_ACCESS_RO);
 735                if (unlikely(!descs))
 736                        return -1;
 737
 738                if (unlikely(dlen < vq->desc[idx].len)) {
 739                        /*
 740                         * The indirect desc table is not contiguous
 741                         * in process VA space, we have to copy it.
 742                         */
 743                        idesc = vhost_alloc_copy_ind_table(dev, vq,
 744                                        vq->desc[idx].addr, vq->desc[idx].len);
 745                        if (unlikely(!idesc))
 746                                return -1;
 747
 748                        descs = idesc;
 749                }
 750
 751                idx = 0;
 752        }
 753
 754        while (1) {
 755                if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
 756                        free_ind_table(idesc);
 757                        return -1;
 758                }
 759
 760                dlen = descs[idx].len;
 761                len += dlen;
 762
 763                if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 764                                                descs[idx].addr, dlen,
 765                                                perm))) {
 766                        free_ind_table(idesc);
 767                        return -1;
 768                }
 769
 770                if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
 771                        break;
 772
 773                idx = descs[idx].next;
 774        }
 775
 776        *desc_chain_len = len;
 777        *vec_idx = vec_id;
 778
 779        if (unlikely(!!idesc))
 780                free_ind_table(idesc);
 781
 782        return 0;
 783}
 784
 785/*
 786 * Returns -1 on fail, 0 on success
 787 */
 788static inline int
 789reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 790                                uint32_t size, struct buf_vector *buf_vec,
 791                                uint16_t *num_buffers, uint16_t avail_head,
 792                                uint16_t *nr_vec)
 793{
 794        uint16_t cur_idx;
 795        uint16_t vec_idx = 0;
 796        uint16_t max_tries, tries = 0;
 797
 798        uint16_t head_idx = 0;
 799        uint32_t len = 0;
 800
 801        *num_buffers = 0;
 802        cur_idx  = vq->last_avail_idx;
 803
 804        if (rxvq_is_mergeable(dev))
 805                max_tries = vq->size - 1;
 806        else
 807                max_tries = 1;
 808
 809        while (size > 0) {
 810                if (unlikely(cur_idx == avail_head))
 811                        return -1;
 812                /*
 813                 * if we tried all available ring items, and still
 814                 * can't get enough buf, it means something abnormal
 815                 * happened.
 816                 */
 817                if (unlikely(++tries > max_tries))
 818                        return -1;
 819
 820                if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
 821                                                &vec_idx, buf_vec,
 822                                                &head_idx, &len,
 823                                                VHOST_ACCESS_RW) < 0))
 824                        return -1;
 825                len = RTE_MIN(len, size);
 826                update_shadow_used_ring_split(vq, head_idx, len);
 827                size -= len;
 828
 829                cur_idx++;
 830                *num_buffers += 1;
 831        }
 832
 833        *nr_vec = vec_idx;
 834
 835        return 0;
 836}
 837
 838static __rte_always_inline int
 839fill_vec_buf_packed_indirect(struct virtio_net *dev,
 840                        struct vhost_virtqueue *vq,
 841                        struct vring_packed_desc *desc, uint16_t *vec_idx,
 842                        struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
 843{
 844        uint16_t i;
 845        uint32_t nr_descs;
 846        uint16_t vec_id = *vec_idx;
 847        uint64_t dlen;
 848        struct vring_packed_desc *descs, *idescs = NULL;
 849
 850        dlen = desc->len;
 851        descs = (struct vring_packed_desc *)(uintptr_t)
 852                vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
 853        if (unlikely(!descs))
 854                return -1;
 855
 856        if (unlikely(dlen < desc->len)) {
 857                /*
 858                 * The indirect desc table is not contiguous
 859                 * in process VA space, we have to copy it.
 860                 */
 861                idescs = vhost_alloc_copy_ind_table(dev,
 862                                vq, desc->addr, desc->len);
 863                if (unlikely(!idescs))
 864                        return -1;
 865
 866                descs = idescs;
 867        }
 868
 869        nr_descs =  desc->len / sizeof(struct vring_packed_desc);
 870        if (unlikely(nr_descs >= vq->size)) {
 871                free_ind_table(idescs);
 872                return -1;
 873        }
 874
 875        for (i = 0; i < nr_descs; i++) {
 876                if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
 877                        free_ind_table(idescs);
 878                        return -1;
 879                }
 880
 881                dlen = descs[i].len;
 882                *len += dlen;
 883                if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 884                                                descs[i].addr, dlen,
 885                                                perm)))
 886                        return -1;
 887        }
 888        *vec_idx = vec_id;
 889
 890        if (unlikely(!!idescs))
 891                free_ind_table(idescs);
 892
 893        return 0;
 894}
 895
 896static __rte_always_inline int
 897fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 898                                uint16_t avail_idx, uint16_t *desc_count,
 899                                struct buf_vector *buf_vec, uint16_t *vec_idx,
 900                                uint16_t *buf_id, uint32_t *len, uint8_t perm)
 901{
 902        bool wrap_counter = vq->avail_wrap_counter;
 903        struct vring_packed_desc *descs = vq->desc_packed;
 904        uint16_t vec_id = *vec_idx;
 905        uint64_t dlen;
 906
 907        if (avail_idx < vq->last_avail_idx)
 908                wrap_counter ^= 1;
 909
 910        /*
 911         * Perform a load-acquire barrier in desc_is_avail to
 912         * enforce the ordering between desc flags and desc
 913         * content.
 914         */
 915        if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
 916                return -1;
 917
 918        *desc_count = 0;
 919        *len = 0;
 920
 921        while (1) {
 922                if (unlikely(vec_id >= BUF_VECTOR_MAX))
 923                        return -1;
 924
 925                if (unlikely(*desc_count >= vq->size))
 926                        return -1;
 927
 928                *desc_count += 1;
 929                *buf_id = descs[avail_idx].id;
 930
 931                if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
 932                        if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
 933                                                        &descs[avail_idx],
 934                                                        &vec_id, buf_vec,
 935                                                        len, perm) < 0))
 936                                return -1;
 937                } else {
 938                        dlen = descs[avail_idx].len;
 939                        *len += dlen;
 940
 941                        if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 942                                                        descs[avail_idx].addr,
 943                                                        dlen,
 944                                                        perm)))
 945                                return -1;
 946                }
 947
 948                if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
 949                        break;
 950
 951                if (++avail_idx >= vq->size) {
 952                        avail_idx -= vq->size;
 953                        wrap_counter ^= 1;
 954                }
 955        }
 956
 957        *vec_idx = vec_id;
 958
 959        return 0;
 960}
 961
 962static __rte_noinline void
 963copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 964                struct buf_vector *buf_vec,
 965                struct virtio_net_hdr_mrg_rxbuf *hdr)
 966{
 967        uint64_t len;
 968        uint64_t remain = dev->vhost_hlen;
 969        uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
 970        uint64_t iova = buf_vec->buf_iova;
 971
 972        while (remain) {
 973                len = RTE_MIN(remain,
 974                                buf_vec->buf_len);
 975                dst = buf_vec->buf_addr;
 976                rte_memcpy((void *)(uintptr_t)dst,
 977                                (void *)(uintptr_t)src,
 978                                len);
 979
 980                PRINT_PACKET(dev, (uintptr_t)dst,
 981                                (uint32_t)len, 0);
 982                vhost_log_cache_write_iova(dev, vq,
 983                                iova, len);
 984
 985                remain -= len;
 986                iova += len;
 987                src += len;
 988                buf_vec++;
 989        }
 990}
 991
 992static __rte_always_inline int
 993async_iter_initialize(struct virtio_net *dev, struct vhost_async *async)
 994{
 995        struct vhost_iov_iter *iter;
 996
 997        if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
 998                VHOST_LOG_DATA(dev->ifname, ERR, "no more async iovec available\n");
 999                return -1;
1000        }

1001
1002        iter = async->iov_iter + async->iter_idx;
1003        iter->iov = async->iovec + async->iovec_idx;
1004        iter->nr_segs = 0;
1005
1006        return 0;
1007}
1008
1009static __rte_always_inline int
1010async_iter_add_iovec(struct virtio_net *dev, struct vhost_async *async,
1011                void *src, void *dst, size_t len)
1012{
1013        struct vhost_iov_iter *iter;
1014        struct vhost_iovec *iovec;
1015
1016        if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
1017                static bool vhost_max_async_vec_log;
1018
1019                if (!vhost_max_async_vec_log) {
1020                        VHOST_LOG_DATA(dev->ifname, ERR, "no more async iovec available\n");
1021                        vhost_max_async_vec_log = true;
1022                }
1023
1024                return -1;
1025        }
1026
1027        iter = async->iov_iter + async->iter_idx;
1028        iovec = async->iovec + async->iovec_idx;
1029
1030        iovec->src_addr = src;
1031        iovec->dst_addr = dst;
1032        iovec->len = len;
1033
1034        iter->nr_segs++;
1035        async->iovec_idx++;
1036
1037        return 0;
1038}
1039
1040static __rte_always_inline void
1041async_iter_finalize(struct vhost_async *async)
1042{
1043        async->iter_idx++;
1044}
1045
1046static __rte_always_inline void
1047async_iter_cancel(struct vhost_async *async)
1048{
1049        struct vhost_iov_iter *iter;
1050
1051        iter = async->iov_iter + async->iter_idx;
1052        async->iovec_idx -= iter->nr_segs;
1053        iter->nr_segs = 0;
1054        iter->iov = NULL;
1055}
1056
1057static __rte_always_inline void
1058async_iter_reset(struct vhost_async *async)
1059{
1060        async->iter_idx = 0;
1061        async->iovec_idx = 0;
1062}
1063
1064static __rte_always_inline int
1065async_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
1066                struct rte_mbuf *m, uint32_t mbuf_offset,
1067                uint64_t buf_iova, uint32_t cpy_len, bool to_desc)
1068{
1069        struct vhost_async *async = vq->async;
1070        uint64_t mapped_len;
1071        uint32_t buf_offset = 0;
1072        void *src, *dst;
1073        void *host_iova;
1074
1075        while (cpy_len) {
1076                host_iova = (void *)(uintptr_t)gpa_to_first_hpa(dev,
1077                                buf_iova + buf_offset, cpy_len, &mapped_len);
1078                if (unlikely(!host_iova)) {
1079                        VHOST_LOG_DATA(dev->ifname, ERR,
1080                                "%s: failed to get host iova.\n",
1081                                __func__);
1082                        return -1;
1083                }
1084
1085                if (to_desc) {
1086                        src = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
1087                        dst = host_iova;
1088                } else {
1089                        src = host_iova;
1090                        dst = (void *)(uintptr_t)rte_pktmbuf_iova_offset(m, mbuf_offset);
1091                }
1092
1093                if (unlikely(async_iter_add_iovec(dev, async, src, dst, (size_t)mapped_len)))
1094                        return -1;
1095
1096                cpy_len -= (uint32_t)mapped_len;
1097                mbuf_offset += (uint32_t)mapped_len;
1098                buf_offset += (uint32_t)mapped_len;
1099        }
1100
1101        return 0;
1102}
1103
1104static __rte_always_inline void
1105sync_fill_seg(struct virtio_net *dev, struct vhost_virtqueue *vq,
1106                struct rte_mbuf *m, uint32_t mbuf_offset,
1107                uint64_t buf_addr, uint64_t buf_iova, uint32_t cpy_len, bool to_desc)
1108{
1109        struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1110
1111        if (likely(cpy_len > MAX_BATCH_LEN || vq->batch_copy_nb_elems >= vq->size)) {
1112                if (to_desc) {
1113                        rte_memcpy((void *)((uintptr_t)(buf_addr)),
1114                                rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
1115                                cpy_len);
1116                        vhost_log_cache_write_iova(dev, vq, buf_iova, cpy_len);
1117                        PRINT_PACKET(dev, (uintptr_t)(buf_addr), cpy_len, 0);
1118                } else {
1119                        rte_memcpy(rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
1120                                (void *)((uintptr_t)(buf_addr)),
1121                                cpy_len);
1122                }
1123        } else {
1124                if (to_desc) {
1125                        batch_copy[vq->batch_copy_nb_elems].dst =
1126                                (void *)((uintptr_t)(buf_addr));
1127                        batch_copy[vq->batch_copy_nb_elems].src =
1128                                rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
1129                        batch_copy[vq->batch_copy_nb_elems].log_addr = buf_iova;
1130                } else {
1131                        batch_copy[vq->batch_copy_nb_elems].dst =
1132                                rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
1133                        batch_copy[vq->batch_copy_nb_elems].src =
1134                                (void *)((uintptr_t)(buf_addr));
1135                }
1136                batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
1137                vq->batch_copy_nb_elems++;
1138        }
1139}
1140
1141static __rte_always_inline int
1142mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
1143                struct rte_mbuf *m, struct buf_vector *buf_vec,
1144                uint16_t nr_vec, uint16_t num_buffers, bool is_async)
1145{
1146        uint32_t vec_idx = 0;
1147        uint32_t mbuf_offset, mbuf_avail;
1148        uint32_t buf_offset, buf_avail;
1149        uint64_t buf_addr, buf_iova, buf_len;
1150        uint32_t cpy_len;
1151        uint64_t hdr_addr;
1152        struct rte_mbuf *hdr_mbuf;
1153        struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
1154        struct vhost_async *async = vq->async;
1155
1156        if (unlikely(m == NULL))
1157                return -1;
1158
1159        buf_addr = buf_vec[vec_idx].buf_addr;
1160        buf_iova = buf_vec[vec_idx].buf_iova;
1161        buf_len = buf_vec[vec_idx].buf_len;
1162
1163        if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
1164                return -1;
1165
1166        hdr_mbuf = m;
1167        hdr_addr = buf_addr;
1168        if (unlikely(buf_len < dev->vhost_hlen)) {
1169                memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
1170                hdr = &tmp_hdr;
1171        } else
1172                hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
1173
1174        VHOST_LOG_DATA(dev->ifname, DEBUG, "RX: num merge buffers %d\n", num_buffers);
1175
1176        if (unlikely(buf_len < dev->vhost_hlen)) {
1177                buf_offset = dev->vhost_hlen - buf_len;
1178                vec_idx++;
1179                buf_addr = buf_vec[vec_idx].buf_addr;
1180                buf_iova = buf_vec[vec_idx].buf_iova;
1181                buf_len = buf_vec[vec_idx].buf_len;
1182                buf_avail = buf_len - buf_offset;
1183        } else {
1184                buf_offset = dev->vhost_hlen;
1185                buf_avail = buf_len - dev->vhost_hlen;
1186        }
1187
1188        mbuf_avail  = rte_pktmbuf_data_len(m);
1189        mbuf_offset = 0;
1190
1191        if (is_async) {
1192                if (async_iter_initialize(dev, async))
1193                        return -1;
1194        }
1195
1196        while (mbuf_avail != 0 || m->next != NULL) {
1197                /* done with current buf, get the next one */
1198                if (buf_avail == 0) {
1199                        vec_idx++;
1200                        if (unlikely(vec_idx >= nr_vec))
1201                                goto error;
1202
1203                        buf_addr = buf_vec[vec_idx].buf_addr;
1204                        buf_iova = buf_vec[vec_idx].buf_iova;
1205                        buf_len = buf_vec[vec_idx].buf_len;
1206
1207                        buf_offset = 0;
1208                        buf_avail  = buf_len;
1209                }
1210
1211                /* done with current mbuf, get the next one */
1212                if (mbuf_avail == 0) {
1213                        m = m->next;
1214
1215                        mbuf_offset = 0;
1216                        mbuf_avail  = rte_pktmbuf_data_len(m);
1217                }
1218
1219                if (hdr_addr) {
1220                        virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
1221                        if (rxvq_is_mergeable(dev))
1222                                ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
1223                                                num_buffers);
1224
1225                        if (unlikely(hdr == &tmp_hdr)) {
1226                                copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
1227                        } else {
1228                                PRINT_PACKET(dev, (uintptr_t)hdr_addr,
1229                                                dev->vhost_hlen, 0);
1230                                vhost_log_cache_write_iova(dev, vq,
1231                                                buf_vec[0].buf_iova,
1232                                                dev->vhost_hlen);
1233                        }
1234
1235                        hdr_addr = 0;
1236                }
1237
1238                cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1239
1240                if (is_async) {
1241                        if (async_fill_seg(dev, vq, m, mbuf_offset,
1242                                           buf_iova + buf_offset, cpy_len, true) < 0)
1243                                goto error;
1244                } else {
1245                        sync_fill_seg(dev, vq, m, mbuf_offset,
1246                                      buf_addr + buf_offset,
1247                                      buf_iova + buf_offset, cpy_len, true);
1248                }
1249
1250                mbuf_avail  -= cpy_len;
1251                mbuf_offset += cpy_len;
1252                buf_avail  -= cpy_len;
1253                buf_offset += cpy_len;
1254        }
1255
1256        if (is_async)
1257                async_iter_finalize(async);
1258
1259        return 0;
1260error:
1261        if (is_async)
1262                async_iter_cancel(async);
1263
1264        return -1;
1265}
1266
1267static __rte_always_inline int
1268vhost_enqueue_single_packed(struct virtio_net *dev,
1269                            struct vhost_virtqueue *vq,
1270                            struct rte_mbuf *pkt,
1271                            struct buf_vector *buf_vec,
1272                            uint16_t *nr_descs)
1273{
1274        uint16_t nr_vec = 0;
1275        uint16_t avail_idx = vq->last_avail_idx;
1276        uint16_t max_tries, tries = 0;
1277        uint16_t buf_id = 0;
1278        uint32_t len = 0;
1279        uint16_t desc_count;
1280        uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1281        uint16_t num_buffers = 0;
1282        uint32_t buffer_len[vq->size];
1283        uint16_t buffer_buf_id[vq->size];
1284        uint16_t buffer_desc_count[vq->size];
1285
1286        if (rxvq_is_mergeable(dev))
1287                max_tries = vq->size - 1;
1288        else
1289                max_tries = 1;
1290
1291        while (size > 0) {
1292                /*
1293                 * if we tried all available ring items, and still
1294                 * can't get enough buf, it means something abnormal
1295                 * happened.
1296                 */
1297                if (unlikely(++tries > max_tries))
1298                        return -1;
1299
1300                if (unlikely(fill_vec_buf_packed(dev, vq,
1301                                                avail_idx, &desc_count,
1302                                                buf_vec, &nr_vec,
1303                                                &buf_id, &len,
1304                                                VHOST_ACCESS_RW) < 0))
1305                        return -1;
1306
1307                len = RTE_MIN(len, size);
1308                size -= len;
1309
1310                buffer_len[num_buffers] = len;
1311                buffer_buf_id[num_buffers] = buf_id;
1312                buffer_desc_count[num_buffers] = desc_count;
1313                num_buffers += 1;
1314
1315                *nr_descs += desc_count;
1316                avail_idx += desc_count;
1317                if (avail_idx >= vq->size)
1318                        avail_idx -= vq->size;
1319        }
1320
1321        if (mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers, false) < 0)
1322                return -1;
1323
1324        vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
1325                                           buffer_desc_count, num_buffers);
1326
1327        return 0;
1328}
1329
1330static __rte_noinline uint32_t
1331virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1332        struct rte_mbuf **pkts, uint32_t count)
1333{
1334        uint32_t pkt_idx = 0;
1335        uint16_t num_buffers;
1336        struct buf_vector buf_vec[BUF_VECTOR_MAX];
1337        uint16_t avail_head;
1338
1339        /*
1340         * The ordering between avail index and
1341         * desc reads needs to be enforced.
1342         */
1343        avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1344
1345        rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1346
1347        for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1348                uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1349                uint16_t nr_vec = 0;
1350
1351                if (unlikely(reserve_avail_buf_split(dev, vq,
1352                                                pkt_len, buf_vec, &num_buffers,
1353                                                avail_head, &nr_vec) < 0)) {
1354                        VHOST_LOG_DATA(dev->ifname, DEBUG,
1355                                "failed to get enough desc from vring\n");
1356                        vq->shadow_used_idx -= num_buffers;
1357                        break;
1358                }
1359
1360                VHOST_LOG_DATA(dev->ifname, DEBUG,
1361                        "current index %d | end index %d\n",
1362                        vq->last_avail_idx, vq->last_avail_idx + num_buffers);
1363
1364                if (mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec,
1365                                        num_buffers, false) < 0) {
1366                        vq->shadow_used_idx -= num_buffers;
1367                        break;
1368                }
1369
1370                vq->last_avail_idx += num_buffers;
1371        }
1372
1373        do_data_copy_enqueue(dev, vq);
1374
1375        if (likely(vq->shadow_used_idx)) {
1376                flush_shadow_used_ring_split(dev, vq);
1377                vhost_vring_call_split(dev, vq);
1378        }
1379
1380        return pkt_idx;
1381}
1382
1383static __rte_always_inline int
1384virtio_dev_rx_sync_batch_check(struct virtio_net *dev,
1385                           struct vhost_virtqueue *vq,
1386                           struct rte_mbuf **pkts,
1387                           uint64_t *desc_addrs,
1388                           uint64_t *lens)
1389{
1390        bool wrap_counter = vq->avail_wrap_counter;
1391        struct vring_packed_desc *descs = vq->desc_packed;
1392        uint16_t avail_idx = vq->last_avail_idx;
1393        uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1394        uint16_t i;
1395
1396        if (unlikely(avail_idx & PACKED_BATCH_MASK))
1397                return -1;
1398
1399        if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1400                return -1;
1401
1402        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1403                if (unlikely(pkts[i]->next != NULL))
1404                        return -1;
1405                if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1406                                            wrap_counter)))
1407                        return -1;
1408        }
1409
1410        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1411                lens[i] = descs[avail_idx + i].len;
1412
1413        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1414                if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1415                        return -1;
1416        }
1417
1418        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1419                desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1420                                                  descs[avail_idx + i].addr,
1421                                                  &lens[i],
1422                                                  VHOST_ACCESS_RW);
1423
1424        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1425                if (unlikely(!desc_addrs[i]))
1426                        return -1;
1427                if (unlikely(lens[i] != descs[avail_idx + i].len))
1428                        return -1;
1429        }
1430
1431        return 0;
1432}
1433
1434static __rte_always_inline void
1435virtio_dev_rx_batch_packed_copy(struct virtio_net *dev,
1436                           struct vhost_virtqueue *vq,
1437                           struct rte_mbuf **pkts,
1438                           uint64_t *desc_addrs,
1439                           uint64_t *lens)
1440{
1441        uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1442        struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1443        struct vring_packed_desc *descs = vq->desc_packed;
1444        uint16_t avail_idx = vq->last_avail_idx;
1445        uint16_t ids[PACKED_BATCH_SIZE];
1446        uint16_t i;
1447
1448        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1449                rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1450                hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1451                                        (uintptr_t)desc_addrs[i];
1452                lens[i] = pkts[i]->pkt_len +
1453                        sizeof(struct virtio_net_hdr_mrg_rxbuf);
1454        }
1455
1456        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1457                virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1458
1459        vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1460
1461        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1462                rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1463                           rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1464                           pkts[i]->pkt_len);
1465        }
1466
1467        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1468                vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1469                                           lens[i]);
1470
1471        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1472                ids[i] = descs[avail_idx + i].id;
1473
1474        vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1475}
1476
1477static __rte_always_inline int
1478virtio_dev_rx_sync_batch_packed(struct virtio_net *dev,
1479                           struct vhost_virtqueue *vq,
1480                           struct rte_mbuf **pkts)
1481{
1482        uint64_t desc_addrs[PACKED_BATCH_SIZE];
1483        uint64_t lens[PACKED_BATCH_SIZE];
1484
1485        if (virtio_dev_rx_sync_batch_check(dev, vq, pkts, desc_addrs, lens) == -1)
1486                return -1;
1487
1488        if (vq->shadow_used_idx) {
1489                do_data_copy_enqueue(dev, vq);
1490                vhost_flush_enqueue_shadow_packed(dev, vq);
1491        }
1492
1493        virtio_dev_rx_batch_packed_copy(dev, vq, pkts, desc_addrs, lens);
1494
1495        return 0;
1496}
1497
1498static __rte_always_inline int16_t
1499virtio_dev_rx_single_packed(struct virtio_net *dev,
1500                            struct vhost_virtqueue *vq,
1501                            struct rte_mbuf *pkt)
1502{
1503        struct buf_vector buf_vec[BUF_VECTOR_MAX];
1504        uint16_t nr_descs = 0;
1505
1506        if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1507                                                 &nr_descs) < 0)) {
1508                VHOST_LOG_DATA(dev->ifname, DEBUG, "failed to get enough desc from vring\n");
1509                return -1;
1510        }
1511
1512        VHOST_LOG_DATA(dev->ifname, DEBUG,
1513                "current index %d | end index %d\n",
1514                vq->last_avail_idx, vq->last_avail_idx + nr_descs);
1515
1516        vq_inc_last_avail_packed(vq, nr_descs);
1517
1518        return 0;
1519}
1520
1521static __rte_noinline uint32_t
1522virtio_dev_rx_packed(struct virtio_net *dev,
1523                     struct vhost_virtqueue *__rte_restrict vq,
1524                     struct rte_mbuf **__rte_restrict pkts,
1525                     uint32_t count)
1526{
1527        uint32_t pkt_idx = 0;
1528
1529        do {
1530                rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1531
1532                if (count - pkt_idx >= PACKED_BATCH_SIZE) {
1533                        if (!virtio_dev_rx_sync_batch_packed(dev, vq,
1534                                                        &pkts[pkt_idx])) {
1535                                pkt_idx += PACKED_BATCH_SIZE;
1536                                continue;
1537                        }
1538                }
1539
1540                if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1541                        break;
1542                pkt_idx++;
1543
1544        } while (pkt_idx < count);
1545
1546        if (vq->shadow_used_idx) {
1547                do_data_copy_enqueue(dev, vq);
1548                vhost_flush_enqueue_shadow_packed(dev, vq);
1549        }
1550
1551        if (pkt_idx)
1552                vhost_vring_call_packed(dev, vq);
1553
1554        return pkt_idx;
1555}
1556
1557static __rte_always_inline uint32_t
1558virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1559        struct rte_mbuf **pkts, uint32_t count)
1560{
1561        struct vhost_virtqueue *vq;
1562        uint32_t nb_tx = 0;
1563
1564        VHOST_LOG_DATA(dev->ifname, DEBUG, "%s\n", __func__);
1565        if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1566                VHOST_LOG_DATA(dev->ifname, ERR,
1567                        "%s: invalid virtqueue idx %d.\n",
1568                        __func__, queue_id);
1569                return 0;
1570        }
1571
1572        vq = dev->virtqueue[queue_id];
1573
1574        rte_spinlock_lock(&vq->access_lock);
1575
1576        if (unlikely(!vq->enabled))
1577                goto out_access_unlock;
1578
1579        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1580                vhost_user_iotlb_rd_lock(vq);
1581
1582        if (unlikely(!vq->access_ok))
1583                if (unlikely(vring_translate(dev, vq) < 0))
1584                        goto out;
1585
1586        count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1587        if (count == 0)
1588                goto out;
1589
1590        if (vq_is_packed(dev))
1591                nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1592        else
1593                nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1594
1595        vhost_queue_stats_update(dev, vq, pkts, nb_tx);
1596
1597out:
1598        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1599                vhost_user_iotlb_rd_unlock(vq);
1600
1601out_access_unlock:
1602        rte_spinlock_unlock(&vq->access_lock);
1603
1604        return nb_tx;
1605}
1606
1607uint16_t
1608rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1609        struct rte_mbuf **__rte_restrict pkts, uint16_t count)
1610{
1611        struct virtio_net *dev = get_device(vid);
1612
1613        if (!dev)
1614                return 0;
1615
1616        if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1617                VHOST_LOG_DATA(dev->ifname, ERR,
1618                        "%s: built-in vhost net backend is disabled.\n",
1619                        __func__);
1620                return 0;
1621        }
1622
1623        return virtio_dev_rx(dev, queue_id, pkts, count);
1624}
1625
1626static __rte_always_inline uint16_t
1627async_get_first_inflight_pkt_idx(struct vhost_virtqueue *vq)
1628{
1629        struct vhost_async *async = vq->async;
1630
1631        if (async->pkts_idx >= async->pkts_inflight_n)
1632                return async->pkts_idx - async->pkts_inflight_n;
1633        else
1634                return vq->size - async->pkts_inflight_n + async->pkts_idx;
1635}
1636
1637static __rte_always_inline void
1638store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
1639                uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1640{
1641        size_t elem_size = sizeof(struct vring_used_elem);
1642
1643        if (d_idx + count <= ring_size) {
1644                rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1645        } else {
1646                uint16_t size = ring_size - d_idx;
1647
1648                rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1649                rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1650        }
1651}
1652
1653static __rte_always_inline void
1654store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
1655                struct vring_used_elem_packed *d_ring,
1656                uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1657{
1658        size_t elem_size = sizeof(struct vring_used_elem_packed);
1659
1660        if (d_idx + count <= ring_size) {
1661                rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1662        } else {
1663                uint16_t size = ring_size - d_idx;
1664
1665                rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1666                rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1667        }
1668}
1669
1670static __rte_noinline uint32_t
1671virtio_dev_rx_async_submit_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1672                uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count,
1673                int16_t dma_id, uint16_t vchan_id)
1674{
1675        struct buf_vector buf_vec[BUF_VECTOR_MAX];
1676        uint32_t pkt_idx = 0;
1677        uint16_t num_buffers;
1678        uint16_t avail_head;
1679
1680        struct vhost_async *async = vq->async;
1681        struct async_inflight_info *pkts_info = async->pkts_info;
1682        uint32_t pkt_err = 0;
1683        uint16_t n_xfer;
1684        uint16_t slot_idx = 0;
1685
1686        /*
1687         * The ordering between avail index and desc reads need to be enforced.
1688         */
1689        avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1690
1691        rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1692
1693        async_iter_reset(async);
1694
1695        for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1696                uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1697                uint16_t nr_vec = 0;
1698
1699                if (unlikely(reserve_avail_buf_split(dev, vq, pkt_len, buf_vec,
1700                                                &num_buffers, avail_head, &nr_vec) < 0)) {
1701                        VHOST_LOG_DATA(dev->ifname, DEBUG,
1702                                "failed to get enough desc from vring\n");
1703                        vq->shadow_used_idx -= num_buffers;
1704                        break;
1705                }
1706
1707                VHOST_LOG_DATA(dev->ifname, DEBUG,
1708                        "current index %d | end index %d\n",
1709                        vq->last_avail_idx, vq->last_avail_idx + num_buffers);
1710
1711                if (mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers, true) < 0) {
1712                        vq->shadow_used_idx -= num_buffers;
1713                        break;
1714                }
1715
1716                slot_idx = (async->pkts_idx + pkt_idx) & (vq->size - 1);
1717                pkts_info[slot_idx].descs = num_buffers;
1718                pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1719
1720                vq->last_avail_idx += num_buffers;
1721        }
1722
1723        if (unlikely(pkt_idx == 0))
1724                return 0;
1725
1726        n_xfer = vhost_async_dma_transfer(dev, vq, dma_id, vchan_id, async->pkts_idx,
1727                        async->iov_iter, pkt_idx);
1728
1729        pkt_err = pkt_idx - n_xfer;
1730        if (unlikely(pkt_err)) {
1731                uint16_t num_descs = 0;
1732
1733                VHOST_LOG_DATA(dev->ifname, DEBUG,
1734                        "%s: failed to transfer %u packets for queue %u.\n",
1735                        __func__, pkt_err, queue_id);
1736
1737                /* update number of completed packets */
1738                pkt_idx = n_xfer;
1739
1740                /* calculate the sum of descriptors to revert */
1741                while (pkt_err-- > 0) {
1742                        num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
1743                        slot_idx--;
1744                }
1745
1746                /* recover shadow used ring and available ring */
1747                vq->shadow_used_idx -= num_descs;
1748                vq->last_avail_idx -= num_descs;
1749        }
1750
1751        /* keep used descriptors */
1752        if (likely(vq->shadow_used_idx)) {
1753                uint16_t to = async->desc_idx_split & (vq->size - 1);
1754
1755                store_dma_desc_info_split(vq->shadow_used_split,
1756                                async->descs_split, vq->size, 0, to,
1757                                vq->shadow_used_idx);
1758
1759                async->desc_idx_split += vq->shadow_used_idx;
1760
1761                async->pkts_idx += pkt_idx;
1762                if (async->pkts_idx >= vq->size)
1763                        async->pkts_idx -= vq->size;
1764
1765                async->pkts_inflight_n += pkt_idx;
1766                vq->shadow_used_idx = 0;
1767        }
1768
1769        return pkt_idx;
1770}
1771
1772
1773static __rte_always_inline int
1774vhost_enqueue_async_packed(struct virtio_net *dev,
1775                            struct vhost_virtqueue *vq,
1776                            struct rte_mbuf *pkt,
1777                            struct buf_vector *buf_vec,
1778                            uint16_t *nr_descs,
1779                            uint16_t *nr_buffers)
1780{
1781        uint16_t nr_vec = 0;
1782        uint16_t avail_idx = vq->last_avail_idx;
1783        uint16_t max_tries, tries = 0;
1784        uint16_t buf_id = 0;
1785        uint32_t len = 0;
1786        uint16_t desc_count = 0;
1787        uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1788        uint32_t buffer_len[vq->size];
1789        uint16_t buffer_buf_id[vq->size];
1790        uint16_t buffer_desc_count[vq->size];
1791
1792        if (rxvq_is_mergeable(dev))
1793                max_tries = vq->size - 1;
1794        else
1795                max_tries = 1;
1796
1797        while (size > 0) {
1798                /*
1799                 * if we tried all available ring items, and still
1800                 * can't get enough buf, it means something abnormal
1801                 * happened.
1802                 */
1803                if (unlikely(++tries > max_tries))
1804                        return -1;
1805
1806                if (unlikely(fill_vec_buf_packed(dev, vq,
1807                                                avail_idx, &desc_count,
1808                                                buf_vec, &nr_vec,
1809                                                &buf_id, &len,
1810                                                VHOST_ACCESS_RW) < 0))
1811                        return -1;
1812
1813                len = RTE_MIN(len, size);
1814                size -= len;
1815
1816                buffer_len[*nr_buffers] = len;
1817                buffer_buf_id[*nr_buffers] = buf_id;
1818                buffer_desc_count[*nr_buffers] = desc_count;
1819                *nr_buffers += 1;
1820                *nr_descs += desc_count;
1821                avail_idx += desc_count;
1822                if (avail_idx >= vq->size)
1823                        avail_idx -= vq->size;
1824        }
1825
1826        if (unlikely(mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, true) < 0))
1827                return -1;
1828
1829        vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
1830
1831        return 0;
1832}
1833
1834static __rte_always_inline int16_t
1835virtio_dev_rx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1836                            struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers)
1837{
1838        struct buf_vector buf_vec[BUF_VECTOR_MAX];
1839
1840        if (unlikely(vhost_enqueue_async_packed(dev, vq, pkt, buf_vec,
1841                                        nr_descs, nr_buffers) < 0)) {
1842                VHOST_LOG_DATA(dev->ifname, DEBUG, "failed to get enough desc from vring\n");
1843                return -1;
1844        }
1845
1846        VHOST_LOG_DATA(dev->ifname, DEBUG,
1847                "current index %d | end index %d\n",
1848                vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
1849
1850        return 0;
1851}
1852
1853static __rte_always_inline void
1854dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
1855                        uint32_t nr_err, uint32_t *pkt_idx)
1856{
1857        uint16_t descs_err = 0;
1858        uint16_t buffers_err = 0;
1859        struct async_inflight_info *pkts_info = vq->async->pkts_info;
1860
1861        *pkt_idx -= nr_err;
1862        /* calculate the sum of buffers and descs of DMA-error packets. */
1863        while (nr_err-- > 0) {
1864                descs_err += pkts_info[slot_idx % vq->size].descs;
1865                buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
1866                slot_idx--;
1867        }
1868
1869        if (vq->last_avail_idx >= descs_err) {
1870                vq->last_avail_idx -= descs_err;
1871        } else {
1872                vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
1873                vq->avail_wrap_counter ^= 1;
1874        }
1875
1876        vq->shadow_used_idx -= buffers_err;
1877}
1878
1879static __rte_noinline uint32_t
1880virtio_dev_rx_async_submit_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1881                uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count,
1882                int16_t dma_id, uint16_t vchan_id)
1883{
1884        uint32_t pkt_idx = 0;
1885        uint32_t remained = count;
1886        uint16_t n_xfer;
1887        uint16_t num_buffers;
1888        uint16_t num_descs;
1889
1890        struct vhost_async *async = vq->async;
1891        struct async_inflight_info *pkts_info = async->pkts_info;
1892        uint32_t pkt_err = 0;
1893        uint16_t slot_idx = 0;
1894
1895        do {
1896                rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1897
1898                num_buffers = 0;
1899                num_descs = 0;
1900                if (unlikely(virtio_dev_rx_async_packed(dev, vq, pkts[pkt_idx],
1901                                                &num_descs, &num_buffers) < 0))
1902                        break;
1903
1904                slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
1905
1906                pkts_info[slot_idx].descs = num_descs;
1907                pkts_info[slot_idx].nr_buffers = num_buffers;
1908                pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1909
1910                pkt_idx++;
1911                remained--;
1912                vq_inc_last_avail_packed(vq, num_descs);
1913        } while (pkt_idx < count);
1914
1915        if (unlikely(pkt_idx == 0))
1916                return 0;
1917
1918        n_xfer = vhost_async_dma_transfer(dev, vq, dma_id, vchan_id, async->pkts_idx,
1919                        async->iov_iter, pkt_idx);
1920
1921        async_iter_reset(async);
1922
1923        pkt_err = pkt_idx - n_xfer;
1924        if (unlikely(pkt_err)) {
1925                VHOST_LOG_DATA(dev->ifname, DEBUG,
1926                        "%s: failed to transfer %u packets for queue %u.\n",
1927                        __func__, pkt_err, queue_id);
1928                dma_error_handler_packed(vq, slot_idx, pkt_err, &pkt_idx);
1929        }
1930
1931        if (likely(vq->shadow_used_idx)) {
1932                /* keep used descriptors. */
1933                store_dma_desc_info_packed(vq->shadow_used_packed, async->buffers_packed,
1934                                        vq->size, 0, async->buffer_idx_packed,
1935                                        vq->shadow_used_idx);
1936
1937                async->buffer_idx_packed += vq->shadow_used_idx;
1938                if (async->buffer_idx_packed >= vq->size)
1939                        async->buffer_idx_packed -= vq->size;
1940
1941                async->pkts_idx += pkt_idx;
1942                if (async->pkts_idx >= vq->size)
1943                        async->pkts_idx -= vq->size;
1944
1945                vq->shadow_used_idx = 0;
1946                async->pkts_inflight_n += pkt_idx;
1947        }
1948
1949        return pkt_idx;
1950}
1951
1952static __rte_always_inline void
1953write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
1954{
1955        struct vhost_async *async = vq->async;
1956        uint16_t nr_left = n_descs;
1957        uint16_t nr_copy;
1958        uint16_t to, from;
1959
1960        do {
1961                from = async->last_desc_idx_split & (vq->size - 1);
1962                nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
1963                to = vq->last_used_idx & (vq->size - 1);
1964
1965                if (to + nr_copy <= vq->size) {
1966                        rte_memcpy(&vq->used->ring[to], &async->descs_split[from],
1967                                        nr_copy * sizeof(struct vring_used_elem));
1968                } else {
1969                        uint16_t size = vq->size - to;
1970
1971                        rte_memcpy(&vq->used->ring[to], &async->descs_split[from],
1972                                        size * sizeof(struct vring_used_elem));
1973                        rte_memcpy(&vq->used->ring[0], &async->descs_split[from + size],
1974                                        (nr_copy - size) * sizeof(struct vring_used_elem));
1975                }
1976
1977                async->last_desc_idx_split += nr_copy;
1978                vq->last_used_idx += nr_copy;
1979                nr_left -= nr_copy;
1980        } while (nr_left > 0);
1981}
1982
1983static __rte_always_inline void
1984write_back_completed_descs_packed(struct vhost_virtqueue *vq,
1985                                uint16_t n_buffers)
1986{
1987        struct vhost_async *async = vq->async;
1988        uint16_t from = async->last_buffer_idx_packed;
1989        uint16_t used_idx = vq->last_used_idx;
1990        uint16_t head_idx = vq->last_used_idx;
1991        uint16_t head_flags = 0;
1992        uint16_t i;
1993
1994        /* Split loop in two to save memory barriers */
1995        for (i = 0; i < n_buffers; i++) {
1996                vq->desc_packed[used_idx].id = async->buffers_packed[from].id;
1997                vq->desc_packed[used_idx].len = async->buffers_packed[from].len;
1998
1999                used_idx += async->buffers_packed[from].count;
2000                if (used_idx >= vq->size)

2001                        used_idx -= vq->size;
2002
2003                from++;
2004                if (from >= vq->size)
2005                        from = 0;
2006        }
2007
2008        /* The ordering for storing desc flags needs to be enforced. */
2009        rte_atomic_thread_fence(__ATOMIC_RELEASE);
2010
2011        from = async->last_buffer_idx_packed;
2012
2013        for (i = 0; i < n_buffers; i++) {
2014                uint16_t flags;
2015
2016                if (async->buffers_packed[from].len)
2017                        flags = VRING_DESC_F_WRITE;
2018                else
2019                        flags = 0;
2020
2021                if (vq->used_wrap_counter) {
2022                        flags |= VRING_DESC_F_USED;
2023                        flags |= VRING_DESC_F_AVAIL;
2024                } else {
2025                        flags &= ~VRING_DESC_F_USED;
2026                        flags &= ~VRING_DESC_F_AVAIL;
2027                }
2028
2029                if (i > 0) {
2030                        vq->desc_packed[vq->last_used_idx].flags = flags;
2031                } else {
2032                        head_idx = vq->last_used_idx;
2033                        head_flags = flags;
2034                }
2035
2036                vq_inc_last_used_packed(vq, async->buffers_packed[from].count);
2037
2038                from++;
2039                if (from == vq->size)
2040                        from = 0;
2041        }
2042
2043        vq->desc_packed[head_idx].flags = head_flags;
2044        async->last_buffer_idx_packed = from;
2045}
2046
2047static __rte_always_inline uint16_t
2048vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
2049                struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
2050                uint16_t vchan_id)
2051{
2052        struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
2053        struct vhost_async *async = vq->async;
2054        struct async_inflight_info *pkts_info = async->pkts_info;
2055        uint16_t nr_cpl_pkts = 0;
2056        uint16_t n_descs = 0, n_buffers = 0;
2057        uint16_t start_idx, from, i;
2058
2059        /* Check completed copies for the given DMA vChannel */
2060        vhost_async_dma_check_completed(dev, dma_id, vchan_id, VHOST_DMA_MAX_COPY_COMPLETE);
2061
2062        start_idx = async_get_first_inflight_pkt_idx(vq);
2063        /**
2064         * Calculate the number of copy completed packets.
2065         * Note that there may be completed packets even if
2066         * no copies are reported done by the given DMA vChannel,
2067         * as it's possible that a virtqueue uses multiple DMA
2068         * vChannels.
2069         */
2070        from = start_idx;
2071        while (vq->async->pkts_cmpl_flag[from] && count--) {
2072                vq->async->pkts_cmpl_flag[from] = false;
2073                from++;
2074                if (from >= vq->size)
2075                        from -= vq->size;
2076                nr_cpl_pkts++;
2077        }
2078
2079        if (nr_cpl_pkts == 0)
2080                return 0;
2081
2082        for (i = 0; i < nr_cpl_pkts; i++) {
2083                from = (start_idx + i) % vq->size;
2084                /* Only used with packed ring */
2085                n_buffers += pkts_info[from].nr_buffers;
2086                /* Only used with split ring */
2087                n_descs += pkts_info[from].descs;
2088                pkts[i] = pkts_info[from].mbuf;
2089        }
2090
2091        async->pkts_inflight_n -= nr_cpl_pkts;
2092
2093        if (likely(vq->enabled && vq->access_ok)) {
2094                if (vq_is_packed(dev)) {
2095                        write_back_completed_descs_packed(vq, n_buffers);
2096                        vhost_vring_call_packed(dev, vq);
2097                } else {
2098                        write_back_completed_descs_split(vq, n_descs);
2099                        __atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
2100                        vhost_vring_call_split(dev, vq);
2101                }
2102        } else {
2103                if (vq_is_packed(dev)) {
2104                        async->last_buffer_idx_packed += n_buffers;
2105                        if (async->last_buffer_idx_packed >= vq->size)
2106                                async->last_buffer_idx_packed -= vq->size;
2107                } else {
2108                        async->last_desc_idx_split += n_descs;
2109                }
2110        }
2111
2112        return nr_cpl_pkts;
2113}
2114
2115uint16_t
2116rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
2117                struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
2118                uint16_t vchan_id)
2119{
2120        struct virtio_net *dev = get_device(vid);
2121        struct vhost_virtqueue *vq;
2122        uint16_t n_pkts_cpl = 0;
2123
2124        if (unlikely(!dev))
2125                return 0;
2126
2127        VHOST_LOG_DATA(dev->ifname, DEBUG, "%s\n", __func__);
2128        if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2129                VHOST_LOG_DATA(dev->ifname, ERR,
2130                        "%s: invalid virtqueue idx %d.\n",
2131                        __func__, queue_id);
2132                return 0;
2133        }
2134
2135        if (unlikely(!dma_copy_track[dma_id].vchans ||
2136                                !dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
2137                VHOST_LOG_DATA(dev->ifname, ERR,
2138                        "%s: invalid channel %d:%u.\n",
2139                        __func__, dma_id, vchan_id);
2140                return 0;
2141        }
2142
2143        vq = dev->virtqueue[queue_id];
2144
2145        if (!rte_spinlock_trylock(&vq->access_lock)) {
2146                VHOST_LOG_DATA(dev->ifname, DEBUG,
2147                        "%s: virtqueue %u is busy.\n",
2148                        __func__, queue_id);
2149                return 0;
2150        }
2151
2152        if (unlikely(!vq->async)) {
2153                VHOST_LOG_DATA(dev->ifname, ERR,
2154                        "%s: async not registered for virtqueue %d.\n",
2155                        __func__, queue_id);
2156                goto out;
2157        }
2158
2159        n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count, dma_id, vchan_id);
2160
2161        vhost_queue_stats_update(dev, vq, pkts, n_pkts_cpl);
2162        vq->stats.inflight_completed += n_pkts_cpl;
2163
2164out:
2165        rte_spinlock_unlock(&vq->access_lock);
2166
2167        return n_pkts_cpl;
2168}
2169
2170uint16_t
2171rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
2172                struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
2173                uint16_t vchan_id)
2174{
2175        struct virtio_net *dev = get_device(vid);
2176        struct vhost_virtqueue *vq;
2177        uint16_t n_pkts_cpl = 0;
2178
2179        if (!dev)
2180                return 0;
2181
2182        VHOST_LOG_DATA(dev->ifname, DEBUG, "%s\n", __func__);
2183        if (unlikely(queue_id >= dev->nr_vring)) {
2184                VHOST_LOG_DATA(dev->ifname, ERR, "%s: invalid virtqueue idx %d.\n",
2185                        __func__, queue_id);
2186                return 0;
2187        }
2188
2189        if (unlikely(dma_id < 0 || dma_id >= RTE_DMADEV_DEFAULT_MAX)) {
2190                VHOST_LOG_DATA(dev->ifname, ERR, "%s: invalid dma id %d.\n",
2191                        __func__, dma_id);
2192                return 0;
2193        }
2194
2195        vq = dev->virtqueue[queue_id];
2196
2197        if (unlikely(!rte_spinlock_is_locked(&vq->access_lock))) {
2198                VHOST_LOG_DATA(dev->ifname, ERR, "%s() called without access lock taken.\n",
2199                        __func__);
2200                return -1;
2201        }
2202
2203        if (unlikely(!vq->async)) {
2204                VHOST_LOG_DATA(dev->ifname, ERR,
2205                        "%s: async not registered for virtqueue %d.\n",
2206                        __func__, queue_id);
2207                return 0;
2208        }
2209
2210        if (unlikely(!dma_copy_track[dma_id].vchans ||
2211                                !dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
2212                VHOST_LOG_DATA(dev->ifname, ERR,
2213                        "%s: invalid channel %d:%u.\n",
2214                        __func__, dma_id, vchan_id);
2215                return 0;
2216        }
2217
2218        if ((queue_id & 1) == 0)
2219                n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id,
2220                                        pkts, count, dma_id, vchan_id);
2221        else {
2222                n_pkts_cpl = async_poll_dequeue_completed(dev, vq, pkts, count,
2223                                        dma_id, vchan_id, dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS);
2224        }
2225
2226        vhost_queue_stats_update(dev, vq, pkts, n_pkts_cpl);
2227        vq->stats.inflight_completed += n_pkts_cpl;
2228
2229        return n_pkts_cpl;
2230}
2231
2232uint16_t
2233rte_vhost_clear_queue(int vid, uint16_t queue_id, struct rte_mbuf **pkts,
2234                uint16_t count, int16_t dma_id, uint16_t vchan_id)
2235{
2236        struct virtio_net *dev = get_device(vid);
2237        struct vhost_virtqueue *vq;
2238        uint16_t n_pkts_cpl = 0;
2239
2240        if (!dev)
2241                return 0;
2242
2243        VHOST_LOG_DATA(dev->ifname, DEBUG, "%s\n", __func__);
2244        if (unlikely(queue_id >= dev->nr_vring)) {
2245                VHOST_LOG_DATA(dev->ifname, ERR, "%s: invalid virtqueue idx %u.\n",
2246                        __func__, queue_id);
2247                return 0;
2248        }
2249
2250        if (unlikely(dma_id < 0 || dma_id >= RTE_DMADEV_DEFAULT_MAX)) {
2251                VHOST_LOG_DATA(dev->ifname, ERR, "%s: invalid dma id %d.\n",
2252                        __func__, dma_id);
2253                return 0;
2254        }
2255
2256        vq = dev->virtqueue[queue_id];
2257
2258        if (!rte_spinlock_trylock(&vq->access_lock)) {
2259                VHOST_LOG_DATA(dev->ifname, DEBUG, "%s: virtqueue %u is busy.\n",
2260                        __func__, queue_id);
2261                return 0;
2262        }
2263
2264        if (unlikely(!vq->async)) {
2265                VHOST_LOG_DATA(dev->ifname, ERR, "%s: async not registered for queue id %u.\n",
2266                        __func__, queue_id);
2267                goto out_access_unlock;
2268        }
2269
2270        if (unlikely(!dma_copy_track[dma_id].vchans ||
2271                                !dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
2272                VHOST_LOG_DATA(dev->ifname, ERR, "%s: invalid channel %d:%u.\n",
2273                        __func__, dma_id, vchan_id);
2274                goto out_access_unlock;
2275        }
2276
2277        if ((queue_id & 1) == 0)
2278                n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id,
2279                                pkts, count, dma_id, vchan_id);
2280        else {
2281                n_pkts_cpl = async_poll_dequeue_completed(dev, vq, pkts, count,
2282                                        dma_id, vchan_id, dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS);
2283        }
2284
2285        vhost_queue_stats_update(dev, vq, pkts, n_pkts_cpl);
2286        vq->stats.inflight_completed += n_pkts_cpl;
2287
2288out_access_unlock:
2289        rte_spinlock_unlock(&vq->access_lock);
2290
2291        return n_pkts_cpl;
2292}
2293
2294static __rte_always_inline uint32_t
2295virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
2296        struct rte_mbuf **pkts, uint32_t count, int16_t dma_id, uint16_t vchan_id)
2297{
2298        struct vhost_virtqueue *vq;
2299        uint32_t nb_tx = 0;
2300
2301        VHOST_LOG_DATA(dev->ifname, DEBUG, "%s\n", __func__);
2302        if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2303                VHOST_LOG_DATA(dev->ifname, ERR,
2304                        "%s: invalid virtqueue idx %d.\n",
2305                        __func__, queue_id);
2306                return 0;
2307        }
2308
2309        if (unlikely(!dma_copy_track[dma_id].vchans ||
2310                                !dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
2311                VHOST_LOG_DATA(dev->ifname, ERR,
2312                        "%s: invalid channel %d:%u.\n",
2313                         __func__, dma_id, vchan_id);
2314                return 0;
2315        }
2316
2317        vq = dev->virtqueue[queue_id];
2318
2319        rte_spinlock_lock(&vq->access_lock);
2320
2321        if (unlikely(!vq->enabled || !vq->async))
2322                goto out_access_unlock;
2323
2324        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2325                vhost_user_iotlb_rd_lock(vq);
2326
2327        if (unlikely(!vq->access_ok))
2328                if (unlikely(vring_translate(dev, vq) < 0))
2329                        goto out;
2330
2331        count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
2332        if (count == 0)
2333                goto out;
2334
2335        if (vq_is_packed(dev))
2336                nb_tx = virtio_dev_rx_async_submit_packed(dev, vq, queue_id,
2337                                pkts, count, dma_id, vchan_id);
2338        else
2339                nb_tx = virtio_dev_rx_async_submit_split(dev, vq, queue_id,
2340                                pkts, count, dma_id, vchan_id);
2341
2342        vq->stats.inflight_submitted += nb_tx;
2343
2344out:
2345        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2346                vhost_user_iotlb_rd_unlock(vq);
2347
2348out_access_unlock:
2349        rte_spinlock_unlock(&vq->access_lock);
2350
2351        return nb_tx;
2352}
2353
2354uint16_t
2355rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
2356                struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
2357                uint16_t vchan_id)
2358{
2359        struct virtio_net *dev = get_device(vid);
2360
2361        if (!dev)
2362                return 0;
2363
2364        if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2365                VHOST_LOG_DATA(dev->ifname, ERR,
2366                        "%s: built-in vhost net backend is disabled.\n",
2367                        __func__);
2368                return 0;
2369        }
2370
2371        return virtio_dev_rx_async_submit(dev, queue_id, pkts, count, dma_id, vchan_id);
2372}
2373
2374static inline bool
2375virtio_net_with_host_offload(struct virtio_net *dev)
2376{
2377        if (dev->features &
2378                        ((1ULL << VIRTIO_NET_F_CSUM) |
2379                         (1ULL << VIRTIO_NET_F_HOST_ECN) |
2380                         (1ULL << VIRTIO_NET_F_HOST_TSO4) |
2381                         (1ULL << VIRTIO_NET_F_HOST_TSO6) |
2382                         (1ULL << VIRTIO_NET_F_HOST_UFO)))
2383                return true;
2384
2385        return false;
2386}
2387
2388static int
2389parse_headers(struct rte_mbuf *m, uint8_t *l4_proto)
2390{
2391        struct rte_ipv4_hdr *ipv4_hdr;
2392        struct rte_ipv6_hdr *ipv6_hdr;
2393        struct rte_ether_hdr *eth_hdr;
2394        uint16_t ethertype;
2395        uint16_t data_len = rte_pktmbuf_data_len(m);
2396
2397        if (data_len < sizeof(struct rte_ether_hdr))
2398                return -EINVAL;
2399
2400        eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
2401
2402        m->l2_len = sizeof(struct rte_ether_hdr);
2403        ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
2404
2405        if (ethertype == RTE_ETHER_TYPE_VLAN) {
2406                if (data_len < sizeof(struct rte_ether_hdr) +
2407                                sizeof(struct rte_vlan_hdr))
2408                        goto error;
2409
2410                struct rte_vlan_hdr *vlan_hdr =
2411                        (struct rte_vlan_hdr *)(eth_hdr + 1);
2412
2413                m->l2_len += sizeof(struct rte_vlan_hdr);
2414                ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
2415        }
2416
2417        switch (ethertype) {
2418        case RTE_ETHER_TYPE_IPV4:
2419                if (data_len < m->l2_len + sizeof(struct rte_ipv4_hdr))
2420                        goto error;
2421                ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
2422                                m->l2_len);
2423                m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
2424                if (data_len < m->l2_len + m->l3_len)
2425                        goto error;
2426                m->ol_flags |= RTE_MBUF_F_TX_IPV4;
2427                *l4_proto = ipv4_hdr->next_proto_id;
2428                break;
2429        case RTE_ETHER_TYPE_IPV6:
2430                if (data_len < m->l2_len + sizeof(struct rte_ipv6_hdr))
2431                        goto error;
2432                ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *,
2433                                m->l2_len);
2434                m->l3_len = sizeof(struct rte_ipv6_hdr);
2435                m->ol_flags |= RTE_MBUF_F_TX_IPV6;
2436                *l4_proto = ipv6_hdr->proto;
2437                break;
2438        default:
2439                /* a valid L3 header is needed for further L4 parsing */
2440                goto error;
2441        }
2442
2443        /* both CSUM and GSO need a valid L4 header */
2444        switch (*l4_proto) {
2445        case IPPROTO_TCP:
2446                if (data_len < m->l2_len + m->l3_len +
2447                                sizeof(struct rte_tcp_hdr))
2448                        goto error;
2449                break;
2450        case IPPROTO_UDP:
2451                if (data_len < m->l2_len + m->l3_len +
2452                                sizeof(struct rte_udp_hdr))
2453                        goto error;
2454                break;
2455        case IPPROTO_SCTP:
2456                if (data_len < m->l2_len + m->l3_len +
2457                                sizeof(struct rte_sctp_hdr))
2458                        goto error;
2459                break;
2460        default:
2461                goto error;
2462        }
2463
2464        return 0;
2465
2466error:
2467        m->l2_len = 0;
2468        m->l3_len = 0;
2469        m->ol_flags = 0;
2470        return -EINVAL;
2471}
2472
2473static __rte_always_inline void
2474vhost_dequeue_offload_legacy(struct virtio_net *dev, struct virtio_net_hdr *hdr,
2475                struct rte_mbuf *m)
2476{
2477        uint8_t l4_proto = 0;
2478        struct rte_tcp_hdr *tcp_hdr = NULL;
2479        uint16_t tcp_len;
2480        uint16_t data_len = rte_pktmbuf_data_len(m);
2481
2482        if (parse_headers(m, &l4_proto) < 0)
2483                return;
2484
2485        if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2486                if (hdr->csum_start == (m->l2_len + m->l3_len)) {
2487                        switch (hdr->csum_offset) {
2488                        case (offsetof(struct rte_tcp_hdr, cksum)):
2489                                if (l4_proto != IPPROTO_TCP)
2490                                        goto error;
2491                                m->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
2492                                break;
2493                        case (offsetof(struct rte_udp_hdr, dgram_cksum)):
2494                                if (l4_proto != IPPROTO_UDP)
2495                                        goto error;
2496                                m->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
2497                                break;
2498                        case (offsetof(struct rte_sctp_hdr, cksum)):
2499                                if (l4_proto != IPPROTO_SCTP)
2500                                        goto error;
2501                                m->ol_flags |= RTE_MBUF_F_TX_SCTP_CKSUM;
2502                                break;
2503                        default:
2504                                goto error;
2505                        }
2506                } else {
2507                        goto error;
2508                }
2509        }
2510
2511        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2512                switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2513                case VIRTIO_NET_HDR_GSO_TCPV4:
2514                case VIRTIO_NET_HDR_GSO_TCPV6:
2515                        if (l4_proto != IPPROTO_TCP)
2516                                goto error;
2517                        tcp_hdr = rte_pktmbuf_mtod_offset(m,
2518                                        struct rte_tcp_hdr *,
2519                                        m->l2_len + m->l3_len);
2520                        tcp_len = (tcp_hdr->data_off & 0xf0) >> 2;
2521                        if (data_len < m->l2_len + m->l3_len + tcp_len)
2522                                goto error;
2523                        m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
2524                        m->tso_segsz = hdr->gso_size;
2525                        m->l4_len = tcp_len;
2526                        break;
2527                case VIRTIO_NET_HDR_GSO_UDP:
2528                        if (l4_proto != IPPROTO_UDP)
2529                                goto error;
2530                        m->ol_flags |= RTE_MBUF_F_TX_UDP_SEG;
2531                        m->tso_segsz = hdr->gso_size;
2532                        m->l4_len = sizeof(struct rte_udp_hdr);
2533                        break;
2534                default:
2535                        VHOST_LOG_DATA(dev->ifname, WARNING,
2536                                "unsupported gso type %u.\n",
2537                                hdr->gso_type);
2538                        goto error;
2539                }
2540        }
2541        return;
2542
2543error:
2544        m->l2_len = 0;
2545        m->l3_len = 0;
2546        m->ol_flags = 0;
2547}
2548
2549static __rte_always_inline void
2550vhost_dequeue_offload(struct virtio_net *dev, struct virtio_net_hdr *hdr,
2551                struct rte_mbuf *m, bool legacy_ol_flags)
2552{
2553        struct rte_net_hdr_lens hdr_lens;
2554        int l4_supported = 0;
2555        uint32_t ptype;
2556
2557        if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
2558                return;
2559
2560        if (legacy_ol_flags) {
2561                vhost_dequeue_offload_legacy(dev, hdr, m);
2562                return;
2563        }
2564
2565        m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN;
2566
2567        ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
2568        m->packet_type = ptype;
2569        if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
2570            (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
2571            (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
2572                l4_supported = 1;
2573
2574        /* According to Virtio 1.1 spec, the device only needs to look at
2575         * VIRTIO_NET_HDR_F_NEEDS_CSUM in the packet transmission path.
2576         * This differs from the processing incoming packets path where the
2577         * driver could rely on VIRTIO_NET_HDR_F_DATA_VALID flag set by the
2578         * device.
2579         *
2580         * 5.1.6.2.1 Driver Requirements: Packet Transmission
2581         * The driver MUST NOT set the VIRTIO_NET_HDR_F_DATA_VALID and
2582         * VIRTIO_NET_HDR_F_RSC_INFO bits in flags.
2583         *
2584         * 5.1.6.2.2 Device Requirements: Packet Transmission
2585         * The device MUST ignore flag bits that it does not recognize.
2586         */
2587        if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2588                uint32_t hdrlen;
2589
2590                hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
2591                if (hdr->csum_start <= hdrlen && l4_supported != 0) {
2592                        m->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_NONE;
2593                } else {
2594                        /* Unknown proto or tunnel, do sw cksum. We can assume
2595                         * the cksum field is in the first segment since the
2596                         * buffers we provided to the host are large enough.
2597                         * In case of SCTP, this will be wrong since it's a CRC
2598                         * but there's nothing we can do.
2599                         */
2600                        uint16_t csum = 0, off;
2601
2602                        if (rte_raw_cksum_mbuf(m, hdr->csum_start,
2603                                        rte_pktmbuf_pkt_len(m) - hdr->csum_start, &csum) < 0)
2604                                return;
2605                        if (likely(csum != 0xffff))
2606                                csum = ~csum;
2607                        off = hdr->csum_offset + hdr->csum_start;
2608                        if (rte_pktmbuf_data_len(m) >= off + 1)
2609                                *rte_pktmbuf_mtod_offset(m, uint16_t *, off) = csum;
2610                }
2611        }
2612
2613        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2614                if (hdr->gso_size == 0)
2615                        return;
2616
2617                switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2618                case VIRTIO_NET_HDR_GSO_TCPV4:
2619                case VIRTIO_NET_HDR_GSO_TCPV6:
2620                        if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_TCP)
2621                                break;
2622                        m->ol_flags |= RTE_MBUF_F_RX_LRO | RTE_MBUF_F_RX_L4_CKSUM_NONE;
2623                        m->tso_segsz = hdr->gso_size;
2624                        break;
2625                case VIRTIO_NET_HDR_GSO_UDP:
2626                        if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_UDP)
2627                                break;
2628                        m->ol_flags |= RTE_MBUF_F_RX_LRO | RTE_MBUF_F_RX_L4_CKSUM_NONE;
2629                        m->tso_segsz = hdr->gso_size;
2630                        break;
2631                default:
2632                        break;
2633                }
2634        }
2635}
2636
2637static __rte_noinline void
2638copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
2639                struct buf_vector *buf_vec)
2640{
2641        uint64_t len;
2642        uint64_t remain = sizeof(struct virtio_net_hdr);
2643        uint64_t src;
2644        uint64_t dst = (uint64_t)(uintptr_t)hdr;
2645
2646        while (remain) {
2647                len = RTE_MIN(remain, buf_vec->buf_len);
2648                src = buf_vec->buf_addr;
2649                rte_memcpy((void *)(uintptr_t)dst,
2650                                (void *)(uintptr_t)src, len);
2651
2652                remain -= len;
2653                dst += len;
2654                buf_vec++;
2655        }
2656}
2657
2658static __rte_always_inline int
2659desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
2660                  struct buf_vector *buf_vec, uint16_t nr_vec,
2661                  struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
2662                  bool legacy_ol_flags, uint16_t slot_idx, bool is_async)
2663{
2664        uint32_t buf_avail, buf_offset, buf_len;
2665        uint64_t buf_addr, buf_iova;
2666        uint32_t mbuf_avail, mbuf_offset;
2667        uint32_t cpy_len;
2668        struct rte_mbuf *cur = m, *prev = m;
2669        struct virtio_net_hdr tmp_hdr;
2670        struct virtio_net_hdr *hdr = NULL;
2671        /* A counter to avoid desc dead loop chain */
2672        uint16_t vec_idx = 0;
2673        struct vhost_async *async = vq->async;
2674        struct async_inflight_info *pkts_info;
2675
2676        buf_addr = buf_vec[vec_idx].buf_addr;
2677        buf_iova = buf_vec[vec_idx].buf_iova;
2678        buf_len = buf_vec[vec_idx].buf_len;
2679
2680        if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
2681                return -1;
2682
2683        if (virtio_net_with_host_offload(dev)) {
2684                if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
2685                        /*
2686                         * No luck, the virtio-net header doesn't fit
2687                         * in a contiguous virtual area.
2688                         */
2689                        copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
2690                        hdr = &tmp_hdr;
2691                } else {
2692                        hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
2693                }
2694        }
2695
2696        /*
2697         * A virtio driver normally uses at least 2 desc buffers
2698         * for Tx: the first for storing the header, and others
2699         * for storing the data.
2700         */
2701        if (unlikely(buf_len < dev->vhost_hlen)) {
2702                buf_offset = dev->vhost_hlen - buf_len;
2703                vec_idx++;
2704                buf_addr = buf_vec[vec_idx].buf_addr;
2705                buf_iova = buf_vec[vec_idx].buf_iova;
2706                buf_len = buf_vec[vec_idx].buf_len;
2707                buf_avail  = buf_len - buf_offset;
2708        } else if (buf_len == dev->vhost_hlen) {
2709                if (unlikely(++vec_idx >= nr_vec))
2710                        goto error;
2711                buf_addr = buf_vec[vec_idx].buf_addr;
2712                buf_iova = buf_vec[vec_idx].buf_iova;
2713                buf_len = buf_vec[vec_idx].buf_len;
2714
2715                buf_offset = 0;
2716                buf_avail = buf_len;
2717        } else {
2718                buf_offset = dev->vhost_hlen;
2719                buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
2720        }
2721
2722        PRINT_PACKET(dev,
2723                        (uintptr_t)(buf_addr + buf_offset),
2724                        (uint32_t)buf_avail, 0);
2725
2726        mbuf_offset = 0;
2727        mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
2728
2729        if (is_async) {
2730                pkts_info = async->pkts_info;
2731                if (async_iter_initialize(dev, async))
2732                        return -1;
2733        }
2734
2735        while (1) {
2736                cpy_len = RTE_MIN(buf_avail, mbuf_avail);
2737
2738                if (is_async) {
2739                        if (async_fill_seg(dev, vq, cur, mbuf_offset,
2740                                           buf_iova + buf_offset, cpy_len, false) < 0)
2741                                goto error;
2742                } else if (likely(hdr && cur == m)) {
2743                        rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
2744                                (void *)((uintptr_t)(buf_addr + buf_offset)),
2745                                cpy_len);
2746                } else {
2747                        sync_fill_seg(dev, vq, cur, mbuf_offset,
2748                                      buf_addr + buf_offset,
2749                                      buf_iova + buf_offset, cpy_len, false);
2750                }
2751
2752                mbuf_avail  -= cpy_len;
2753                mbuf_offset += cpy_len;
2754                buf_avail -= cpy_len;
2755                buf_offset += cpy_len;
2756
2757                /* This buf reaches to its end, get the next one */
2758                if (buf_avail == 0) {
2759                        if (++vec_idx >= nr_vec)
2760                                break;
2761
2762                        buf_addr = buf_vec[vec_idx].buf_addr;
2763                        buf_iova = buf_vec[vec_idx].buf_iova;
2764                        buf_len = buf_vec[vec_idx].buf_len;
2765
2766                        buf_offset = 0;
2767                        buf_avail  = buf_len;
2768
2769                        PRINT_PACKET(dev, (uintptr_t)buf_addr,
2770                                        (uint32_t)buf_avail, 0);
2771                }
2772
2773                /*
2774                 * This mbuf reaches to its end, get a new one
2775                 * to hold more data.
2776                 */
2777                if (mbuf_avail == 0) {
2778                        cur = rte_pktmbuf_alloc(mbuf_pool);
2779                        if (unlikely(cur == NULL)) {
2780                                VHOST_LOG_DATA(dev->ifname, ERR,
2781                                        "failed to allocate memory for mbuf.\n");
2782                                goto error;
2783                        }
2784
2785                        prev->next = cur;
2786                        prev->data_len = mbuf_offset;
2787                        m->nb_segs += 1;
2788                        m->pkt_len += mbuf_offset;
2789                        prev = cur;
2790
2791                        mbuf_offset = 0;
2792                        mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
2793                }
2794        }
2795
2796        prev->data_len = mbuf_offset;
2797        m->pkt_len    += mbuf_offset;
2798
2799        if (is_async) {
2800                async_iter_finalize(async);
2801                if (hdr)
2802                        pkts_info[slot_idx].nethdr = *hdr;
2803        } else if (hdr) {
2804                vhost_dequeue_offload(dev, hdr, m, legacy_ol_flags);
2805        }
2806
2807        return 0;
2808error:
2809        if (is_async)
2810                async_iter_cancel(async);
2811
2812        return -1;
2813}
2814
2815static void
2816virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
2817{
2818        rte_free(opaque);
2819}
2820
2821static int
2822virtio_dev_extbuf_alloc(struct virtio_net *dev, struct rte_mbuf *pkt, uint32_t size)
2823{
2824        struct rte_mbuf_ext_shared_info *shinfo = NULL;
2825        uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
2826        uint16_t buf_len;
2827        rte_iova_t iova;
2828        void *buf;
2829
2830        total_len += sizeof(*shinfo) + sizeof(uintptr_t);
2831        total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2832
2833        if (unlikely(total_len > UINT16_MAX))
2834                return -ENOSPC;
2835
2836        buf_len = total_len;
2837        buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2838        if (unlikely(buf == NULL))
2839                return -ENOMEM;
2840
2841        /* Initialize shinfo */
2842        shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2843                                                virtio_dev_extbuf_free, buf);
2844        if (unlikely(shinfo == NULL)) {
2845                rte_free(buf);
2846                VHOST_LOG_DATA(dev->ifname, ERR, "failed to init shinfo\n");
2847                return -1;
2848        }
2849
2850        iova = rte_malloc_virt2iova(buf);
2851        rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
2852        rte_pktmbuf_reset_headroom(pkt);
2853
2854        return 0;
2855}
2856
2857/*
2858 * Prepare a host supported pktmbuf.
2859 */
2860static __rte_always_inline int
2861virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
2862                         uint32_t data_len)
2863{
2864        if (rte_pktmbuf_tailroom(pkt) >= data_len)
2865                return 0;
2866
2867        /* attach an external buffer if supported */
2868        if (dev->extbuf && !virtio_dev_extbuf_alloc(dev, pkt, data_len))
2869                return 0;
2870
2871        /* check if chained buffers are allowed */
2872        if (!dev->linearbuf)
2873                return 0;
2874
2875        return -1;
2876}
2877
2878__rte_always_inline
2879static uint16_t
2880virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
2881        struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
2882        bool legacy_ol_flags)
2883{
2884        uint16_t i;
2885        uint16_t avail_entries;
2886        uint16_t dropped = 0;
2887        static bool allocerr_warned;
2888
2889        /*
2890         * The ordering between avail index and
2891         * desc reads needs to be enforced.
2892         */
2893        avail_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
2894                        vq->last_avail_idx;
2895        if (avail_entries == 0)
2896                return 0;
2897
2898        rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
2899
2900        VHOST_LOG_DATA(dev->ifname, DEBUG, "%s\n", __func__);
2901
2902        count = RTE_MIN(count, MAX_PKT_BURST);
2903        count = RTE_MIN(count, avail_entries);
2904        VHOST_LOG_DATA(dev->ifname, DEBUG, "about to dequeue %u buffers\n", count);
2905
2906        if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
2907                return 0;
2908
2909        for (i = 0; i < count; i++) {
2910                struct buf_vector buf_vec[BUF_VECTOR_MAX];
2911                uint16_t head_idx;
2912                uint32_t buf_len;
2913                uint16_t nr_vec = 0;
2914                int err;
2915
2916                if (unlikely(fill_vec_buf_split(dev, vq,
2917                                                vq->last_avail_idx + i,
2918                                                &nr_vec, buf_vec,
2919                                                &head_idx, &buf_len,
2920                                                VHOST_ACCESS_RO) < 0))
2921                        break;
2922
2923                update_shadow_used_ring_split(vq, head_idx, 0);
2924
2925                err = virtio_dev_pktmbuf_prep(dev, pkts[i], buf_len);
2926                if (unlikely(err)) {
2927                        /*
2928                         * mbuf allocation fails for jumbo packets when external
2929                         * buffer allocation is not allowed and linear buffer
2930                         * is required. Drop this packet.
2931                         */
2932                        if (!allocerr_warned) {
2933                                VHOST_LOG_DATA(dev->ifname, ERR,
2934                                        "failed mbuf alloc of size %d from %s.\n",
2935                                        buf_len, mbuf_pool->name);
2936                                allocerr_warned = true;
2937                        }
2938                        dropped += 1;
2939                        i++;
2940                        break;
2941                }
2942
2943                err = desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
2944                                   mbuf_pool, legacy_ol_flags, 0, false);
2945                if (unlikely(err)) {
2946                        if (!allocerr_warned) {
2947                                VHOST_LOG_DATA(dev->ifname, ERR, "failed to copy desc to mbuf.\n");
2948                                allocerr_warned = true;
2949                        }
2950                        dropped += 1;
2951                        i++;
2952                        break;
2953                }
2954
2955        }
2956
2957        if (dropped)
2958                rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
2959
2960        vq->last_avail_idx += i;
2961
2962        do_data_copy_dequeue(vq);
2963        if (unlikely(i < count))
2964                vq->shadow_used_idx = i;
2965        if (likely(vq->shadow_used_idx)) {
2966                flush_shadow_used_ring_split(dev, vq);
2967                vhost_vring_call_split(dev, vq);
2968        }
2969
2970        return (i - dropped);
2971}
2972
2973__rte_noinline
2974static uint16_t
2975virtio_dev_tx_split_legacy(struct virtio_net *dev,
2976        struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2977        struct rte_mbuf **pkts, uint16_t count)
2978{
2979        return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, true);
2980}
2981
2982__rte_noinline
2983static uint16_t
2984virtio_dev_tx_split_compliant(struct virtio_net *dev,
2985        struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2986        struct rte_mbuf **pkts, uint16_t count)
2987{
2988        return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, false);
2989}
2990
2991static __rte_always_inline int
2992vhost_reserve_avail_batch_packed(struct virtio_net *dev,
2993                                 struct vhost_virtqueue *vq,
2994                                 struct rte_mbuf **pkts,
2995                                 uint16_t avail_idx,
2996                                 uintptr_t *desc_addrs,
2997                                 uint16_t *ids)
2998{
2999        bool wrap = vq->avail_wrap_counter;
3000        struct vring_packed_desc *descs = vq->desc_packed;

3001        uint64_t lens[PACKED_BATCH_SIZE];
3002        uint64_t buf_lens[PACKED_BATCH_SIZE];
3003        uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3004        uint16_t flags, i;
3005
3006        if (unlikely(avail_idx & PACKED_BATCH_MASK))
3007                return -1;
3008        if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
3009                return -1;
3010
3011        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3012                flags = descs[avail_idx + i].flags;
3013                if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
3014                             (wrap == !!(flags & VRING_DESC_F_USED))  ||
3015                             (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
3016                        return -1;
3017        }
3018
3019        rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
3020
3021        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3022                lens[i] = descs[avail_idx + i].len;
3023
3024        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3025                desc_addrs[i] = vhost_iova_to_vva(dev, vq,
3026                                                  descs[avail_idx + i].addr,
3027                                                  &lens[i], VHOST_ACCESS_RW);
3028        }
3029
3030        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3031                if (unlikely(!desc_addrs[i]))
3032                        return -1;
3033                if (unlikely((lens[i] != descs[avail_idx + i].len)))
3034                        return -1;
3035        }
3036
3037        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3038                if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
3039                        goto err;
3040        }
3041
3042        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3043                buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
3044
3045        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3046                if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
3047                        goto err;
3048        }
3049
3050        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3051                pkts[i]->pkt_len = lens[i] - buf_offset;
3052                pkts[i]->data_len = pkts[i]->pkt_len;
3053                ids[i] = descs[avail_idx + i].id;
3054        }
3055
3056        return 0;
3057
3058err:
3059        return -1;
3060}
3061
3062static __rte_always_inline int
3063virtio_dev_tx_batch_packed(struct virtio_net *dev,
3064                           struct vhost_virtqueue *vq,
3065                           struct rte_mbuf **pkts,
3066                           bool legacy_ol_flags)
3067{
3068        uint16_t avail_idx = vq->last_avail_idx;
3069        uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3070        struct virtio_net_hdr *hdr;
3071        uintptr_t desc_addrs[PACKED_BATCH_SIZE];
3072        uint16_t ids[PACKED_BATCH_SIZE];
3073        uint16_t i;
3074
3075        if (vhost_reserve_avail_batch_packed(dev, vq, pkts, avail_idx,
3076                                             desc_addrs, ids))
3077                return -1;
3078
3079        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3080                rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
3081
3082        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
3083                rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
3084                           (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
3085                           pkts[i]->pkt_len);
3086
3087        if (virtio_net_with_host_offload(dev)) {
3088                vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
3089                        hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
3090                        vhost_dequeue_offload(dev, hdr, pkts[i], legacy_ol_flags);
3091                }
3092        }
3093
3094        if (virtio_net_is_inorder(dev))
3095                vhost_shadow_dequeue_batch_packed_inorder(vq,
3096                        ids[PACKED_BATCH_SIZE - 1]);
3097        else
3098                vhost_shadow_dequeue_batch_packed(dev, vq, ids);
3099
3100        vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
3101
3102        return 0;
3103}
3104
3105static __rte_always_inline int
3106vhost_dequeue_single_packed(struct virtio_net *dev,
3107                            struct vhost_virtqueue *vq,
3108                            struct rte_mempool *mbuf_pool,
3109                            struct rte_mbuf *pkts,
3110                            uint16_t *buf_id,
3111                            uint16_t *desc_count,
3112                            bool legacy_ol_flags)
3113{
3114        struct buf_vector buf_vec[BUF_VECTOR_MAX];
3115        uint32_t buf_len;
3116        uint16_t nr_vec = 0;
3117        int err;
3118        static bool allocerr_warned;
3119
3120        if (unlikely(fill_vec_buf_packed(dev, vq,
3121                                         vq->last_avail_idx, desc_count,
3122                                         buf_vec, &nr_vec,
3123                                         buf_id, &buf_len,
3124                                         VHOST_ACCESS_RO) < 0))
3125                return -1;
3126
3127        if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
3128                if (!allocerr_warned) {
3129                        VHOST_LOG_DATA(dev->ifname, ERR,
3130                                "failed mbuf alloc of size %d from %s.\n",
3131                                buf_len, mbuf_pool->name);
3132                        allocerr_warned = true;
3133                }
3134                return -1;
3135        }
3136
3137        err = desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts,
3138                           mbuf_pool, legacy_ol_flags, 0, false);
3139        if (unlikely(err)) {
3140                if (!allocerr_warned) {
3141                        VHOST_LOG_DATA(dev->ifname, ERR, "failed to copy desc to mbuf.\n");
3142                        allocerr_warned = true;
3143                }
3144                return -1;
3145        }
3146
3147        return 0;
3148}
3149
3150static __rte_always_inline int
3151virtio_dev_tx_single_packed(struct virtio_net *dev,
3152                            struct vhost_virtqueue *vq,
3153                            struct rte_mempool *mbuf_pool,
3154                            struct rte_mbuf *pkts,
3155                            bool legacy_ol_flags)
3156{
3157
3158        uint16_t buf_id, desc_count = 0;
3159        int ret;
3160
3161        ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
3162                                        &desc_count, legacy_ol_flags);
3163
3164        if (likely(desc_count > 0)) {
3165                if (virtio_net_is_inorder(dev))
3166                        vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
3167                                                                   desc_count);
3168                else
3169                        vhost_shadow_dequeue_single_packed(vq, buf_id,
3170                                        desc_count);
3171
3172                vq_inc_last_avail_packed(vq, desc_count);
3173        }
3174
3175        return ret;
3176}
3177
3178__rte_always_inline
3179static uint16_t
3180virtio_dev_tx_packed(struct virtio_net *dev,
3181                     struct vhost_virtqueue *__rte_restrict vq,
3182                     struct rte_mempool *mbuf_pool,
3183                     struct rte_mbuf **__rte_restrict pkts,
3184                     uint32_t count,
3185                     bool legacy_ol_flags)
3186{
3187        uint32_t pkt_idx = 0;
3188
3189        if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
3190                return 0;
3191
3192        do {
3193                rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
3194
3195                if (count - pkt_idx >= PACKED_BATCH_SIZE) {
3196                        if (!virtio_dev_tx_batch_packed(dev, vq,
3197                                                        &pkts[pkt_idx],
3198                                                        legacy_ol_flags)) {
3199                                pkt_idx += PACKED_BATCH_SIZE;
3200                                continue;
3201                        }
3202                }
3203
3204                if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
3205                                                pkts[pkt_idx],
3206                                                legacy_ol_flags))
3207                        break;
3208                pkt_idx++;
3209        } while (pkt_idx < count);
3210
3211        if (pkt_idx != count)
3212                rte_pktmbuf_free_bulk(&pkts[pkt_idx], count - pkt_idx);
3213
3214        if (vq->shadow_used_idx) {
3215                do_data_copy_dequeue(vq);
3216
3217                vhost_flush_dequeue_shadow_packed(dev, vq);
3218                vhost_vring_call_packed(dev, vq);
3219        }
3220
3221        return pkt_idx;
3222}
3223
3224__rte_noinline
3225static uint16_t
3226virtio_dev_tx_packed_legacy(struct virtio_net *dev,
3227        struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3228        struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3229{
3230        return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, true);
3231}
3232
3233__rte_noinline
3234static uint16_t
3235virtio_dev_tx_packed_compliant(struct virtio_net *dev,
3236        struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
3237        struct rte_mbuf **__rte_restrict pkts, uint32_t count)
3238{
3239        return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, false);
3240}
3241
3242uint16_t
3243rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
3244        struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
3245{
3246        struct virtio_net *dev;
3247        struct rte_mbuf *rarp_mbuf = NULL;
3248        struct vhost_virtqueue *vq;
3249        int16_t success = 1;
3250
3251        dev = get_device(vid);
3252        if (!dev)
3253                return 0;
3254
3255        if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
3256                VHOST_LOG_DATA(dev->ifname, ERR,
3257                        "%s: built-in vhost net backend is disabled.\n",
3258                        __func__);
3259                return 0;
3260        }
3261
3262        if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
3263                VHOST_LOG_DATA(dev->ifname, ERR,
3264                        "%s: invalid virtqueue idx %d.\n",
3265                        __func__, queue_id);
3266                return 0;
3267        }
3268
3269        vq = dev->virtqueue[queue_id];
3270
3271        if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
3272                return 0;
3273
3274        if (unlikely(!vq->enabled)) {
3275                count = 0;
3276                goto out_access_unlock;
3277        }
3278
3279        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3280                vhost_user_iotlb_rd_lock(vq);
3281
3282        if (unlikely(!vq->access_ok))
3283                if (unlikely(vring_translate(dev, vq) < 0)) {
3284                        count = 0;
3285                        goto out;
3286                }
3287
3288        /*
3289         * Construct a RARP broadcast packet, and inject it to the "pkts"
3290         * array, to looks like that guest actually send such packet.
3291         *
3292         * Check user_send_rarp() for more information.
3293         *
3294         * broadcast_rarp shares a cacheline in the virtio_net structure
3295         * with some fields that are accessed during enqueue and
3296         * __atomic_compare_exchange_n causes a write if performed compare
3297         * and exchange. This could result in false sharing between enqueue
3298         * and dequeue.
3299         *
3300         * Prevent unnecessary false sharing by reading broadcast_rarp first
3301         * and only performing compare and exchange if the read indicates it
3302         * is likely to be set.
3303         */
3304        if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
3305                        __atomic_compare_exchange_n(&dev->broadcast_rarp,
3306                        &success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
3307
3308                rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
3309                if (rarp_mbuf == NULL) {
3310                        VHOST_LOG_DATA(dev->ifname, ERR, "failed to make RARP packet.\n");
3311                        count = 0;
3312                        goto out;
3313                }
3314                /*
3315                 * Inject it to the head of "pkts" array, so that switch's mac
3316                 * learning table will get updated first.
3317                 */
3318                pkts[0] = rarp_mbuf;
3319                vhost_queue_stats_update(dev, vq, pkts, 1);
3320                pkts++;
3321                count -= 1;
3322        }
3323
3324        if (vq_is_packed(dev)) {
3325                if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3326                        count = virtio_dev_tx_packed_legacy(dev, vq, mbuf_pool, pkts, count);
3327                else
3328                        count = virtio_dev_tx_packed_compliant(dev, vq, mbuf_pool, pkts, count);
3329        } else {
3330                if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3331                        count = virtio_dev_tx_split_legacy(dev, vq, mbuf_pool, pkts, count);
3332                else
3333                        count = virtio_dev_tx_split_compliant(dev, vq, mbuf_pool, pkts, count);
3334        }
3335
3336        vhost_queue_stats_update(dev, vq, pkts, count);
3337
3338out:
3339        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3340                vhost_user_iotlb_rd_unlock(vq);
3341
3342out_access_unlock:
3343        rte_spinlock_unlock(&vq->access_lock);
3344
3345        if (unlikely(rarp_mbuf != NULL))
3346                count += 1;
3347
3348        return count;
3349}
3350
3351static __rte_always_inline uint16_t
3352async_poll_dequeue_completed(struct virtio_net *dev, struct vhost_virtqueue *vq,
3353                struct rte_mbuf **pkts, uint16_t count, int16_t dma_id,
3354                uint16_t vchan_id, bool legacy_ol_flags)
3355{
3356        uint16_t start_idx, from, i;
3357        uint16_t nr_cpl_pkts = 0;
3358        struct async_inflight_info *pkts_info = vq->async->pkts_info;
3359
3360        vhost_async_dma_check_completed(dev, dma_id, vchan_id, VHOST_DMA_MAX_COPY_COMPLETE);
3361
3362        start_idx = async_get_first_inflight_pkt_idx(vq);
3363
3364        from = start_idx;
3365        while (vq->async->pkts_cmpl_flag[from] && count--) {
3366                vq->async->pkts_cmpl_flag[from] = false;
3367                from = (from + 1) % vq->size;
3368                nr_cpl_pkts++;
3369        }
3370
3371        if (nr_cpl_pkts == 0)
3372                return 0;
3373
3374        for (i = 0; i < nr_cpl_pkts; i++) {
3375                from = (start_idx + i) % vq->size;
3376                pkts[i] = pkts_info[from].mbuf;
3377
3378                if (virtio_net_with_host_offload(dev))
3379                        vhost_dequeue_offload(dev, &pkts_info[from].nethdr, pkts[i],
3380                                              legacy_ol_flags);
3381        }
3382
3383        /* write back completed descs to used ring and update used idx */
3384        if (vq_is_packed(dev)) {
3385                write_back_completed_descs_packed(vq, nr_cpl_pkts);
3386                vhost_vring_call_packed(dev, vq);
3387        } else {
3388                write_back_completed_descs_split(vq, nr_cpl_pkts);
3389                __atomic_add_fetch(&vq->used->idx, nr_cpl_pkts, __ATOMIC_RELEASE);
3390                vhost_vring_call_split(dev, vq);
3391        }
3392        vq->async->pkts_inflight_n -= nr_cpl_pkts;
3393
3394        return nr_cpl_pkts;
3395}
3396
3397static __rte_always_inline uint16_t
3398virtio_dev_tx_async_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
3399                struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
3400                int16_t dma_id, uint16_t vchan_id, bool legacy_ol_flags)
3401{
3402        static bool allocerr_warned;
3403        bool dropped = false;
3404        uint16_t avail_entries;
3405        uint16_t pkt_idx, slot_idx = 0;
3406        uint16_t nr_done_pkts = 0;
3407        uint16_t pkt_err = 0;
3408        uint16_t n_xfer;
3409        struct vhost_async *async = vq->async;
3410        struct async_inflight_info *pkts_info = async->pkts_info;
3411        struct rte_mbuf *pkts_prealloc[MAX_PKT_BURST];
3412        uint16_t pkts_size = count;
3413
3414        /**
3415         * The ordering between avail index and
3416         * desc reads needs to be enforced.
3417         */
3418        avail_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
3419                        vq->last_avail_idx;
3420        if (avail_entries == 0)
3421                goto out;
3422
3423        rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
3424
3425        async_iter_reset(async);
3426
3427        count = RTE_MIN(count, MAX_PKT_BURST);
3428        count = RTE_MIN(count, avail_entries);
3429        VHOST_LOG_DATA(dev->ifname, DEBUG, "about to dequeue %u buffers\n", count);
3430
3431        if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts_prealloc, count))
3432                goto out;
3433
3434        for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
3435                uint16_t head_idx = 0;
3436                uint16_t nr_vec = 0;
3437                uint16_t to;
3438                uint32_t buf_len;
3439                int err;
3440                struct buf_vector buf_vec[BUF_VECTOR_MAX];
3441                struct rte_mbuf *pkt = pkts_prealloc[pkt_idx];
3442
3443                if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
3444                                                &nr_vec, buf_vec,
3445                                                &head_idx, &buf_len,
3446                                                VHOST_ACCESS_RO) < 0)) {
3447                        dropped = true;
3448                        break;
3449                }
3450
3451                err = virtio_dev_pktmbuf_prep(dev, pkt, buf_len);
3452                if (unlikely(err)) {
3453                        /**
3454                         * mbuf allocation fails for jumbo packets when external
3455                         * buffer allocation is not allowed and linear buffer
3456                         * is required. Drop this packet.
3457                         */
3458                        if (!allocerr_warned) {
3459                                VHOST_LOG_DATA(dev->ifname, ERR,
3460                                        "%s: Failed mbuf alloc of size %d from %s\n",
3461                                        __func__, buf_len, mbuf_pool->name);
3462                                allocerr_warned = true;
3463                        }
3464                        dropped = true;
3465                        break;
3466                }
3467
3468                slot_idx = (async->pkts_idx + pkt_idx) & (vq->size - 1);
3469                err = desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt, mbuf_pool,
3470                                        legacy_ol_flags, slot_idx, true);
3471                if (unlikely(err)) {
3472                        if (!allocerr_warned) {
3473                                VHOST_LOG_DATA(dev->ifname, ERR,
3474                                        "%s: Failed to offload copies to async channel.\n",
3475                                        __func__);
3476                                allocerr_warned = true;
3477                        }
3478                        dropped = true;
3479                        break;
3480                }
3481
3482                pkts_info[slot_idx].mbuf = pkt;
3483
3484                /* store used descs */
3485                to = async->desc_idx_split & (vq->size - 1);
3486                async->descs_split[to].id = head_idx;
3487                async->descs_split[to].len = 0;
3488                async->desc_idx_split++;
3489
3490                vq->last_avail_idx++;
3491        }
3492
3493        if (unlikely(dropped))
3494                rte_pktmbuf_free_bulk(&pkts_prealloc[pkt_idx], count - pkt_idx);
3495
3496        n_xfer = vhost_async_dma_transfer(dev, vq, dma_id, vchan_id, async->pkts_idx,
3497                                          async->iov_iter, pkt_idx);
3498
3499        async->pkts_inflight_n += n_xfer;
3500
3501        pkt_err = pkt_idx - n_xfer;
3502        if (unlikely(pkt_err)) {
3503                VHOST_LOG_DATA(dev->ifname, DEBUG, "%s: failed to transfer data.\n",
3504                        __func__);
3505
3506                pkt_idx = n_xfer;
3507                /* recover available ring */
3508                vq->last_avail_idx -= pkt_err;
3509
3510                /**
3511                 * recover async channel copy related structures and free pktmbufs
3512                 * for error pkts.
3513                 */
3514                async->desc_idx_split -= pkt_err;
3515                while (pkt_err-- > 0) {
3516                        rte_pktmbuf_free(pkts_info[slot_idx & (vq->size - 1)].mbuf);
3517                        slot_idx--;
3518                }
3519        }
3520
3521        async->pkts_idx += pkt_idx;
3522        if (async->pkts_idx >= vq->size)
3523                async->pkts_idx -= vq->size;
3524
3525out:
3526        /* DMA device may serve other queues, unconditionally check completed. */
3527        nr_done_pkts = async_poll_dequeue_completed(dev, vq, pkts, pkts_size,
3528                                                        dma_id, vchan_id, legacy_ol_flags);
3529
3530        return nr_done_pkts;
3531}
3532
3533__rte_noinline
3534static uint16_t
3535virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
3536                struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
3537                struct rte_mbuf **pkts, uint16_t count,
3538                int16_t dma_id, uint16_t vchan_id)
3539{
3540        return virtio_dev_tx_async_split(dev, vq, mbuf_pool,
3541                                pkts, count, dma_id, vchan_id, true);
3542}
3543
3544__rte_noinline
3545static uint16_t
3546virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
3547                struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
3548                struct rte_mbuf **pkts, uint16_t count,
3549                int16_t dma_id, uint16_t vchan_id)
3550{
3551        return virtio_dev_tx_async_split(dev, vq, mbuf_pool,
3552                                pkts, count, dma_id, vchan_id, false);
3553}
3554
3555static __rte_always_inline void
3556vhost_async_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, uint16_t buf_id)
3557{
3558        struct vhost_async *async = vq->async;
3559        uint16_t idx = async->buffer_idx_packed;
3560
3561        async->buffers_packed[idx].id = buf_id;
3562        async->buffers_packed[idx].len = 0;
3563        async->buffers_packed[idx].count = 1;
3564
3565        async->buffer_idx_packed++;
3566        if (async->buffer_idx_packed >= vq->size)
3567                async->buffer_idx_packed -= vq->size;
3568
3569}
3570
3571static __rte_always_inline int
3572virtio_dev_tx_async_single_packed(struct virtio_net *dev,
3573                        struct vhost_virtqueue *vq,
3574                        struct rte_mempool *mbuf_pool,
3575                        struct rte_mbuf *pkts,
3576                        uint16_t slot_idx,
3577                        bool legacy_ol_flags)
3578{
3579        int err;
3580        uint16_t buf_id, desc_count = 0;
3581        uint16_t nr_vec = 0;
3582        uint32_t buf_len;
3583        struct buf_vector buf_vec[BUF_VECTOR_MAX];
3584        static bool allocerr_warned;
3585
3586        if (unlikely(fill_vec_buf_packed(dev, vq, vq->last_avail_idx, &desc_count,
3587                                         buf_vec, &nr_vec, &buf_id, &buf_len,
3588                                         VHOST_ACCESS_RO) < 0))
3589                return -1;
3590
3591        if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
3592                if (!allocerr_warned) {
3593                        VHOST_LOG_DATA(dev->ifname, ERR, "Failed mbuf alloc of size %d from %s.\n",
3594                                buf_len, mbuf_pool->name);
3595
3596                        allocerr_warned = true;
3597                }
3598                return -1;
3599        }
3600
3601        err = desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts, mbuf_pool,
3602                legacy_ol_flags, slot_idx, true);
3603        if (unlikely(err)) {
3604                rte_pktmbuf_free(pkts);
3605                if (!allocerr_warned) {
3606                        VHOST_LOG_DATA(dev->ifname, ERR, "Failed to copy desc to mbuf on.\n");
3607                        allocerr_warned = true;
3608                }
3609                return -1;
3610        }
3611
3612        /* update async shadow packed ring */
3613        vhost_async_shadow_dequeue_single_packed(vq, buf_id);
3614
3615        return err;
3616}
3617
3618static __rte_always_inline uint16_t
3619virtio_dev_tx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
3620                struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
3621                uint16_t count, uint16_t dma_id, uint16_t vchan_id, bool legacy_ol_flags)
3622{
3623        uint16_t pkt_idx;
3624        uint16_t slot_idx = 0;
3625        uint16_t nr_done_pkts = 0;
3626        uint16_t pkt_err = 0;
3627        uint32_t n_xfer;
3628        struct vhost_async *async = vq->async;
3629        struct async_inflight_info *pkts_info = async->pkts_info;
3630        struct rte_mbuf *pkts_prealloc[MAX_PKT_BURST];
3631
3632        VHOST_LOG_DATA(dev->ifname, DEBUG, "(%d) about to dequeue %u buffers\n", dev->vid, count);
3633
3634        async_iter_reset(async);
3635
3636        if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts_prealloc, count))
3637                goto out;
3638
3639        for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
3640                struct rte_mbuf *pkt = pkts_prealloc[pkt_idx];
3641
3642                rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
3643
3644                slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
3645                if (unlikely(virtio_dev_tx_async_single_packed(dev, vq, mbuf_pool, pkt,
3646                                slot_idx, legacy_ol_flags))) {
3647                        rte_pktmbuf_free_bulk(&pkts_prealloc[pkt_idx], count - pkt_idx);
3648                        break;
3649                }
3650
3651                pkts_info[slot_idx].mbuf = pkt;
3652
3653                vq_inc_last_avail_packed(vq, 1);
3654
3655        }
3656
3657        n_xfer = vhost_async_dma_transfer(dev, vq, dma_id, vchan_id, async->pkts_idx,
3658                                        async->iov_iter, pkt_idx);
3659
3660        async->pkts_inflight_n += n_xfer;
3661
3662        pkt_err = pkt_idx - n_xfer;
3663
3664        if (unlikely(pkt_err)) {
3665                pkt_idx -= pkt_err;
3666
3667                /**
3668                 * recover DMA-copy related structures and free pktmbuf for DMA-error pkts.
3669                 */
3670                if (async->buffer_idx_packed >= pkt_err)
3671                        async->buffer_idx_packed -= pkt_err;
3672                else
3673                        async->buffer_idx_packed += vq->size - pkt_err;
3674
3675                while (pkt_err-- > 0) {
3676                        rte_pktmbuf_free(pkts_info[slot_idx % vq->size].mbuf);
3677                        slot_idx--;
3678                }
3679
3680                /* recover available ring */
3681                if (vq->last_avail_idx >= pkt_err) {
3682                        vq->last_avail_idx -= pkt_err;
3683                } else {
3684                        vq->last_avail_idx += vq->size - pkt_err;
3685                        vq->avail_wrap_counter ^= 1;
3686                }
3687        }
3688
3689        async->pkts_idx += pkt_idx;
3690        if (async->pkts_idx >= vq->size)
3691                async->pkts_idx -= vq->size;
3692
3693out:
3694        nr_done_pkts = async_poll_dequeue_completed(dev, vq, pkts, count,
3695                                        dma_id, vchan_id, legacy_ol_flags);
3696
3697        return nr_done_pkts;
3698}
3699
3700__rte_noinline
3701static uint16_t
3702virtio_dev_tx_async_packed_legacy(struct virtio_net *dev, struct vhost_virtqueue *vq,
3703                struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
3704                uint16_t count, uint16_t dma_id, uint16_t vchan_id)
3705{
3706        return virtio_dev_tx_async_packed(dev, vq, mbuf_pool,
3707                                pkts, count, dma_id, vchan_id, true);
3708}
3709
3710__rte_noinline
3711static uint16_t
3712virtio_dev_tx_async_packed_compliant(struct virtio_net *dev, struct vhost_virtqueue *vq,
3713                struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
3714                uint16_t count, uint16_t dma_id, uint16_t vchan_id)
3715{
3716        return virtio_dev_tx_async_packed(dev, vq, mbuf_pool,
3717                                pkts, count, dma_id, vchan_id, false);
3718}
3719
3720uint16_t
3721rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
3722        struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
3723        int *nr_inflight, int16_t dma_id, uint16_t vchan_id)
3724{
3725        struct virtio_net *dev;
3726        struct rte_mbuf *rarp_mbuf = NULL;
3727        struct vhost_virtqueue *vq;
3728        int16_t success = 1;
3729
3730        dev = get_device(vid);
3731        if (!dev || !nr_inflight)
3732                return 0;
3733
3734        *nr_inflight = -1;
3735
3736        if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
3737                VHOST_LOG_DATA(dev->ifname, ERR, "%s: built-in vhost net backend is disabled.\n",
3738                        __func__);
3739                return 0;
3740        }
3741
3742        if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
3743                VHOST_LOG_DATA(dev->ifname, ERR, "%s: invalid virtqueue idx %d.\n",
3744                        __func__, queue_id);
3745                return 0;
3746        }
3747
3748        if (unlikely(dma_id < 0 || dma_id >= RTE_DMADEV_DEFAULT_MAX)) {
3749                VHOST_LOG_DATA(dev->ifname, ERR, "%s: invalid dma id %d.\n",
3750                        __func__, dma_id);
3751                return 0;
3752        }
3753
3754        if (unlikely(!dma_copy_track[dma_id].vchans ||
3755                                !dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
3756                VHOST_LOG_DATA(dev->ifname, ERR, "%s: invalid channel %d:%u.\n",
3757                        __func__, dma_id, vchan_id);
3758                return 0;
3759        }
3760
3761        vq = dev->virtqueue[queue_id];
3762
3763        if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
3764                return 0;
3765
3766        if (unlikely(vq->enabled == 0)) {
3767                count = 0;
3768                goto out_access_unlock;
3769        }
3770
3771        if (unlikely(!vq->async)) {
3772                VHOST_LOG_DATA(dev->ifname, ERR, "%s: async not registered for queue id %d.\n",
3773                        __func__, queue_id);
3774                count = 0;
3775                goto out_access_unlock;
3776        }
3777
3778        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3779                vhost_user_iotlb_rd_lock(vq);
3780
3781        if (unlikely(vq->access_ok == 0))
3782                if (unlikely(vring_translate(dev, vq) < 0)) {
3783                        count = 0;
3784                        goto out;
3785                }
3786
3787        /*
3788         * Construct a RARP broadcast packet, and inject it to the "pkts"
3789         * array, to looks like that guest actually send such packet.
3790         *
3791         * Check user_send_rarp() for more information.
3792         *
3793         * broadcast_rarp shares a cacheline in the virtio_net structure
3794         * with some fields that are accessed during enqueue and
3795         * __atomic_compare_exchange_n causes a write if performed compare
3796         * and exchange. This could result in false sharing between enqueue
3797         * and dequeue.
3798         *
3799         * Prevent unnecessary false sharing by reading broadcast_rarp first
3800         * and only performing compare and exchange if the read indicates it
3801         * is likely to be set.
3802         */
3803        if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
3804                        __atomic_compare_exchange_n(&dev->broadcast_rarp,
3805                        &success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
3806
3807                rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
3808                if (rarp_mbuf == NULL) {
3809                        VHOST_LOG_DATA(dev->ifname, ERR, "failed to make RARP packet.\n");
3810                        count = 0;
3811                        goto out;
3812                }
3813                /*
3814                 * Inject it to the head of "pkts" array, so that switch's mac
3815                 * learning table will get updated first.
3816                 */
3817                pkts[0] = rarp_mbuf;
3818                vhost_queue_stats_update(dev, vq, pkts, 1);
3819                pkts++;
3820                count -= 1;
3821        }
3822
3823        if (vq_is_packed(dev)) {
3824                if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3825                        count = virtio_dev_tx_async_packed_legacy(dev, vq, mbuf_pool,
3826                                        pkts, count, dma_id, vchan_id);
3827                else
3828                        count = virtio_dev_tx_async_packed_compliant(dev, vq, mbuf_pool,
3829                                        pkts, count, dma_id, vchan_id);
3830        } else {
3831                if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3832                        count = virtio_dev_tx_async_split_legacy(dev, vq, mbuf_pool,
3833                                        pkts, count, dma_id, vchan_id);
3834                else
3835                        count = virtio_dev_tx_async_split_compliant(dev, vq, mbuf_pool,
3836                                        pkts, count, dma_id, vchan_id);
3837        }
3838
3839        *nr_inflight = vq->async->pkts_inflight_n;
3840        vhost_queue_stats_update(dev, vq, pkts, count);
3841
3842out:
3843        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3844                vhost_user_iotlb_rd_unlock(vq);
3845
3846out_access_unlock:
3847        rte_spinlock_unlock(&vq->access_lock);
3848
3849        if (unlikely(rarp_mbuf != NULL))
3850                count += 1;
3851
3852        return count;
3853}
3854