qemu/subprojects/libvduse/libvduse.c
<<
>>
Prefs
   1/*
   2 * VDUSE (vDPA Device in Userspace) library
   3 *
   4 * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
   5 *   Portions of codes and concepts borrowed from libvhost-user.c, so:
   6 *     Copyright IBM, Corp. 2007
   7 *     Copyright (c) 2016 Red Hat, Inc.
   8 *
   9 * Author:
  10 *   Xie Yongji <xieyongji@bytedance.com>
  11 *   Anthony Liguori <aliguori@us.ibm.com>
  12 *   Marc-André Lureau <mlureau@redhat.com>
  13 *   Victor Kaplansky <victork@redhat.com>
  14 *
  15 * This work is licensed under the terms of the GNU GPL, version 2 or
  16 * later.  See the COPYING file in the top-level directory.
  17 */
  18
  19#ifndef _GNU_SOURCE
  20#define _GNU_SOURCE
  21#endif
  22
  23#include <stdlib.h>
  24#include <stdio.h>
  25#include <stdbool.h>
  26#include <stddef.h>
  27#include <errno.h>
  28#include <string.h>
  29#include <assert.h>
  30#include <endian.h>
  31#include <unistd.h>
  32#include <limits.h>
  33#include <fcntl.h>
  34#include <inttypes.h>
  35
  36#include <sys/ioctl.h>
  37#include <sys/eventfd.h>
  38#include <sys/mman.h>
  39
  40#include "include/atomic.h"
  41#include "linux-headers/linux/virtio_ring.h"
  42#include "linux-headers/linux/virtio_config.h"
  43#include "linux-headers/linux/vduse.h"
  44#include "libvduse.h"
  45
  46#define VDUSE_VQ_ALIGN 4096
  47#define MAX_IOVA_REGIONS 256
  48
  49#define LOG_ALIGNMENT 64
  50
  51/* Round number down to multiple */
  52#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
  53
  54/* Round number up to multiple */
  55#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
  56
  57#ifndef unlikely
  58#define unlikely(x)   __builtin_expect(!!(x), 0)
  59#endif
  60
  61typedef struct VduseDescStateSplit {
  62    uint8_t inflight;
  63    uint8_t padding[5];
  64    uint16_t next;
  65    uint64_t counter;
  66} VduseDescStateSplit;
  67
  68typedef struct VduseVirtqLogInflight {
  69    uint64_t features;
  70    uint16_t version;
  71    uint16_t desc_num;
  72    uint16_t last_batch_head;
  73    uint16_t used_idx;
  74    VduseDescStateSplit desc[];
  75} VduseVirtqLogInflight;
  76
  77typedef struct VduseVirtqLog {
  78    VduseVirtqLogInflight inflight;
  79} VduseVirtqLog;
  80
  81typedef struct VduseVirtqInflightDesc {
  82    uint16_t index;
  83    uint64_t counter;
  84} VduseVirtqInflightDesc;
  85
  86typedef struct VduseRing {
  87    unsigned int num;
  88    uint64_t desc_addr;
  89    uint64_t avail_addr;
  90    uint64_t used_addr;
  91    struct vring_desc *desc;
  92    struct vring_avail *avail;
  93    struct vring_used *used;
  94} VduseRing;
  95
  96struct VduseVirtq {
  97    VduseRing vring;
  98    uint16_t last_avail_idx;
  99    uint16_t shadow_avail_idx;
 100    uint16_t used_idx;
 101    uint16_t signalled_used;
 102    bool signalled_used_valid;
 103    int index;
 104    unsigned int inuse;
 105    bool ready;
 106    int fd;
 107    VduseDev *dev;
 108    VduseVirtqInflightDesc *resubmit_list;
 109    uint16_t resubmit_num;
 110    uint64_t counter;
 111    VduseVirtqLog *log;
 112};
 113
 114typedef struct VduseIovaRegion {
 115    uint64_t iova;
 116    uint64_t size;
 117    uint64_t mmap_offset;
 118    uint64_t mmap_addr;
 119} VduseIovaRegion;
 120
 121struct VduseDev {
 122    VduseVirtq *vqs;
 123    VduseIovaRegion regions[MAX_IOVA_REGIONS];
 124    int num_regions;
 125    char *name;
 126    uint32_t device_id;
 127    uint32_t vendor_id;
 128    uint16_t num_queues;
 129    uint16_t queue_size;
 130    uint64_t features;
 131    const VduseOps *ops;
 132    int fd;
 133    int ctrl_fd;
 134    void *priv;
 135    void *log;
 136};
 137
 138static inline size_t vduse_vq_log_size(uint16_t queue_size)
 139{
 140    return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size +
 141                    sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT);
 142}
 143
 144static void *vduse_log_get(const char *filename, size_t size)
 145{
 146    void *ptr = MAP_FAILED;
 147    int fd;
 148
 149    fd = open(filename, O_RDWR | O_CREAT, 0600);
 150    if (fd == -1) {
 151        return MAP_FAILED;
 152    }
 153
 154    if (ftruncate(fd, size) == -1) {
 155        goto out;
 156    }
 157
 158    ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 159
 160out:
 161    close(fd);
 162    return ptr;
 163}
 164
 165static inline bool has_feature(uint64_t features, unsigned int fbit)
 166{
 167    assert(fbit < 64);
 168    return !!(features & (1ULL << fbit));
 169}
 170
 171static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
 172{
 173    return has_feature(dev->features, fbit);
 174}
 175
 176uint64_t vduse_get_virtio_features(void)
 177{
 178    return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
 179           (1ULL << VIRTIO_F_VERSION_1) |
 180           (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
 181           (1ULL << VIRTIO_RING_F_EVENT_IDX) |
 182           (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
 183}
 184
 185VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
 186{
 187    return vq->dev;
 188}
 189
 190int vduse_queue_get_fd(VduseVirtq *vq)
 191{
 192    return vq->fd;
 193}
 194
 195void *vduse_dev_get_priv(VduseDev *dev)
 196{
 197    return dev->priv;
 198}
 199
 200VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
 201{
 202    return &dev->vqs[index];
 203}
 204
 205int vduse_dev_get_fd(VduseDev *dev)
 206{
 207    return dev->fd;
 208}
 209
 210static int vduse_inject_irq(VduseDev *dev, int index)
 211{
 212    return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
 213}
 214
 215static int inflight_desc_compare(const void *a, const void *b)
 216{
 217    VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a,
 218                           *desc1 = (VduseVirtqInflightDesc *)b;
 219
 220    if (desc1->counter > desc0->counter &&
 221        (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
 222        return 1;
 223    }
 224
 225    return -1;
 226}
 227
 228static int vduse_queue_check_inflights(VduseVirtq *vq)
 229{
 230    int i = 0;
 231    VduseDev *dev = vq->dev;
 232
 233    vq->used_idx = le16toh(vq->vring.used->idx);
 234    vq->resubmit_num = 0;
 235    vq->resubmit_list = NULL;
 236    vq->counter = 0;
 237
 238    if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) {
 239        if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) {
 240            return -1;
 241        }
 242
 243        vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0;
 244
 245        barrier();
 246
 247        vq->log->inflight.used_idx = vq->used_idx;
 248    }
 249
 250    for (i = 0; i < vq->log->inflight.desc_num; i++) {
 251        if (vq->log->inflight.desc[i].inflight == 1) {
 252            vq->inuse++;
 253        }
 254    }
 255
 256    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
 257
 258    if (vq->inuse) {
 259        vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc));
 260        if (!vq->resubmit_list) {
 261            return -1;
 262        }
 263
 264        for (i = 0; i < vq->log->inflight.desc_num; i++) {
 265            if (vq->log->inflight.desc[i].inflight) {
 266                vq->resubmit_list[vq->resubmit_num].index = i;
 267                vq->resubmit_list[vq->resubmit_num].counter =
 268                                        vq->log->inflight.desc[i].counter;
 269                vq->resubmit_num++;
 270            }
 271        }
 272
 273        if (vq->resubmit_num > 1) {
 274            qsort(vq->resubmit_list, vq->resubmit_num,
 275                  sizeof(VduseVirtqInflightDesc), inflight_desc_compare);
 276        }
 277        vq->counter = vq->resubmit_list[0].counter + 1;
 278    }
 279
 280    vduse_inject_irq(dev, vq->index);
 281
 282    return 0;
 283}
 284
 285static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx)
 286{
 287    vq->log->inflight.desc[desc_idx].counter = vq->counter++;
 288
 289    barrier();
 290
 291    vq->log->inflight.desc[desc_idx].inflight = 1;
 292
 293    return 0;
 294}
 295
 296static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx)
 297{
 298    vq->log->inflight.last_batch_head = desc_idx;
 299
 300    return 0;
 301}
 302
 303static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx)
 304{
 305    vq->log->inflight.desc[desc_idx].inflight = 0;
 306
 307    barrier();
 308
 309    vq->log->inflight.used_idx = vq->used_idx;
 310
 311    return 0;
 312}
 313
 314static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
 315                                     uint64_t last)
 316{
 317    int i;
 318
 319    if (last == start) {
 320        return;
 321    }
 322
 323    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
 324        if (!dev->regions[i].mmap_addr) {
 325            continue;
 326        }
 327
 328        if (start <= dev->regions[i].iova &&
 329            last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
 330            munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
 331                   dev->regions[i].mmap_offset + dev->regions[i].size);
 332            dev->regions[i].mmap_addr = 0;
 333            dev->num_regions--;
 334        }
 335    }
 336}
 337
 338static int vduse_iova_add_region(VduseDev *dev, int fd,
 339                                 uint64_t offset, uint64_t start,
 340                                 uint64_t last, int prot)
 341{
 342    int i;
 343    uint64_t size = last - start + 1;
 344    void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
 345
 346    if (mmap_addr == MAP_FAILED) {
 347        close(fd);
 348        return -EINVAL;
 349    }
 350
 351    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
 352        if (!dev->regions[i].mmap_addr) {
 353            dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
 354            dev->regions[i].mmap_offset = offset;
 355            dev->regions[i].iova = start;
 356            dev->regions[i].size = size;
 357            dev->num_regions++;
 358            break;
 359        }
 360    }
 361    assert(i < MAX_IOVA_REGIONS);
 362    close(fd);
 363
 364    return 0;
 365}
 366
 367static int perm_to_prot(uint8_t perm)
 368{
 369    int prot = 0;
 370
 371    switch (perm) {
 372    case VDUSE_ACCESS_WO:
 373        prot |= PROT_WRITE;
 374        break;
 375    case VDUSE_ACCESS_RO:
 376        prot |= PROT_READ;
 377        break;
 378    case VDUSE_ACCESS_RW:
 379        prot |= PROT_READ | PROT_WRITE;
 380        break;
 381    default:
 382        break;
 383    }
 384
 385    return prot;
 386}
 387
 388static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
 389{
 390    int i, ret;
 391    struct vduse_iotlb_entry entry;
 392
 393    for (i = 0; i < MAX_IOVA_REGIONS; i++) {
 394        VduseIovaRegion *r = &dev->regions[i];
 395
 396        if (!r->mmap_addr) {
 397            continue;
 398        }
 399
 400        if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
 401            if ((iova + *plen) > (r->iova + r->size)) {
 402                *plen = r->iova + r->size - iova;
 403            }
 404            return (void *)(uintptr_t)(iova - r->iova +
 405                   r->mmap_addr + r->mmap_offset);
 406        }
 407    }
 408
 409    entry.start = iova;
 410    entry.last = iova + 1;
 411    ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
 412    if (ret < 0) {
 413        return NULL;
 414    }
 415
 416    if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
 417                               entry.last, perm_to_prot(entry.perm))) {
 418        return iova_to_va(dev, plen, iova);
 419    }
 420
 421    return NULL;
 422}
 423
 424static inline uint16_t vring_avail_flags(VduseVirtq *vq)
 425{
 426    return le16toh(vq->vring.avail->flags);
 427}
 428
 429static inline uint16_t vring_avail_idx(VduseVirtq *vq)
 430{
 431    vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
 432
 433    return vq->shadow_avail_idx;
 434}
 435
 436static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
 437{
 438    return le16toh(vq->vring.avail->ring[i]);
 439}
 440
 441static inline uint16_t vring_get_used_event(VduseVirtq *vq)
 442{
 443    return vring_avail_ring(vq, vq->vring.num);
 444}
 445
 446static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
 447                                 unsigned int *head)
 448{
 449    /*
 450     * Grab the next descriptor number they're advertising, and increment
 451     * the index we've seen.
 452     */
 453    *head = vring_avail_ring(vq, idx % vq->vring.num);
 454
 455    /* If their number is silly, that's a fatal mistake. */
 456    if (*head >= vq->vring.num) {
 457        fprintf(stderr, "Guest says index %u is available\n", *head);
 458        return false;
 459    }
 460
 461    return true;
 462}
 463
 464static int
 465vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
 466                               uint64_t addr, size_t len)
 467{
 468    struct vring_desc *ori_desc;
 469    uint64_t read_len;
 470
 471    if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
 472        return -1;
 473    }
 474
 475    if (len == 0) {
 476        return -1;
 477    }
 478
 479    while (len) {
 480        read_len = len;
 481        ori_desc = iova_to_va(dev, &read_len, addr);
 482        if (!ori_desc) {
 483            return -1;
 484        }
 485
 486        memcpy(desc, ori_desc, read_len);
 487        len -= read_len;
 488        addr += read_len;
 489        desc += read_len;
 490    }
 491
 492    return 0;
 493}
 494
 495enum {
 496    VIRTQUEUE_READ_DESC_ERROR = -1,
 497    VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
 498    VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
 499};
 500
 501static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
 502                                      unsigned int max, unsigned int *next)
 503{
 504    /* If this descriptor says it doesn't chain, we're done. */
 505    if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
 506        return VIRTQUEUE_READ_DESC_DONE;
 507    }
 508
 509    /* Check they're not leading us off end of descriptors. */
 510    *next = desc[i].next;
 511    /* Make sure compiler knows to grab that: we don't want it changing! */
 512    smp_wmb();
 513
 514    if (*next >= max) {
 515        fprintf(stderr, "Desc next is %u\n", *next);
 516        return VIRTQUEUE_READ_DESC_ERROR;
 517    }
 518
 519    return VIRTQUEUE_READ_DESC_MORE;
 520}
 521
 522/*
 523 * Fetch avail_idx from VQ memory only when we really need to know if
 524 * guest has added some buffers.
 525 */
 526static bool vduse_queue_empty(VduseVirtq *vq)
 527{
 528    if (unlikely(!vq->vring.avail)) {
 529        return true;
 530    }
 531
 532    if (vq->shadow_avail_idx != vq->last_avail_idx) {
 533        return false;
 534    }
 535
 536    return vring_avail_idx(vq) == vq->last_avail_idx;
 537}
 538
 539static bool vduse_queue_should_notify(VduseVirtq *vq)
 540{
 541    VduseDev *dev = vq->dev;
 542    uint16_t old, new;
 543    bool v;
 544
 545    /* We need to expose used array entries before checking used event. */
 546    smp_mb();
 547
 548    /* Always notify when queue is empty (when feature acknowledge) */
 549    if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
 550        !vq->inuse && vduse_queue_empty(vq)) {
 551        return true;
 552    }
 553
 554    if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
 555        return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
 556    }
 557
 558    v = vq->signalled_used_valid;
 559    vq->signalled_used_valid = true;
 560    old = vq->signalled_used;
 561    new = vq->signalled_used = vq->used_idx;
 562    return !v || vring_need_event(vring_get_used_event(vq), new, old);
 563}
 564
 565void vduse_queue_notify(VduseVirtq *vq)
 566{
 567    VduseDev *dev = vq->dev;
 568
 569    if (unlikely(!vq->vring.avail)) {
 570        return;
 571    }
 572
 573    if (!vduse_queue_should_notify(vq)) {
 574        return;
 575    }
 576
 577    if (vduse_inject_irq(dev, vq->index) < 0) {
 578        fprintf(stderr, "Error inject irq for vq %d: %s\n",
 579                vq->index, strerror(errno));
 580    }
 581}
 582
 583static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
 584{
 585    uint16_t val_le = htole16(val);
 586    memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t));
 587}
 588
 589static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
 590                                   struct iovec *iov, unsigned int max_num_sg,
 591                                   bool is_write, uint64_t pa, size_t sz)
 592{
 593    unsigned num_sg = *p_num_sg;
 594    VduseDev *dev = vq->dev;
 595
 596    assert(num_sg <= max_num_sg);
 597
 598    if (!sz) {
 599        fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
 600        return false;
 601    }
 602
 603    while (sz) {
 604        uint64_t len = sz;
 605
 606        if (num_sg == max_num_sg) {
 607            fprintf(stderr,
 608                    "virtio: too many descriptors in indirect table\n");
 609            return false;
 610        }
 611
 612        iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
 613        if (iov[num_sg].iov_base == NULL) {
 614            fprintf(stderr, "virtio: invalid address for buffers\n");
 615            return false;
 616        }
 617        iov[num_sg++].iov_len = len;
 618        sz -= len;
 619        pa += len;
 620    }
 621
 622    *p_num_sg = num_sg;
 623    return true;
 624}
 625
 626static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
 627                                       unsigned in_num)
 628{
 629    VduseVirtqElement *elem;
 630    size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
 631    size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
 632    size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
 633
 634    assert(sz >= sizeof(VduseVirtqElement));
 635    elem = malloc(out_sg_end);
 636    if (!elem) {
 637        return NULL;
 638    }
 639    elem->out_num = out_num;
 640    elem->in_num = in_num;
 641    elem->in_sg = (void *)elem + in_sg_ofs;
 642    elem->out_sg = (void *)elem + out_sg_ofs;
 643    return elem;
 644}
 645
 646static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
 647{
 648    struct vring_desc *desc = vq->vring.desc;
 649    VduseDev *dev = vq->dev;
 650    uint64_t desc_addr, read_len;
 651    unsigned int desc_len;
 652    unsigned int max = vq->vring.num;
 653    unsigned int i = idx;
 654    VduseVirtqElement *elem;
 655    struct iovec iov[VIRTQUEUE_MAX_SIZE];
 656    struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
 657    unsigned int out_num = 0, in_num = 0;
 658    int rc;
 659
 660    if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
 661        if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
 662            fprintf(stderr, "Invalid size for indirect buffer table\n");
 663            return NULL;
 664        }
 665
 666        /* loop over the indirect descriptor table */
 667        desc_addr = le64toh(desc[i].addr);
 668        desc_len = le32toh(desc[i].len);
 669        max = desc_len / sizeof(struct vring_desc);
 670        read_len = desc_len;
 671        desc = iova_to_va(dev, &read_len, desc_addr);
 672        if (unlikely(desc && read_len != desc_len)) {
 673            /* Failed to use zero copy */
 674            desc = NULL;
 675            if (!vduse_queue_read_indirect_desc(dev, desc_buf,
 676                                                desc_addr,
 677                                                desc_len)) {
 678                desc = desc_buf;
 679            }
 680        }
 681        if (!desc) {
 682            fprintf(stderr, "Invalid indirect buffer table\n");
 683            return NULL;
 684        }
 685        i = 0;
 686    }
 687
 688    /* Collect all the descriptors */
 689    do {
 690        if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
 691            if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
 692                                             VIRTQUEUE_MAX_SIZE - out_num,
 693                                             true, le64toh(desc[i].addr),
 694                                             le32toh(desc[i].len))) {
 695                return NULL;
 696            }
 697        } else {
 698            if (in_num) {
 699                fprintf(stderr, "Incorrect order for descriptors\n");
 700                return NULL;
 701            }
 702            if (!vduse_queue_map_single_desc(vq, &out_num, iov,
 703                                             VIRTQUEUE_MAX_SIZE, false,
 704                                             le64toh(desc[i].addr),
 705                                             le32toh(desc[i].len))) {
 706                return NULL;
 707            }
 708        }
 709
 710        /* If we've got too many, that implies a descriptor loop. */
 711        if ((in_num + out_num) > max) {
 712            fprintf(stderr, "Looped descriptor\n");
 713            return NULL;
 714        }
 715        rc = vduse_queue_read_next_desc(desc, i, max, &i);
 716    } while (rc == VIRTQUEUE_READ_DESC_MORE);
 717
 718    if (rc == VIRTQUEUE_READ_DESC_ERROR) {
 719        fprintf(stderr, "read descriptor error\n");
 720        return NULL;
 721    }
 722
 723    /* Now copy what we have collected and mapped */
 724    elem = vduse_queue_alloc_element(sz, out_num, in_num);
 725    if (!elem) {
 726        fprintf(stderr, "read descriptor error\n");
 727        return NULL;
 728    }
 729    elem->index = idx;
 730    for (i = 0; i < out_num; i++) {
 731        elem->out_sg[i] = iov[i];
 732    }
 733    for (i = 0; i < in_num; i++) {
 734        elem->in_sg[i] = iov[out_num + i];
 735    }
 736
 737    return elem;
 738}
 739
 740void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
 741{
 742    unsigned int head;
 743    VduseVirtqElement *elem;
 744    VduseDev *dev = vq->dev;
 745    int i;
 746
 747    if (unlikely(!vq->vring.avail)) {
 748        return NULL;
 749    }
 750
 751    if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
 752        i = (--vq->resubmit_num);
 753        elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz);
 754
 755        if (!vq->resubmit_num) {
 756            free(vq->resubmit_list);
 757            vq->resubmit_list = NULL;
 758        }
 759
 760        return elem;
 761    }
 762
 763    if (vduse_queue_empty(vq)) {
 764        return NULL;
 765    }
 766    /* Needed after virtio_queue_empty() */
 767    smp_rmb();
 768
 769    if (vq->inuse >= vq->vring.num) {
 770        fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
 771        return NULL;
 772    }
 773
 774    if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
 775        return NULL;
 776    }
 777
 778    if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
 779        vring_set_avail_event(vq, vq->last_avail_idx);
 780    }
 781
 782    elem = vduse_queue_map_desc(vq, head, sz);
 783
 784    if (!elem) {
 785        return NULL;
 786    }
 787
 788    vq->inuse++;
 789
 790    vduse_queue_inflight_get(vq, head);
 791
 792    return elem;
 793}
 794
 795static inline void vring_used_write(VduseVirtq *vq,
 796                                    struct vring_used_elem *uelem, int i)
 797{
 798    struct vring_used *used = vq->vring.used;
 799
 800    used->ring[i] = *uelem;
 801}
 802
 803static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
 804                             unsigned int len, unsigned int idx)
 805{
 806    struct vring_used_elem uelem;
 807
 808    if (unlikely(!vq->vring.used)) {
 809        return;
 810    }
 811
 812    idx = (idx + vq->used_idx) % vq->vring.num;
 813
 814    uelem.id = htole32(elem->index);
 815    uelem.len = htole32(len);
 816    vring_used_write(vq, &uelem, idx);
 817}
 818
 819static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
 820{
 821    vq->vring.used->idx = htole16(val);
 822    vq->used_idx = val;
 823}
 824
 825static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
 826{
 827    uint16_t old, new;
 828
 829    if (unlikely(!vq->vring.used)) {
 830        return;
 831    }
 832
 833    /* Make sure buffer is written before we update index. */
 834    smp_wmb();
 835
 836    old = vq->used_idx;
 837    new = old + count;
 838    vring_used_idx_set(vq, new);
 839    vq->inuse -= count;
 840    if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
 841        vq->signalled_used_valid = false;
 842    }
 843}
 844
 845void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
 846                      unsigned int len)
 847{
 848    vduse_queue_fill(vq, elem, len, 0);
 849    vduse_queue_inflight_pre_put(vq, elem->index);
 850    vduse_queue_flush(vq, 1);
 851    vduse_queue_inflight_post_put(vq, elem->index);
 852}
 853
 854static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
 855                                    uint64_t avail_addr, uint64_t used_addr)
 856{
 857    struct VduseDev *dev = vq->dev;
 858    uint64_t len;
 859
 860    len = sizeof(struct vring_desc);
 861    vq->vring.desc = iova_to_va(dev, &len, desc_addr);
 862    if (len != sizeof(struct vring_desc)) {
 863        return -EINVAL;
 864    }
 865
 866    len = sizeof(struct vring_avail);
 867    vq->vring.avail = iova_to_va(dev, &len, avail_addr);
 868    if (len != sizeof(struct vring_avail)) {
 869        return -EINVAL;
 870    }
 871
 872    len = sizeof(struct vring_used);
 873    vq->vring.used = iova_to_va(dev, &len, used_addr);
 874    if (len != sizeof(struct vring_used)) {
 875        return -EINVAL;
 876    }
 877
 878    if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
 879        fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
 880        return -EINVAL;
 881    }
 882
 883    return 0;
 884}
 885
 886static void vduse_queue_enable(VduseVirtq *vq)
 887{
 888    struct VduseDev *dev = vq->dev;
 889    struct vduse_vq_info vq_info;
 890    struct vduse_vq_eventfd vq_eventfd;
 891    int fd;
 892
 893    vq_info.index = vq->index;
 894    if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
 895        fprintf(stderr, "Failed to get vq[%d] info: %s\n",
 896                vq->index, strerror(errno));
 897        return;
 898    }
 899
 900    if (!vq_info.ready) {
 901        return;
 902    }
 903
 904    vq->vring.num = vq_info.num;
 905    vq->vring.desc_addr = vq_info.desc_addr;
 906    vq->vring.avail_addr = vq_info.driver_addr;
 907    vq->vring.used_addr = vq_info.device_addr;
 908
 909    if (vduse_queue_update_vring(vq, vq_info.desc_addr,
 910                                 vq_info.driver_addr, vq_info.device_addr)) {
 911        fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
 912        return;
 913    }
 914
 915    fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
 916    if (fd < 0) {
 917        fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
 918        return;
 919    }
 920
 921    vq_eventfd.index = vq->index;
 922    vq_eventfd.fd = fd;
 923    if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
 924        fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
 925        close(fd);
 926        return;
 927    }
 928
 929    vq->fd = fd;
 930    vq->signalled_used_valid = false;
 931    vq->ready = true;
 932
 933    if (vduse_queue_check_inflights(vq)) {
 934        fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index);
 935        close(fd);
 936        return;
 937    }
 938
 939    dev->ops->enable_queue(dev, vq);
 940}
 941
 942static void vduse_queue_disable(VduseVirtq *vq)
 943{
 944    struct VduseDev *dev = vq->dev;
 945    struct vduse_vq_eventfd eventfd;
 946
 947    if (!vq->ready) {
 948        return;
 949    }
 950
 951    dev->ops->disable_queue(dev, vq);
 952
 953    eventfd.index = vq->index;
 954    eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
 955    ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
 956    close(vq->fd);
 957
 958    assert(vq->inuse == 0);
 959
 960    vq->vring.num = 0;
 961    vq->vring.desc_addr = 0;
 962    vq->vring.avail_addr = 0;
 963    vq->vring.used_addr = 0;
 964    vq->vring.desc = 0;
 965    vq->vring.avail = 0;
 966    vq->vring.used = 0;
 967    vq->ready = false;
 968    vq->fd = -1;
 969}
 970
 971static void vduse_dev_start_dataplane(VduseDev *dev)
 972{
 973    int i;
 974
 975    if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
 976        fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
 977        return;
 978    }
 979    assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
 980
 981    for (i = 0; i < dev->num_queues; i++) {
 982        vduse_queue_enable(&dev->vqs[i]);
 983    }
 984}
 985
 986static void vduse_dev_stop_dataplane(VduseDev *dev)
 987{
 988    size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
 989    int i;
 990
 991    for (i = 0; i < dev->num_queues; i++) {
 992        vduse_queue_disable(&dev->vqs[i]);
 993    }
 994    if (dev->log) {
 995        memset(dev->log, 0, log_size);
 996    }
 997    dev->features = 0;
 998    vduse_iova_remove_region(dev, 0, ULONG_MAX);
 999}
1000
1001int vduse_dev_handler(VduseDev *dev)
1002{
1003    struct vduse_dev_request req;
1004    struct vduse_dev_response resp = { 0 };
1005    VduseVirtq *vq;
1006    int i, ret;
1007
1008    ret = read(dev->fd, &req, sizeof(req));
1009    if (ret != sizeof(req)) {
1010        fprintf(stderr, "Read request error [%d]: %s\n",
1011                ret, strerror(errno));
1012        return -errno;
1013    }
1014    resp.request_id = req.request_id;
1015
1016    switch (req.type) {
1017    case VDUSE_GET_VQ_STATE:
1018        vq = &dev->vqs[req.vq_state.index];
1019        resp.vq_state.split.avail_index = vq->last_avail_idx;
1020        resp.result = VDUSE_REQ_RESULT_OK;
1021        break;
1022    case VDUSE_SET_STATUS:
1023        if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
1024            vduse_dev_start_dataplane(dev);
1025        } else if (req.s.status == 0) {
1026            vduse_dev_stop_dataplane(dev);
1027        }
1028        resp.result = VDUSE_REQ_RESULT_OK;
1029        break;
1030    case VDUSE_UPDATE_IOTLB:
1031        /* The iova will be updated by iova_to_va() later, so just remove it */
1032        vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
1033        for (i = 0; i < dev->num_queues; i++) {
1034            VduseVirtq *vq = &dev->vqs[i];
1035            if (vq->ready) {
1036                if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
1037                                             vq->vring.avail_addr,
1038                                             vq->vring.used_addr)) {
1039                    fprintf(stderr, "Failed to update vring for vq[%d]\n",
1040                            vq->index);
1041                }
1042            }
1043        }
1044        resp.result = VDUSE_REQ_RESULT_OK;
1045        break;
1046    default:
1047        resp.result = VDUSE_REQ_RESULT_FAILED;
1048        break;
1049    }
1050
1051    ret = write(dev->fd, &resp, sizeof(resp));
1052    if (ret != sizeof(resp)) {
1053        fprintf(stderr, "Write request %d error [%d]: %s\n",
1054                req.type, ret, strerror(errno));
1055        return -errno;
1056    }
1057    return 0;
1058}
1059
1060int vduse_dev_update_config(VduseDev *dev, uint32_t size,
1061                            uint32_t offset, char *buffer)
1062{
1063    int ret;
1064    struct vduse_config_data *data;
1065
1066    data = malloc(offsetof(struct vduse_config_data, buffer) + size);
1067    if (!data) {
1068        return -ENOMEM;
1069    }
1070
1071    data->offset = offset;
1072    data->length = size;
1073    memcpy(data->buffer, buffer, size);
1074
1075    ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
1076    free(data);
1077
1078    if (ret) {
1079        return -errno;
1080    }
1081
1082    if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
1083        return -errno;
1084    }
1085
1086    return 0;
1087}
1088
1089int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
1090{
1091    VduseVirtq *vq = &dev->vqs[index];
1092    struct vduse_vq_config vq_config = { 0 };
1093
1094    if (max_size > VIRTQUEUE_MAX_SIZE) {
1095        return -EINVAL;
1096    }
1097
1098    vq_config.index = vq->index;
1099    vq_config.max_size = max_size;
1100
1101    if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
1102        return -errno;
1103    }
1104
1105    vduse_queue_enable(vq);
1106
1107    return 0;
1108}
1109
1110int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename)
1111{
1112
1113    size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
1114    void *log;
1115    int i;
1116
1117    dev->log = log = vduse_log_get(filename, log_size);
1118    if (log == MAP_FAILED) {
1119        fprintf(stderr, "Failed to get vduse log\n");
1120        return -EINVAL;
1121    }
1122
1123    for (i = 0; i < dev->num_queues; i++) {
1124        dev->vqs[i].log = log;
1125        dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE;
1126        log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE));
1127    }
1128
1129    return 0;
1130}
1131
1132static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
1133{
1134    VduseVirtq *vqs;
1135    int i;
1136
1137    vqs = calloc(sizeof(VduseVirtq), num_queues);
1138    if (!vqs) {
1139        return -ENOMEM;
1140    }
1141
1142    for (i = 0; i < num_queues; i++) {
1143        vqs[i].index = i;
1144        vqs[i].dev = dev;
1145        vqs[i].fd = -1;
1146    }
1147    dev->vqs = vqs;
1148
1149    return 0;
1150}
1151
1152static int vduse_dev_init(VduseDev *dev, const char *name,
1153                          uint16_t num_queues, const VduseOps *ops,
1154                          void *priv)
1155{
1156    char *dev_path, *dev_name;
1157    int ret, fd;
1158
1159    dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
1160    if (!dev_path) {
1161        return -ENOMEM;
1162    }
1163    sprintf(dev_path, "/dev/vduse/%s", name);
1164
1165    fd = open(dev_path, O_RDWR);
1166    free(dev_path);
1167    if (fd < 0) {
1168        fprintf(stderr, "Failed to open vduse dev %s: %s\n",
1169                name, strerror(errno));
1170        return -errno;
1171    }
1172
1173    if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1174        fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1175        close(fd);
1176        return -errno;
1177    }
1178
1179    dev_name = strdup(name);
1180    if (!dev_name) {
1181        close(fd);
1182        return -ENOMEM;
1183    }
1184
1185    ret = vduse_dev_init_vqs(dev, num_queues);
1186    if (ret) {
1187        free(dev_name);
1188        close(fd);
1189        return ret;
1190    }
1191
1192    dev->name = dev_name;
1193    dev->num_queues = num_queues;
1194    dev->fd = fd;
1195    dev->ops = ops;
1196    dev->priv = priv;
1197
1198    return 0;
1199}
1200
1201static inline bool vduse_name_is_invalid(const char *name)
1202{
1203    return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
1204}
1205
1206VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
1207                                 const VduseOps *ops, void *priv)
1208{
1209    VduseDev *dev;
1210    int ret;
1211
1212    if (!ops || !ops->enable_queue || !ops->disable_queue) {
1213        fprintf(stderr, "Invalid parameter for vduse\n");
1214        return NULL;
1215    }
1216
1217    dev = calloc(sizeof(VduseDev), 1);
1218    if (!dev) {
1219        fprintf(stderr, "Failed to allocate vduse device\n");
1220        return NULL;
1221    }
1222
1223    if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1224        fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1225        free(dev);
1226        return NULL;
1227    }
1228
1229    ret = vduse_dev_init_vqs(dev, num_queues);
1230    if (ret) {
1231        fprintf(stderr, "Failed to init vqs\n");
1232        free(dev);
1233        return NULL;
1234    }
1235
1236    dev->num_queues = num_queues;
1237    dev->fd = fd;
1238    dev->ops = ops;
1239    dev->priv = priv;
1240
1241    return dev;
1242}
1243
1244VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
1245                                   const VduseOps *ops, void *priv)
1246{
1247    VduseDev *dev;
1248    int ret;
1249
1250    if (!name || vduse_name_is_invalid(name) || !ops ||
1251        !ops->enable_queue || !ops->disable_queue) {
1252        fprintf(stderr, "Invalid parameter for vduse\n");
1253        return NULL;
1254    }
1255
1256    dev = calloc(sizeof(VduseDev), 1);
1257    if (!dev) {
1258        fprintf(stderr, "Failed to allocate vduse device\n");
1259        return NULL;
1260    }
1261
1262    ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1263    if (ret < 0) {
1264        fprintf(stderr, "Failed to init vduse device %s: %s\n",
1265                name, strerror(-ret));
1266        free(dev);
1267        return NULL;
1268    }
1269
1270    return dev;
1271}
1272
1273VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
1274                           uint32_t vendor_id, uint64_t features,
1275                           uint16_t num_queues, uint32_t config_size,
1276                           char *config, const VduseOps *ops, void *priv)
1277{
1278    VduseDev *dev;
1279    int ret, ctrl_fd;
1280    uint64_t version;
1281    struct vduse_dev_config *dev_config;
1282    size_t size = offsetof(struct vduse_dev_config, config);
1283
1284    if (!name || vduse_name_is_invalid(name) ||
1285        !has_feature(features,  VIRTIO_F_VERSION_1) || !config ||
1286        !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
1287        fprintf(stderr, "Invalid parameter for vduse\n");
1288        return NULL;
1289    }
1290
1291    dev = calloc(sizeof(VduseDev), 1);
1292    if (!dev) {
1293        fprintf(stderr, "Failed to allocate vduse device\n");
1294        return NULL;
1295    }
1296
1297    ctrl_fd = open("/dev/vduse/control", O_RDWR);
1298    if (ctrl_fd < 0) {
1299        fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
1300                strerror(errno));
1301        goto err_ctrl;
1302    }
1303
1304    version = VDUSE_API_VERSION;
1305    if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
1306        fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
1307                version, strerror(errno));
1308        goto err_dev;
1309    }
1310
1311    dev_config = calloc(size + config_size, 1);
1312    if (!dev_config) {
1313        fprintf(stderr, "Failed to allocate config space\n");
1314        goto err_dev;
1315    }
1316
1317    assert(!vduse_name_is_invalid(name));
1318    strcpy(dev_config->name, name);
1319    dev_config->device_id = device_id;
1320    dev_config->vendor_id = vendor_id;
1321    dev_config->features = features;
1322    dev_config->vq_num = num_queues;
1323    dev_config->vq_align = VDUSE_VQ_ALIGN;
1324    dev_config->config_size = config_size;
1325    memcpy(dev_config->config, config, config_size);
1326
1327    ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
1328    free(dev_config);
1329    if (ret && errno != EEXIST) {
1330        fprintf(stderr, "Failed to create vduse device %s: %s\n",
1331                name, strerror(errno));
1332        goto err_dev;
1333    }
1334    dev->ctrl_fd = ctrl_fd;
1335
1336    ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1337    if (ret < 0) {
1338        fprintf(stderr, "Failed to init vduse device %s: %s\n",
1339                name, strerror(-ret));
1340        goto err;
1341    }
1342
1343    return dev;
1344err:
1345    ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
1346err_dev:
1347    close(ctrl_fd);
1348err_ctrl:
1349    free(dev);
1350
1351    return NULL;
1352}
1353
1354int vduse_dev_destroy(VduseDev *dev)
1355{
1356    size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
1357    int i, ret = 0;
1358
1359    if (dev->log) {
1360        munmap(dev->log, log_size);
1361    }
1362    for (i = 0; i < dev->num_queues; i++) {
1363        free(dev->vqs[i].resubmit_list);
1364    }
1365    free(dev->vqs);
1366    if (dev->fd >= 0) {
1367        close(dev->fd);
1368        dev->fd = -1;
1369    }
1370    if (dev->ctrl_fd >= 0) {
1371        if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
1372            ret = -errno;
1373        }
1374        close(dev->ctrl_fd);
1375        dev->ctrl_fd = -1;
1376    }
1377    free(dev->name);
1378    free(dev);
1379
1380    return ret;
1381}
1382