qemu/contrib/libvhost-user/libvhost-user.c
<<
>>
Prefs
   1/*
   2 * Vhost User library
   3 *
   4 * Copyright IBM, Corp. 2007
   5 * Copyright (c) 2016 Red Hat, Inc.
   6 *
   7 * Authors:
   8 *  Anthony Liguori <aliguori@us.ibm.com>
   9 *  Marc-André Lureau <mlureau@redhat.com>
  10 *  Victor Kaplansky <victork@redhat.com>
  11 *
  12 * This work is licensed under the terms of the GNU GPL, version 2 or
  13 * later.  See the COPYING file in the top-level directory.
  14 */
  15
  16/* this code avoids GLib dependency */
  17#include <stdlib.h>
  18#include <stdio.h>
  19#include <unistd.h>
  20#include <stdarg.h>
  21#include <errno.h>
  22#include <string.h>
  23#include <assert.h>
  24#include <inttypes.h>
  25#include <sys/types.h>
  26#include <sys/socket.h>
  27#include <sys/eventfd.h>
  28#include <sys/mman.h>
  29#include "qemu/compiler.h"
  30
  31#if defined(__linux__)
  32#include <sys/syscall.h>
  33#include <fcntl.h>
  34#include <sys/ioctl.h>
  35#include <linux/vhost.h>
  36
  37#ifdef __NR_userfaultfd
  38#include <linux/userfaultfd.h>
  39#endif
  40
  41#endif
  42
  43#include "qemu/atomic.h"
  44
  45#include "libvhost-user.h"
  46
  47/* usually provided by GLib */
  48#ifndef MIN
  49#define MIN(x, y) ({                            \
  50            typeof(x) _min1 = (x);              \
  51            typeof(y) _min2 = (y);              \
  52            (void) (&_min1 == &_min2);          \
  53            _min1 < _min2 ? _min1 : _min2; })
  54#endif
  55
  56#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
  57
  58/* The version of the protocol we support */
  59#define VHOST_USER_VERSION 1
  60#define LIBVHOST_USER_DEBUG 0
  61
  62#define DPRINT(...)                             \
  63    do {                                        \
  64        if (LIBVHOST_USER_DEBUG) {              \
  65            fprintf(stderr, __VA_ARGS__);        \
  66        }                                       \
  67    } while (0)
  68
  69static const char *
  70vu_request_to_string(unsigned int req)
  71{
  72#define REQ(req) [req] = #req
  73    static const char *vu_request_str[] = {
  74        REQ(VHOST_USER_NONE),
  75        REQ(VHOST_USER_GET_FEATURES),
  76        REQ(VHOST_USER_SET_FEATURES),
  77        REQ(VHOST_USER_SET_OWNER),
  78        REQ(VHOST_USER_RESET_OWNER),
  79        REQ(VHOST_USER_SET_MEM_TABLE),
  80        REQ(VHOST_USER_SET_LOG_BASE),
  81        REQ(VHOST_USER_SET_LOG_FD),
  82        REQ(VHOST_USER_SET_VRING_NUM),
  83        REQ(VHOST_USER_SET_VRING_ADDR),
  84        REQ(VHOST_USER_SET_VRING_BASE),
  85        REQ(VHOST_USER_GET_VRING_BASE),
  86        REQ(VHOST_USER_SET_VRING_KICK),
  87        REQ(VHOST_USER_SET_VRING_CALL),
  88        REQ(VHOST_USER_SET_VRING_ERR),
  89        REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
  90        REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
  91        REQ(VHOST_USER_GET_QUEUE_NUM),
  92        REQ(VHOST_USER_SET_VRING_ENABLE),
  93        REQ(VHOST_USER_SEND_RARP),
  94        REQ(VHOST_USER_NET_SET_MTU),
  95        REQ(VHOST_USER_SET_SLAVE_REQ_FD),
  96        REQ(VHOST_USER_IOTLB_MSG),
  97        REQ(VHOST_USER_SET_VRING_ENDIAN),
  98        REQ(VHOST_USER_GET_CONFIG),
  99        REQ(VHOST_USER_SET_CONFIG),
 100        REQ(VHOST_USER_POSTCOPY_ADVISE),
 101        REQ(VHOST_USER_POSTCOPY_LISTEN),
 102        REQ(VHOST_USER_POSTCOPY_END),
 103        REQ(VHOST_USER_MAX),
 104    };
 105#undef REQ
 106
 107    if (req < VHOST_USER_MAX) {
 108        return vu_request_str[req];
 109    } else {
 110        return "unknown";
 111    }
 112}
 113
 114static void
 115vu_panic(VuDev *dev, const char *msg, ...)
 116{
 117    char *buf = NULL;
 118    va_list ap;
 119
 120    va_start(ap, msg);
 121    if (vasprintf(&buf, msg, ap) < 0) {
 122        buf = NULL;
 123    }
 124    va_end(ap);
 125
 126    dev->broken = true;
 127    dev->panic(dev, buf);
 128    free(buf);
 129
 130    /* FIXME: find a way to call virtio_error? */
 131}
 132
 133/* Translate guest physical address to our virtual address.  */
 134void *
 135vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
 136{
 137    int i;
 138
 139    if (*plen == 0) {
 140        return NULL;
 141    }
 142
 143    /* Find matching memory region.  */
 144    for (i = 0; i < dev->nregions; i++) {
 145        VuDevRegion *r = &dev->regions[i];
 146
 147        if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
 148            if ((guest_addr + *plen) > (r->gpa + r->size)) {
 149                *plen = r->gpa + r->size - guest_addr;
 150            }
 151            return (void *)(uintptr_t)
 152                guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
 153        }
 154    }
 155
 156    return NULL;
 157}
 158
 159/* Translate qemu virtual address to our virtual address.  */
 160static void *
 161qva_to_va(VuDev *dev, uint64_t qemu_addr)
 162{
 163    int i;
 164
 165    /* Find matching memory region.  */
 166    for (i = 0; i < dev->nregions; i++) {
 167        VuDevRegion *r = &dev->regions[i];
 168
 169        if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
 170            return (void *)(uintptr_t)
 171                qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
 172        }
 173    }
 174
 175    return NULL;
 176}
 177
 178static void
 179vmsg_close_fds(VhostUserMsg *vmsg)
 180{
 181    int i;
 182
 183    for (i = 0; i < vmsg->fd_num; i++) {
 184        close(vmsg->fds[i]);
 185    }
 186}
 187
 188/* A test to see if we have userfault available */
 189static bool
 190have_userfault(void)
 191{
 192#if defined(__linux__) && defined(__NR_userfaultfd) &&\
 193        defined(UFFD_FEATURE_MISSING_SHMEM) &&\
 194        defined(UFFD_FEATURE_MISSING_HUGETLBFS)
 195    /* Now test the kernel we're running on really has the features */
 196    int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
 197    struct uffdio_api api_struct;
 198    if (ufd < 0) {
 199        return false;
 200    }
 201
 202    api_struct.api = UFFD_API;
 203    api_struct.features = UFFD_FEATURE_MISSING_SHMEM |
 204                          UFFD_FEATURE_MISSING_HUGETLBFS;
 205    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
 206        close(ufd);
 207        return false;
 208    }
 209    close(ufd);
 210    return true;
 211
 212#else
 213    return false;
 214#endif
 215}
 216
 217static bool
 218vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 219{
 220    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
 221    struct iovec iov = {
 222        .iov_base = (char *)vmsg,
 223        .iov_len = VHOST_USER_HDR_SIZE,
 224    };
 225    struct msghdr msg = {
 226        .msg_iov = &iov,
 227        .msg_iovlen = 1,
 228        .msg_control = control,
 229        .msg_controllen = sizeof(control),
 230    };
 231    size_t fd_size;
 232    struct cmsghdr *cmsg;
 233    int rc;
 234
 235    do {
 236        rc = recvmsg(conn_fd, &msg, 0);
 237    } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 238
 239    if (rc < 0) {
 240        vu_panic(dev, "Error while recvmsg: %s", strerror(errno));
 241        return false;
 242    }
 243
 244    vmsg->fd_num = 0;
 245    for (cmsg = CMSG_FIRSTHDR(&msg);
 246         cmsg != NULL;
 247         cmsg = CMSG_NXTHDR(&msg, cmsg))
 248    {
 249        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
 250            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
 251            vmsg->fd_num = fd_size / sizeof(int);
 252            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
 253            break;
 254        }
 255    }
 256
 257    if (vmsg->size > sizeof(vmsg->payload)) {
 258        vu_panic(dev,
 259                 "Error: too big message request: %d, size: vmsg->size: %u, "
 260                 "while sizeof(vmsg->payload) = %zu\n",
 261                 vmsg->request, vmsg->size, sizeof(vmsg->payload));
 262        goto fail;
 263    }
 264
 265    if (vmsg->size) {
 266        do {
 267            rc = read(conn_fd, &vmsg->payload, vmsg->size);
 268        } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 269
 270        if (rc <= 0) {
 271            vu_panic(dev, "Error while reading: %s", strerror(errno));
 272            goto fail;
 273        }
 274
 275        assert(rc == vmsg->size);
 276    }
 277
 278    return true;
 279
 280fail:
 281    vmsg_close_fds(vmsg);
 282
 283    return false;
 284}
 285
 286static bool
 287vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 288{
 289    int rc;
 290    uint8_t *p = (uint8_t *)vmsg;
 291    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
 292    struct iovec iov = {
 293        .iov_base = (char *)vmsg,
 294        .iov_len = VHOST_USER_HDR_SIZE,
 295    };
 296    struct msghdr msg = {
 297        .msg_iov = &iov,
 298        .msg_iovlen = 1,
 299        .msg_control = control,
 300    };
 301    struct cmsghdr *cmsg;
 302
 303    memset(control, 0, sizeof(control));
 304    assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS);
 305    if (vmsg->fd_num > 0) {
 306        size_t fdsize = vmsg->fd_num * sizeof(int);
 307        msg.msg_controllen = CMSG_SPACE(fdsize);
 308        cmsg = CMSG_FIRSTHDR(&msg);
 309        cmsg->cmsg_len = CMSG_LEN(fdsize);
 310        cmsg->cmsg_level = SOL_SOCKET;
 311        cmsg->cmsg_type = SCM_RIGHTS;
 312        memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
 313    } else {
 314        msg.msg_controllen = 0;
 315    }
 316
 317    do {
 318        rc = sendmsg(conn_fd, &msg, 0);
 319    } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 320
 321    if (vmsg->size) {
 322        do {
 323            if (vmsg->data) {
 324                rc = write(conn_fd, vmsg->data, vmsg->size);
 325            } else {
 326                rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
 327            }
 328        } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 329    }
 330
 331    if (rc <= 0) {
 332        vu_panic(dev, "Error while writing: %s", strerror(errno));
 333        return false;
 334    }
 335
 336    return true;
 337}
 338
 339static bool
 340vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 341{
 342    /* Set the version in the flags when sending the reply */
 343    vmsg->flags &= ~VHOST_USER_VERSION_MASK;
 344    vmsg->flags |= VHOST_USER_VERSION;
 345    vmsg->flags |= VHOST_USER_REPLY_MASK;
 346
 347    return vu_message_write(dev, conn_fd, vmsg);
 348}
 349
 350static bool
 351vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg)
 352{
 353    VhostUserMsg msg_reply;
 354
 355    if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
 356        return true;
 357    }
 358
 359    if (!vu_message_read(dev, dev->slave_fd, &msg_reply)) {
 360        return false;
 361    }
 362
 363    if (msg_reply.request != vmsg->request) {
 364        DPRINT("Received unexpected msg type. Expected %d received %d",
 365               vmsg->request, msg_reply.request);
 366        return false;
 367    }
 368
 369    return msg_reply.payload.u64 == 0;
 370}
 371
 372/* Kick the log_call_fd if required. */
 373static void
 374vu_log_kick(VuDev *dev)
 375{
 376    if (dev->log_call_fd != -1) {
 377        DPRINT("Kicking the QEMU's log...\n");
 378        if (eventfd_write(dev->log_call_fd, 1) < 0) {
 379            vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
 380        }
 381    }
 382}
 383
 384static void
 385vu_log_page(uint8_t *log_table, uint64_t page)
 386{
 387    DPRINT("Logged dirty guest page: %"PRId64"\n", page);
 388    atomic_or(&log_table[page / 8], 1 << (page % 8));
 389}
 390
 391static void
 392vu_log_write(VuDev *dev, uint64_t address, uint64_t length)
 393{
 394    uint64_t page;
 395
 396    if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
 397        !dev->log_table || !length) {
 398        return;
 399    }
 400
 401    assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
 402
 403    page = address / VHOST_LOG_PAGE;
 404    while (page * VHOST_LOG_PAGE < address + length) {
 405        vu_log_page(dev->log_table, page);
 406        page += VHOST_LOG_PAGE;
 407    }
 408
 409    vu_log_kick(dev);
 410}
 411
 412static void
 413vu_kick_cb(VuDev *dev, int condition, void *data)
 414{
 415    int index = (intptr_t)data;
 416    VuVirtq *vq = &dev->vq[index];
 417    int sock = vq->kick_fd;
 418    eventfd_t kick_data;
 419    ssize_t rc;
 420
 421    rc = eventfd_read(sock, &kick_data);
 422    if (rc == -1) {
 423        vu_panic(dev, "kick eventfd_read(): %s", strerror(errno));
 424        dev->remove_watch(dev, dev->vq[index].kick_fd);
 425    } else {
 426        DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n",
 427               kick_data, vq->handler, index);
 428        if (vq->handler) {
 429            vq->handler(dev, index);
 430        }
 431    }
 432}
 433
 434static bool
 435vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 436{
 437    vmsg->payload.u64 =
 438        1ULL << VHOST_F_LOG_ALL |
 439        1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
 440
 441    if (dev->iface->get_features) {
 442        vmsg->payload.u64 |= dev->iface->get_features(dev);
 443    }
 444
 445    vmsg->size = sizeof(vmsg->payload.u64);
 446    vmsg->fd_num = 0;
 447
 448    DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 449
 450    return true;
 451}
 452
 453static void
 454vu_set_enable_all_rings(VuDev *dev, bool enabled)
 455{
 456    int i;
 457
 458    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
 459        dev->vq[i].enable = enabled;
 460    }
 461}
 462
 463static bool
 464vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 465{
 466    DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 467
 468    dev->features = vmsg->payload.u64;
 469
 470    if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) {
 471        vu_set_enable_all_rings(dev, true);
 472    }
 473
 474    if (dev->iface->set_features) {
 475        dev->iface->set_features(dev, dev->features);
 476    }
 477
 478    return false;
 479}
 480
 481static bool
 482vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg)
 483{
 484    return false;
 485}
 486
 487static void
 488vu_close_log(VuDev *dev)
 489{
 490    if (dev->log_table) {
 491        if (munmap(dev->log_table, dev->log_size) != 0) {
 492            perror("close log munmap() error");
 493        }
 494
 495        dev->log_table = NULL;
 496    }
 497    if (dev->log_call_fd != -1) {
 498        close(dev->log_call_fd);
 499        dev->log_call_fd = -1;
 500    }
 501}
 502
 503static bool
 504vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
 505{
 506    vu_set_enable_all_rings(dev, false);
 507
 508    return false;
 509}
 510
 511static bool
 512vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
 513{
 514    int i;
 515    VhostUserMemory *memory = &vmsg->payload.memory;
 516    dev->nregions = memory->nregions;
 517
 518    DPRINT("Nregions: %d\n", memory->nregions);
 519    for (i = 0; i < dev->nregions; i++) {
 520        void *mmap_addr;
 521        VhostUserMemoryRegion *msg_region = &memory->regions[i];
 522        VuDevRegion *dev_region = &dev->regions[i];
 523
 524        DPRINT("Region %d\n", i);
 525        DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
 526               msg_region->guest_phys_addr);
 527        DPRINT("    memory_size:     0x%016"PRIx64"\n",
 528               msg_region->memory_size);
 529        DPRINT("    userspace_addr   0x%016"PRIx64"\n",
 530               msg_region->userspace_addr);
 531        DPRINT("    mmap_offset      0x%016"PRIx64"\n",
 532               msg_region->mmap_offset);
 533
 534        dev_region->gpa = msg_region->guest_phys_addr;
 535        dev_region->size = msg_region->memory_size;
 536        dev_region->qva = msg_region->userspace_addr;
 537        dev_region->mmap_offset = msg_region->mmap_offset;
 538
 539        /* We don't use offset argument of mmap() since the
 540         * mapped address has to be page aligned, and we use huge
 541         * pages.
 542         * In postcopy we're using PROT_NONE here to catch anyone
 543         * accessing it before we userfault
 544         */
 545        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
 546                         PROT_NONE, MAP_SHARED,
 547                         vmsg->fds[i], 0);
 548
 549        if (mmap_addr == MAP_FAILED) {
 550            vu_panic(dev, "region mmap error: %s", strerror(errno));
 551        } else {
 552            dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
 553            DPRINT("    mmap_addr:       0x%016"PRIx64"\n",
 554                   dev_region->mmap_addr);
 555        }
 556
 557        /* Return the address to QEMU so that it can translate the ufd
 558         * fault addresses back.
 559         */
 560        msg_region->userspace_addr = (uintptr_t)(mmap_addr +
 561                                                 dev_region->mmap_offset);
 562        close(vmsg->fds[i]);
 563    }
 564
 565    /* Send the message back to qemu with the addresses filled in */
 566    vmsg->fd_num = 0;
 567    if (!vu_send_reply(dev, dev->sock, vmsg)) {
 568        vu_panic(dev, "failed to respond to set-mem-table for postcopy");
 569        return false;
 570    }
 571
 572    /* Wait for QEMU to confirm that it's registered the handler for the
 573     * faults.
 574     */
 575    if (!vu_message_read(dev, dev->sock, vmsg) ||
 576        vmsg->size != sizeof(vmsg->payload.u64) ||
 577        vmsg->payload.u64 != 0) {
 578        vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
 579        return false;
 580    }
 581
 582    /* OK, now we can go and register the memory and generate faults */
 583    for (i = 0; i < dev->nregions; i++) {
 584        VuDevRegion *dev_region = &dev->regions[i];
 585        int ret;
 586#ifdef UFFDIO_REGISTER
 587        /* We should already have an open ufd. Mark each memory
 588         * range as ufd.
 589         * Discard any mapping we have here; note I can't use MADV_REMOVE
 590         * or fallocate to make the hole since I don't want to lose
 591         * data that's already arrived in the shared process.
 592         * TODO: How to do hugepage
 593         */
 594        ret = madvise((void *)dev_region->mmap_addr,
 595                      dev_region->size + dev_region->mmap_offset,
 596                      MADV_DONTNEED);
 597        if (ret) {
 598            fprintf(stderr,
 599                    "%s: Failed to madvise(DONTNEED) region %d: %s\n",
 600                    __func__, i, strerror(errno));
 601        }
 602        /* Turn off transparent hugepages so we dont get lose wakeups
 603         * in neighbouring pages.
 604         * TODO: Turn this backon later.
 605         */
 606        ret = madvise((void *)dev_region->mmap_addr,
 607                      dev_region->size + dev_region->mmap_offset,
 608                      MADV_NOHUGEPAGE);
 609        if (ret) {
 610            /* Note: This can happen legally on kernels that are configured
 611             * without madvise'able hugepages
 612             */
 613            fprintf(stderr,
 614                    "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
 615                    __func__, i, strerror(errno));
 616        }
 617        struct uffdio_register reg_struct;
 618        reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
 619        reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
 620        reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 621
 622        if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
 623            vu_panic(dev, "%s: Failed to userfault region %d "
 624                          "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
 625                     __func__, i,
 626                     dev_region->mmap_addr,
 627                     dev_region->size, dev_region->mmap_offset,
 628                     dev->postcopy_ufd, strerror(errno));
 629            return false;
 630        }
 631        if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
 632            vu_panic(dev, "%s Region (%d) doesn't support COPY",
 633                     __func__, i);
 634            return false;
 635        }
 636        DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
 637                __func__, i, reg_struct.range.start, reg_struct.range.len);
 638        /* Now it's registered we can let the client at it */
 639        if (mprotect((void *)dev_region->mmap_addr,
 640                     dev_region->size + dev_region->mmap_offset,
 641                     PROT_READ | PROT_WRITE)) {
 642            vu_panic(dev, "failed to mprotect region %d for postcopy (%s)",
 643                     i, strerror(errno));
 644            return false;
 645        }
 646        /* TODO: Stash 'zero' support flags somewhere */
 647#endif
 648    }
 649
 650    return false;
 651}
 652
 653static bool
 654vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
 655{
 656    int i;
 657    VhostUserMemory *memory = &vmsg->payload.memory;
 658
 659    for (i = 0; i < dev->nregions; i++) {
 660        VuDevRegion *r = &dev->regions[i];
 661        void *m = (void *) (uintptr_t) r->mmap_addr;
 662
 663        if (m) {
 664            munmap(m, r->size + r->mmap_offset);
 665        }
 666    }
 667    dev->nregions = memory->nregions;
 668
 669    if (dev->postcopy_listening) {
 670        return vu_set_mem_table_exec_postcopy(dev, vmsg);
 671    }
 672
 673    DPRINT("Nregions: %d\n", memory->nregions);
 674    for (i = 0; i < dev->nregions; i++) {
 675        void *mmap_addr;
 676        VhostUserMemoryRegion *msg_region = &memory->regions[i];
 677        VuDevRegion *dev_region = &dev->regions[i];
 678
 679        DPRINT("Region %d\n", i);
 680        DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
 681               msg_region->guest_phys_addr);
 682        DPRINT("    memory_size:     0x%016"PRIx64"\n",
 683               msg_region->memory_size);
 684        DPRINT("    userspace_addr   0x%016"PRIx64"\n",
 685               msg_region->userspace_addr);
 686        DPRINT("    mmap_offset      0x%016"PRIx64"\n",
 687               msg_region->mmap_offset);
 688
 689        dev_region->gpa = msg_region->guest_phys_addr;
 690        dev_region->size = msg_region->memory_size;
 691        dev_region->qva = msg_region->userspace_addr;
 692        dev_region->mmap_offset = msg_region->mmap_offset;
 693
 694        /* We don't use offset argument of mmap() since the
 695         * mapped address has to be page aligned, and we use huge
 696         * pages.  */
 697        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
 698                         PROT_READ | PROT_WRITE, MAP_SHARED,
 699                         vmsg->fds[i], 0);
 700
 701        if (mmap_addr == MAP_FAILED) {
 702            vu_panic(dev, "region mmap error: %s", strerror(errno));
 703        } else {
 704            dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
 705            DPRINT("    mmap_addr:       0x%016"PRIx64"\n",
 706                   dev_region->mmap_addr);
 707        }
 708
 709        close(vmsg->fds[i]);
 710    }
 711
 712    return false;
 713}
 714
 715static bool
 716vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg)
 717{
 718    int fd;
 719    uint64_t log_mmap_size, log_mmap_offset;
 720    void *rc;
 721
 722    if (vmsg->fd_num != 1 ||
 723        vmsg->size != sizeof(vmsg->payload.log)) {
 724        vu_panic(dev, "Invalid log_base message");
 725        return true;
 726    }
 727
 728    fd = vmsg->fds[0];
 729    log_mmap_offset = vmsg->payload.log.mmap_offset;
 730    log_mmap_size = vmsg->payload.log.mmap_size;
 731    DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
 732    DPRINT("Log mmap_size:   %"PRId64"\n", log_mmap_size);
 733
 734    rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
 735              log_mmap_offset);
 736    close(fd);
 737    if (rc == MAP_FAILED) {
 738        perror("log mmap error");
 739    }
 740
 741    if (dev->log_table) {
 742        munmap(dev->log_table, dev->log_size);
 743    }
 744    dev->log_table = rc;
 745    dev->log_size = log_mmap_size;
 746
 747    vmsg->size = sizeof(vmsg->payload.u64);
 748    vmsg->fd_num = 0;
 749
 750    return true;
 751}
 752
 753static bool
 754vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg)
 755{
 756    if (vmsg->fd_num != 1) {
 757        vu_panic(dev, "Invalid log_fd message");
 758        return false;
 759    }
 760
 761    if (dev->log_call_fd != -1) {
 762        close(dev->log_call_fd);
 763    }
 764    dev->log_call_fd = vmsg->fds[0];
 765    DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
 766
 767    return false;
 768}
 769
 770static bool
 771vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg)
 772{
 773    unsigned int index = vmsg->payload.state.index;
 774    unsigned int num = vmsg->payload.state.num;
 775
 776    DPRINT("State.index: %d\n", index);
 777    DPRINT("State.num:   %d\n", num);
 778    dev->vq[index].vring.num = num;
 779
 780    return false;
 781}
 782
 783static bool
 784vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg)
 785{
 786    struct vhost_vring_addr *vra = &vmsg->payload.addr;
 787    unsigned int index = vra->index;
 788    VuVirtq *vq = &dev->vq[index];
 789
 790    DPRINT("vhost_vring_addr:\n");
 791    DPRINT("    index:  %d\n", vra->index);
 792    DPRINT("    flags:  %d\n", vra->flags);
 793    DPRINT("    desc_user_addr:   0x%016llx\n", vra->desc_user_addr);
 794    DPRINT("    used_user_addr:   0x%016llx\n", vra->used_user_addr);
 795    DPRINT("    avail_user_addr:  0x%016llx\n", vra->avail_user_addr);
 796    DPRINT("    log_guest_addr:   0x%016llx\n", vra->log_guest_addr);
 797
 798    vq->vring.flags = vra->flags;
 799    vq->vring.desc = qva_to_va(dev, vra->desc_user_addr);
 800    vq->vring.used = qva_to_va(dev, vra->used_user_addr);
 801    vq->vring.avail = qva_to_va(dev, vra->avail_user_addr);
 802    vq->vring.log_guest_addr = vra->log_guest_addr;
 803
 804    DPRINT("Setting virtq addresses:\n");
 805    DPRINT("    vring_desc  at %p\n", vq->vring.desc);
 806    DPRINT("    vring_used  at %p\n", vq->vring.used);
 807    DPRINT("    vring_avail at %p\n", vq->vring.avail);
 808
 809    if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) {
 810        vu_panic(dev, "Invalid vring_addr message");
 811        return false;
 812    }
 813
 814    vq->used_idx = vq->vring.used->idx;
 815
 816    if (vq->last_avail_idx != vq->used_idx) {
 817        bool resume = dev->iface->queue_is_processed_in_order &&
 818            dev->iface->queue_is_processed_in_order(dev, index);
 819
 820        DPRINT("Last avail index != used index: %u != %u%s\n",
 821               vq->last_avail_idx, vq->used_idx,
 822               resume ? ", resuming" : "");
 823
 824        if (resume) {
 825            vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx;
 826        }
 827    }
 828
 829    return false;
 830}
 831
 832static bool
 833vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
 834{
 835    unsigned int index = vmsg->payload.state.index;
 836    unsigned int num = vmsg->payload.state.num;
 837
 838    DPRINT("State.index: %d\n", index);
 839    DPRINT("State.num:   %d\n", num);
 840    dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num;
 841
 842    return false;
 843}
 844
 845static bool
 846vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
 847{
 848    unsigned int index = vmsg->payload.state.index;
 849
 850    DPRINT("State.index: %d\n", index);
 851    vmsg->payload.state.num = dev->vq[index].last_avail_idx;
 852    vmsg->size = sizeof(vmsg->payload.state);
 853
 854    dev->vq[index].started = false;
 855    if (dev->iface->queue_set_started) {
 856        dev->iface->queue_set_started(dev, index, false);
 857    }
 858
 859    if (dev->vq[index].call_fd != -1) {
 860        close(dev->vq[index].call_fd);
 861        dev->vq[index].call_fd = -1;
 862    }
 863    if (dev->vq[index].kick_fd != -1) {
 864        dev->remove_watch(dev, dev->vq[index].kick_fd);
 865        close(dev->vq[index].kick_fd);
 866        dev->vq[index].kick_fd = -1;
 867    }
 868
 869    return true;
 870}
 871
 872static bool
 873vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
 874{
 875    int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 876
 877    if (index >= VHOST_MAX_NR_VIRTQUEUE) {
 878        vmsg_close_fds(vmsg);
 879        vu_panic(dev, "Invalid queue index: %u", index);
 880        return false;
 881    }
 882
 883    if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK ||
 884        vmsg->fd_num != 1) {
 885        vmsg_close_fds(vmsg);
 886        vu_panic(dev, "Invalid fds in request: %d", vmsg->request);
 887        return false;
 888    }
 889
 890    return true;
 891}
 892
 893static bool
 894vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
 895{
 896    int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 897
 898    DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 899
 900    if (!vu_check_queue_msg_file(dev, vmsg)) {
 901        return false;
 902    }
 903
 904    if (dev->vq[index].kick_fd != -1) {
 905        dev->remove_watch(dev, dev->vq[index].kick_fd);
 906        close(dev->vq[index].kick_fd);
 907        dev->vq[index].kick_fd = -1;
 908    }
 909
 910    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
 911        dev->vq[index].kick_fd = vmsg->fds[0];
 912        DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
 913    }
 914
 915    dev->vq[index].started = true;
 916    if (dev->iface->queue_set_started) {
 917        dev->iface->queue_set_started(dev, index, true);
 918    }
 919
 920    if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
 921        dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
 922                       vu_kick_cb, (void *)(long)index);
 923
 924        DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
 925               dev->vq[index].kick_fd, index);
 926    }
 927
 928    return false;
 929}
 930
 931void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
 932                          vu_queue_handler_cb handler)
 933{
 934    int qidx = vq - dev->vq;
 935
 936    vq->handler = handler;
 937    if (vq->kick_fd >= 0) {
 938        if (handler) {
 939            dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
 940                           vu_kick_cb, (void *)(long)qidx);
 941        } else {
 942            dev->remove_watch(dev, vq->kick_fd);
 943        }
 944    }
 945}
 946
 947bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd,
 948                                int size, int offset)
 949{
 950    int qidx = vq - dev->vq;
 951    int fd_num = 0;
 952    VhostUserMsg vmsg = {
 953        .request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
 954        .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
 955        .size = sizeof(vmsg.payload.area),
 956        .payload.area = {
 957            .u64 = qidx & VHOST_USER_VRING_IDX_MASK,
 958            .size = size,
 959            .offset = offset,
 960        },
 961    };
 962
 963    if (fd == -1) {
 964        vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
 965    } else {
 966        vmsg.fds[fd_num++] = fd;
 967    }
 968
 969    vmsg.fd_num = fd_num;
 970
 971    if ((dev->protocol_features & VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) == 0) {
 972        return false;
 973    }
 974
 975    if (!vu_message_write(dev, dev->slave_fd, &vmsg)) {
 976        return false;
 977    }
 978
 979    return vu_process_message_reply(dev, &vmsg);
 980}
 981
 982static bool
 983vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
 984{
 985    int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 986
 987    DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 988
 989    if (!vu_check_queue_msg_file(dev, vmsg)) {
 990        return false;
 991    }
 992
 993    if (dev->vq[index].call_fd != -1) {
 994        close(dev->vq[index].call_fd);
 995        dev->vq[index].call_fd = -1;
 996    }
 997
 998    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
 999        dev->vq[index].call_fd = vmsg->fds[0];
1000    }
1001
1002    DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
1003
1004    return false;
1005}
1006
1007static bool
1008vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
1009{
1010    int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
1011
1012    DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1013
1014    if (!vu_check_queue_msg_file(dev, vmsg)) {
1015        return false;
1016    }
1017
1018    if (dev->vq[index].err_fd != -1) {
1019        close(dev->vq[index].err_fd);
1020        dev->vq[index].err_fd = -1;
1021    }
1022
1023    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
1024        dev->vq[index].err_fd = vmsg->fds[0];
1025    }
1026
1027    return false;
1028}
1029
1030static bool
1031vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
1032{
1033    uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
1034                        1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ |
1035                        1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER |
1036                        1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD;
1037
1038    if (have_userfault()) {
1039        features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
1040    }
1041
1042    if (dev->iface->get_protocol_features) {
1043        features |= dev->iface->get_protocol_features(dev);
1044    }
1045
1046    vmsg->payload.u64 = features;
1047    vmsg->size = sizeof(vmsg->payload.u64);
1048    vmsg->fd_num = 0;
1049
1050    return true;
1051}
1052
1053static bool
1054vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
1055{
1056    uint64_t features = vmsg->payload.u64;
1057
1058    DPRINT("u64: 0x%016"PRIx64"\n", features);
1059
1060    dev->protocol_features = vmsg->payload.u64;
1061
1062    if (dev->iface->set_protocol_features) {
1063        dev->iface->set_protocol_features(dev, features);
1064    }
1065
1066    return false;
1067}
1068
1069static bool
1070vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg)
1071{
1072    DPRINT("Function %s() not implemented yet.\n", __func__);
1073    return false;
1074}
1075
1076static bool
1077vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg)
1078{
1079    unsigned int index = vmsg->payload.state.index;
1080    unsigned int enable = vmsg->payload.state.num;
1081
1082    DPRINT("State.index: %d\n", index);
1083    DPRINT("State.enable:   %d\n", enable);
1084
1085    if (index >= VHOST_MAX_NR_VIRTQUEUE) {
1086        vu_panic(dev, "Invalid vring_enable index: %u", index);
1087        return false;
1088    }
1089
1090    dev->vq[index].enable = enable;
1091    return false;
1092}
1093
1094static bool
1095vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg)
1096{
1097    if (vmsg->fd_num != 1) {
1098        vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num);
1099        return false;
1100    }
1101
1102    if (dev->slave_fd != -1) {
1103        close(dev->slave_fd);
1104    }
1105    dev->slave_fd = vmsg->fds[0];
1106    DPRINT("Got slave_fd: %d\n", vmsg->fds[0]);
1107
1108    return false;
1109}
1110
1111static bool
1112vu_get_config(VuDev *dev, VhostUserMsg *vmsg)
1113{
1114    int ret = -1;
1115
1116    if (dev->iface->get_config) {
1117        ret = dev->iface->get_config(dev, vmsg->payload.config.region,
1118                                     vmsg->payload.config.size);
1119    }
1120
1121    if (ret) {
1122        /* resize to zero to indicate an error to master */
1123        vmsg->size = 0;
1124    }
1125
1126    return true;
1127}
1128
1129static bool
1130vu_set_config(VuDev *dev, VhostUserMsg *vmsg)
1131{
1132    int ret = -1;
1133
1134    if (dev->iface->set_config) {
1135        ret = dev->iface->set_config(dev, vmsg->payload.config.region,
1136                                     vmsg->payload.config.offset,
1137                                     vmsg->payload.config.size,
1138                                     vmsg->payload.config.flags);
1139        if (ret) {
1140            vu_panic(dev, "Set virtio configuration space failed");
1141        }
1142    }
1143
1144    return false;
1145}
1146
1147static bool
1148vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg)
1149{
1150    dev->postcopy_ufd = -1;
1151#ifdef UFFDIO_API
1152    struct uffdio_api api_struct;
1153
1154    dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1155    vmsg->size = 0;
1156#endif
1157
1158    if (dev->postcopy_ufd == -1) {
1159        vu_panic(dev, "Userfaultfd not available: %s", strerror(errno));
1160        goto out;
1161    }
1162
1163#ifdef UFFDIO_API
1164    api_struct.api = UFFD_API;
1165    api_struct.features = 0;
1166    if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
1167        vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno));
1168        close(dev->postcopy_ufd);
1169        dev->postcopy_ufd = -1;
1170        goto out;
1171    }
1172    /* TODO: Stash feature flags somewhere */
1173#endif
1174
1175out:
1176    /* Return a ufd to the QEMU */
1177    vmsg->fd_num = 1;
1178    vmsg->fds[0] = dev->postcopy_ufd;
1179    return true; /* = send a reply */
1180}
1181
1182static bool
1183vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg)
1184{
1185    vmsg->payload.u64 = -1;
1186    vmsg->size = sizeof(vmsg->payload.u64);
1187
1188    if (dev->nregions) {
1189        vu_panic(dev, "Regions already registered at postcopy-listen");
1190        return true;
1191    }
1192    dev->postcopy_listening = true;
1193
1194    vmsg->flags = VHOST_USER_VERSION |  VHOST_USER_REPLY_MASK;
1195    vmsg->payload.u64 = 0; /* Success */
1196    return true;
1197}
1198
1199static bool
1200vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
1201{
1202    DPRINT("%s: Entry\n", __func__);
1203    dev->postcopy_listening = false;
1204    if (dev->postcopy_ufd > 0) {
1205        close(dev->postcopy_ufd);
1206        dev->postcopy_ufd = -1;
1207        DPRINT("%s: Done close\n", __func__);
1208    }
1209
1210    vmsg->fd_num = 0;
1211    vmsg->payload.u64 = 0;
1212    vmsg->size = sizeof(vmsg->payload.u64);
1213    vmsg->flags = VHOST_USER_VERSION |  VHOST_USER_REPLY_MASK;
1214    DPRINT("%s: exit\n", __func__);
1215    return true;
1216}
1217
1218static bool
1219vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
1220{
1221    int do_reply = 0;
1222
1223    /* Print out generic part of the request. */
1224    DPRINT("================ Vhost user message ================\n");
1225    DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request),
1226           vmsg->request);
1227    DPRINT("Flags:   0x%x\n", vmsg->flags);
1228    DPRINT("Size:    %d\n", vmsg->size);
1229
1230    if (vmsg->fd_num) {
1231        int i;
1232        DPRINT("Fds:");
1233        for (i = 0; i < vmsg->fd_num; i++) {
1234            DPRINT(" %d", vmsg->fds[i]);
1235        }
1236        DPRINT("\n");
1237    }
1238
1239    if (dev->iface->process_msg &&
1240        dev->iface->process_msg(dev, vmsg, &do_reply)) {
1241        return do_reply;
1242    }
1243
1244    switch (vmsg->request) {
1245    case VHOST_USER_GET_FEATURES:
1246        return vu_get_features_exec(dev, vmsg);
1247    case VHOST_USER_SET_FEATURES:
1248        return vu_set_features_exec(dev, vmsg);
1249    case VHOST_USER_GET_PROTOCOL_FEATURES:
1250        return vu_get_protocol_features_exec(dev, vmsg);
1251    case VHOST_USER_SET_PROTOCOL_FEATURES:
1252        return vu_set_protocol_features_exec(dev, vmsg);
1253    case VHOST_USER_SET_OWNER:
1254        return vu_set_owner_exec(dev, vmsg);
1255    case VHOST_USER_RESET_OWNER:
1256        return vu_reset_device_exec(dev, vmsg);
1257    case VHOST_USER_SET_MEM_TABLE:
1258        return vu_set_mem_table_exec(dev, vmsg);
1259    case VHOST_USER_SET_LOG_BASE:
1260        return vu_set_log_base_exec(dev, vmsg);
1261    case VHOST_USER_SET_LOG_FD:
1262        return vu_set_log_fd_exec(dev, vmsg);
1263    case VHOST_USER_SET_VRING_NUM:
1264        return vu_set_vring_num_exec(dev, vmsg);
1265    case VHOST_USER_SET_VRING_ADDR:
1266        return vu_set_vring_addr_exec(dev, vmsg);
1267    case VHOST_USER_SET_VRING_BASE:
1268        return vu_set_vring_base_exec(dev, vmsg);
1269    case VHOST_USER_GET_VRING_BASE:
1270        return vu_get_vring_base_exec(dev, vmsg);
1271    case VHOST_USER_SET_VRING_KICK:
1272        return vu_set_vring_kick_exec(dev, vmsg);
1273    case VHOST_USER_SET_VRING_CALL:
1274        return vu_set_vring_call_exec(dev, vmsg);
1275    case VHOST_USER_SET_VRING_ERR:
1276        return vu_set_vring_err_exec(dev, vmsg);
1277    case VHOST_USER_GET_QUEUE_NUM:
1278        return vu_get_queue_num_exec(dev, vmsg);
1279    case VHOST_USER_SET_VRING_ENABLE:
1280        return vu_set_vring_enable_exec(dev, vmsg);
1281    case VHOST_USER_SET_SLAVE_REQ_FD:
1282        return vu_set_slave_req_fd(dev, vmsg);
1283    case VHOST_USER_GET_CONFIG:
1284        return vu_get_config(dev, vmsg);
1285    case VHOST_USER_SET_CONFIG:
1286        return vu_set_config(dev, vmsg);
1287    case VHOST_USER_NONE:
1288        break;
1289    case VHOST_USER_POSTCOPY_ADVISE:
1290        return vu_set_postcopy_advise(dev, vmsg);
1291    case VHOST_USER_POSTCOPY_LISTEN:
1292        return vu_set_postcopy_listen(dev, vmsg);
1293    case VHOST_USER_POSTCOPY_END:
1294        return vu_set_postcopy_end(dev, vmsg);
1295    default:
1296        vmsg_close_fds(vmsg);
1297        vu_panic(dev, "Unhandled request: %d", vmsg->request);
1298    }
1299
1300    return false;
1301}
1302
1303bool
1304vu_dispatch(VuDev *dev)
1305{
1306    VhostUserMsg vmsg = { 0, };
1307    int reply_requested;
1308    bool success = false;
1309
1310    if (!vu_message_read(dev, dev->sock, &vmsg)) {
1311        goto end;
1312    }
1313
1314    reply_requested = vu_process_message(dev, &vmsg);
1315    if (!reply_requested) {
1316        success = true;
1317        goto end;
1318    }
1319
1320    if (!vu_send_reply(dev, dev->sock, &vmsg)) {
1321        goto end;
1322    }
1323
1324    success = true;
1325
1326end:
1327    free(vmsg.data);
1328    return success;
1329}
1330
1331void
1332vu_deinit(VuDev *dev)
1333{
1334    int i;
1335
1336    for (i = 0; i < dev->nregions; i++) {
1337        VuDevRegion *r = &dev->regions[i];
1338        void *m = (void *) (uintptr_t) r->mmap_addr;
1339        if (m != MAP_FAILED) {
1340            munmap(m, r->size + r->mmap_offset);
1341        }
1342    }
1343    dev->nregions = 0;
1344
1345    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
1346        VuVirtq *vq = &dev->vq[i];
1347
1348        if (vq->call_fd != -1) {
1349            close(vq->call_fd);
1350            vq->call_fd = -1;
1351        }
1352
1353        if (vq->kick_fd != -1) {
1354            close(vq->kick_fd);
1355            vq->kick_fd = -1;
1356        }
1357
1358        if (vq->err_fd != -1) {
1359            close(vq->err_fd);
1360            vq->err_fd = -1;
1361        }
1362    }
1363
1364
1365    vu_close_log(dev);
1366    if (dev->slave_fd != -1) {
1367        close(dev->slave_fd);
1368        dev->slave_fd = -1;
1369    }
1370
1371    if (dev->sock != -1) {
1372        close(dev->sock);
1373    }
1374}
1375
1376void
1377vu_init(VuDev *dev,
1378        int socket,
1379        vu_panic_cb panic,
1380        vu_set_watch_cb set_watch,
1381        vu_remove_watch_cb remove_watch,
1382        const VuDevIface *iface)
1383{
1384    int i;
1385
1386    assert(socket >= 0);
1387    assert(set_watch);
1388    assert(remove_watch);
1389    assert(iface);
1390    assert(panic);
1391
1392    memset(dev, 0, sizeof(*dev));
1393
1394    dev->sock = socket;
1395    dev->panic = panic;
1396    dev->set_watch = set_watch;
1397    dev->remove_watch = remove_watch;
1398    dev->iface = iface;
1399    dev->log_call_fd = -1;
1400    dev->slave_fd = -1;
1401    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
1402        dev->vq[i] = (VuVirtq) {
1403            .call_fd = -1, .kick_fd = -1, .err_fd = -1,
1404            .notification = true,
1405        };
1406    }
1407}
1408
1409VuVirtq *
1410vu_get_queue(VuDev *dev, int qidx)
1411{
1412    assert(qidx < VHOST_MAX_NR_VIRTQUEUE);
1413    return &dev->vq[qidx];
1414}
1415
1416bool
1417vu_queue_enabled(VuDev *dev, VuVirtq *vq)
1418{
1419    return vq->enable;
1420}
1421
1422bool
1423vu_queue_started(const VuDev *dev, const VuVirtq *vq)
1424{
1425    return vq->started;
1426}
1427
1428static inline uint16_t
1429vring_avail_flags(VuVirtq *vq)
1430{
1431    return vq->vring.avail->flags;
1432}
1433
1434static inline uint16_t
1435vring_avail_idx(VuVirtq *vq)
1436{
1437    vq->shadow_avail_idx = vq->vring.avail->idx;
1438
1439    return vq->shadow_avail_idx;
1440}
1441
1442static inline uint16_t
1443vring_avail_ring(VuVirtq *vq, int i)
1444{
1445    return vq->vring.avail->ring[i];
1446}
1447
1448static inline uint16_t
1449vring_get_used_event(VuVirtq *vq)
1450{
1451    return vring_avail_ring(vq, vq->vring.num);
1452}
1453
1454static int
1455virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx)
1456{
1457    uint16_t num_heads = vring_avail_idx(vq) - idx;
1458
1459    /* Check it isn't doing very strange things with descriptor numbers. */
1460    if (num_heads > vq->vring.num) {
1461        vu_panic(dev, "Guest moved used index from %u to %u",
1462                 idx, vq->shadow_avail_idx);
1463        return -1;
1464    }
1465    if (num_heads) {
1466        /* On success, callers read a descriptor at vq->last_avail_idx.
1467         * Make sure descriptor read does not bypass avail index read. */
1468        smp_rmb();
1469    }
1470
1471    return num_heads;
1472}
1473
1474static bool
1475virtqueue_get_head(VuDev *dev, VuVirtq *vq,
1476                   unsigned int idx, unsigned int *head)
1477{
1478    /* Grab the next descriptor number they're advertising, and increment
1479     * the index we've seen. */
1480    *head = vring_avail_ring(vq, idx % vq->vring.num);
1481
1482    /* If their number is silly, that's a fatal mistake. */
1483    if (*head >= vq->vring.num) {
1484        vu_panic(dev, "Guest says index %u is available", head);
1485        return false;
1486    }
1487
1488    return true;
1489}
1490
1491static int
1492virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc,
1493                             uint64_t addr, size_t len)
1494{
1495    struct vring_desc *ori_desc;
1496    uint64_t read_len;
1497
1498    if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
1499        return -1;
1500    }
1501
1502    if (len == 0) {
1503        return -1;
1504    }
1505
1506    while (len) {
1507        read_len = len;
1508        ori_desc = vu_gpa_to_va(dev, &read_len, addr);
1509        if (!ori_desc) {
1510            return -1;
1511        }
1512
1513        memcpy(desc, ori_desc, read_len);
1514        len -= read_len;
1515        addr += read_len;
1516        desc += read_len;
1517    }
1518
1519    return 0;
1520}
1521
1522enum {
1523    VIRTQUEUE_READ_DESC_ERROR = -1,
1524    VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
1525    VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
1526};
1527
1528static int
1529virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc,
1530                         int i, unsigned int max, unsigned int *next)
1531{
1532    /* If this descriptor says it doesn't chain, we're done. */
1533    if (!(desc[i].flags & VRING_DESC_F_NEXT)) {
1534        return VIRTQUEUE_READ_DESC_DONE;
1535    }
1536
1537    /* Check they're not leading us off end of descriptors. */
1538    *next = desc[i].next;
1539    /* Make sure compiler knows to grab that: we don't want it changing! */
1540    smp_wmb();
1541
1542    if (*next >= max) {
1543        vu_panic(dev, "Desc next is %u", next);
1544        return VIRTQUEUE_READ_DESC_ERROR;
1545    }
1546
1547    return VIRTQUEUE_READ_DESC_MORE;
1548}
1549
1550void
1551vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes,
1552                         unsigned int *out_bytes,
1553                         unsigned max_in_bytes, unsigned max_out_bytes)
1554{
1555    unsigned int idx;
1556    unsigned int total_bufs, in_total, out_total;
1557    int rc;
1558
1559    idx = vq->last_avail_idx;
1560
1561    total_bufs = in_total = out_total = 0;
1562    if (unlikely(dev->broken) ||
1563        unlikely(!vq->vring.avail)) {
1564        goto done;
1565    }
1566
1567    while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) {
1568        unsigned int max, desc_len, num_bufs, indirect = 0;
1569        uint64_t desc_addr, read_len;
1570        struct vring_desc *desc;
1571        struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
1572        unsigned int i;
1573
1574        max = vq->vring.num;
1575        num_bufs = total_bufs;
1576        if (!virtqueue_get_head(dev, vq, idx++, &i)) {
1577            goto err;
1578        }
1579        desc = vq->vring.desc;
1580
1581        if (desc[i].flags & VRING_DESC_F_INDIRECT) {
1582            if (desc[i].len % sizeof(struct vring_desc)) {
1583                vu_panic(dev, "Invalid size for indirect buffer table");
1584                goto err;
1585            }
1586
1587            /* If we've got too many, that implies a descriptor loop. */
1588            if (num_bufs >= max) {
1589                vu_panic(dev, "Looped descriptor");
1590                goto err;
1591            }
1592
1593            /* loop over the indirect descriptor table */
1594            indirect = 1;
1595            desc_addr = desc[i].addr;
1596            desc_len = desc[i].len;
1597            max = desc_len / sizeof(struct vring_desc);
1598            read_len = desc_len;
1599            desc = vu_gpa_to_va(dev, &read_len, desc_addr);
1600            if (unlikely(desc && read_len != desc_len)) {
1601                /* Failed to use zero copy */
1602                desc = NULL;
1603                if (!virtqueue_read_indirect_desc(dev, desc_buf,
1604                                                  desc_addr,
1605                                                  desc_len)) {
1606                    desc = desc_buf;
1607                }
1608            }
1609            if (!desc) {
1610                vu_panic(dev, "Invalid indirect buffer table");
1611                goto err;
1612            }
1613            num_bufs = i = 0;
1614        }
1615
1616        do {
1617            /* If we've got too many, that implies a descriptor loop. */
1618            if (++num_bufs > max) {
1619                vu_panic(dev, "Looped descriptor");
1620                goto err;
1621            }
1622
1623            if (desc[i].flags & VRING_DESC_F_WRITE) {
1624                in_total += desc[i].len;
1625            } else {
1626                out_total += desc[i].len;
1627            }
1628            if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
1629                goto done;
1630            }
1631            rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
1632        } while (rc == VIRTQUEUE_READ_DESC_MORE);
1633
1634        if (rc == VIRTQUEUE_READ_DESC_ERROR) {
1635            goto err;
1636        }
1637
1638        if (!indirect) {
1639            total_bufs = num_bufs;
1640        } else {
1641            total_bufs++;
1642        }
1643    }
1644    if (rc < 0) {
1645        goto err;
1646    }
1647done:
1648    if (in_bytes) {
1649        *in_bytes = in_total;
1650    }
1651    if (out_bytes) {
1652        *out_bytes = out_total;
1653    }
1654    return;
1655
1656err:
1657    in_total = out_total = 0;
1658    goto done;
1659}
1660
1661bool
1662vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
1663                     unsigned int out_bytes)
1664{
1665    unsigned int in_total, out_total;
1666
1667    vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total,
1668                             in_bytes, out_bytes);
1669
1670    return in_bytes <= in_total && out_bytes <= out_total;
1671}
1672
1673/* Fetch avail_idx from VQ memory only when we really need to know if
1674 * guest has added some buffers. */
1675bool
1676vu_queue_empty(VuDev *dev, VuVirtq *vq)
1677{
1678    if (unlikely(dev->broken) ||
1679        unlikely(!vq->vring.avail)) {
1680        return true;
1681    }
1682
1683    if (vq->shadow_avail_idx != vq->last_avail_idx) {
1684        return false;
1685    }
1686
1687    return vring_avail_idx(vq) == vq->last_avail_idx;
1688}
1689
1690static inline
1691bool has_feature(uint64_t features, unsigned int fbit)
1692{
1693    assert(fbit < 64);
1694    return !!(features & (1ULL << fbit));
1695}
1696
1697static inline
1698bool vu_has_feature(VuDev *dev,
1699                    unsigned int fbit)
1700{
1701    return has_feature(dev->features, fbit);
1702}
1703
1704static bool
1705vring_notify(VuDev *dev, VuVirtq *vq)
1706{
1707    uint16_t old, new;
1708    bool v;
1709
1710    /* We need to expose used array entries before checking used event. */
1711    smp_mb();
1712
1713    /* Always notify when queue is empty (when feature acknowledge) */
1714    if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
1715        !vq->inuse && vu_queue_empty(dev, vq)) {
1716        return true;
1717    }
1718
1719    if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1720        return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
1721    }
1722
1723    v = vq->signalled_used_valid;
1724    vq->signalled_used_valid = true;
1725    old = vq->signalled_used;
1726    new = vq->signalled_used = vq->used_idx;
1727    return !v || vring_need_event(vring_get_used_event(vq), new, old);
1728}
1729
1730void
1731vu_queue_notify(VuDev *dev, VuVirtq *vq)
1732{
1733    if (unlikely(dev->broken) ||
1734        unlikely(!vq->vring.avail)) {
1735        return;
1736    }
1737
1738    if (!vring_notify(dev, vq)) {
1739        DPRINT("skipped notify...\n");
1740        return;
1741    }
1742
1743    if (eventfd_write(vq->call_fd, 1) < 0) {
1744        vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
1745    }
1746}
1747
1748static inline void
1749vring_used_flags_set_bit(VuVirtq *vq, int mask)
1750{
1751    uint16_t *flags;
1752
1753    flags = (uint16_t *)((char*)vq->vring.used +
1754                         offsetof(struct vring_used, flags));
1755    *flags |= mask;
1756}
1757
1758static inline void
1759vring_used_flags_unset_bit(VuVirtq *vq, int mask)
1760{
1761    uint16_t *flags;
1762
1763    flags = (uint16_t *)((char*)vq->vring.used +
1764                         offsetof(struct vring_used, flags));
1765    *flags &= ~mask;
1766}
1767
1768static inline void
1769vring_set_avail_event(VuVirtq *vq, uint16_t val)
1770{
1771    if (!vq->notification) {
1772        return;
1773    }
1774
1775    *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val;
1776}
1777
1778void
1779vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable)
1780{
1781    vq->notification = enable;
1782    if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1783        vring_set_avail_event(vq, vring_avail_idx(vq));
1784    } else if (enable) {
1785        vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
1786    } else {
1787        vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
1788    }
1789    if (enable) {
1790        /* Expose avail event/used flags before caller checks the avail idx. */
1791        smp_mb();
1792    }
1793}
1794
1795static void
1796virtqueue_map_desc(VuDev *dev,
1797                   unsigned int *p_num_sg, struct iovec *iov,
1798                   unsigned int max_num_sg, bool is_write,
1799                   uint64_t pa, size_t sz)
1800{
1801    unsigned num_sg = *p_num_sg;
1802
1803    assert(num_sg <= max_num_sg);
1804
1805    if (!sz) {
1806        vu_panic(dev, "virtio: zero sized buffers are not allowed");
1807        return;
1808    }
1809
1810    while (sz) {
1811        uint64_t len = sz;
1812
1813        if (num_sg == max_num_sg) {
1814            vu_panic(dev, "virtio: too many descriptors in indirect table");
1815            return;
1816        }
1817
1818        iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
1819        if (iov[num_sg].iov_base == NULL) {
1820            vu_panic(dev, "virtio: invalid address for buffers");
1821            return;
1822        }
1823        iov[num_sg].iov_len = len;
1824        num_sg++;
1825        sz -= len;
1826        pa += len;
1827    }
1828
1829    *p_num_sg = num_sg;
1830}
1831
1832/* Round number down to multiple */
1833#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
1834
1835/* Round number up to multiple */
1836#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
1837
1838static void *
1839virtqueue_alloc_element(size_t sz,
1840                                     unsigned out_num, unsigned in_num)
1841{
1842    VuVirtqElement *elem;
1843    size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
1844    size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
1845    size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
1846
1847    assert(sz >= sizeof(VuVirtqElement));
1848    elem = malloc(out_sg_end);
1849    elem->out_num = out_num;
1850    elem->in_num = in_num;
1851    elem->in_sg = (void *)elem + in_sg_ofs;
1852    elem->out_sg = (void *)elem + out_sg_ofs;
1853    return elem;
1854}
1855
1856void *
1857vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
1858{
1859    unsigned int i, head, max, desc_len;
1860    uint64_t desc_addr, read_len;
1861    VuVirtqElement *elem;
1862    unsigned out_num, in_num;
1863    struct iovec iov[VIRTQUEUE_MAX_SIZE];
1864    struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
1865    struct vring_desc *desc;
1866    int rc;
1867
1868    if (unlikely(dev->broken) ||
1869        unlikely(!vq->vring.avail)) {
1870        return NULL;
1871    }
1872
1873    if (vu_queue_empty(dev, vq)) {
1874        return NULL;
1875    }
1876    /* Needed after virtio_queue_empty(), see comment in
1877     * virtqueue_num_heads(). */
1878    smp_rmb();
1879
1880    /* When we start there are none of either input nor output. */
1881    out_num = in_num = 0;
1882
1883    max = vq->vring.num;
1884    if (vq->inuse >= vq->vring.num) {
1885        vu_panic(dev, "Virtqueue size exceeded");
1886        return NULL;
1887    }
1888
1889    if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
1890        return NULL;
1891    }
1892
1893    if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1894        vring_set_avail_event(vq, vq->last_avail_idx);
1895    }
1896
1897    i = head;
1898    desc = vq->vring.desc;
1899    if (desc[i].flags & VRING_DESC_F_INDIRECT) {
1900        if (desc[i].len % sizeof(struct vring_desc)) {
1901            vu_panic(dev, "Invalid size for indirect buffer table");
1902        }
1903
1904        /* loop over the indirect descriptor table */
1905        desc_addr = desc[i].addr;
1906        desc_len = desc[i].len;
1907        max = desc_len / sizeof(struct vring_desc);
1908        read_len = desc_len;
1909        desc = vu_gpa_to_va(dev, &read_len, desc_addr);
1910        if (unlikely(desc && read_len != desc_len)) {
1911            /* Failed to use zero copy */
1912            desc = NULL;
1913            if (!virtqueue_read_indirect_desc(dev, desc_buf,
1914                                              desc_addr,
1915                                              desc_len)) {
1916                desc = desc_buf;
1917            }
1918        }
1919        if (!desc) {
1920            vu_panic(dev, "Invalid indirect buffer table");
1921            return NULL;
1922        }
1923        i = 0;
1924    }
1925
1926    /* Collect all the descriptors */
1927    do {
1928        if (desc[i].flags & VRING_DESC_F_WRITE) {
1929            virtqueue_map_desc(dev, &in_num, iov + out_num,
1930                               VIRTQUEUE_MAX_SIZE - out_num, true,
1931                               desc[i].addr, desc[i].len);
1932        } else {
1933            if (in_num) {
1934                vu_panic(dev, "Incorrect order for descriptors");
1935                return NULL;
1936            }
1937            virtqueue_map_desc(dev, &out_num, iov,
1938                               VIRTQUEUE_MAX_SIZE, false,
1939                               desc[i].addr, desc[i].len);
1940        }
1941
1942        /* If we've got too many, that implies a descriptor loop. */
1943        if ((in_num + out_num) > max) {
1944            vu_panic(dev, "Looped descriptor");
1945        }
1946        rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
1947    } while (rc == VIRTQUEUE_READ_DESC_MORE);
1948
1949    if (rc == VIRTQUEUE_READ_DESC_ERROR) {
1950        return NULL;
1951    }
1952
1953    /* Now copy what we have collected and mapped */
1954    elem = virtqueue_alloc_element(sz, out_num, in_num);
1955    elem->index = head;
1956    for (i = 0; i < out_num; i++) {
1957        elem->out_sg[i] = iov[i];
1958    }
1959    for (i = 0; i < in_num; i++) {
1960        elem->in_sg[i] = iov[out_num + i];
1961    }
1962
1963    vq->inuse++;
1964
1965    return elem;
1966}
1967
1968bool
1969vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
1970{
1971    if (num > vq->inuse) {
1972        return false;
1973    }
1974    vq->last_avail_idx -= num;
1975    vq->inuse -= num;
1976    return true;
1977}
1978
1979static inline
1980void vring_used_write(VuDev *dev, VuVirtq *vq,
1981                      struct vring_used_elem *uelem, int i)
1982{
1983    struct vring_used *used = vq->vring.used;
1984
1985    used->ring[i] = *uelem;
1986    vu_log_write(dev, vq->vring.log_guest_addr +
1987                 offsetof(struct vring_used, ring[i]),
1988                 sizeof(used->ring[i]));
1989}
1990
1991
1992static void
1993vu_log_queue_fill(VuDev *dev, VuVirtq *vq,
1994                  const VuVirtqElement *elem,
1995                  unsigned int len)
1996{
1997    struct vring_desc *desc = vq->vring.desc;
1998    unsigned int i, max, min, desc_len;
1999    uint64_t desc_addr, read_len;
2000    struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
2001    unsigned num_bufs = 0;
2002
2003    max = vq->vring.num;
2004    i = elem->index;
2005
2006    if (desc[i].flags & VRING_DESC_F_INDIRECT) {
2007        if (desc[i].len % sizeof(struct vring_desc)) {
2008            vu_panic(dev, "Invalid size for indirect buffer table");
2009        }
2010
2011        /* loop over the indirect descriptor table */
2012        desc_addr = desc[i].addr;
2013        desc_len = desc[i].len;
2014        max = desc_len / sizeof(struct vring_desc);
2015        read_len = desc_len;
2016        desc = vu_gpa_to_va(dev, &read_len, desc_addr);
2017        if (unlikely(desc && read_len != desc_len)) {
2018            /* Failed to use zero copy */
2019            desc = NULL;
2020            if (!virtqueue_read_indirect_desc(dev, desc_buf,
2021                                              desc_addr,
2022                                              desc_len)) {
2023                desc = desc_buf;
2024            }
2025        }
2026        if (!desc) {
2027            vu_panic(dev, "Invalid indirect buffer table");
2028            return;
2029        }
2030        i = 0;
2031    }
2032
2033    do {
2034        if (++num_bufs > max) {
2035            vu_panic(dev, "Looped descriptor");
2036            return;
2037        }
2038
2039        if (desc[i].flags & VRING_DESC_F_WRITE) {
2040            min = MIN(desc[i].len, len);
2041            vu_log_write(dev, desc[i].addr, min);
2042            len -= min;
2043        }
2044
2045    } while (len > 0 &&
2046             (virtqueue_read_next_desc(dev, desc, i, max, &i)
2047              == VIRTQUEUE_READ_DESC_MORE));
2048}
2049
2050void
2051vu_queue_fill(VuDev *dev, VuVirtq *vq,
2052              const VuVirtqElement *elem,
2053              unsigned int len, unsigned int idx)
2054{
2055    struct vring_used_elem uelem;
2056
2057    if (unlikely(dev->broken) ||
2058        unlikely(!vq->vring.avail)) {
2059        return;
2060    }
2061
2062    vu_log_queue_fill(dev, vq, elem, len);
2063
2064    idx = (idx + vq->used_idx) % vq->vring.num;
2065
2066    uelem.id = elem->index;
2067    uelem.len = len;
2068    vring_used_write(dev, vq, &uelem, idx);
2069}
2070
2071static inline
2072void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val)
2073{
2074    vq->vring.used->idx = val;
2075    vu_log_write(dev,
2076                 vq->vring.log_guest_addr + offsetof(struct vring_used, idx),
2077                 sizeof(vq->vring.used->idx));
2078
2079    vq->used_idx = val;
2080}
2081
2082void
2083vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
2084{
2085    uint16_t old, new;
2086
2087    if (unlikely(dev->broken) ||
2088        unlikely(!vq->vring.avail)) {
2089        return;
2090    }
2091
2092    /* Make sure buffer is written before we update index. */
2093    smp_wmb();
2094
2095    old = vq->used_idx;
2096    new = old + count;
2097    vring_used_idx_set(dev, vq, new);
2098    vq->inuse -= count;
2099    if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
2100        vq->signalled_used_valid = false;
2101    }
2102}
2103
2104void
2105vu_queue_push(VuDev *dev, VuVirtq *vq,
2106              const VuVirtqElement *elem, unsigned int len)
2107{
2108    vu_queue_fill(dev, vq, elem, len, 0);
2109    vu_queue_flush(dev, vq, 1);
2110}
2111