qemu/contrib/libvhost-user/libvhost-user.c
<<
>>
Prefs
   1/*
   2 * Vhost User library
   3 *
   4 * Copyright IBM, Corp. 2007
   5 * Copyright (c) 2016 Red Hat, Inc.
   6 *
   7 * Authors:
   8 *  Anthony Liguori <aliguori@us.ibm.com>
   9 *  Marc-André Lureau <mlureau@redhat.com>
  10 *  Victor Kaplansky <victork@redhat.com>
  11 *
  12 * This work is licensed under the terms of the GNU GPL, version 2 or
  13 * later.  See the COPYING file in the top-level directory.
  14 */
  15
  16#include <qemu/osdep.h>
  17#include <sys/eventfd.h>
  18#include <linux/vhost.h>
  19
  20#include "qemu/atomic.h"
  21
  22#include "libvhost-user.h"
  23
  24#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
  25
  26/* The version of the protocol we support */
  27#define VHOST_USER_VERSION 1
  28#define LIBVHOST_USER_DEBUG 0
  29
  30#define DPRINT(...)                             \
  31    do {                                        \
  32        if (LIBVHOST_USER_DEBUG) {              \
  33            fprintf(stderr, __VA_ARGS__);        \
  34        }                                       \
  35    } while (0)
  36
  37static const char *
  38vu_request_to_string(int req)
  39{
  40#define REQ(req) [req] = #req
  41    static const char *vu_request_str[] = {
  42        REQ(VHOST_USER_NONE),
  43        REQ(VHOST_USER_GET_FEATURES),
  44        REQ(VHOST_USER_SET_FEATURES),
  45        REQ(VHOST_USER_NONE),
  46        REQ(VHOST_USER_GET_FEATURES),
  47        REQ(VHOST_USER_SET_FEATURES),
  48        REQ(VHOST_USER_SET_OWNER),
  49        REQ(VHOST_USER_RESET_OWNER),
  50        REQ(VHOST_USER_SET_MEM_TABLE),
  51        REQ(VHOST_USER_SET_LOG_BASE),
  52        REQ(VHOST_USER_SET_LOG_FD),
  53        REQ(VHOST_USER_SET_VRING_NUM),
  54        REQ(VHOST_USER_SET_VRING_ADDR),
  55        REQ(VHOST_USER_SET_VRING_BASE),
  56        REQ(VHOST_USER_GET_VRING_BASE),
  57        REQ(VHOST_USER_SET_VRING_KICK),
  58        REQ(VHOST_USER_SET_VRING_CALL),
  59        REQ(VHOST_USER_SET_VRING_ERR),
  60        REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
  61        REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
  62        REQ(VHOST_USER_GET_QUEUE_NUM),
  63        REQ(VHOST_USER_SET_VRING_ENABLE),
  64        REQ(VHOST_USER_SEND_RARP),
  65        REQ(VHOST_USER_INPUT_GET_CONFIG),
  66        REQ(VHOST_USER_MAX),
  67    };
  68#undef REQ
  69
  70    if (req < VHOST_USER_MAX) {
  71        return vu_request_str[req];
  72    } else {
  73        return "unknown";
  74    }
  75}
  76
  77static void
  78vu_panic(VuDev *dev, const char *msg, ...)
  79{
  80    char *buf = NULL;
  81    va_list ap;
  82
  83    va_start(ap, msg);
  84    buf = g_strdup_vprintf(msg, ap);
  85    va_end(ap);
  86
  87    dev->broken = true;
  88    dev->panic(dev, buf);
  89    free(buf);
  90
  91    /* FIXME: find a way to call virtio_error? */
  92}
  93
  94/* Translate guest physical address to our virtual address.  */
  95void *
  96vu_gpa_to_va(VuDev *dev, uint64_t guest_addr)
  97{
  98    int i;
  99
 100    /* Find matching memory region.  */
 101    for (i = 0; i < dev->nregions; i++) {
 102        VuDevRegion *r = &dev->regions[i];
 103
 104        if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
 105            return (void *)(uintptr_t)
 106                guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
 107        }
 108    }
 109
 110    return NULL;
 111}
 112
 113/* Translate qemu virtual address to our virtual address.  */
 114static void *
 115qva_to_va(VuDev *dev, uint64_t qemu_addr)
 116{
 117    int i;
 118
 119    /* Find matching memory region.  */
 120    for (i = 0; i < dev->nregions; i++) {
 121        VuDevRegion *r = &dev->regions[i];
 122
 123        if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
 124            return (void *)(uintptr_t)
 125                qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
 126        }
 127    }
 128
 129    return NULL;
 130}
 131
 132static void
 133vmsg_close_fds(VhostUserMsg *vmsg)
 134{
 135    int i;
 136
 137    for (i = 0; i < vmsg->fd_num; i++) {
 138        close(vmsg->fds[i]);
 139    }
 140}
 141
 142static bool
 143vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 144{
 145    char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
 146    struct iovec iov = {
 147        .iov_base = (char *)vmsg,
 148        .iov_len = VHOST_USER_HDR_SIZE,
 149    };
 150    struct msghdr msg = {
 151        .msg_iov = &iov,
 152        .msg_iovlen = 1,
 153        .msg_control = control,
 154        .msg_controllen = sizeof(control),
 155    };
 156    size_t fd_size;
 157    struct cmsghdr *cmsg;
 158    int rc;
 159
 160    do {
 161        rc = recvmsg(conn_fd, &msg, 0);
 162    } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 163
 164    if (rc < 0) {
 165        vu_panic(dev, "Error while recvmsg: %s", strerror(errno));
 166        return false;
 167    }
 168
 169    vmsg->fd_num = 0;
 170    for (cmsg = CMSG_FIRSTHDR(&msg);
 171         cmsg != NULL;
 172         cmsg = CMSG_NXTHDR(&msg, cmsg))
 173    {
 174        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
 175            fd_size = cmsg->cmsg_len - CMSG_LEN(0);
 176            vmsg->fd_num = fd_size / sizeof(int);
 177            memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
 178            break;
 179        }
 180    }
 181
 182    if (vmsg->size > sizeof(vmsg->payload)) {
 183        vu_panic(dev,
 184                 "Error: too big message request: %d, size: vmsg->size: %u, "
 185                 "while sizeof(vmsg->payload) = %zu\n",
 186                 vmsg->request, vmsg->size, sizeof(vmsg->payload));
 187        goto fail;
 188    }
 189
 190    if (vmsg->size) {
 191        do {
 192            rc = read(conn_fd, &vmsg->payload, vmsg->size);
 193        } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 194
 195        if (rc <= 0) {
 196            vu_panic(dev, "Error while reading: %s", strerror(errno));
 197            goto fail;
 198        }
 199
 200        assert(rc == vmsg->size);
 201    }
 202
 203    return true;
 204
 205fail:
 206    vmsg_close_fds(vmsg);
 207
 208    return false;
 209}
 210
 211static bool
 212vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
 213{
 214    int rc;
 215    uint8_t *p = (uint8_t *)vmsg;
 216
 217    /* Set the version in the flags when sending the reply */
 218    vmsg->flags &= ~VHOST_USER_VERSION_MASK;
 219    vmsg->flags |= VHOST_USER_VERSION;
 220    vmsg->flags |= VHOST_USER_REPLY_MASK;
 221
 222    do {
 223        rc = write(conn_fd, p, VHOST_USER_HDR_SIZE);
 224    } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 225
 226    do {
 227        if (vmsg->data) {
 228            rc = write(conn_fd, vmsg->data, vmsg->size);
 229        } else {
 230            rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
 231        }
 232    } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 233
 234    if (rc <= 0) {
 235        vu_panic(dev, "Error while writing: %s", strerror(errno));
 236        return false;
 237    }
 238
 239    return true;
 240}
 241
 242/* Kick the log_call_fd if required. */
 243static void
 244vu_log_kick(VuDev *dev)
 245{
 246    if (dev->log_call_fd != -1) {
 247        DPRINT("Kicking the QEMU's log...\n");
 248        if (eventfd_write(dev->log_call_fd, 1) < 0) {
 249            vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
 250        }
 251    }
 252}
 253
 254static void
 255vu_log_page(uint8_t *log_table, uint64_t page)
 256{
 257    DPRINT("Logged dirty guest page: %"PRId64"\n", page);
 258    atomic_or(&log_table[page / 8], 1 << (page % 8));
 259}
 260
 261static void
 262vu_log_write(VuDev *dev, uint64_t address, uint64_t length)
 263{
 264    uint64_t page;
 265
 266    if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
 267        !dev->log_table || !length) {
 268        return;
 269    }
 270
 271    assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
 272
 273    page = address / VHOST_LOG_PAGE;
 274    while (page * VHOST_LOG_PAGE < address + length) {
 275        vu_log_page(dev->log_table, page);
 276        page += VHOST_LOG_PAGE;
 277    }
 278
 279    vu_log_kick(dev);
 280}
 281
 282static void
 283vu_kick_cb(VuDev *dev, int condition, void *data)
 284{
 285    int index = (intptr_t)data;
 286    VuVirtq *vq = &dev->vq[index];
 287    int sock = vq->kick_fd;
 288    eventfd_t kick_data;
 289    ssize_t rc;
 290
 291    rc = eventfd_read(sock, &kick_data);
 292    if (rc == -1) {
 293        vu_panic(dev, "kick eventfd_read(): %s", strerror(errno));
 294        dev->remove_watch(dev, dev->vq[index].kick_fd);
 295    } else {
 296        DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n",
 297               kick_data, vq->handler, index);
 298        if (vq->handler) {
 299            vq->handler(dev, index);
 300        }
 301    }
 302}
 303
 304static bool
 305vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 306{
 307    vmsg->payload.u64 =
 308        1ULL << VHOST_F_LOG_ALL |
 309        1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
 310
 311    if (dev->iface->get_features) {
 312        vmsg->payload.u64 |= dev->iface->get_features(dev);
 313    }
 314
 315    vmsg->size = sizeof(vmsg->payload.u64);
 316
 317    DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 318
 319    return true;
 320}
 321
 322static void
 323vu_set_enable_all_rings(VuDev *dev, bool enabled)
 324{
 325    int i;
 326
 327    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
 328        dev->vq[i].enable = enabled;
 329    }
 330}
 331
 332static bool
 333vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 334{
 335    DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 336
 337    dev->features = vmsg->payload.u64;
 338
 339    if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) {
 340        vu_set_enable_all_rings(dev, true);
 341    }
 342
 343    if (dev->iface->set_features) {
 344        dev->iface->set_features(dev, dev->features);
 345    }
 346
 347    return false;
 348}
 349
 350static bool
 351vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg)
 352{
 353    return false;
 354}
 355
 356static void
 357vu_close_log(VuDev *dev)
 358{
 359    if (dev->log_table) {
 360        if (munmap(dev->log_table, dev->log_size) != 0) {
 361            perror("close log munmap() error");
 362        }
 363
 364        dev->log_table = NULL;
 365    }
 366    if (dev->log_call_fd != -1) {
 367        close(dev->log_call_fd);
 368        dev->log_call_fd = -1;
 369    }
 370}
 371
 372static bool
 373vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
 374{
 375    vu_set_enable_all_rings(dev, false);
 376
 377    return false;
 378}
 379
 380static bool
 381vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
 382{
 383    int i;
 384    VhostUserMemory *memory = &vmsg->payload.memory;
 385    dev->nregions = memory->nregions;
 386
 387    DPRINT("Nregions: %d\n", memory->nregions);
 388    for (i = 0; i < dev->nregions; i++) {
 389        void *mmap_addr;
 390        VhostUserMemoryRegion *msg_region = &memory->regions[i];
 391        VuDevRegion *dev_region = &dev->regions[i];
 392
 393        DPRINT("Region %d\n", i);
 394        DPRINT("    guest_phys_addr: 0x%016"PRIx64"\n",
 395               msg_region->guest_phys_addr);
 396        DPRINT("    memory_size:     0x%016"PRIx64"\n",
 397               msg_region->memory_size);
 398        DPRINT("    userspace_addr   0x%016"PRIx64"\n",
 399               msg_region->userspace_addr);
 400        DPRINT("    mmap_offset      0x%016"PRIx64"\n",
 401               msg_region->mmap_offset);
 402
 403        dev_region->gpa = msg_region->guest_phys_addr;
 404        dev_region->size = msg_region->memory_size;
 405        dev_region->qva = msg_region->userspace_addr;
 406        dev_region->mmap_offset = msg_region->mmap_offset;
 407
 408        /* We don't use offset argument of mmap() since the
 409         * mapped address has to be page aligned, and we use huge
 410         * pages.  */
 411        mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
 412                         PROT_READ | PROT_WRITE, MAP_SHARED,
 413                         vmsg->fds[i], 0);
 414
 415        if (mmap_addr == MAP_FAILED) {
 416            vu_panic(dev, "region mmap error: %s", strerror(errno));
 417        } else {
 418            dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
 419            DPRINT("    mmap_addr:       0x%016"PRIx64"\n",
 420                   dev_region->mmap_addr);
 421        }
 422
 423        close(vmsg->fds[i]);
 424    }
 425
 426    return false;
 427}
 428
 429static bool
 430vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg)
 431{
 432    int fd;
 433    uint64_t log_mmap_size, log_mmap_offset;
 434    void *rc;
 435
 436    if (vmsg->fd_num != 1 ||
 437        vmsg->size != sizeof(vmsg->payload.log)) {
 438        vu_panic(dev, "Invalid log_base message");
 439        return true;
 440    }
 441
 442    fd = vmsg->fds[0];
 443    log_mmap_offset = vmsg->payload.log.mmap_offset;
 444    log_mmap_size = vmsg->payload.log.mmap_size;
 445    DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
 446    DPRINT("Log mmap_size:   %"PRId64"\n", log_mmap_size);
 447
 448    rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
 449              log_mmap_offset);
 450    if (rc == MAP_FAILED) {
 451        perror("log mmap error");
 452    }
 453    dev->log_table = rc;
 454    dev->log_size = log_mmap_size;
 455
 456    vmsg->size = sizeof(vmsg->payload.u64);
 457
 458    return true;
 459}
 460
 461static bool
 462vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg)
 463{
 464    if (vmsg->fd_num != 1) {
 465        vu_panic(dev, "Invalid log_fd message");
 466        return false;
 467    }
 468
 469    if (dev->log_call_fd != -1) {
 470        close(dev->log_call_fd);
 471    }
 472    dev->log_call_fd = vmsg->fds[0];
 473    DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
 474
 475    return false;
 476}
 477
 478static bool
 479vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg)
 480{
 481    unsigned int index = vmsg->payload.state.index;
 482    unsigned int num = vmsg->payload.state.num;
 483
 484    DPRINT("State.index: %d\n", index);
 485    DPRINT("State.num:   %d\n", num);
 486    dev->vq[index].vring.num = num;
 487
 488    return false;
 489}
 490
 491static bool
 492vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg)
 493{
 494    struct vhost_vring_addr *vra = &vmsg->payload.addr;
 495    unsigned int index = vra->index;
 496    VuVirtq *vq = &dev->vq[index];
 497
 498    DPRINT("vhost_vring_addr:\n");
 499    DPRINT("    index:  %d\n", vra->index);
 500    DPRINT("    flags:  %d\n", vra->flags);
 501    DPRINT("    desc_user_addr:   0x%016llx\n", vra->desc_user_addr);
 502    DPRINT("    used_user_addr:   0x%016llx\n", vra->used_user_addr);
 503    DPRINT("    avail_user_addr:  0x%016llx\n", vra->avail_user_addr);
 504    DPRINT("    log_guest_addr:   0x%016llx\n", vra->log_guest_addr);
 505
 506    vq->vring.flags = vra->flags;
 507    vq->vring.desc = qva_to_va(dev, vra->desc_user_addr);
 508    vq->vring.used = qva_to_va(dev, vra->used_user_addr);
 509    vq->vring.avail = qva_to_va(dev, vra->avail_user_addr);
 510    vq->vring.log_guest_addr = vra->log_guest_addr;
 511
 512    DPRINT("Setting virtq addresses:\n");
 513    DPRINT("    vring_desc  at %p\n", vq->vring.desc);
 514    DPRINT("    vring_used  at %p\n", vq->vring.used);
 515    DPRINT("    vring_avail at %p\n", vq->vring.avail);
 516
 517    if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) {
 518        vu_panic(dev, "Invalid vring_addr message");
 519        return false;
 520    }
 521
 522    vq->used_idx = vq->vring.used->idx;
 523
 524    if (vq->last_avail_idx != vq->used_idx) {
 525        bool resume = dev->iface->queue_is_processed_in_order &&
 526            dev->iface->queue_is_processed_in_order(dev, index);
 527
 528        DPRINT("Last avail index != used index: %u != %u%s\n",
 529               vq->last_avail_idx, vq->used_idx,
 530               resume ? ", resuming" : "");
 531
 532        if (resume) {
 533            vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx;
 534        }
 535    }
 536
 537    return false;
 538}
 539
 540static bool
 541vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
 542{
 543    unsigned int index = vmsg->payload.state.index;
 544    unsigned int num = vmsg->payload.state.num;
 545
 546    DPRINT("State.index: %d\n", index);
 547    DPRINT("State.num:   %d\n", num);
 548    dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num;
 549
 550    return false;
 551}
 552
 553static bool
 554vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
 555{
 556    unsigned int index = vmsg->payload.state.index;
 557
 558    DPRINT("State.index: %d\n", index);
 559    vmsg->payload.state.num = dev->vq[index].last_avail_idx;
 560    vmsg->size = sizeof(vmsg->payload.state);
 561
 562    dev->vq[index].started = false;
 563    if (dev->iface->queue_set_started) {
 564        dev->iface->queue_set_started(dev, index, false);
 565    }
 566
 567    if (dev->vq[index].call_fd != -1) {
 568        close(dev->vq[index].call_fd);
 569        dev->vq[index].call_fd = -1;
 570    }
 571    if (dev->vq[index].kick_fd != -1) {
 572        dev->remove_watch(dev, dev->vq[index].kick_fd);
 573        close(dev->vq[index].kick_fd);
 574        dev->vq[index].kick_fd = -1;
 575    }
 576
 577    return true;
 578}
 579
 580static bool
 581vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
 582{
 583    int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 584
 585    if (index >= VHOST_MAX_NR_VIRTQUEUE) {
 586        vmsg_close_fds(vmsg);
 587        vu_panic(dev, "Invalid queue index: %u", index);
 588        return false;
 589    }
 590
 591    if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK ||
 592        vmsg->fd_num != 1) {
 593        vmsg_close_fds(vmsg);
 594        vu_panic(dev, "Invalid fds in request: %d", vmsg->request);
 595        return false;
 596    }
 597
 598    return true;
 599}
 600
 601static bool
 602vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
 603{
 604    int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 605
 606    DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 607
 608    if (!vu_check_queue_msg_file(dev, vmsg)) {
 609        return false;
 610    }
 611
 612    if (dev->vq[index].kick_fd != -1) {
 613        dev->remove_watch(dev, dev->vq[index].kick_fd);
 614        close(dev->vq[index].kick_fd);
 615        dev->vq[index].kick_fd = -1;
 616    }
 617
 618    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
 619        dev->vq[index].kick_fd = vmsg->fds[0];
 620        DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
 621    }
 622
 623    dev->vq[index].started = true;
 624    if (dev->iface->queue_set_started) {
 625        dev->iface->queue_set_started(dev, index, true);
 626    }
 627
 628    if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
 629        dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
 630                       vu_kick_cb, (void *)(long)index);
 631
 632        DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
 633               dev->vq[index].kick_fd, index);
 634    }
 635
 636    return false;
 637}
 638
 639void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
 640                          vu_queue_handler_cb handler)
 641{
 642    int qidx = vq - dev->vq;
 643
 644    vq->handler = handler;
 645    if (vq->kick_fd >= 0) {
 646        if (handler) {
 647            dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
 648                           vu_kick_cb, (void *)(long)qidx);
 649        } else {
 650            dev->remove_watch(dev, vq->kick_fd);
 651        }
 652    }
 653}
 654
 655static bool
 656vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
 657{
 658    int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 659
 660    DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 661
 662    if (!vu_check_queue_msg_file(dev, vmsg)) {
 663        return false;
 664    }
 665
 666    if (dev->vq[index].call_fd != -1) {
 667        close(dev->vq[index].call_fd);
 668        dev->vq[index].call_fd = -1;
 669    }
 670
 671    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
 672        dev->vq[index].call_fd = vmsg->fds[0];
 673    }
 674
 675    DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
 676
 677    return false;
 678}
 679
 680static bool
 681vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
 682{
 683    int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 684
 685    DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
 686
 687    if (!vu_check_queue_msg_file(dev, vmsg)) {
 688        return false;
 689    }
 690
 691    if (dev->vq[index].err_fd != -1) {
 692        close(dev->vq[index].err_fd);
 693        dev->vq[index].err_fd = -1;
 694    }
 695
 696    if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
 697        dev->vq[index].err_fd = vmsg->fds[0];
 698    }
 699
 700    return false;
 701}
 702
 703static bool
 704vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 705{
 706    uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
 707
 708    if (dev->iface->get_protocol_features) {
 709        features |= dev->iface->get_protocol_features(dev);
 710    }
 711
 712    vmsg->payload.u64 = features;
 713    vmsg->size = sizeof(vmsg->payload.u64);
 714
 715    return true;
 716}
 717
 718static bool
 719vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
 720{
 721    uint64_t features = vmsg->payload.u64;
 722
 723    DPRINT("u64: 0x%016"PRIx64"\n", features);
 724
 725    dev->protocol_features = vmsg->payload.u64;
 726
 727    if (dev->iface->set_protocol_features) {
 728        dev->iface->set_protocol_features(dev, features);
 729    }
 730
 731    return false;
 732}
 733
 734static bool
 735vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg)
 736{
 737    DPRINT("Function %s() not implemented yet.\n", __func__);
 738    return false;
 739}
 740
 741static bool
 742vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg)
 743{
 744    unsigned int index = vmsg->payload.state.index;
 745    unsigned int enable = vmsg->payload.state.num;
 746
 747    DPRINT("State.index: %d\n", index);
 748    DPRINT("State.enable:   %d\n", enable);
 749
 750    if (index >= VHOST_MAX_NR_VIRTQUEUE) {
 751        vu_panic(dev, "Invalid vring_enable index: %u", index);
 752        return false;
 753    }
 754
 755    dev->vq[index].enable = enable;
 756    return false;
 757}
 758
 759static bool
 760vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 761{
 762    int do_reply = 0;
 763
 764    /* Print out generic part of the request. */
 765    DPRINT("================ Vhost user message ================\n");
 766    DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request),
 767           vmsg->request);
 768    DPRINT("Flags:   0x%x\n", vmsg->flags);
 769    DPRINT("Size:    %d\n", vmsg->size);
 770
 771    if (vmsg->fd_num) {
 772        int i;
 773        DPRINT("Fds:");
 774        for (i = 0; i < vmsg->fd_num; i++) {
 775            DPRINT(" %d", vmsg->fds[i]);
 776        }
 777        DPRINT("\n");
 778    }
 779
 780    if (dev->iface->process_msg &&
 781        dev->iface->process_msg(dev, vmsg, &do_reply)) {
 782        return do_reply;
 783    }
 784
 785    switch (vmsg->request) {
 786    case VHOST_USER_GET_FEATURES:
 787        return vu_get_features_exec(dev, vmsg);
 788    case VHOST_USER_SET_FEATURES:
 789        return vu_set_features_exec(dev, vmsg);
 790    case VHOST_USER_GET_PROTOCOL_FEATURES:
 791        return vu_get_protocol_features_exec(dev, vmsg);
 792    case VHOST_USER_SET_PROTOCOL_FEATURES:
 793        return vu_set_protocol_features_exec(dev, vmsg);
 794    case VHOST_USER_SET_OWNER:
 795        return vu_set_owner_exec(dev, vmsg);
 796    case VHOST_USER_RESET_OWNER:
 797        return vu_reset_device_exec(dev, vmsg);
 798    case VHOST_USER_SET_MEM_TABLE:
 799        return vu_set_mem_table_exec(dev, vmsg);
 800    case VHOST_USER_SET_LOG_BASE:
 801        return vu_set_log_base_exec(dev, vmsg);
 802    case VHOST_USER_SET_LOG_FD:
 803        return vu_set_log_fd_exec(dev, vmsg);
 804    case VHOST_USER_SET_VRING_NUM:
 805        return vu_set_vring_num_exec(dev, vmsg);
 806    case VHOST_USER_SET_VRING_ADDR:
 807        return vu_set_vring_addr_exec(dev, vmsg);
 808    case VHOST_USER_SET_VRING_BASE:
 809        return vu_set_vring_base_exec(dev, vmsg);
 810    case VHOST_USER_GET_VRING_BASE:
 811        return vu_get_vring_base_exec(dev, vmsg);
 812    case VHOST_USER_SET_VRING_KICK:
 813        return vu_set_vring_kick_exec(dev, vmsg);
 814    case VHOST_USER_SET_VRING_CALL:
 815        return vu_set_vring_call_exec(dev, vmsg);
 816    case VHOST_USER_SET_VRING_ERR:
 817        return vu_set_vring_err_exec(dev, vmsg);
 818    case VHOST_USER_GET_QUEUE_NUM:
 819        return vu_get_queue_num_exec(dev, vmsg);
 820    case VHOST_USER_SET_VRING_ENABLE:
 821        return vu_set_vring_enable_exec(dev, vmsg);
 822    case VHOST_USER_NONE:
 823        break;
 824    default:
 825        vmsg_close_fds(vmsg);
 826        vu_panic(dev, "Unhandled request: %d", vmsg->request);
 827    }
 828
 829    return false;
 830}
 831
 832bool
 833vu_dispatch(VuDev *dev)
 834{
 835    VhostUserMsg vmsg = { 0, };
 836    int reply_requested;
 837    bool success = false;
 838
 839    if (!vu_message_read(dev, dev->sock, &vmsg)) {
 840        goto end;
 841    }
 842
 843    reply_requested = vu_process_message(dev, &vmsg);
 844    if (!reply_requested) {
 845        success = true;
 846        goto end;
 847    }
 848
 849    if (!vu_message_write(dev, dev->sock, &vmsg)) {
 850        goto end;
 851    }
 852
 853    success = true;
 854
 855end:
 856    g_free(vmsg.data);
 857    return success;
 858}
 859
 860void
 861vu_deinit(VuDev *dev)
 862{
 863    int i;
 864
 865    for (i = 0; i < dev->nregions; i++) {
 866        VuDevRegion *r = &dev->regions[i];
 867        void *m = (void *) (uintptr_t) r->mmap_addr;
 868        if (m != MAP_FAILED) {
 869            munmap(m, r->size + r->mmap_offset);
 870        }
 871    }
 872    dev->nregions = 0;
 873
 874    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
 875        VuVirtq *vq = &dev->vq[i];
 876
 877        if (vq->call_fd != -1) {
 878            close(vq->call_fd);
 879            vq->call_fd = -1;
 880        }
 881
 882        if (vq->kick_fd != -1) {
 883            close(vq->kick_fd);
 884            vq->kick_fd = -1;
 885        }
 886
 887        if (vq->err_fd != -1) {
 888            close(vq->err_fd);
 889            vq->err_fd = -1;
 890        }
 891    }
 892
 893
 894    vu_close_log(dev);
 895
 896    if (dev->sock != -1) {
 897        close(dev->sock);
 898    }
 899}
 900
 901void
 902vu_init(VuDev *dev,
 903        int socket,
 904        vu_panic_cb panic,
 905        vu_set_watch_cb set_watch,
 906        vu_remove_watch_cb remove_watch,
 907        const VuDevIface *iface)
 908{
 909    int i;
 910
 911    assert(socket >= 0);
 912    assert(set_watch);
 913    assert(remove_watch);
 914    assert(iface);
 915    assert(panic);
 916
 917    memset(dev, 0, sizeof(*dev));
 918
 919    dev->sock = socket;
 920    dev->panic = panic;
 921    dev->set_watch = set_watch;
 922    dev->remove_watch = remove_watch;
 923    dev->iface = iface;
 924    dev->log_call_fd = -1;
 925    for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
 926        dev->vq[i] = (VuVirtq) {
 927            .call_fd = -1, .kick_fd = -1, .err_fd = -1,
 928            .notification = true,
 929        };
 930    }
 931}
 932
 933VuVirtq *
 934vu_get_queue(VuDev *dev, int qidx)
 935{
 936    assert(qidx < VHOST_MAX_NR_VIRTQUEUE);
 937    return &dev->vq[qidx];
 938}
 939
 940bool
 941vu_queue_enabled(VuDev *dev, VuVirtq *vq)
 942{
 943    return vq->enable;
 944}
 945
 946static inline uint16_t
 947vring_avail_flags(VuVirtq *vq)
 948{
 949    return vq->vring.avail->flags;
 950}
 951
 952static inline uint16_t
 953vring_avail_idx(VuVirtq *vq)
 954{
 955    vq->shadow_avail_idx = vq->vring.avail->idx;
 956
 957    return vq->shadow_avail_idx;
 958}
 959
 960static inline uint16_t
 961vring_avail_ring(VuVirtq *vq, int i)
 962{
 963    return vq->vring.avail->ring[i];
 964}
 965
 966static inline uint16_t
 967vring_get_used_event(VuVirtq *vq)
 968{
 969    return vring_avail_ring(vq, vq->vring.num);
 970}
 971
 972static int
 973virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx)
 974{
 975    uint16_t num_heads = vring_avail_idx(vq) - idx;
 976
 977    /* Check it isn't doing very strange things with descriptor numbers. */
 978    if (num_heads > vq->vring.num) {
 979        vu_panic(dev, "Guest moved used index from %u to %u",
 980                 idx, vq->shadow_avail_idx);
 981        return -1;
 982    }
 983    if (num_heads) {
 984        /* On success, callers read a descriptor at vq->last_avail_idx.
 985         * Make sure descriptor read does not bypass avail index read. */
 986        smp_rmb();
 987    }
 988
 989    return num_heads;
 990}
 991
 992static bool
 993virtqueue_get_head(VuDev *dev, VuVirtq *vq,
 994                   unsigned int idx, unsigned int *head)
 995{
 996    /* Grab the next descriptor number they're advertising, and increment
 997     * the index we've seen. */
 998    *head = vring_avail_ring(vq, idx % vq->vring.num);
 999
1000    /* If their number is silly, that's a fatal mistake. */
1001    if (*head >= vq->vring.num) {
1002        vu_panic(dev, "Guest says index %u is available", head);
1003        return false;
1004    }
1005
1006    return true;
1007}
1008
1009enum {
1010    VIRTQUEUE_READ_DESC_ERROR = -1,
1011    VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
1012    VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
1013};
1014
1015static int
1016virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc,
1017                         int i, unsigned int max, unsigned int *next)
1018{
1019    /* If this descriptor says it doesn't chain, we're done. */
1020    if (!(desc[i].flags & VRING_DESC_F_NEXT)) {
1021        return VIRTQUEUE_READ_DESC_DONE;
1022    }
1023
1024    /* Check they're not leading us off end of descriptors. */
1025    *next = desc[i].next;
1026    /* Make sure compiler knows to grab that: we don't want it changing! */
1027    smp_wmb();
1028
1029    if (*next >= max) {
1030        vu_panic(dev, "Desc next is %u", next);
1031        return VIRTQUEUE_READ_DESC_ERROR;
1032    }
1033
1034    return VIRTQUEUE_READ_DESC_MORE;
1035}
1036
1037void
1038vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes,
1039                         unsigned int *out_bytes,
1040                         unsigned max_in_bytes, unsigned max_out_bytes)
1041{
1042    unsigned int idx;
1043    unsigned int total_bufs, in_total, out_total;
1044    int rc;
1045
1046    idx = vq->last_avail_idx;
1047
1048    total_bufs = in_total = out_total = 0;
1049    if (unlikely(dev->broken) ||
1050        unlikely(!vq->vring.avail)) {
1051        goto done;
1052    }
1053
1054    while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) {
1055        unsigned int max, num_bufs, indirect = 0;
1056        struct vring_desc *desc;
1057        unsigned int i;
1058
1059        max = vq->vring.num;
1060        num_bufs = total_bufs;
1061        if (!virtqueue_get_head(dev, vq, idx++, &i)) {
1062            goto err;
1063        }
1064        desc = vq->vring.desc;
1065
1066        if (desc[i].flags & VRING_DESC_F_INDIRECT) {
1067            if (desc[i].len % sizeof(struct vring_desc)) {
1068                vu_panic(dev, "Invalid size for indirect buffer table");
1069                goto err;
1070            }
1071
1072            /* If we've got too many, that implies a descriptor loop. */
1073            if (num_bufs >= max) {
1074                vu_panic(dev, "Looped descriptor");
1075                goto err;
1076            }
1077
1078            /* loop over the indirect descriptor table */
1079            indirect = 1;
1080            max = desc[i].len / sizeof(struct vring_desc);
1081            desc = vu_gpa_to_va(dev, desc[i].addr);
1082            num_bufs = i = 0;
1083        }
1084
1085        do {
1086            /* If we've got too many, that implies a descriptor loop. */
1087            if (++num_bufs > max) {
1088                vu_panic(dev, "Looped descriptor");
1089                goto err;
1090            }
1091
1092            if (desc[i].flags & VRING_DESC_F_WRITE) {
1093                in_total += desc[i].len;
1094            } else {
1095                out_total += desc[i].len;
1096            }
1097            if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
1098                goto done;
1099            }
1100            rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
1101        } while (rc == VIRTQUEUE_READ_DESC_MORE);
1102
1103        if (rc == VIRTQUEUE_READ_DESC_ERROR) {
1104            goto err;
1105        }
1106
1107        if (!indirect) {
1108            total_bufs = num_bufs;
1109        } else {
1110            total_bufs++;
1111        }
1112    }
1113    if (rc < 0) {
1114        goto err;
1115    }
1116done:
1117    if (in_bytes) {
1118        *in_bytes = in_total;
1119    }
1120    if (out_bytes) {
1121        *out_bytes = out_total;
1122    }
1123    return;
1124
1125err:
1126    in_total = out_total = 0;
1127    goto done;
1128}
1129
1130bool
1131vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
1132                     unsigned int out_bytes)
1133{
1134    unsigned int in_total, out_total;
1135
1136    vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total,
1137                             in_bytes, out_bytes);
1138
1139    return in_bytes <= in_total && out_bytes <= out_total;
1140}
1141
1142/* Fetch avail_idx from VQ memory only when we really need to know if
1143 * guest has added some buffers. */
1144bool
1145vu_queue_empty(VuDev *dev, VuVirtq *vq)
1146{
1147    if (unlikely(dev->broken) ||
1148        unlikely(!vq->vring.avail)) {
1149        return true;
1150    }
1151
1152    if (vq->shadow_avail_idx != vq->last_avail_idx) {
1153        return false;
1154    }
1155
1156    return vring_avail_idx(vq) == vq->last_avail_idx;
1157}
1158
1159static inline
1160bool has_feature(uint64_t features, unsigned int fbit)
1161{
1162    assert(fbit < 64);
1163    return !!(features & (1ULL << fbit));
1164}
1165
1166static inline
1167bool vu_has_feature(VuDev *dev,
1168                    unsigned int fbit)
1169{
1170    return has_feature(dev->features, fbit);
1171}
1172
1173static bool
1174vring_notify(VuDev *dev, VuVirtq *vq)
1175{
1176    uint16_t old, new;
1177    bool v;
1178
1179    /* We need to expose used array entries before checking used event. */
1180    smp_mb();
1181
1182    /* Always notify when queue is empty (when feature acknowledge) */
1183    if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
1184        !vq->inuse && vu_queue_empty(dev, vq)) {
1185        return true;
1186    }
1187
1188    if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1189        return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
1190    }
1191
1192    v = vq->signalled_used_valid;
1193    vq->signalled_used_valid = true;
1194    old = vq->signalled_used;
1195    new = vq->signalled_used = vq->used_idx;
1196    return !v || vring_need_event(vring_get_used_event(vq), new, old);
1197}
1198
1199void
1200vu_queue_notify(VuDev *dev, VuVirtq *vq)
1201{
1202    if (unlikely(dev->broken) ||
1203        unlikely(!vq->vring.avail)) {
1204        return;
1205    }
1206
1207    if (!vring_notify(dev, vq)) {
1208        DPRINT("skipped notify...\n");
1209        return;
1210    }
1211
1212    if (eventfd_write(vq->call_fd, 1) < 0) {
1213        vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
1214    }
1215}
1216
1217static inline void
1218vring_used_flags_set_bit(VuVirtq *vq, int mask)
1219{
1220    uint16_t *flags;
1221
1222    flags = (uint16_t *)((char*)vq->vring.used +
1223                         offsetof(struct vring_used, flags));
1224    *flags |= mask;
1225}
1226
1227static inline void
1228vring_used_flags_unset_bit(VuVirtq *vq, int mask)
1229{
1230    uint16_t *flags;
1231
1232    flags = (uint16_t *)((char*)vq->vring.used +
1233                         offsetof(struct vring_used, flags));
1234    *flags &= ~mask;
1235}
1236
1237static inline void
1238vring_set_avail_event(VuVirtq *vq, uint16_t val)
1239{
1240    if (!vq->notification) {
1241        return;
1242    }
1243
1244    *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val;
1245}
1246
1247void
1248vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable)
1249{
1250    vq->notification = enable;
1251    if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1252        vring_set_avail_event(vq, vring_avail_idx(vq));
1253    } else if (enable) {
1254        vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
1255    } else {
1256        vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
1257    }
1258    if (enable) {
1259        /* Expose avail event/used flags before caller checks the avail idx. */
1260        smp_mb();
1261    }
1262}
1263
1264static void
1265virtqueue_map_desc(VuDev *dev,
1266                   unsigned int *p_num_sg, struct iovec *iov,
1267                   unsigned int max_num_sg, bool is_write,
1268                   uint64_t pa, size_t sz)
1269{
1270    unsigned num_sg = *p_num_sg;
1271
1272    assert(num_sg <= max_num_sg);
1273
1274    if (!sz) {
1275        vu_panic(dev, "virtio: zero sized buffers are not allowed");
1276        return;
1277    }
1278
1279    iov[num_sg].iov_base = vu_gpa_to_va(dev, pa);
1280    iov[num_sg].iov_len = sz;
1281    num_sg++;
1282
1283    *p_num_sg = num_sg;
1284}
1285
1286/* Round number down to multiple */
1287#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
1288
1289/* Round number up to multiple */
1290#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
1291
1292static void *
1293virtqueue_alloc_element(size_t sz,
1294                                     unsigned out_num, unsigned in_num)
1295{
1296    VuVirtqElement *elem;
1297    size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
1298    size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
1299    size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
1300
1301    assert(sz >= sizeof(VuVirtqElement));
1302    elem = malloc(out_sg_end);
1303    elem->out_num = out_num;
1304    elem->in_num = in_num;
1305    elem->in_sg = (void *)elem + in_sg_ofs;
1306    elem->out_sg = (void *)elem + out_sg_ofs;
1307    return elem;
1308}
1309
1310void *
1311vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
1312{
1313    unsigned int i, head, max;
1314    VuVirtqElement *elem;
1315    unsigned out_num, in_num;
1316    struct iovec iov[VIRTQUEUE_MAX_SIZE];
1317    struct vring_desc *desc;
1318    int rc;
1319
1320    if (unlikely(dev->broken) ||
1321        unlikely(!vq->vring.avail)) {
1322        return NULL;
1323    }
1324
1325    if (vu_queue_empty(dev, vq)) {
1326        return NULL;
1327    }
1328    /* Needed after virtio_queue_empty(), see comment in
1329     * virtqueue_num_heads(). */
1330    smp_rmb();
1331
1332    /* When we start there are none of either input nor output. */
1333    out_num = in_num = 0;
1334
1335    max = vq->vring.num;
1336    if (vq->inuse >= vq->vring.num) {
1337        vu_panic(dev, "Virtqueue size exceeded");
1338        return NULL;
1339    }
1340
1341    if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
1342        return NULL;
1343    }
1344
1345    if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1346        vring_set_avail_event(vq, vq->last_avail_idx);
1347    }
1348
1349    i = head;
1350    desc = vq->vring.desc;
1351    if (desc[i].flags & VRING_DESC_F_INDIRECT) {
1352        if (desc[i].len % sizeof(struct vring_desc)) {
1353            vu_panic(dev, "Invalid size for indirect buffer table");
1354        }
1355
1356        /* loop over the indirect descriptor table */
1357        max = desc[i].len / sizeof(struct vring_desc);
1358        desc = vu_gpa_to_va(dev, desc[i].addr);
1359        i = 0;
1360    }
1361
1362    /* Collect all the descriptors */
1363    do {
1364        if (desc[i].flags & VRING_DESC_F_WRITE) {
1365            virtqueue_map_desc(dev, &in_num, iov + out_num,
1366                               VIRTQUEUE_MAX_SIZE - out_num, true,
1367                               desc[i].addr, desc[i].len);
1368        } else {
1369            if (in_num) {
1370                vu_panic(dev, "Incorrect order for descriptors");
1371                return NULL;
1372            }
1373            virtqueue_map_desc(dev, &out_num, iov,
1374                               VIRTQUEUE_MAX_SIZE, false,
1375                               desc[i].addr, desc[i].len);
1376        }
1377
1378        /* If we've got too many, that implies a descriptor loop. */
1379        if ((in_num + out_num) > max) {
1380            vu_panic(dev, "Looped descriptor");
1381        }
1382        rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
1383    } while (rc == VIRTQUEUE_READ_DESC_MORE);
1384
1385    if (rc == VIRTQUEUE_READ_DESC_ERROR) {
1386        return NULL;
1387    }
1388
1389    /* Now copy what we have collected and mapped */
1390    elem = virtqueue_alloc_element(sz, out_num, in_num);
1391    elem->index = head;
1392    for (i = 0; i < out_num; i++) {
1393        elem->out_sg[i] = iov[i];
1394    }
1395    for (i = 0; i < in_num; i++) {
1396        elem->in_sg[i] = iov[out_num + i];
1397    }
1398
1399    vq->inuse++;
1400
1401    return elem;
1402}
1403
1404bool
1405vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
1406{
1407    if (num > vq->inuse) {
1408        return false;
1409    }
1410    vq->last_avail_idx -= num;
1411    vq->inuse -= num;
1412    return true;
1413}
1414
1415static inline
1416void vring_used_write(VuDev *dev, VuVirtq *vq,
1417                      struct vring_used_elem *uelem, int i)
1418{
1419    struct vring_used *used = vq->vring.used;
1420
1421    used->ring[i] = *uelem;
1422    vu_log_write(dev, vq->vring.log_guest_addr +
1423                 offsetof(struct vring_used, ring[i]),
1424                 sizeof(used->ring[i]));
1425}
1426
1427
1428static void
1429vu_log_queue_fill(VuDev *dev, VuVirtq *vq,
1430                  const VuVirtqElement *elem,
1431                  unsigned int len)
1432{
1433    struct vring_desc *desc = vq->vring.desc;
1434    unsigned int i, max, min;
1435    unsigned num_bufs = 0;
1436
1437    max = vq->vring.num;
1438    i = elem->index;
1439
1440    if (desc[i].flags & VRING_DESC_F_INDIRECT) {
1441        if (desc[i].len % sizeof(struct vring_desc)) {
1442            vu_panic(dev, "Invalid size for indirect buffer table");
1443        }
1444
1445        /* loop over the indirect descriptor table */
1446        max = desc[i].len / sizeof(struct vring_desc);
1447        desc = vu_gpa_to_va(dev, desc[i].addr);
1448        i = 0;
1449    }
1450
1451    do {
1452        if (++num_bufs > max) {
1453            vu_panic(dev, "Looped descriptor");
1454            return;
1455        }
1456
1457        if (desc[i].flags & VRING_DESC_F_WRITE) {
1458            min = MIN(desc[i].len, len);
1459            vu_log_write(dev, desc[i].addr, min);
1460            len -= min;
1461        }
1462
1463    } while (len > 0 &&
1464             (virtqueue_read_next_desc(dev, desc, i, max, &i)
1465              == VIRTQUEUE_READ_DESC_MORE));
1466}
1467
1468void
1469vu_queue_fill(VuDev *dev, VuVirtq *vq,
1470              const VuVirtqElement *elem,
1471              unsigned int len, unsigned int idx)
1472{
1473    struct vring_used_elem uelem;
1474
1475    if (unlikely(dev->broken) ||
1476        unlikely(!vq->vring.avail)) {
1477        return;
1478    }
1479
1480    vu_log_queue_fill(dev, vq, elem, len);
1481
1482    idx = (idx + vq->used_idx) % vq->vring.num;
1483
1484    uelem.id = elem->index;
1485    uelem.len = len;
1486    vring_used_write(dev, vq, &uelem, idx);
1487}
1488
1489static inline
1490void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val)
1491{
1492    vq->vring.used->idx = val;
1493    vu_log_write(dev,
1494                 vq->vring.log_guest_addr + offsetof(struct vring_used, idx),
1495                 sizeof(vq->vring.used->idx));
1496
1497    vq->used_idx = val;
1498}
1499
1500void
1501vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
1502{
1503    uint16_t old, new;
1504
1505    if (unlikely(dev->broken) ||
1506        unlikely(!vq->vring.avail)) {
1507        return;
1508    }
1509
1510    /* Make sure buffer is written before we update index. */
1511    smp_wmb();
1512
1513    old = vq->used_idx;
1514    new = old + count;
1515    vring_used_idx_set(dev, vq, new);
1516    vq->inuse -= count;
1517    if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
1518        vq->signalled_used_valid = false;
1519    }
1520}
1521
1522void
1523vu_queue_push(VuDev *dev, VuVirtq *vq,
1524              const VuVirtqElement *elem, unsigned int len)
1525{
1526    vu_queue_fill(dev, vq, elem, len, 0);
1527    vu_queue_flush(dev, vq, 1);
1528}
1529