LXR qemu/hw/virtio/vhost.c

   1/*
   2 * vhost support
   3 *
   4 * Copyright Red Hat, Inc. 2010
   5 *
   6 * Authors:
   7 *  Michael S. Tsirkin <mst@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qapi/error.h"
  18#include "hw/virtio/vhost.h"
  19#include "qemu/atomic.h"
  20#include "qemu/range.h"
  21#include "qemu/error-report.h"
  22#include "qemu/memfd.h"
  23#include "standard-headers/linux/vhost_types.h"
  24#include "hw/virtio/virtio-bus.h"
  25#include "hw/virtio/virtio-access.h"
  26#include "migration/blocker.h"
  27#include "migration/qemu-file-types.h"
  28#include "sysemu/dma.h"
  29#include "sysemu/tcg.h"
  30#include "trace.h"
  31
  32/* enabled until disconnected backend stabilizes */
  33#define _VHOST_DEBUG 1
  34
  35#ifdef _VHOST_DEBUG
  36#define VHOST_OPS_DEBUG(fmt, ...) \
  37    do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
  38                      strerror(errno), errno); } while (0)
  39#else
  40#define VHOST_OPS_DEBUG(fmt, ...) \
  41    do { } while (0)
  42#endif
  43
  44static struct vhost_log *vhost_log;
  45static struct vhost_log *vhost_log_shm;
  46
  47static unsigned int used_memslots;
  48static QLIST_HEAD(, vhost_dev) vhost_devices =
  49    QLIST_HEAD_INITIALIZER(vhost_devices);
  50
  51bool vhost_has_free_slot(void)
  52{
  53    unsigned int slots_limit = ~0U;
  54    struct vhost_dev *hdev;
  55
  56    QLIST_FOREACH(hdev, &vhost_devices, entry) {
  57        unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  58        slots_limit = MIN(slots_limit, r);
  59    }
  60    return slots_limit > used_memslots;
  61}
  62
  63static void vhost_dev_sync_region(struct vhost_dev *dev,
  64                                  MemoryRegionSection *section,
  65                                  uint64_t mfirst, uint64_t mlast,
  66                                  uint64_t rfirst, uint64_t rlast)
  67{
  68    vhost_log_chunk_t *log = dev->log->log;
  69
  70    uint64_t start = MAX(mfirst, rfirst);
  71    uint64_t end = MIN(mlast, rlast);
  72    vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  73    vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  74    uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
  75
  76    if (end < start) {
  77        return;
  78    }
  79    assert(end / VHOST_LOG_CHUNK < dev->log_size);
  80    assert(start / VHOST_LOG_CHUNK < dev->log_size);
  81
  82    for (;from < to; ++from) {
  83        vhost_log_chunk_t log;
  84        /* We first check with non-atomic: much cheaper,
  85         * and we expect non-dirty to be the common case. */
  86        if (!*from) {
  87            addr += VHOST_LOG_CHUNK;
  88            continue;
  89        }
  90        /* Data must be read atomically. We don't really need barrier semantics
  91         * but it's easier to use atomic_* than roll our own. */
  92        log = qatomic_xchg(from, 0);
  93        while (log) {
  94            int bit = ctzl(log);
  95            hwaddr page_addr;
  96            hwaddr section_offset;
  97            hwaddr mr_offset;
  98            page_addr = addr + bit * VHOST_LOG_PAGE;
  99            section_offset = page_addr - section->offset_within_address_space;
 100            mr_offset = section_offset + section->offset_within_region;
 101            memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
 102            log &= ~(0x1ull << bit);
 103        }
 104        addr += VHOST_LOG_CHUNK;
 105    }
 106}
 107
 108static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
 109                                   MemoryRegionSection *section,
 110                                   hwaddr first,
 111                                   hwaddr last)
 112{
 113    int i;
 114    hwaddr start_addr;
 115    hwaddr end_addr;
 116
 117    if (!dev->log_enabled || !dev->started) {
 118        return 0;
 119    }
 120    start_addr = section->offset_within_address_space;
 121    end_addr = range_get_last(start_addr, int128_get64(section->size));
 122    start_addr = MAX(first, start_addr);
 123    end_addr = MIN(last, end_addr);
 124
 125    for (i = 0; i < dev->mem->nregions; ++i) {
 126        struct vhost_memory_region *reg = dev->mem->regions + i;
 127        vhost_dev_sync_region(dev, section, start_addr, end_addr,
 128                              reg->guest_phys_addr,
 129                              range_get_last(reg->guest_phys_addr,
 130                                             reg->memory_size));
 131    }
 132    for (i = 0; i < dev->nvqs; ++i) {
 133        struct vhost_virtqueue *vq = dev->vqs + i;
 134
 135        if (!vq->used_phys && !vq->used_size) {
 136            continue;
 137        }
 138
 139        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 140                              range_get_last(vq->used_phys, vq->used_size));
 141    }
 142    return 0;
 143}
 144
 145static void vhost_log_sync(MemoryListener *listener,
 146                          MemoryRegionSection *section)
 147{
 148    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 149                                         memory_listener);
 150    vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 151}
 152
 153static void vhost_log_sync_range(struct vhost_dev *dev,
 154                                 hwaddr first, hwaddr last)
 155{
 156    int i;
 157    /* FIXME: this is N^2 in number of sections */
 158    for (i = 0; i < dev->n_mem_sections; ++i) {
 159        MemoryRegionSection *section = &dev->mem_sections[i];
 160        vhost_sync_dirty_bitmap(dev, section, first, last);
 161    }
 162}
 163
 164static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 165{
 166    uint64_t log_size = 0;
 167    int i;
 168    for (i = 0; i < dev->mem->nregions; ++i) {
 169        struct vhost_memory_region *reg = dev->mem->regions + i;
 170        uint64_t last = range_get_last(reg->guest_phys_addr,
 171                                       reg->memory_size);
 172        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 173    }
 174    return log_size;
 175}
 176
 177static int vhost_set_backend_type(struct vhost_dev *dev,
 178                                  VhostBackendType backend_type)
 179{
 180    int r = 0;
 181
 182    switch (backend_type) {
 183#ifdef CONFIG_VHOST_KERNEL
 184    case VHOST_BACKEND_TYPE_KERNEL:
 185        dev->vhost_ops = &kernel_ops;
 186        break;
 187#endif
 188#ifdef CONFIG_VHOST_USER
 189    case VHOST_BACKEND_TYPE_USER:
 190        dev->vhost_ops = &user_ops;
 191        break;
 192#endif
 193#ifdef CONFIG_VHOST_VDPA
 194    case VHOST_BACKEND_TYPE_VDPA:
 195        dev->vhost_ops = &vdpa_ops;
 196        break;
 197#endif
 198    default:
 199        error_report("Unknown vhost backend type");
 200        r = -1;
 201    }
 202
 203    return r;
 204}
 205
 206static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 207{
 208    Error *err = NULL;
 209    struct vhost_log *log;
 210    uint64_t logsize = size * sizeof(*(log->log));
 211    int fd = -1;
 212
 213    log = g_new0(struct vhost_log, 1);
 214    if (share) {
 215        log->log = qemu_memfd_alloc("vhost-log", logsize,
 216                                    F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 217                                    &fd, &err);
 218        if (err) {
 219            error_report_err(err);
 220            g_free(log);
 221            return NULL;
 222        }
 223        memset(log->log, 0, logsize);
 224    } else {
 225        log->log = g_malloc0(logsize);
 226    }
 227
 228    log->size = size;
 229    log->refcnt = 1;
 230    log->fd = fd;
 231
 232    return log;
 233}
 234
 235static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 236{
 237    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 238
 239    if (!log || log->size != size) {
 240        log = vhost_log_alloc(size, share);
 241        if (share) {
 242            vhost_log_shm = log;
 243        } else {
 244            vhost_log = log;
 245        }
 246    } else {
 247        ++log->refcnt;
 248    }
 249
 250    return log;
 251}
 252
 253static void vhost_log_put(struct vhost_dev *dev, bool sync)
 254{
 255    struct vhost_log *log = dev->log;
 256
 257    if (!log) {
 258        return;
 259    }
 260
 261    --log->refcnt;
 262    if (log->refcnt == 0) {
 263        /* Sync only the range covered by the old log */
 264        if (dev->log_size && sync) {
 265            vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 266        }
 267
 268        if (vhost_log == log) {
 269            g_free(log->log);
 270            vhost_log = NULL;
 271        } else if (vhost_log_shm == log) {
 272            qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 273                            log->fd);
 274            vhost_log_shm = NULL;
 275        }
 276
 277        g_free(log);
 278    }
 279
 280    dev->log = NULL;
 281    dev->log_size = 0;
 282}
 283
 284static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 285{
 286    return dev->vhost_ops->vhost_requires_shm_log &&
 287           dev->vhost_ops->vhost_requires_shm_log(dev);
 288}
 289
 290static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 291{
 292    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 293    uint64_t log_base = (uintptr_t)log->log;
 294    int r;
 295
 296    /* inform backend of log switching, this must be done before
 297       releasing the current log, to ensure no logging is lost */
 298    r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 299    if (r < 0) {
 300        VHOST_OPS_DEBUG("vhost_set_log_base failed");
 301    }
 302
 303    vhost_log_put(dev, true);
 304    dev->log = log;
 305    dev->log_size = size;
 306}
 307
 308static int vhost_dev_has_iommu(struct vhost_dev *dev)
 309{
 310    VirtIODevice *vdev = dev->vdev;
 311
 312    /*
 313     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
 314     * incremental memory mapping API via IOTLB API. For platform that
 315     * does not have IOMMU, there's no need to enable this feature
 316     * which may cause unnecessary IOTLB miss/update trnasactions.
 317     */
 318    return virtio_bus_device_iommu_enabled(vdev) &&
 319           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
 320}
 321
 322static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
 323                              hwaddr *plen, bool is_write)
 324{
 325    if (!vhost_dev_has_iommu(dev)) {
 326        return cpu_physical_memory_map(addr, plen, is_write);
 327    } else {
 328        return (void *)(uintptr_t)addr;
 329    }
 330}
 331
 332static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
 333                               hwaddr len, int is_write,
 334                               hwaddr access_len)
 335{
 336    if (!vhost_dev_has_iommu(dev)) {
 337        cpu_physical_memory_unmap(buffer, len, is_write, access_len);
 338    }
 339}
 340
 341static int vhost_verify_ring_part_mapping(void *ring_hva,
 342                                          uint64_t ring_gpa,
 343                                          uint64_t ring_size,
 344                                          void *reg_hva,
 345                                          uint64_t reg_gpa,
 346                                          uint64_t reg_size)
 347{
 348    uint64_t hva_ring_offset;
 349    uint64_t ring_last = range_get_last(ring_gpa, ring_size);
 350    uint64_t reg_last = range_get_last(reg_gpa, reg_size);
 351
 352    if (ring_last < reg_gpa || ring_gpa > reg_last) {
 353        return 0;
 354    }
 355    /* check that whole ring's is mapped */
 356    if (ring_last > reg_last) {
 357        return -ENOMEM;
 358    }
 359    /* check that ring's MemoryRegion wasn't replaced */
 360    hva_ring_offset = ring_gpa - reg_gpa;
 361    if (ring_hva != reg_hva + hva_ring_offset) {
 362        return -EBUSY;
 363    }
 364
 365    return 0;
 366}
 367
 368static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 369                                      void *reg_hva,
 370                                      uint64_t reg_gpa,
 371                                      uint64_t reg_size)
 372{
 373    int i, j;
 374    int r = 0;
 375    const char *part_name[] = {
 376        "descriptor table",
 377        "available ring",
 378        "used ring"
 379    };
 380
 381    if (vhost_dev_has_iommu(dev)) {
 382        return 0;
 383    }
 384
 385    for (i = 0; i < dev->nvqs; ++i) {
 386        struct vhost_virtqueue *vq = dev->vqs + i;
 387
 388        if (vq->desc_phys == 0) {
 389            continue;
 390        }
 391
 392        j = 0;
 393        r = vhost_verify_ring_part_mapping(
 394                vq->desc, vq->desc_phys, vq->desc_size,
 395                reg_hva, reg_gpa, reg_size);
 396        if (r) {
 397            break;
 398        }
 399
 400        j++;
 401        r = vhost_verify_ring_part_mapping(
 402                vq->avail, vq->avail_phys, vq->avail_size,
 403                reg_hva, reg_gpa, reg_size);
 404        if (r) {
 405            break;
 406        }
 407
 408        j++;
 409        r = vhost_verify_ring_part_mapping(
 410                vq->used, vq->used_phys, vq->used_size,
 411                reg_hva, reg_gpa, reg_size);
 412        if (r) {
 413            break;
 414        }
 415    }
 416
 417    if (r == -ENOMEM) {
 418        error_report("Unable to map %s for ring %d", part_name[j], i);
 419    } else if (r == -EBUSY) {
 420        error_report("%s relocated for ring %d", part_name[j], i);
 421    }
 422    return r;
 423}
 424
 425/*
 426 * vhost_section: identify sections needed for vhost access
 427 *
 428 * We only care about RAM sections here (where virtqueue and guest
 429 * internals accessed by virtio might live). If we find one we still
 430 * allow the backend to potentially filter it out of our list.
 431 */
 432static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
 433{
 434    MemoryRegion *mr = section->mr;
 435
 436    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
 437        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
 438        uint8_t handled_dirty;
 439
 440        /*
 441         * Kernel based vhost doesn't handle any block which is doing
 442         * dirty-tracking other than migration for which it has
 443         * specific logging support. However for TCG the kernel never
 444         * gets involved anyway so we can also ignore it's
 445         * self-modiying code detection flags. However a vhost-user
 446         * client could still confuse a TCG guest if it re-writes
 447         * executable memory that has already been translated.
 448         */
 449        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
 450            (1 << DIRTY_MEMORY_CODE);
 451
 452        if (dirty_mask & ~handled_dirty) {
 453            trace_vhost_reject_section(mr->name, 1);
 454            return false;
 455        }
 456
 457        if (dev->vhost_ops->vhost_backend_mem_section_filter &&
 458            !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
 459            trace_vhost_reject_section(mr->name, 2);
 460            return false;
 461        }
 462
 463        trace_vhost_section(mr->name);
 464        return true;
 465    } else {
 466        trace_vhost_reject_section(mr->name, 3);
 467        return false;
 468    }
 469}
 470
 471static void vhost_begin(MemoryListener *listener)
 472{
 473    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 474                                         memory_listener);
 475    dev->tmp_sections = NULL;
 476    dev->n_tmp_sections = 0;
 477}
 478
 479static void vhost_commit(MemoryListener *listener)
 480{
 481    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 482                                         memory_listener);
 483    MemoryRegionSection *old_sections;
 484    int n_old_sections;
 485    uint64_t log_size;
 486    size_t regions_size;
 487    int r;
 488    int i;
 489    bool changed = false;
 490
 491    /* Note we can be called before the device is started, but then
 492     * starting the device calls set_mem_table, so we need to have
 493     * built the data structures.
 494     */
 495    old_sections = dev->mem_sections;
 496    n_old_sections = dev->n_mem_sections;
 497    dev->mem_sections = dev->tmp_sections;
 498    dev->n_mem_sections = dev->n_tmp_sections;
 499
 500    if (dev->n_mem_sections != n_old_sections) {
 501        changed = true;
 502    } else {
 503        /* Same size, lets check the contents */
 504        for (int i = 0; i < n_old_sections; i++) {
 505            if (!MemoryRegionSection_eq(&old_sections[i],
 506                                        &dev->mem_sections[i])) {
 507                changed = true;
 508                break;
 509            }
 510        }
 511    }
 512
 513    trace_vhost_commit(dev->started, changed);
 514    if (!changed) {
 515        goto out;
 516    }
 517
 518    /* Rebuild the regions list from the new sections list */
 519    regions_size = offsetof(struct vhost_memory, regions) +
 520                       dev->n_mem_sections * sizeof dev->mem->regions[0];
 521    dev->mem = g_realloc(dev->mem, regions_size);
 522    dev->mem->nregions = dev->n_mem_sections;
 523    used_memslots = dev->mem->nregions;
 524    for (i = 0; i < dev->n_mem_sections; i++) {
 525        struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
 526        struct MemoryRegionSection *mrs = dev->mem_sections + i;
 527
 528        cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
 529        cur_vmr->memory_size     = int128_get64(mrs->size);
 530        cur_vmr->userspace_addr  =
 531            (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
 532            mrs->offset_within_region;
 533        cur_vmr->flags_padding   = 0;
 534    }
 535
 536    if (!dev->started) {
 537        goto out;
 538    }
 539
 540    for (i = 0; i < dev->mem->nregions; i++) {
 541        if (vhost_verify_ring_mappings(dev,
 542                       (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
 543                       dev->mem->regions[i].guest_phys_addr,
 544                       dev->mem->regions[i].memory_size)) {
 545            error_report("Verify ring failure on region %d", i);
 546            abort();
 547        }
 548    }
 549
 550    if (!dev->log_enabled) {
 551        r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 552        if (r < 0) {
 553            VHOST_OPS_DEBUG("vhost_set_mem_table failed");
 554        }
 555        goto out;
 556    }
 557    log_size = vhost_get_log_size(dev);
 558    /* We allocate an extra 4K bytes to log,
 559     * to reduce the * number of reallocations. */
 560#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 561    /* To log more, must increase log size before table update. */
 562    if (dev->log_size < log_size) {
 563        vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 564    }
 565    r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 566    if (r < 0) {
 567        VHOST_OPS_DEBUG("vhost_set_mem_table failed");
 568    }
 569    /* To log less, can only decrease log size after table update. */
 570    if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 571        vhost_dev_log_resize(dev, log_size);
 572    }
 573
 574out:
 575    /* Deref the old list of sections, this must happen _after_ the
 576     * vhost_set_mem_table to ensure the client isn't still using the
 577     * section we're about to unref.
 578     */
 579    while (n_old_sections--) {
 580        memory_region_unref(old_sections[n_old_sections].mr);
 581    }
 582    g_free(old_sections);
 583    return;
 584}
 585
 586/* Adds the section data to the tmp_section structure.
 587 * It relies on the listener calling us in memory address order
 588 * and for each region (via the _add and _nop methods) to
 589 * join neighbours.
 590 */
 591static void vhost_region_add_section(struct vhost_dev *dev,
 592                                     MemoryRegionSection *section)
 593{
 594    bool need_add = true;
 595    uint64_t mrs_size = int128_get64(section->size);
 596    uint64_t mrs_gpa = section->offset_within_address_space;
 597    uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
 598                         section->offset_within_region;
 599    RAMBlock *mrs_rb = section->mr->ram_block;
 600
 601    trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
 602                                   mrs_host);
 603
 604    if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
 605        /* Round the section to it's page size */
 606        /* First align the start down to a page boundary */
 607        size_t mrs_page = qemu_ram_pagesize(mrs_rb);
 608        uint64_t alignage = mrs_host & (mrs_page - 1);
 609        if (alignage) {
 610            mrs_host -= alignage;
 611            mrs_size += alignage;
 612            mrs_gpa  -= alignage;
 613        }
 614        /* Now align the size up to a page boundary */
 615        alignage = mrs_size & (mrs_page - 1);
 616        if (alignage) {
 617            mrs_size += mrs_page - alignage;
 618        }
 619        trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
 620                                               mrs_size, mrs_host);
 621    }
 622
 623    if (dev->n_tmp_sections) {
 624        /* Since we already have at least one section, lets see if
 625         * this extends it; since we're scanning in order, we only
 626         * have to look at the last one, and the FlatView that calls
 627         * us shouldn't have overlaps.
 628         */
 629        MemoryRegionSection *prev_sec = dev->tmp_sections +
 630                                               (dev->n_tmp_sections - 1);
 631        uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
 632        uint64_t prev_size = int128_get64(prev_sec->size);
 633        uint64_t prev_gpa_end   = range_get_last(prev_gpa_start, prev_size);
 634        uint64_t prev_host_start =
 635                        (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
 636                        prev_sec->offset_within_region;
 637        uint64_t prev_host_end   = range_get_last(prev_host_start, prev_size);
 638
 639        if (mrs_gpa <= (prev_gpa_end + 1)) {
 640            /* OK, looks like overlapping/intersecting - it's possible that
 641             * the rounding to page sizes has made them overlap, but they should
 642             * match up in the same RAMBlock if they do.
 643             */
 644            if (mrs_gpa < prev_gpa_start) {
 645                error_report("%s:Section '%s' rounded to %"PRIx64
 646                             " prior to previous '%s' %"PRIx64,
 647                             __func__, section->mr->name, mrs_gpa,
 648                             prev_sec->mr->name, prev_gpa_start);
 649                /* A way to cleanly fail here would be better */
 650                return;
 651            }
 652            /* Offset from the start of the previous GPA to this GPA */
 653            size_t offset = mrs_gpa - prev_gpa_start;
 654
 655            if (prev_host_start + offset == mrs_host &&
 656                section->mr == prev_sec->mr &&
 657                (!dev->vhost_ops->vhost_backend_can_merge ||
 658                 dev->vhost_ops->vhost_backend_can_merge(dev,
 659                    mrs_host, mrs_size,
 660                    prev_host_start, prev_size))) {
 661                uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
 662                need_add = false;
 663                prev_sec->offset_within_address_space =
 664                    MIN(prev_gpa_start, mrs_gpa);
 665                prev_sec->offset_within_region =
 666                    MIN(prev_host_start, mrs_host) -
 667                    (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
 668                prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
 669                                               mrs_host));
 670                trace_vhost_region_add_section_merge(section->mr->name,
 671                                        int128_get64(prev_sec->size),
 672                                        prev_sec->offset_within_address_space,
 673                                        prev_sec->offset_within_region);
 674            } else {
 675                /* adjoining regions are fine, but overlapping ones with
 676                 * different blocks/offsets shouldn't happen
 677                 */
 678                if (mrs_gpa != prev_gpa_end + 1) {
 679                    error_report("%s: Overlapping but not coherent sections "
 680                                 "at %"PRIx64,
 681                                 __func__, mrs_gpa);
 682                    return;
 683                }
 684            }
 685        }
 686    }
 687
 688    if (need_add) {
 689        ++dev->n_tmp_sections;
 690        dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
 691                                    dev->n_tmp_sections);
 692        dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
 693        /* The flatview isn't stable and we don't use it, making it NULL
 694         * means we can memcmp the list.
 695         */
 696        dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
 697        memory_region_ref(section->mr);
 698    }
 699}
 700
 701/* Used for both add and nop callbacks */
 702static void vhost_region_addnop(MemoryListener *listener,
 703                                MemoryRegionSection *section)
 704{
 705    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 706                                         memory_listener);
 707
 708    if (!vhost_section(dev, section)) {
 709        return;
 710    }
 711    vhost_region_add_section(dev, section);
 712}
 713
 714static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 715{
 716    struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
 717    struct vhost_dev *hdev = iommu->hdev;
 718    hwaddr iova = iotlb->iova + iommu->iommu_offset;
 719
 720    if (vhost_backend_invalidate_device_iotlb(hdev, iova,
 721                                              iotlb->addr_mask + 1)) {
 722        error_report("Fail to invalidate device iotlb");
 723    }
 724}
 725
 726static void vhost_iommu_region_add(MemoryListener *listener,
 727                                   MemoryRegionSection *section)
 728{
 729    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 730                                         iommu_listener);
 731    struct vhost_iommu *iommu;
 732    Int128 end;
 733    int iommu_idx;
 734    IOMMUMemoryRegion *iommu_mr;
 735    int ret;
 736
 737    if (!memory_region_is_iommu(section->mr)) {
 738        return;
 739    }
 740
 741    iommu_mr = IOMMU_MEMORY_REGION(section->mr);
 742
 743    iommu = g_malloc0(sizeof(*iommu));
 744    end = int128_add(int128_make64(section->offset_within_region),
 745                     section->size);
 746    end = int128_sub(end, int128_one());
 747    iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
 748                                                   MEMTXATTRS_UNSPECIFIED);
 749    iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
 750                        IOMMU_NOTIFIER_DEVIOTLB_UNMAP,
 751                        section->offset_within_region,
 752                        int128_get64(end),
 753                        iommu_idx);
 754    iommu->mr = section->mr;
 755    iommu->iommu_offset = section->offset_within_address_space -
 756                          section->offset_within_region;
 757    iommu->hdev = dev;
 758    ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL);
 759    if (ret) {
 760        /*
 761         * Some vIOMMUs do not support dev-iotlb yet.  If so, try to use the
 762         * UNMAP legacy message
 763         */
 764        iommu->n.notifier_flags = IOMMU_NOTIFIER_UNMAP;
 765        memory_region_register_iommu_notifier(section->mr, &iommu->n,
 766                                              &error_fatal);
 767    }
 768    QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
 769    /* TODO: can replay help performance here? */
 770}
 771
 772static void vhost_iommu_region_del(MemoryListener *listener,
 773                                   MemoryRegionSection *section)
 774{
 775    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 776                                         iommu_listener);
 777    struct vhost_iommu *iommu;
 778
 779    if (!memory_region_is_iommu(section->mr)) {
 780        return;
 781    }
 782
 783    QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
 784        if (iommu->mr == section->mr &&
 785            iommu->n.start == section->offset_within_region) {
 786            memory_region_unregister_iommu_notifier(iommu->mr,
 787                                                    &iommu->n);
 788            QLIST_REMOVE(iommu, iommu_next);
 789            g_free(iommu);
 790            break;
 791        }
 792    }
 793}
 794
 795static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 796                                    struct vhost_virtqueue *vq,
 797                                    unsigned idx, bool enable_log)
 798{
 799    struct vhost_vring_addr addr;
 800    int r;
 801    memset(&addr, 0, sizeof(struct vhost_vring_addr));
 802
 803    if (dev->vhost_ops->vhost_vq_get_addr) {
 804        r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
 805        if (r < 0) {
 806            VHOST_OPS_DEBUG("vhost_vq_get_addr failed");
 807            return -errno;
 808        }
 809    } else {
 810        addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
 811        addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
 812        addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
 813    }
 814    addr.index = idx;
 815    addr.log_guest_addr = vq->used_phys;
 816    addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
 817    r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 818    if (r < 0) {
 819        VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
 820        return -errno;
 821    }
 822    return 0;
 823}
 824
 825static int vhost_dev_set_features(struct vhost_dev *dev,
 826                                  bool enable_log)
 827{
 828    uint64_t features = dev->acked_features;
 829    int r;
 830    if (enable_log) {
 831        features |= 0x1ULL << VHOST_F_LOG_ALL;
 832    }
 833    if (!vhost_dev_has_iommu(dev)) {
 834        features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
 835    }
 836    if (dev->vhost_ops->vhost_force_iommu) {
 837        if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
 838            features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
 839       }
 840    }
 841    r = dev->vhost_ops->vhost_set_features(dev, features);
 842    if (r < 0) {
 843        VHOST_OPS_DEBUG("vhost_set_features failed");
 844        goto out;
 845    }
 846    if (dev->vhost_ops->vhost_set_backend_cap) {
 847        r = dev->vhost_ops->vhost_set_backend_cap(dev);
 848        if (r < 0) {
 849            VHOST_OPS_DEBUG("vhost_set_backend_cap failed");
 850            goto out;
 851        }
 852    }
 853
 854out:
 855    return r < 0 ? -errno : 0;
 856}
 857
 858static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 859{
 860    int r, i, idx;
 861    hwaddr addr;
 862
 863    r = vhost_dev_set_features(dev, enable_log);
 864    if (r < 0) {
 865        goto err_features;
 866    }
 867    for (i = 0; i < dev->nvqs; ++i) {
 868        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 869        addr = virtio_queue_get_desc_addr(dev->vdev, idx);
 870        if (!addr) {
 871            /*
 872             * The queue might not be ready for start. If this
 873             * is the case there is no reason to continue the process.
 874             * The similar logic is used by the vhost_virtqueue_start()
 875             * routine.
 876             */
 877            continue;
 878        }
 879        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 880                                     enable_log);
 881        if (r < 0) {
 882            goto err_vq;
 883        }
 884    }
 885    return 0;
 886err_vq:
 887    for (; i >= 0; --i) {
 888        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 889        vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 890                                 dev->log_enabled);
 891    }
 892    vhost_dev_set_features(dev, dev->log_enabled);
 893err_features:
 894    return r;
 895}
 896
 897static int vhost_migration_log(MemoryListener *listener, bool enable)
 898{
 899    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 900                                         memory_listener);
 901    int r;
 902    if (enable == dev->log_enabled) {
 903        return 0;
 904    }
 905    if (!dev->started) {
 906        dev->log_enabled = enable;
 907        return 0;
 908    }
 909
 910    r = 0;
 911    if (!enable) {
 912        r = vhost_dev_set_log(dev, false);
 913        if (r < 0) {
 914            goto check_dev_state;
 915        }
 916        vhost_log_put(dev, false);
 917    } else {
 918        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 919        r = vhost_dev_set_log(dev, true);
 920        if (r < 0) {
 921            goto check_dev_state;
 922        }
 923    }
 924
 925check_dev_state:
 926    dev->log_enabled = enable;
 927    /*
 928     * vhost-user-* devices could change their state during log
 929     * initialization due to disconnect. So check dev state after
 930     * vhost communication.
 931     */
 932    if (!dev->started) {
 933        /*
 934         * Since device is in the stopped state, it is okay for
 935         * migration. Return success.
 936         */
 937        r = 0;
 938    }
 939    if (r) {
 940        /* An error occurred. */
 941        dev->log_enabled = false;
 942    }
 943
 944    return r;
 945}
 946
 947static void vhost_log_global_start(MemoryListener *listener)
 948{
 949    int r;
 950
 951    r = vhost_migration_log(listener, true);
 952    if (r < 0) {
 953        abort();
 954    }
 955}
 956
 957static void vhost_log_global_stop(MemoryListener *listener)
 958{
 959    int r;
 960
 961    r = vhost_migration_log(listener, false);
 962    if (r < 0) {
 963        abort();
 964    }
 965}
 966
 967static void vhost_log_start(MemoryListener *listener,
 968                            MemoryRegionSection *section,
 969                            int old, int new)
 970{
 971    /* FIXME: implement */
 972}
 973
 974static void vhost_log_stop(MemoryListener *listener,
 975                           MemoryRegionSection *section,
 976                           int old, int new)
 977{
 978    /* FIXME: implement */
 979}
 980
 981/* The vhost driver natively knows how to handle the vrings of non
 982 * cross-endian legacy devices and modern devices. Only legacy devices
 983 * exposed to a bi-endian guest may require the vhost driver to use a
 984 * specific endianness.
 985 */
 986static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
 987{
 988    if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
 989        return false;
 990    }
 991#ifdef HOST_WORDS_BIGENDIAN
 992    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
 993#else
 994    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
 995#endif
 996}
 997
 998static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 999                                                   bool is_big_endian,
1000                                                   int vhost_vq_index)

1001{
1002    struct vhost_vring_state s = {
1003        .index = vhost_vq_index,
1004        .num = is_big_endian
1005    };
1006
1007    if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
1008        return 0;
1009    }
1010
1011    VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
1012    if (errno == ENOTTY) {
1013        error_report("vhost does not support cross-endian");
1014        return -ENOSYS;
1015    }
1016
1017    return -errno;
1018}
1019
1020static int vhost_memory_region_lookup(struct vhost_dev *hdev,
1021                                      uint64_t gpa, uint64_t *uaddr,
1022                                      uint64_t *len)
1023{
1024    int i;
1025
1026    for (i = 0; i < hdev->mem->nregions; i++) {
1027        struct vhost_memory_region *reg = hdev->mem->regions + i;
1028
1029        if (gpa >= reg->guest_phys_addr &&
1030            reg->guest_phys_addr + reg->memory_size > gpa) {
1031            *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
1032            *len = reg->guest_phys_addr + reg->memory_size - gpa;
1033            return 0;
1034        }
1035    }
1036
1037    return -EFAULT;
1038}
1039
1040int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
1041{
1042    IOMMUTLBEntry iotlb;
1043    uint64_t uaddr, len;
1044    int ret = -EFAULT;
1045
1046    RCU_READ_LOCK_GUARD();
1047
1048    trace_vhost_iotlb_miss(dev, 1);
1049
1050    iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
1051                                          iova, write,
1052                                          MEMTXATTRS_UNSPECIFIED);
1053    if (iotlb.target_as != NULL) {
1054        ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
1055                                         &uaddr, &len);
1056        if (ret) {
1057            trace_vhost_iotlb_miss(dev, 3);
1058            error_report("Fail to lookup the translated address "
1059                         "%"PRIx64, iotlb.translated_addr);
1060            goto out;
1061        }
1062
1063        len = MIN(iotlb.addr_mask + 1, len);
1064        iova = iova & ~iotlb.addr_mask;
1065
1066        ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1067                                                len, iotlb.perm);
1068        if (ret) {
1069            trace_vhost_iotlb_miss(dev, 4);
1070            error_report("Fail to update device iotlb");
1071            goto out;
1072        }
1073    }
1074
1075    trace_vhost_iotlb_miss(dev, 2);
1076
1077out:
1078    return ret;
1079}
1080
1081static int vhost_virtqueue_start(struct vhost_dev *dev,
1082                                struct VirtIODevice *vdev,
1083                                struct vhost_virtqueue *vq,
1084                                unsigned idx)
1085{
1086    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1087    VirtioBusState *vbus = VIRTIO_BUS(qbus);
1088    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1089    hwaddr s, l, a;
1090    int r;
1091    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1092    struct vhost_vring_file file = {
1093        .index = vhost_vq_index
1094    };
1095    struct vhost_vring_state state = {
1096        .index = vhost_vq_index
1097    };
1098    struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1099
1100    a = virtio_queue_get_desc_addr(vdev, idx);
1101    if (a == 0) {
1102        /* Queue might not be ready for start */
1103        return 0;
1104    }
1105
1106    vq->num = state.num = virtio_queue_get_num(vdev, idx);
1107    r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1108    if (r) {
1109        VHOST_OPS_DEBUG("vhost_set_vring_num failed");
1110        return -errno;
1111    }
1112
1113    state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1114    r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1115    if (r) {
1116        VHOST_OPS_DEBUG("vhost_set_vring_base failed");
1117        return -errno;
1118    }
1119
1120    if (vhost_needs_vring_endian(vdev)) {
1121        r = vhost_virtqueue_set_vring_endian_legacy(dev,
1122                                                    virtio_is_big_endian(vdev),
1123                                                    vhost_vq_index);
1124        if (r) {
1125            return -errno;
1126        }
1127    }
1128
1129    vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1130    vq->desc_phys = a;
1131    vq->desc = vhost_memory_map(dev, a, &l, false);
1132    if (!vq->desc || l != s) {
1133        r = -ENOMEM;
1134        goto fail_alloc_desc;
1135    }
1136    vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1137    vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1138    vq->avail = vhost_memory_map(dev, a, &l, false);
1139    if (!vq->avail || l != s) {
1140        r = -ENOMEM;
1141        goto fail_alloc_avail;
1142    }
1143    vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1144    vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1145    vq->used = vhost_memory_map(dev, a, &l, true);
1146    if (!vq->used || l != s) {
1147        r = -ENOMEM;
1148        goto fail_alloc_used;
1149    }
1150
1151    r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1152    if (r < 0) {
1153        r = -errno;
1154        goto fail_alloc;
1155    }
1156
1157    file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1158    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1159    if (r) {
1160        VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
1161        r = -errno;
1162        goto fail_kick;
1163    }
1164
1165    /* Clear and discard previous events if any. */
1166    event_notifier_test_and_clear(&vq->masked_notifier);
1167
1168    /* Init vring in unmasked state, unless guest_notifier_mask
1169     * will do it later.
1170     */
1171    if (!vdev->use_guest_notifier_mask) {
1172        /* TODO: check and handle errors. */
1173        vhost_virtqueue_mask(dev, vdev, idx, false);
1174    }
1175
1176    if (k->query_guest_notifiers &&
1177        k->query_guest_notifiers(qbus->parent) &&
1178        virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1179        file.fd = -1;
1180        r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1181        if (r) {
1182            goto fail_vector;
1183        }
1184    }
1185
1186    return 0;
1187
1188fail_vector:
1189fail_kick:
1190fail_alloc:
1191    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1192                       0, 0);
1193fail_alloc_used:
1194    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1195                       0, 0);
1196fail_alloc_avail:
1197    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1198                       0, 0);
1199fail_alloc_desc:
1200    return r;
1201}
1202
1203static void vhost_virtqueue_stop(struct vhost_dev *dev,
1204                                    struct VirtIODevice *vdev,
1205                                    struct vhost_virtqueue *vq,
1206                                    unsigned idx)
1207{
1208    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1209    struct vhost_vring_state state = {
1210        .index = vhost_vq_index,
1211    };
1212    int r;
1213
1214    if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
1215        /* Don't stop the virtqueue which might have not been started */
1216        return;
1217    }
1218
1219    r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1220    if (r < 0) {
1221        VHOST_OPS_DEBUG("vhost VQ %u ring restore failed: %d", idx, r);
1222        /* Connection to the backend is broken, so let's sync internal
1223         * last avail idx to the device used idx.
1224         */
1225        virtio_queue_restore_last_avail_idx(vdev, idx);
1226    } else {
1227        virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1228    }
1229    virtio_queue_invalidate_signalled_used(vdev, idx);
1230    virtio_queue_update_used_idx(vdev, idx);
1231
1232    /* In the cross-endian case, we need to reset the vring endianness to
1233     * native as legacy devices expect so by default.
1234     */
1235    if (vhost_needs_vring_endian(vdev)) {
1236        vhost_virtqueue_set_vring_endian_legacy(dev,
1237                                                !virtio_is_big_endian(vdev),
1238                                                vhost_vq_index);
1239    }
1240
1241    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1242                       1, virtio_queue_get_used_size(vdev, idx));
1243    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1244                       0, virtio_queue_get_avail_size(vdev, idx));
1245    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1246                       0, virtio_queue_get_desc_size(vdev, idx));
1247}
1248
1249static void vhost_eventfd_add(MemoryListener *listener,
1250                              MemoryRegionSection *section,
1251                              bool match_data, uint64_t data, EventNotifier *e)
1252{
1253}
1254
1255static void vhost_eventfd_del(MemoryListener *listener,
1256                              MemoryRegionSection *section,
1257                              bool match_data, uint64_t data, EventNotifier *e)
1258{
1259}
1260
1261static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1262                                                int n, uint32_t timeout)
1263{
1264    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1265    struct vhost_vring_state state = {
1266        .index = vhost_vq_index,
1267        .num = timeout,
1268    };
1269    int r;
1270
1271    if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1272        return -EINVAL;
1273    }
1274
1275    r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1276    if (r) {
1277        VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
1278        return r;
1279    }
1280
1281    return 0;
1282}
1283
1284static int vhost_virtqueue_init(struct vhost_dev *dev,
1285                                struct vhost_virtqueue *vq, int n)
1286{
1287    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1288    struct vhost_vring_file file = {
1289        .index = vhost_vq_index,
1290    };
1291    int r = event_notifier_init(&vq->masked_notifier, 0);
1292    if (r < 0) {
1293        return r;
1294    }
1295
1296    file.fd = event_notifier_get_fd(&vq->masked_notifier);
1297    r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1298    if (r) {
1299        VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1300        r = -errno;
1301        goto fail_call;
1302    }
1303
1304    vq->dev = dev;
1305
1306    return 0;
1307fail_call:
1308    event_notifier_cleanup(&vq->masked_notifier);
1309    return r;
1310}
1311
1312static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1313{
1314    event_notifier_cleanup(&vq->masked_notifier);
1315}
1316
1317int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1318                   VhostBackendType backend_type, uint32_t busyloop_timeout,
1319                   Error **errp)
1320{
1321    uint64_t features;
1322    int i, r, n_initialized_vqs = 0;
1323
1324    hdev->vdev = NULL;
1325    hdev->migration_blocker = NULL;
1326
1327    r = vhost_set_backend_type(hdev, backend_type);
1328    assert(r >= 0);
1329
1330    r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp);
1331    if (r < 0) {
1332        goto fail;
1333    }
1334
1335    r = hdev->vhost_ops->vhost_set_owner(hdev);
1336    if (r < 0) {
1337        error_setg_errno(errp, -r, "vhost_set_owner failed");
1338        goto fail;
1339    }
1340
1341    r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1342    if (r < 0) {
1343        error_setg_errno(errp, -r, "vhost_get_features failed");
1344        goto fail;
1345    }
1346
1347    for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1348        r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1349        if (r < 0) {
1350            error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i);
1351            goto fail;
1352        }
1353    }
1354
1355    if (busyloop_timeout) {
1356        for (i = 0; i < hdev->nvqs; ++i) {
1357            r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1358                                                     busyloop_timeout);
1359            if (r < 0) {
1360                error_setg_errno(errp, -r, "Failed to set busyloop timeout");
1361                goto fail_busyloop;
1362            }
1363        }
1364    }
1365
1366    hdev->features = features;
1367
1368    hdev->memory_listener = (MemoryListener) {
1369        .name = "vhost",
1370        .begin = vhost_begin,
1371        .commit = vhost_commit,
1372        .region_add = vhost_region_addnop,
1373        .region_nop = vhost_region_addnop,
1374        .log_start = vhost_log_start,
1375        .log_stop = vhost_log_stop,
1376        .log_sync = vhost_log_sync,
1377        .log_global_start = vhost_log_global_start,
1378        .log_global_stop = vhost_log_global_stop,
1379        .eventfd_add = vhost_eventfd_add,
1380        .eventfd_del = vhost_eventfd_del,
1381        .priority = 10
1382    };
1383
1384    hdev->iommu_listener = (MemoryListener) {
1385        .name = "vhost-iommu",
1386        .region_add = vhost_iommu_region_add,
1387        .region_del = vhost_iommu_region_del,
1388    };
1389
1390    if (hdev->migration_blocker == NULL) {
1391        if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1392            error_setg(&hdev->migration_blocker,
1393                       "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1394        } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
1395            error_setg(&hdev->migration_blocker,
1396                       "Migration disabled: failed to allocate shared memory");
1397        }
1398    }
1399
1400    if (hdev->migration_blocker != NULL) {
1401        r = migrate_add_blocker(hdev->migration_blocker, errp);
1402        if (r < 0) {
1403            error_free(hdev->migration_blocker);
1404            goto fail_busyloop;
1405        }
1406    }
1407
1408    hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1409    hdev->n_mem_sections = 0;
1410    hdev->mem_sections = NULL;
1411    hdev->log = NULL;
1412    hdev->log_size = 0;
1413    hdev->log_enabled = false;
1414    hdev->started = false;
1415    memory_listener_register(&hdev->memory_listener, &address_space_memory);
1416    QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1417
1418    if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1419        error_setg(errp, "vhost backend memory slots limit is less"
1420                   " than current number of present memory slots");
1421        r = -EINVAL;
1422        goto fail_busyloop;
1423    }
1424
1425    return 0;
1426
1427fail_busyloop:
1428    if (busyloop_timeout) {
1429        while (--i >= 0) {
1430            vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1431        }
1432    }
1433fail:
1434    hdev->nvqs = n_initialized_vqs;
1435    vhost_dev_cleanup(hdev);
1436    return r;
1437}
1438
1439void vhost_dev_cleanup(struct vhost_dev *hdev)
1440{
1441    int i;
1442
1443    for (i = 0; i < hdev->nvqs; ++i) {
1444        vhost_virtqueue_cleanup(hdev->vqs + i);
1445    }
1446    if (hdev->mem) {
1447        /* those are only safe after successful init */
1448        memory_listener_unregister(&hdev->memory_listener);
1449        QLIST_REMOVE(hdev, entry);
1450    }
1451    if (hdev->migration_blocker) {
1452        migrate_del_blocker(hdev->migration_blocker);
1453        error_free(hdev->migration_blocker);
1454    }
1455    g_free(hdev->mem);
1456    g_free(hdev->mem_sections);
1457    if (hdev->vhost_ops) {
1458        hdev->vhost_ops->vhost_backend_cleanup(hdev);
1459    }
1460    assert(!hdev->log);
1461
1462    memset(hdev, 0, sizeof(struct vhost_dev));
1463}
1464
1465/* Stop processing guest IO notifications in qemu.
1466 * Start processing them in vhost in kernel.
1467 */
1468int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1469{
1470    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1471    int i, r, e;
1472
1473    /* We will pass the notifiers to the kernel, make sure that QEMU
1474     * doesn't interfere.
1475     */
1476    r = virtio_device_grab_ioeventfd(vdev);
1477    if (r < 0) {
1478        error_report("binding does not support host notifiers");
1479        goto fail;
1480    }
1481
1482    for (i = 0; i < hdev->nvqs; ++i) {
1483        r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1484                                         true);
1485        if (r < 0) {
1486            error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1487            goto fail_vq;
1488        }
1489    }
1490
1491    return 0;
1492fail_vq:
1493    while (--i >= 0) {
1494        e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1495                                         false);
1496        if (e < 0) {
1497            error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
1498        }
1499        assert (e >= 0);
1500        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1501    }
1502    virtio_device_release_ioeventfd(vdev);
1503fail:
1504    return r;
1505}
1506
1507/* Stop processing guest IO notifications in vhost.
1508 * Start processing them in qemu.
1509 * This might actually run the qemu handlers right away,
1510 * so virtio in qemu must be completely setup when this is called.
1511 */
1512void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1513{
1514    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1515    int i, r;
1516
1517    for (i = 0; i < hdev->nvqs; ++i) {
1518        r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1519                                         false);
1520        if (r < 0) {
1521            error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1522        }
1523        assert (r >= 0);
1524        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1525    }
1526    virtio_device_release_ioeventfd(vdev);
1527}
1528
1529/* Test and clear event pending status.
1530 * Should be called after unmask to avoid losing events.
1531 */
1532bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1533{
1534    struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1535    assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1536    return event_notifier_test_and_clear(&vq->masked_notifier);
1537}
1538
1539/* Mask/unmask events from this vq. */
1540void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1541                         bool mask)
1542{
1543    struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1544    int r, index = n - hdev->vq_index;
1545    struct vhost_vring_file file;
1546
1547    /* should only be called after backend is connected */
1548    assert(hdev->vhost_ops);
1549
1550    if (mask) {
1551        assert(vdev->use_guest_notifier_mask);
1552        file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1553    } else {
1554        file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1555    }
1556
1557    file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1558    r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1559    if (r < 0) {
1560        VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1561    }
1562}
1563
1564uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1565                            uint64_t features)
1566{
1567    const int *bit = feature_bits;
1568    while (*bit != VHOST_INVALID_FEATURE_BIT) {
1569        uint64_t bit_mask = (1ULL << *bit);
1570        if (!(hdev->features & bit_mask)) {
1571            features &= ~bit_mask;
1572        }
1573        bit++;
1574    }
1575    return features;
1576}
1577
1578void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1579                        uint64_t features)
1580{
1581    const int *bit = feature_bits;
1582    while (*bit != VHOST_INVALID_FEATURE_BIT) {
1583        uint64_t bit_mask = (1ULL << *bit);
1584        if (features & bit_mask) {
1585            hdev->acked_features |= bit_mask;
1586        }
1587        bit++;
1588    }
1589}
1590
1591int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1592                         uint32_t config_len, Error **errp)
1593{
1594    assert(hdev->vhost_ops);
1595
1596    if (hdev->vhost_ops->vhost_get_config) {
1597        return hdev->vhost_ops->vhost_get_config(hdev, config, config_len,
1598                                                 errp);
1599    }
1600
1601    error_setg(errp, "vhost_get_config not implemented");
1602    return -ENOTSUP;
1603}
1604
1605int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1606                         uint32_t offset, uint32_t size, uint32_t flags)
1607{
1608    assert(hdev->vhost_ops);
1609
1610    if (hdev->vhost_ops->vhost_set_config) {
1611        return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1612                                                 size, flags);
1613    }
1614
1615    return -1;
1616}
1617
1618void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1619                                   const VhostDevConfigOps *ops)
1620{
1621    hdev->config_ops = ops;
1622}
1623
1624void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1625{
1626    if (inflight && inflight->addr) {
1627        qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1628        inflight->addr = NULL;
1629        inflight->fd = -1;
1630    }
1631}
1632
1633static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
1634                                     uint64_t new_size)
1635{
1636    Error *err = NULL;
1637    int fd = -1;
1638    void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
1639                                  F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
1640                                  &fd, &err);
1641
1642    if (err) {
1643        error_report_err(err);
1644        return -1;
1645    }
1646
1647    vhost_dev_free_inflight(inflight);
1648    inflight->offset = 0;
1649    inflight->addr = addr;
1650    inflight->fd = fd;
1651    inflight->size = new_size;
1652
1653    return 0;
1654}
1655
1656void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1657{
1658    if (inflight->addr) {
1659        qemu_put_be64(f, inflight->size);
1660        qemu_put_be16(f, inflight->queue_size);
1661        qemu_put_buffer(f, inflight->addr, inflight->size);
1662    } else {
1663        qemu_put_be64(f, 0);
1664    }
1665}
1666
1667int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1668{
1669    uint64_t size;
1670
1671    size = qemu_get_be64(f);
1672    if (!size) {
1673        return 0;
1674    }
1675
1676    if (inflight->size != size) {
1677        if (vhost_dev_resize_inflight(inflight, size)) {
1678            return -1;
1679        }
1680    }
1681    inflight->queue_size = qemu_get_be16(f);
1682
1683    qemu_get_buffer(f, inflight->addr, size);
1684
1685    return 0;
1686}
1687
1688int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
1689{
1690    int r;
1691
1692    if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
1693        hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
1694        return 0;
1695    }
1696
1697    hdev->vdev = vdev;
1698
1699    r = vhost_dev_set_features(hdev, hdev->log_enabled);
1700    if (r < 0) {
1701        VHOST_OPS_DEBUG("vhost_dev_prepare_inflight failed");
1702        return r;
1703    }
1704
1705    return 0;
1706}
1707
1708int vhost_dev_set_inflight(struct vhost_dev *dev,
1709                           struct vhost_inflight *inflight)
1710{
1711    int r;
1712
1713    if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1714        r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1715        if (r) {
1716            VHOST_OPS_DEBUG("vhost_set_inflight_fd failed");
1717            return -errno;
1718        }
1719    }
1720
1721    return 0;
1722}
1723
1724int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1725                           struct vhost_inflight *inflight)
1726{
1727    int r;
1728
1729    if (dev->vhost_ops->vhost_get_inflight_fd) {
1730        r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1731        if (r) {
1732            VHOST_OPS_DEBUG("vhost_get_inflight_fd failed");
1733            return -errno;
1734        }
1735    }
1736
1737    return 0;
1738}
1739
1740/* Host notifiers must be enabled at this point. */
1741int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1742{
1743    int i, r;
1744
1745    /* should only be called after backend is connected */
1746    assert(hdev->vhost_ops);
1747
1748    hdev->started = true;
1749    hdev->vdev = vdev;
1750
1751    r = vhost_dev_set_features(hdev, hdev->log_enabled);
1752    if (r < 0) {
1753        goto fail_features;
1754    }
1755
1756    if (vhost_dev_has_iommu(hdev)) {
1757        memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1758    }
1759
1760    r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1761    if (r < 0) {
1762        VHOST_OPS_DEBUG("vhost_set_mem_table failed");
1763        r = -errno;
1764        goto fail_mem;
1765    }
1766    for (i = 0; i < hdev->nvqs; ++i) {
1767        r = vhost_virtqueue_start(hdev,
1768                                  vdev,
1769                                  hdev->vqs + i,
1770                                  hdev->vq_index + i);
1771        if (r < 0) {
1772            goto fail_vq;
1773        }
1774    }
1775
1776    if (hdev->log_enabled) {
1777        uint64_t log_base;
1778
1779        hdev->log_size = vhost_get_log_size(hdev);
1780        hdev->log = vhost_log_get(hdev->log_size,
1781                                  vhost_dev_log_is_shared(hdev));
1782        log_base = (uintptr_t)hdev->log->log;
1783        r = hdev->vhost_ops->vhost_set_log_base(hdev,
1784                                                hdev->log_size ? log_base : 0,
1785                                                hdev->log);
1786        if (r < 0) {
1787            VHOST_OPS_DEBUG("vhost_set_log_base failed");
1788            r = -errno;
1789            goto fail_log;
1790        }
1791    }
1792    if (hdev->vhost_ops->vhost_dev_start) {
1793        r = hdev->vhost_ops->vhost_dev_start(hdev, true);
1794        if (r) {
1795            goto fail_log;
1796        }
1797    }
1798    if (vhost_dev_has_iommu(hdev) &&
1799        hdev->vhost_ops->vhost_set_iotlb_callback) {
1800            hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
1801
1802        /* Update used ring information for IOTLB to work correctly,
1803         * vhost-kernel code requires for this.*/
1804        for (i = 0; i < hdev->nvqs; ++i) {
1805            struct vhost_virtqueue *vq = hdev->vqs + i;
1806            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1807        }
1808    }
1809    return 0;
1810fail_log:
1811    vhost_log_put(hdev, false);
1812fail_vq:
1813    while (--i >= 0) {
1814        vhost_virtqueue_stop(hdev,
1815                             vdev,
1816                             hdev->vqs + i,
1817                             hdev->vq_index + i);
1818    }
1819
1820fail_mem:
1821fail_features:
1822
1823    hdev->started = false;
1824    return r;
1825}
1826
1827/* Host notifiers must be enabled at this point. */
1828void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1829{
1830    int i;
1831
1832    /* should only be called after backend is connected */
1833    assert(hdev->vhost_ops);
1834
1835    if (hdev->vhost_ops->vhost_dev_start) {
1836        hdev->vhost_ops->vhost_dev_start(hdev, false);
1837    }
1838    for (i = 0; i < hdev->nvqs; ++i) {
1839        vhost_virtqueue_stop(hdev,
1840                             vdev,
1841                             hdev->vqs + i,
1842                             hdev->vq_index + i);
1843    }
1844
1845    if (vhost_dev_has_iommu(hdev)) {
1846        if (hdev->vhost_ops->vhost_set_iotlb_callback) {
1847            hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
1848        }
1849        memory_listener_unregister(&hdev->iommu_listener);
1850    }
1851    vhost_log_put(hdev, true);
1852    hdev->started = false;
1853    hdev->vdev = NULL;
1854}
1855
1856int vhost_net_set_backend(struct vhost_dev *hdev,
1857                          struct vhost_vring_file *file)
1858{
1859    if (hdev->vhost_ops->vhost_net_set_backend) {
1860        return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
1861    }
1862
1863    return -1;
1864}
1865