LXR qemu/hw/virtio/vhost.c

   1/*
   2 * vhost support
   3 *
   4 * Copyright Red Hat, Inc. 2010
   5 *
   6 * Authors:
   7 *  Michael S. Tsirkin <mst@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qapi/error.h"
  18#include "hw/virtio/vhost.h"
  19#include "qemu/atomic.h"
  20#include "qemu/range.h"
  21#include "qemu/error-report.h"
  22#include "qemu/memfd.h"
  23#include "standard-headers/linux/vhost_types.h"
  24#include "hw/virtio/virtio-bus.h"
  25#include "hw/virtio/virtio-access.h"
  26#include "migration/blocker.h"
  27#include "migration/qemu-file-types.h"
  28#include "sysemu/dma.h"
  29#include "sysemu/tcg.h"
  30#include "trace.h"
  31
  32/* enabled until disconnected backend stabilizes */
  33#define _VHOST_DEBUG 1
  34
  35#ifdef _VHOST_DEBUG
  36#define VHOST_OPS_DEBUG(fmt, ...) \
  37    do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
  38                      strerror(errno), errno); } while (0)
  39#else
  40#define VHOST_OPS_DEBUG(fmt, ...) \
  41    do { } while (0)
  42#endif
  43
  44static struct vhost_log *vhost_log;
  45static struct vhost_log *vhost_log_shm;
  46
  47static unsigned int used_memslots;
  48static QLIST_HEAD(, vhost_dev) vhost_devices =
  49    QLIST_HEAD_INITIALIZER(vhost_devices);
  50
  51bool vhost_has_free_slot(void)
  52{
  53    unsigned int slots_limit = ~0U;
  54    struct vhost_dev *hdev;
  55
  56    QLIST_FOREACH(hdev, &vhost_devices, entry) {
  57        unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  58        slots_limit = MIN(slots_limit, r);
  59    }
  60    return slots_limit > used_memslots;
  61}
  62
  63static void vhost_dev_sync_region(struct vhost_dev *dev,
  64                                  MemoryRegionSection *section,
  65                                  uint64_t mfirst, uint64_t mlast,
  66                                  uint64_t rfirst, uint64_t rlast)
  67{
  68    vhost_log_chunk_t *log = dev->log->log;
  69
  70    uint64_t start = MAX(mfirst, rfirst);
  71    uint64_t end = MIN(mlast, rlast);
  72    vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  73    vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  74    uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
  75
  76    if (end < start) {
  77        return;
  78    }
  79    assert(end / VHOST_LOG_CHUNK < dev->log_size);
  80    assert(start / VHOST_LOG_CHUNK < dev->log_size);
  81
  82    for (;from < to; ++from) {
  83        vhost_log_chunk_t log;
  84        /* We first check with non-atomic: much cheaper,
  85         * and we expect non-dirty to be the common case. */
  86        if (!*from) {
  87            addr += VHOST_LOG_CHUNK;
  88            continue;
  89        }
  90        /* Data must be read atomically. We don't really need barrier semantics
  91         * but it's easier to use atomic_* than roll our own. */
  92        log = qatomic_xchg(from, 0);
  93        while (log) {
  94            int bit = ctzl(log);
  95            hwaddr page_addr;
  96            hwaddr section_offset;
  97            hwaddr mr_offset;
  98            page_addr = addr + bit * VHOST_LOG_PAGE;
  99            section_offset = page_addr - section->offset_within_address_space;
 100            mr_offset = section_offset + section->offset_within_region;
 101            memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
 102            log &= ~(0x1ull << bit);
 103        }
 104        addr += VHOST_LOG_CHUNK;
 105    }
 106}
 107
 108static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
 109                                   MemoryRegionSection *section,
 110                                   hwaddr first,
 111                                   hwaddr last)
 112{
 113    int i;
 114    hwaddr start_addr;
 115    hwaddr end_addr;
 116
 117    if (!dev->log_enabled || !dev->started) {
 118        return 0;
 119    }
 120    start_addr = section->offset_within_address_space;
 121    end_addr = range_get_last(start_addr, int128_get64(section->size));
 122    start_addr = MAX(first, start_addr);
 123    end_addr = MIN(last, end_addr);
 124
 125    for (i = 0; i < dev->mem->nregions; ++i) {
 126        struct vhost_memory_region *reg = dev->mem->regions + i;
 127        vhost_dev_sync_region(dev, section, start_addr, end_addr,
 128                              reg->guest_phys_addr,
 129                              range_get_last(reg->guest_phys_addr,
 130                                             reg->memory_size));
 131    }
 132    for (i = 0; i < dev->nvqs; ++i) {
 133        struct vhost_virtqueue *vq = dev->vqs + i;
 134
 135        if (!vq->used_phys && !vq->used_size) {
 136            continue;
 137        }
 138
 139        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 140                              range_get_last(vq->used_phys, vq->used_size));
 141    }
 142    return 0;
 143}
 144
 145static void vhost_log_sync(MemoryListener *listener,
 146                          MemoryRegionSection *section)
 147{
 148    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 149                                         memory_listener);
 150    vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 151}
 152
 153static void vhost_log_sync_range(struct vhost_dev *dev,
 154                                 hwaddr first, hwaddr last)
 155{
 156    int i;
 157    /* FIXME: this is N^2 in number of sections */
 158    for (i = 0; i < dev->n_mem_sections; ++i) {
 159        MemoryRegionSection *section = &dev->mem_sections[i];
 160        vhost_sync_dirty_bitmap(dev, section, first, last);
 161    }
 162}
 163
 164static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 165{
 166    uint64_t log_size = 0;
 167    int i;
 168    for (i = 0; i < dev->mem->nregions; ++i) {
 169        struct vhost_memory_region *reg = dev->mem->regions + i;
 170        uint64_t last = range_get_last(reg->guest_phys_addr,
 171                                       reg->memory_size);
 172        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 173    }
 174    return log_size;
 175}
 176
 177static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 178{
 179    Error *err = NULL;
 180    struct vhost_log *log;
 181    uint64_t logsize = size * sizeof(*(log->log));
 182    int fd = -1;
 183
 184    log = g_new0(struct vhost_log, 1);
 185    if (share) {
 186        log->log = qemu_memfd_alloc("vhost-log", logsize,
 187                                    F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 188                                    &fd, &err);
 189        if (err) {
 190            error_report_err(err);
 191            g_free(log);
 192            return NULL;
 193        }
 194        memset(log->log, 0, logsize);
 195    } else {
 196        log->log = g_malloc0(logsize);
 197    }
 198
 199    log->size = size;
 200    log->refcnt = 1;
 201    log->fd = fd;
 202
 203    return log;
 204}
 205
 206static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 207{
 208    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 209
 210    if (!log || log->size != size) {
 211        log = vhost_log_alloc(size, share);
 212        if (share) {
 213            vhost_log_shm = log;
 214        } else {
 215            vhost_log = log;
 216        }
 217    } else {
 218        ++log->refcnt;
 219    }
 220
 221    return log;
 222}
 223
 224static void vhost_log_put(struct vhost_dev *dev, bool sync)
 225{
 226    struct vhost_log *log = dev->log;
 227
 228    if (!log) {
 229        return;
 230    }
 231
 232    --log->refcnt;
 233    if (log->refcnt == 0) {
 234        /* Sync only the range covered by the old log */
 235        if (dev->log_size && sync) {
 236            vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 237        }
 238
 239        if (vhost_log == log) {
 240            g_free(log->log);
 241            vhost_log = NULL;
 242        } else if (vhost_log_shm == log) {
 243            qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 244                            log->fd);
 245            vhost_log_shm = NULL;
 246        }
 247
 248        g_free(log);
 249    }
 250
 251    dev->log = NULL;
 252    dev->log_size = 0;
 253}
 254
 255static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 256{
 257    return dev->vhost_ops->vhost_requires_shm_log &&
 258           dev->vhost_ops->vhost_requires_shm_log(dev);
 259}
 260
 261static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 262{
 263    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 264    uint64_t log_base = (uintptr_t)log->log;
 265    int r;
 266
 267    /* inform backend of log switching, this must be done before
 268       releasing the current log, to ensure no logging is lost */
 269    r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 270    if (r < 0) {
 271        VHOST_OPS_DEBUG("vhost_set_log_base failed");
 272    }
 273
 274    vhost_log_put(dev, true);
 275    dev->log = log;
 276    dev->log_size = size;
 277}
 278
 279static int vhost_dev_has_iommu(struct vhost_dev *dev)
 280{
 281    VirtIODevice *vdev = dev->vdev;
 282
 283    /*
 284     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
 285     * incremental memory mapping API via IOTLB API. For platform that
 286     * does not have IOMMU, there's no need to enable this feature
 287     * which may cause unnecessary IOTLB miss/update trnasactions.
 288     */
 289    return vdev->dma_as != &address_space_memory &&
 290           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
 291}
 292
 293static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
 294                              hwaddr *plen, bool is_write)
 295{
 296    if (!vhost_dev_has_iommu(dev)) {
 297        return cpu_physical_memory_map(addr, plen, is_write);
 298    } else {
 299        return (void *)(uintptr_t)addr;
 300    }
 301}
 302
 303static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
 304                               hwaddr len, int is_write,
 305                               hwaddr access_len)
 306{
 307    if (!vhost_dev_has_iommu(dev)) {
 308        cpu_physical_memory_unmap(buffer, len, is_write, access_len);
 309    }
 310}
 311
 312static int vhost_verify_ring_part_mapping(void *ring_hva,
 313                                          uint64_t ring_gpa,
 314                                          uint64_t ring_size,
 315                                          void *reg_hva,
 316                                          uint64_t reg_gpa,
 317                                          uint64_t reg_size)
 318{
 319    uint64_t hva_ring_offset;
 320    uint64_t ring_last = range_get_last(ring_gpa, ring_size);
 321    uint64_t reg_last = range_get_last(reg_gpa, reg_size);
 322
 323    if (ring_last < reg_gpa || ring_gpa > reg_last) {
 324        return 0;
 325    }
 326    /* check that whole ring's is mapped */
 327    if (ring_last > reg_last) {
 328        return -ENOMEM;
 329    }
 330    /* check that ring's MemoryRegion wasn't replaced */
 331    hva_ring_offset = ring_gpa - reg_gpa;
 332    if (ring_hva != reg_hva + hva_ring_offset) {
 333        return -EBUSY;
 334    }
 335
 336    return 0;
 337}
 338
 339static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 340                                      void *reg_hva,
 341                                      uint64_t reg_gpa,
 342                                      uint64_t reg_size)
 343{
 344    int i, j;
 345    int r = 0;
 346    const char *part_name[] = {
 347        "descriptor table",
 348        "available ring",
 349        "used ring"
 350    };
 351
 352    if (vhost_dev_has_iommu(dev)) {
 353        return 0;
 354    }
 355
 356    for (i = 0; i < dev->nvqs; ++i) {
 357        struct vhost_virtqueue *vq = dev->vqs + i;
 358
 359        if (vq->desc_phys == 0) {
 360            continue;
 361        }
 362
 363        j = 0;
 364        r = vhost_verify_ring_part_mapping(
 365                vq->desc, vq->desc_phys, vq->desc_size,
 366                reg_hva, reg_gpa, reg_size);
 367        if (r) {
 368            break;
 369        }
 370
 371        j++;
 372        r = vhost_verify_ring_part_mapping(
 373                vq->avail, vq->avail_phys, vq->avail_size,
 374                reg_hva, reg_gpa, reg_size);
 375        if (r) {
 376            break;
 377        }
 378
 379        j++;
 380        r = vhost_verify_ring_part_mapping(
 381                vq->used, vq->used_phys, vq->used_size,
 382                reg_hva, reg_gpa, reg_size);
 383        if (r) {
 384            break;
 385        }
 386    }
 387
 388    if (r == -ENOMEM) {
 389        error_report("Unable to map %s for ring %d", part_name[j], i);
 390    } else if (r == -EBUSY) {
 391        error_report("%s relocated for ring %d", part_name[j], i);
 392    }
 393    return r;
 394}
 395
 396/*
 397 * vhost_section: identify sections needed for vhost access
 398 *
 399 * We only care about RAM sections here (where virtqueue and guest
 400 * internals accessed by virtio might live). If we find one we still
 401 * allow the backend to potentially filter it out of our list.
 402 */
 403static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
 404{
 405    MemoryRegion *mr = section->mr;
 406
 407    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
 408        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
 409        uint8_t handled_dirty;
 410
 411        /*
 412         * Kernel based vhost doesn't handle any block which is doing
 413         * dirty-tracking other than migration for which it has
 414         * specific logging support. However for TCG the kernel never
 415         * gets involved anyway so we can also ignore it's
 416         * self-modiying code detection flags. However a vhost-user
 417         * client could still confuse a TCG guest if it re-writes
 418         * executable memory that has already been translated.
 419         */
 420        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
 421            (1 << DIRTY_MEMORY_CODE);
 422
 423        if (dirty_mask & ~handled_dirty) {
 424            trace_vhost_reject_section(mr->name, 1);
 425            return false;
 426        }
 427
 428        if (dev->vhost_ops->vhost_backend_mem_section_filter &&
 429            !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
 430            trace_vhost_reject_section(mr->name, 2);
 431            return false;
 432        }
 433
 434        trace_vhost_section(mr->name);
 435        return true;
 436    } else {
 437        trace_vhost_reject_section(mr->name, 3);
 438        return false;
 439    }
 440}
 441
 442static void vhost_begin(MemoryListener *listener)
 443{
 444    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 445                                         memory_listener);
 446    dev->tmp_sections = NULL;
 447    dev->n_tmp_sections = 0;
 448}
 449
 450static void vhost_commit(MemoryListener *listener)
 451{
 452    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 453                                         memory_listener);
 454    MemoryRegionSection *old_sections;
 455    int n_old_sections;
 456    uint64_t log_size;
 457    size_t regions_size;
 458    int r;
 459    int i;
 460    bool changed = false;
 461
 462    /* Note we can be called before the device is started, but then
 463     * starting the device calls set_mem_table, so we need to have
 464     * built the data structures.
 465     */
 466    old_sections = dev->mem_sections;
 467    n_old_sections = dev->n_mem_sections;
 468    dev->mem_sections = dev->tmp_sections;
 469    dev->n_mem_sections = dev->n_tmp_sections;
 470
 471    if (dev->n_mem_sections != n_old_sections) {
 472        changed = true;
 473    } else {
 474        /* Same size, lets check the contents */
 475        for (int i = 0; i < n_old_sections; i++) {
 476            if (!MemoryRegionSection_eq(&old_sections[i],
 477                                        &dev->mem_sections[i])) {
 478                changed = true;
 479                break;
 480            }
 481        }
 482    }
 483
 484    trace_vhost_commit(dev->started, changed);
 485    if (!changed) {
 486        goto out;
 487    }
 488
 489    /* Rebuild the regions list from the new sections list */
 490    regions_size = offsetof(struct vhost_memory, regions) +
 491                       dev->n_mem_sections * sizeof dev->mem->regions[0];
 492    dev->mem = g_realloc(dev->mem, regions_size);
 493    dev->mem->nregions = dev->n_mem_sections;
 494    used_memslots = dev->mem->nregions;
 495    for (i = 0; i < dev->n_mem_sections; i++) {
 496        struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
 497        struct MemoryRegionSection *mrs = dev->mem_sections + i;
 498
 499        cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
 500        cur_vmr->memory_size     = int128_get64(mrs->size);
 501        cur_vmr->userspace_addr  =
 502            (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
 503            mrs->offset_within_region;
 504        cur_vmr->flags_padding   = 0;
 505    }
 506
 507    if (!dev->started) {
 508        goto out;
 509    }
 510
 511    for (i = 0; i < dev->mem->nregions; i++) {
 512        if (vhost_verify_ring_mappings(dev,
 513                       (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
 514                       dev->mem->regions[i].guest_phys_addr,
 515                       dev->mem->regions[i].memory_size)) {
 516            error_report("Verify ring failure on region %d", i);
 517            abort();
 518        }
 519    }
 520
 521    if (!dev->log_enabled) {
 522        r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 523        if (r < 0) {
 524            VHOST_OPS_DEBUG("vhost_set_mem_table failed");
 525        }
 526        goto out;
 527    }
 528    log_size = vhost_get_log_size(dev);
 529    /* We allocate an extra 4K bytes to log,
 530     * to reduce the * number of reallocations. */
 531#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 532    /* To log more, must increase log size before table update. */
 533    if (dev->log_size < log_size) {
 534        vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 535    }
 536    r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 537    if (r < 0) {
 538        VHOST_OPS_DEBUG("vhost_set_mem_table failed");
 539    }
 540    /* To log less, can only decrease log size after table update. */
 541    if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 542        vhost_dev_log_resize(dev, log_size);
 543    }
 544
 545out:
 546    /* Deref the old list of sections, this must happen _after_ the
 547     * vhost_set_mem_table to ensure the client isn't still using the
 548     * section we're about to unref.
 549     */
 550    while (n_old_sections--) {
 551        memory_region_unref(old_sections[n_old_sections].mr);
 552    }
 553    g_free(old_sections);
 554    return;
 555}
 556
 557/* Adds the section data to the tmp_section structure.
 558 * It relies on the listener calling us in memory address order
 559 * and for each region (via the _add and _nop methods) to
 560 * join neighbours.
 561 */
 562static void vhost_region_add_section(struct vhost_dev *dev,
 563                                     MemoryRegionSection *section)
 564{
 565    bool need_add = true;
 566    uint64_t mrs_size = int128_get64(section->size);
 567    uint64_t mrs_gpa = section->offset_within_address_space;
 568    uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
 569                         section->offset_within_region;
 570    RAMBlock *mrs_rb = section->mr->ram_block;
 571
 572    trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
 573                                   mrs_host);
 574
 575    if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
 576        /* Round the section to it's page size */
 577        /* First align the start down to a page boundary */
 578        size_t mrs_page = qemu_ram_pagesize(mrs_rb);
 579        uint64_t alignage = mrs_host & (mrs_page - 1);
 580        if (alignage) {
 581            mrs_host -= alignage;
 582            mrs_size += alignage;
 583            mrs_gpa  -= alignage;
 584        }
 585        /* Now align the size up to a page boundary */
 586        alignage = mrs_size & (mrs_page - 1);
 587        if (alignage) {
 588            mrs_size += mrs_page - alignage;
 589        }
 590        trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
 591                                               mrs_size, mrs_host);
 592    }
 593
 594    if (dev->n_tmp_sections) {
 595        /* Since we already have at least one section, lets see if
 596         * this extends it; since we're scanning in order, we only
 597         * have to look at the last one, and the FlatView that calls
 598         * us shouldn't have overlaps.
 599         */
 600        MemoryRegionSection *prev_sec = dev->tmp_sections +
 601                                               (dev->n_tmp_sections - 1);
 602        uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
 603        uint64_t prev_size = int128_get64(prev_sec->size);
 604        uint64_t prev_gpa_end   = range_get_last(prev_gpa_start, prev_size);
 605        uint64_t prev_host_start =
 606                        (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
 607                        prev_sec->offset_within_region;
 608        uint64_t prev_host_end   = range_get_last(prev_host_start, prev_size);
 609
 610        if (mrs_gpa <= (prev_gpa_end + 1)) {
 611            /* OK, looks like overlapping/intersecting - it's possible that
 612             * the rounding to page sizes has made them overlap, but they should
 613             * match up in the same RAMBlock if they do.
 614             */
 615            if (mrs_gpa < prev_gpa_start) {
 616                error_report("%s:Section '%s' rounded to %"PRIx64
 617                             " prior to previous '%s' %"PRIx64,
 618                             __func__, section->mr->name, mrs_gpa,
 619                             prev_sec->mr->name, prev_gpa_start);
 620                /* A way to cleanly fail here would be better */
 621                return;
 622            }
 623            /* Offset from the start of the previous GPA to this GPA */
 624            size_t offset = mrs_gpa - prev_gpa_start;
 625
 626            if (prev_host_start + offset == mrs_host &&
 627                section->mr == prev_sec->mr &&
 628                (!dev->vhost_ops->vhost_backend_can_merge ||
 629                 dev->vhost_ops->vhost_backend_can_merge(dev,
 630                    mrs_host, mrs_size,
 631                    prev_host_start, prev_size))) {
 632                uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
 633                need_add = false;
 634                prev_sec->offset_within_address_space =
 635                    MIN(prev_gpa_start, mrs_gpa);
 636                prev_sec->offset_within_region =
 637                    MIN(prev_host_start, mrs_host) -
 638                    (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
 639                prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
 640                                               mrs_host));
 641                trace_vhost_region_add_section_merge(section->mr->name,
 642                                        int128_get64(prev_sec->size),
 643                                        prev_sec->offset_within_address_space,
 644                                        prev_sec->offset_within_region);
 645            } else {
 646                /* adjoining regions are fine, but overlapping ones with
 647                 * different blocks/offsets shouldn't happen
 648                 */
 649                if (mrs_gpa != prev_gpa_end + 1) {
 650                    error_report("%s: Overlapping but not coherent sections "
 651                                 "at %"PRIx64,
 652                                 __func__, mrs_gpa);
 653                    return;
 654                }
 655            }
 656        }
 657    }
 658
 659    if (need_add) {
 660        ++dev->n_tmp_sections;
 661        dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
 662                                    dev->n_tmp_sections);
 663        dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
 664        /* The flatview isn't stable and we don't use it, making it NULL
 665         * means we can memcmp the list.
 666         */
 667        dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
 668        memory_region_ref(section->mr);
 669    }
 670}
 671
 672/* Used for both add and nop callbacks */
 673static void vhost_region_addnop(MemoryListener *listener,
 674                                MemoryRegionSection *section)
 675{
 676    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 677                                         memory_listener);
 678
 679    if (!vhost_section(dev, section)) {
 680        return;
 681    }
 682    vhost_region_add_section(dev, section);
 683}
 684
 685static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 686{
 687    struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
 688    struct vhost_dev *hdev = iommu->hdev;
 689    hwaddr iova = iotlb->iova + iommu->iommu_offset;
 690
 691    if (vhost_backend_invalidate_device_iotlb(hdev, iova,
 692                                              iotlb->addr_mask + 1)) {
 693        error_report("Fail to invalidate device iotlb");
 694    }
 695}
 696
 697static void vhost_iommu_region_add(MemoryListener *listener,
 698                                   MemoryRegionSection *section)
 699{
 700    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 701                                         iommu_listener);
 702    struct vhost_iommu *iommu;
 703    Int128 end;
 704    int iommu_idx;
 705    IOMMUMemoryRegion *iommu_mr;
 706    int ret;
 707
 708    if (!memory_region_is_iommu(section->mr)) {
 709        return;
 710    }
 711
 712    iommu_mr = IOMMU_MEMORY_REGION(section->mr);
 713
 714    iommu = g_malloc0(sizeof(*iommu));
 715    end = int128_add(int128_make64(section->offset_within_region),
 716                     section->size);
 717    end = int128_sub(end, int128_one());
 718    iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
 719                                                   MEMTXATTRS_UNSPECIFIED);
 720    iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
 721                        IOMMU_NOTIFIER_DEVIOTLB_UNMAP,
 722                        section->offset_within_region,
 723                        int128_get64(end),
 724                        iommu_idx);
 725    iommu->mr = section->mr;
 726    iommu->iommu_offset = section->offset_within_address_space -
 727                          section->offset_within_region;
 728    iommu->hdev = dev;
 729    ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL);
 730    if (ret) {
 731        /*
 732         * Some vIOMMUs do not support dev-iotlb yet.  If so, try to use the
 733         * UNMAP legacy message
 734         */
 735        iommu->n.notifier_flags = IOMMU_NOTIFIER_UNMAP;
 736        memory_region_register_iommu_notifier(section->mr, &iommu->n,
 737                                              &error_fatal);
 738    }
 739    QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
 740    /* TODO: can replay help performance here? */
 741}
 742
 743static void vhost_iommu_region_del(MemoryListener *listener,
 744                                   MemoryRegionSection *section)
 745{
 746    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 747                                         iommu_listener);
 748    struct vhost_iommu *iommu;
 749
 750    if (!memory_region_is_iommu(section->mr)) {
 751        return;
 752    }
 753
 754    QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
 755        if (iommu->mr == section->mr &&
 756            iommu->n.start == section->offset_within_region) {
 757            memory_region_unregister_iommu_notifier(iommu->mr,
 758                                                    &iommu->n);
 759            QLIST_REMOVE(iommu, iommu_next);
 760            g_free(iommu);
 761            break;
 762        }
 763    }
 764}
 765
 766static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 767                                    struct vhost_virtqueue *vq,
 768                                    unsigned idx, bool enable_log)
 769{
 770    struct vhost_vring_addr addr;
 771    int r;
 772    memset(&addr, 0, sizeof(struct vhost_vring_addr));
 773
 774    if (dev->vhost_ops->vhost_vq_get_addr) {
 775        r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
 776        if (r < 0) {
 777            VHOST_OPS_DEBUG("vhost_vq_get_addr failed");
 778            return -errno;
 779        }
 780    } else {
 781        addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
 782        addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
 783        addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
 784    }
 785    addr.index = idx;
 786    addr.log_guest_addr = vq->used_phys;
 787    addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
 788    r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 789    if (r < 0) {
 790        VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
 791        return -errno;
 792    }
 793    return 0;
 794}
 795
 796static int vhost_dev_set_features(struct vhost_dev *dev,
 797                                  bool enable_log)
 798{
 799    uint64_t features = dev->acked_features;
 800    int r;
 801    if (enable_log) {
 802        features |= 0x1ULL << VHOST_F_LOG_ALL;
 803    }
 804    if (!vhost_dev_has_iommu(dev)) {
 805        features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
 806    }
 807    if (dev->vhost_ops->vhost_force_iommu) {
 808        if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
 809            features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
 810       }
 811    }
 812    r = dev->vhost_ops->vhost_set_features(dev, features);
 813    if (r < 0) {
 814        VHOST_OPS_DEBUG("vhost_set_features failed");
 815        goto out;
 816    }
 817    if (dev->vhost_ops->vhost_set_backend_cap) {
 818        r = dev->vhost_ops->vhost_set_backend_cap(dev);
 819        if (r < 0) {
 820            VHOST_OPS_DEBUG("vhost_set_backend_cap failed");
 821            goto out;
 822        }
 823    }
 824
 825out:
 826    return r < 0 ? -errno : 0;
 827}
 828
 829static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 830{
 831    int r, i, idx;
 832    hwaddr addr;
 833
 834    r = vhost_dev_set_features(dev, enable_log);
 835    if (r < 0) {
 836        goto err_features;
 837    }
 838    for (i = 0; i < dev->nvqs; ++i) {
 839        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 840        addr = virtio_queue_get_desc_addr(dev->vdev, idx);
 841        if (!addr) {
 842            /*
 843             * The queue might not be ready for start. If this
 844             * is the case there is no reason to continue the process.
 845             * The similar logic is used by the vhost_virtqueue_start()
 846             * routine.
 847             */
 848            continue;
 849        }
 850        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 851                                     enable_log);
 852        if (r < 0) {
 853            goto err_vq;
 854        }
 855    }
 856    return 0;
 857err_vq:
 858    for (; i >= 0; --i) {
 859        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 860        vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 861                                 dev->log_enabled);
 862    }
 863    vhost_dev_set_features(dev, dev->log_enabled);
 864err_features:
 865    return r;
 866}
 867
 868static int vhost_migration_log(MemoryListener *listener, bool enable)
 869{
 870    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 871                                         memory_listener);
 872    int r;
 873    if (enable == dev->log_enabled) {
 874        return 0;
 875    }
 876    if (!dev->started) {
 877        dev->log_enabled = enable;
 878        return 0;
 879    }
 880
 881    r = 0;
 882    if (!enable) {
 883        r = vhost_dev_set_log(dev, false);
 884        if (r < 0) {
 885            goto check_dev_state;
 886        }
 887        vhost_log_put(dev, false);
 888    } else {
 889        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 890        r = vhost_dev_set_log(dev, true);
 891        if (r < 0) {
 892            goto check_dev_state;
 893        }
 894    }
 895
 896check_dev_state:
 897    dev->log_enabled = enable;
 898    /*
 899     * vhost-user-* devices could change their state during log
 900     * initialization due to disconnect. So check dev state after
 901     * vhost communication.
 902     */
 903    if (!dev->started) {
 904        /*
 905         * Since device is in the stopped state, it is okay for
 906         * migration. Return success.
 907         */
 908        r = 0;
 909    }
 910    if (r) {
 911        /* An error occurred. */
 912        dev->log_enabled = false;
 913    }
 914
 915    return r;
 916}
 917
 918static void vhost_log_global_start(MemoryListener *listener)
 919{
 920    int r;
 921
 922    r = vhost_migration_log(listener, true);
 923    if (r < 0) {
 924        abort();
 925    }
 926}
 927
 928static void vhost_log_global_stop(MemoryListener *listener)
 929{
 930    int r;
 931
 932    r = vhost_migration_log(listener, false);
 933    if (r < 0) {
 934        abort();
 935    }
 936}
 937
 938static void vhost_log_start(MemoryListener *listener,
 939                            MemoryRegionSection *section,
 940                            int old, int new)
 941{
 942    /* FIXME: implement */
 943}
 944
 945static void vhost_log_stop(MemoryListener *listener,
 946                           MemoryRegionSection *section,
 947                           int old, int new)
 948{
 949    /* FIXME: implement */
 950}
 951
 952/* The vhost driver natively knows how to handle the vrings of non
 953 * cross-endian legacy devices and modern devices. Only legacy devices
 954 * exposed to a bi-endian guest may require the vhost driver to use a
 955 * specific endianness.
 956 */
 957static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
 958{
 959    if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
 960        return false;
 961    }
 962#ifdef HOST_WORDS_BIGENDIAN
 963    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
 964#else
 965    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
 966#endif
 967}
 968
 969static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 970                                                   bool is_big_endian,
 971                                                   int vhost_vq_index)
 972{
 973    struct vhost_vring_state s = {
 974        .index = vhost_vq_index,
 975        .num = is_big_endian
 976    };
 977
 978    if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
 979        return 0;
 980    }
 981
 982    VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
 983    if (errno == ENOTTY) {
 984        error_report("vhost does not support cross-endian");
 985        return -ENOSYS;
 986    }
 987
 988    return -errno;
 989}
 990
 991static int vhost_memory_region_lookup(struct vhost_dev *hdev,
 992                                      uint64_t gpa, uint64_t *uaddr,
 993                                      uint64_t *len)
 994{
 995    int i;
 996
 997    for (i = 0; i < hdev->mem->nregions; i++) {
 998        struct vhost_memory_region *reg = hdev->mem->regions + i;
 999
1000        if (gpa >= reg->guest_phys_addr &&

1001            reg->guest_phys_addr + reg->memory_size > gpa) {
1002            *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
1003            *len = reg->guest_phys_addr + reg->memory_size - gpa;
1004            return 0;
1005        }
1006    }
1007
1008    return -EFAULT;
1009}
1010
1011int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
1012{
1013    IOMMUTLBEntry iotlb;
1014    uint64_t uaddr, len;
1015    int ret = -EFAULT;
1016
1017    RCU_READ_LOCK_GUARD();
1018
1019    trace_vhost_iotlb_miss(dev, 1);
1020
1021    iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
1022                                          iova, write,
1023                                          MEMTXATTRS_UNSPECIFIED);
1024    if (iotlb.target_as != NULL) {
1025        ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
1026                                         &uaddr, &len);
1027        if (ret) {
1028            trace_vhost_iotlb_miss(dev, 3);
1029            error_report("Fail to lookup the translated address "
1030                         "%"PRIx64, iotlb.translated_addr);
1031            goto out;
1032        }
1033
1034        len = MIN(iotlb.addr_mask + 1, len);
1035        iova = iova & ~iotlb.addr_mask;
1036
1037        ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1038                                                len, iotlb.perm);
1039        if (ret) {
1040            trace_vhost_iotlb_miss(dev, 4);
1041            error_report("Fail to update device iotlb");
1042            goto out;
1043        }
1044    }
1045
1046    trace_vhost_iotlb_miss(dev, 2);
1047
1048out:
1049    return ret;
1050}
1051
1052static int vhost_virtqueue_start(struct vhost_dev *dev,
1053                                struct VirtIODevice *vdev,
1054                                struct vhost_virtqueue *vq,
1055                                unsigned idx)
1056{
1057    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1058    VirtioBusState *vbus = VIRTIO_BUS(qbus);
1059    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1060    hwaddr s, l, a;
1061    int r;
1062    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1063    struct vhost_vring_file file = {
1064        .index = vhost_vq_index
1065    };
1066    struct vhost_vring_state state = {
1067        .index = vhost_vq_index
1068    };
1069    struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1070
1071    a = virtio_queue_get_desc_addr(vdev, idx);
1072    if (a == 0) {
1073        /* Queue might not be ready for start */
1074        return 0;
1075    }
1076
1077    vq->num = state.num = virtio_queue_get_num(vdev, idx);
1078    r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1079    if (r) {
1080        VHOST_OPS_DEBUG("vhost_set_vring_num failed");
1081        return -errno;
1082    }
1083
1084    state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1085    r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1086    if (r) {
1087        VHOST_OPS_DEBUG("vhost_set_vring_base failed");
1088        return -errno;
1089    }
1090
1091    if (vhost_needs_vring_endian(vdev)) {
1092        r = vhost_virtqueue_set_vring_endian_legacy(dev,
1093                                                    virtio_is_big_endian(vdev),
1094                                                    vhost_vq_index);
1095        if (r) {
1096            return -errno;
1097        }
1098    }
1099
1100    vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1101    vq->desc_phys = a;
1102    vq->desc = vhost_memory_map(dev, a, &l, false);
1103    if (!vq->desc || l != s) {
1104        r = -ENOMEM;
1105        goto fail_alloc_desc;
1106    }
1107    vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1108    vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1109    vq->avail = vhost_memory_map(dev, a, &l, false);
1110    if (!vq->avail || l != s) {
1111        r = -ENOMEM;
1112        goto fail_alloc_avail;
1113    }
1114    vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1115    vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1116    vq->used = vhost_memory_map(dev, a, &l, true);
1117    if (!vq->used || l != s) {
1118        r = -ENOMEM;
1119        goto fail_alloc_used;
1120    }
1121
1122    r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1123    if (r < 0) {
1124        r = -errno;
1125        goto fail_alloc;
1126    }
1127
1128    file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1129    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1130    if (r) {
1131        VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
1132        r = -errno;
1133        goto fail_kick;
1134    }
1135
1136    /* Clear and discard previous events if any. */
1137    event_notifier_test_and_clear(&vq->masked_notifier);
1138
1139    /* Init vring in unmasked state, unless guest_notifier_mask
1140     * will do it later.
1141     */
1142    if (!vdev->use_guest_notifier_mask) {
1143        /* TODO: check and handle errors. */
1144        vhost_virtqueue_mask(dev, vdev, idx, false);
1145    }
1146
1147    if (k->query_guest_notifiers &&
1148        k->query_guest_notifiers(qbus->parent) &&
1149        virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1150        file.fd = -1;
1151        r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1152        if (r) {
1153            goto fail_vector;
1154        }
1155    }
1156
1157    return 0;
1158
1159fail_vector:
1160fail_kick:
1161fail_alloc:
1162    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1163                       0, 0);
1164fail_alloc_used:
1165    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1166                       0, 0);
1167fail_alloc_avail:
1168    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1169                       0, 0);
1170fail_alloc_desc:
1171    return r;
1172}
1173
1174static void vhost_virtqueue_stop(struct vhost_dev *dev,
1175                                    struct VirtIODevice *vdev,
1176                                    struct vhost_virtqueue *vq,
1177                                    unsigned idx)
1178{
1179    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1180    struct vhost_vring_state state = {
1181        .index = vhost_vq_index,
1182    };
1183    int r;
1184
1185    if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
1186        /* Don't stop the virtqueue which might have not been started */
1187        return;
1188    }
1189
1190    r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1191    if (r < 0) {
1192        VHOST_OPS_DEBUG("vhost VQ %u ring restore failed: %d", idx, r);
1193        /* Connection to the backend is broken, so let's sync internal
1194         * last avail idx to the device used idx.
1195         */
1196        virtio_queue_restore_last_avail_idx(vdev, idx);
1197    } else {
1198        virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1199    }
1200    virtio_queue_invalidate_signalled_used(vdev, idx);
1201    virtio_queue_update_used_idx(vdev, idx);
1202
1203    /* In the cross-endian case, we need to reset the vring endianness to
1204     * native as legacy devices expect so by default.
1205     */
1206    if (vhost_needs_vring_endian(vdev)) {
1207        vhost_virtqueue_set_vring_endian_legacy(dev,
1208                                                !virtio_is_big_endian(vdev),
1209                                                vhost_vq_index);
1210    }
1211
1212    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1213                       1, virtio_queue_get_used_size(vdev, idx));
1214    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1215                       0, virtio_queue_get_avail_size(vdev, idx));
1216    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1217                       0, virtio_queue_get_desc_size(vdev, idx));
1218}
1219
1220static void vhost_eventfd_add(MemoryListener *listener,
1221                              MemoryRegionSection *section,
1222                              bool match_data, uint64_t data, EventNotifier *e)
1223{
1224}
1225
1226static void vhost_eventfd_del(MemoryListener *listener,
1227                              MemoryRegionSection *section,
1228                              bool match_data, uint64_t data, EventNotifier *e)
1229{
1230}
1231
1232static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1233                                                int n, uint32_t timeout)
1234{
1235    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1236    struct vhost_vring_state state = {
1237        .index = vhost_vq_index,
1238        .num = timeout,
1239    };
1240    int r;
1241
1242    if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1243        return -EINVAL;
1244    }
1245
1246    r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1247    if (r) {
1248        VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
1249        return r;
1250    }
1251
1252    return 0;
1253}
1254
1255static int vhost_virtqueue_init(struct vhost_dev *dev,
1256                                struct vhost_virtqueue *vq, int n)
1257{
1258    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1259    struct vhost_vring_file file = {
1260        .index = vhost_vq_index,
1261    };
1262    int r = event_notifier_init(&vq->masked_notifier, 0);
1263    if (r < 0) {
1264        return r;
1265    }
1266
1267    file.fd = event_notifier_get_fd(&vq->masked_notifier);
1268    r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1269    if (r) {
1270        VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1271        r = -errno;
1272        goto fail_call;
1273    }
1274
1275    vq->dev = dev;
1276
1277    return 0;
1278fail_call:
1279    event_notifier_cleanup(&vq->masked_notifier);
1280    return r;
1281}
1282
1283static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1284{
1285    event_notifier_cleanup(&vq->masked_notifier);
1286}
1287
1288int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1289                   VhostBackendType backend_type, uint32_t busyloop_timeout,
1290                   Error **errp)
1291{
1292    ERRP_GUARD();
1293    uint64_t features;
1294    int i, r, n_initialized_vqs = 0;
1295
1296    hdev->vdev = NULL;
1297    hdev->migration_blocker = NULL;
1298
1299    r = vhost_set_backend_type(hdev, backend_type);
1300    assert(r >= 0);
1301
1302    r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp);
1303    if (r < 0) {
1304        if (!*errp) {
1305            error_setg_errno(errp, -r, "vhost_backend_init failed");
1306        }
1307        goto fail;
1308    }
1309
1310    r = hdev->vhost_ops->vhost_set_owner(hdev);
1311    if (r < 0) {
1312        error_setg_errno(errp, -r, "vhost_set_owner failed");
1313        goto fail;
1314    }
1315
1316    r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1317    if (r < 0) {
1318        error_setg_errno(errp, -r, "vhost_get_features failed");
1319        goto fail;
1320    }
1321
1322    for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1323        r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1324        if (r < 0) {
1325            error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i);
1326            goto fail;
1327        }
1328    }
1329
1330    if (busyloop_timeout) {
1331        for (i = 0; i < hdev->nvqs; ++i) {
1332            r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1333                                                     busyloop_timeout);
1334            if (r < 0) {
1335                error_setg_errno(errp, -r, "Failed to set busyloop timeout");
1336                goto fail_busyloop;
1337            }
1338        }
1339    }
1340
1341    hdev->features = features;
1342
1343    hdev->memory_listener = (MemoryListener) {
1344        .begin = vhost_begin,
1345        .commit = vhost_commit,
1346        .region_add = vhost_region_addnop,
1347        .region_nop = vhost_region_addnop,
1348        .log_start = vhost_log_start,
1349        .log_stop = vhost_log_stop,
1350        .log_sync = vhost_log_sync,
1351        .log_global_start = vhost_log_global_start,
1352        .log_global_stop = vhost_log_global_stop,
1353        .eventfd_add = vhost_eventfd_add,
1354        .eventfd_del = vhost_eventfd_del,
1355        .priority = 10
1356    };
1357
1358    hdev->iommu_listener = (MemoryListener) {
1359        .region_add = vhost_iommu_region_add,
1360        .region_del = vhost_iommu_region_del,
1361    };
1362
1363    if (hdev->migration_blocker == NULL) {
1364        if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1365            error_setg(&hdev->migration_blocker,
1366                       "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1367        } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
1368            error_setg(&hdev->migration_blocker,
1369                       "Migration disabled: failed to allocate shared memory");
1370        }
1371    }
1372
1373    if (hdev->migration_blocker != NULL) {
1374        r = migrate_add_blocker(hdev->migration_blocker, errp);
1375        if (*errp) {
1376            error_free(hdev->migration_blocker);
1377            goto fail_busyloop;
1378        }
1379    }
1380
1381    hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1382    hdev->n_mem_sections = 0;
1383    hdev->mem_sections = NULL;
1384    hdev->log = NULL;
1385    hdev->log_size = 0;
1386    hdev->log_enabled = false;
1387    hdev->started = false;
1388    memory_listener_register(&hdev->memory_listener, &address_space_memory);
1389    QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1390
1391    if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1392        error_setg(errp, "vhost backend memory slots limit is less"
1393                   " than current number of present memory slots");
1394        r = -EINVAL;
1395        goto fail_busyloop;
1396    }
1397
1398    return 0;
1399
1400fail_busyloop:
1401    if (busyloop_timeout) {
1402        while (--i >= 0) {
1403            vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1404        }
1405    }
1406fail:
1407    hdev->nvqs = n_initialized_vqs;
1408    vhost_dev_cleanup(hdev);
1409    return r;
1410}
1411
1412void vhost_dev_cleanup(struct vhost_dev *hdev)
1413{
1414    int i;
1415
1416    for (i = 0; i < hdev->nvqs; ++i) {
1417        vhost_virtqueue_cleanup(hdev->vqs + i);
1418    }
1419    if (hdev->mem) {
1420        /* those are only safe after successful init */
1421        memory_listener_unregister(&hdev->memory_listener);
1422        QLIST_REMOVE(hdev, entry);
1423    }
1424    if (hdev->migration_blocker) {
1425        migrate_del_blocker(hdev->migration_blocker);
1426        error_free(hdev->migration_blocker);
1427    }
1428    g_free(hdev->mem);
1429    g_free(hdev->mem_sections);
1430    if (hdev->vhost_ops) {
1431        hdev->vhost_ops->vhost_backend_cleanup(hdev);
1432    }
1433    assert(!hdev->log);
1434
1435    memset(hdev, 0, sizeof(struct vhost_dev));
1436}
1437
1438/* Stop processing guest IO notifications in qemu.
1439 * Start processing them in vhost in kernel.
1440 */
1441int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1442{
1443    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1444    int i, r, e;
1445
1446    /* We will pass the notifiers to the kernel, make sure that QEMU
1447     * doesn't interfere.
1448     */
1449    r = virtio_device_grab_ioeventfd(vdev);
1450    if (r < 0) {
1451        error_report("binding does not support host notifiers");
1452        goto fail;
1453    }
1454
1455    for (i = 0; i < hdev->nvqs; ++i) {
1456        r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1457                                         true);
1458        if (r < 0) {
1459            error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1460            goto fail_vq;
1461        }
1462    }
1463
1464    return 0;
1465fail_vq:
1466    while (--i >= 0) {
1467        e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1468                                         false);
1469        if (e < 0) {
1470            error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
1471        }
1472        assert (e >= 0);
1473        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1474    }
1475    virtio_device_release_ioeventfd(vdev);
1476fail:
1477    return r;
1478}
1479
1480/* Stop processing guest IO notifications in vhost.
1481 * Start processing them in qemu.
1482 * This might actually run the qemu handlers right away,
1483 * so virtio in qemu must be completely setup when this is called.
1484 */
1485void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1486{
1487    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1488    int i, r;
1489
1490    for (i = 0; i < hdev->nvqs; ++i) {
1491        r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1492                                         false);
1493        if (r < 0) {
1494            error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1495        }
1496        assert (r >= 0);
1497        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1498    }
1499    virtio_device_release_ioeventfd(vdev);
1500}
1501
1502/* Test and clear event pending status.
1503 * Should be called after unmask to avoid losing events.
1504 */
1505bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1506{
1507    struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1508    assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1509    return event_notifier_test_and_clear(&vq->masked_notifier);
1510}
1511
1512/* Mask/unmask events from this vq. */
1513void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1514                         bool mask)
1515{
1516    struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1517    int r, index = n - hdev->vq_index;
1518    struct vhost_vring_file file;
1519
1520    /* should only be called after backend is connected */
1521    assert(hdev->vhost_ops);
1522
1523    if (mask) {
1524        assert(vdev->use_guest_notifier_mask);
1525        file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1526    } else {
1527        file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1528    }
1529
1530    file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1531    r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1532    if (r < 0) {
1533        VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1534    }
1535}
1536
1537uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1538                            uint64_t features)
1539{
1540    const int *bit = feature_bits;
1541    while (*bit != VHOST_INVALID_FEATURE_BIT) {
1542        uint64_t bit_mask = (1ULL << *bit);
1543        if (!(hdev->features & bit_mask)) {
1544            features &= ~bit_mask;
1545        }
1546        bit++;
1547    }
1548    return features;
1549}
1550
1551void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1552                        uint64_t features)
1553{
1554    const int *bit = feature_bits;
1555    while (*bit != VHOST_INVALID_FEATURE_BIT) {
1556        uint64_t bit_mask = (1ULL << *bit);
1557        if (features & bit_mask) {
1558            hdev->acked_features |= bit_mask;
1559        }
1560        bit++;
1561    }
1562}
1563
1564int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1565                         uint32_t config_len, Error **errp)
1566{
1567    ERRP_GUARD();
1568    int ret;
1569
1570    assert(hdev->vhost_ops);
1571
1572    if (hdev->vhost_ops->vhost_get_config) {
1573        ret = hdev->vhost_ops->vhost_get_config(hdev, config, config_len, errp);
1574        if (ret < 0 && !*errp) {
1575            error_setg_errno(errp, -ret, "vhost_get_config failed");
1576        }
1577        return ret;
1578    }
1579
1580    error_setg(errp, "vhost_get_config not implemented");
1581    return -ENOTSUP;
1582}
1583
1584int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1585                         uint32_t offset, uint32_t size, uint32_t flags)
1586{
1587    assert(hdev->vhost_ops);
1588
1589    if (hdev->vhost_ops->vhost_set_config) {
1590        return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1591                                                 size, flags);
1592    }
1593
1594    return -1;
1595}
1596
1597void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1598                                   const VhostDevConfigOps *ops)
1599{
1600    hdev->config_ops = ops;
1601}
1602
1603void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1604{
1605    if (inflight && inflight->addr) {
1606        qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1607        inflight->addr = NULL;
1608        inflight->fd = -1;
1609    }
1610}
1611
1612static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
1613                                     uint64_t new_size)
1614{
1615    Error *err = NULL;
1616    int fd = -1;
1617    void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
1618                                  F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
1619                                  &fd, &err);
1620
1621    if (err) {
1622        error_report_err(err);
1623        return -1;
1624    }
1625
1626    vhost_dev_free_inflight(inflight);
1627    inflight->offset = 0;
1628    inflight->addr = addr;
1629    inflight->fd = fd;
1630    inflight->size = new_size;
1631
1632    return 0;
1633}
1634
1635void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1636{
1637    if (inflight->addr) {
1638        qemu_put_be64(f, inflight->size);
1639        qemu_put_be16(f, inflight->queue_size);
1640        qemu_put_buffer(f, inflight->addr, inflight->size);
1641    } else {
1642        qemu_put_be64(f, 0);
1643    }
1644}
1645
1646int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1647{
1648    uint64_t size;
1649
1650    size = qemu_get_be64(f);
1651    if (!size) {
1652        return 0;
1653    }
1654
1655    if (inflight->size != size) {
1656        if (vhost_dev_resize_inflight(inflight, size)) {
1657            return -1;
1658        }
1659    }
1660    inflight->queue_size = qemu_get_be16(f);
1661
1662    qemu_get_buffer(f, inflight->addr, size);
1663
1664    return 0;
1665}
1666
1667int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
1668{
1669    int r;
1670
1671    if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
1672        hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
1673        return 0;
1674    }
1675
1676    hdev->vdev = vdev;
1677
1678    r = vhost_dev_set_features(hdev, hdev->log_enabled);
1679    if (r < 0) {
1680        VHOST_OPS_DEBUG("vhost_dev_prepare_inflight failed");
1681        return r;
1682    }
1683
1684    return 0;
1685}
1686
1687int vhost_dev_set_inflight(struct vhost_dev *dev,
1688                           struct vhost_inflight *inflight)
1689{
1690    int r;
1691
1692    if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1693        r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1694        if (r) {
1695            VHOST_OPS_DEBUG("vhost_set_inflight_fd failed");
1696            return -errno;
1697        }
1698    }
1699
1700    return 0;
1701}
1702
1703int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1704                           struct vhost_inflight *inflight)
1705{
1706    int r;
1707
1708    if (dev->vhost_ops->vhost_get_inflight_fd) {
1709        r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1710        if (r) {
1711            VHOST_OPS_DEBUG("vhost_get_inflight_fd failed");
1712            return -errno;
1713        }
1714    }
1715
1716    return 0;
1717}
1718
1719/* Host notifiers must be enabled at this point. */
1720int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1721{
1722    int i, r;
1723
1724    /* should only be called after backend is connected */
1725    assert(hdev->vhost_ops);
1726
1727    hdev->started = true;
1728    hdev->vdev = vdev;
1729
1730    r = vhost_dev_set_features(hdev, hdev->log_enabled);
1731    if (r < 0) {
1732        goto fail_features;
1733    }
1734
1735    if (vhost_dev_has_iommu(hdev)) {
1736        memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1737    }
1738
1739    r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1740    if (r < 0) {
1741        VHOST_OPS_DEBUG("vhost_set_mem_table failed");
1742        r = -errno;
1743        goto fail_mem;
1744    }
1745    for (i = 0; i < hdev->nvqs; ++i) {
1746        r = vhost_virtqueue_start(hdev,
1747                                  vdev,
1748                                  hdev->vqs + i,
1749                                  hdev->vq_index + i);
1750        if (r < 0) {
1751            goto fail_vq;
1752        }
1753    }
1754
1755    if (hdev->log_enabled) {
1756        uint64_t log_base;
1757
1758        hdev->log_size = vhost_get_log_size(hdev);
1759        hdev->log = vhost_log_get(hdev->log_size,
1760                                  vhost_dev_log_is_shared(hdev));
1761        log_base = (uintptr_t)hdev->log->log;
1762        r = hdev->vhost_ops->vhost_set_log_base(hdev,
1763                                                hdev->log_size ? log_base : 0,
1764                                                hdev->log);
1765        if (r < 0) {
1766            VHOST_OPS_DEBUG("vhost_set_log_base failed");
1767            r = -errno;
1768            goto fail_log;
1769        }
1770    }
1771    if (hdev->vhost_ops->vhost_dev_start) {
1772        r = hdev->vhost_ops->vhost_dev_start(hdev, true);
1773        if (r) {
1774            goto fail_log;
1775        }
1776    }
1777    if (vhost_dev_has_iommu(hdev) &&
1778        hdev->vhost_ops->vhost_set_iotlb_callback) {
1779            hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
1780
1781        /* Update used ring information for IOTLB to work correctly,
1782         * vhost-kernel code requires for this.*/
1783        for (i = 0; i < hdev->nvqs; ++i) {
1784            struct vhost_virtqueue *vq = hdev->vqs + i;
1785            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1786        }
1787    }
1788    return 0;
1789fail_log:
1790    vhost_log_put(hdev, false);
1791fail_vq:
1792    while (--i >= 0) {
1793        vhost_virtqueue_stop(hdev,
1794                             vdev,
1795                             hdev->vqs + i,
1796                             hdev->vq_index + i);
1797    }
1798
1799fail_mem:
1800fail_features:
1801
1802    hdev->started = false;
1803    return r;
1804}
1805
1806/* Host notifiers must be enabled at this point. */
1807void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1808{
1809    int i;
1810
1811    /* should only be called after backend is connected */
1812    assert(hdev->vhost_ops);
1813
1814    if (hdev->vhost_ops->vhost_dev_start) {
1815        hdev->vhost_ops->vhost_dev_start(hdev, false);
1816    }
1817    for (i = 0; i < hdev->nvqs; ++i) {
1818        vhost_virtqueue_stop(hdev,
1819                             vdev,
1820                             hdev->vqs + i,
1821                             hdev->vq_index + i);
1822    }
1823
1824    if (vhost_dev_has_iommu(hdev)) {
1825        if (hdev->vhost_ops->vhost_set_iotlb_callback) {
1826            hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
1827        }
1828        memory_listener_unregister(&hdev->iommu_listener);
1829    }
1830    vhost_log_put(hdev, true);
1831    hdev->started = false;
1832    hdev->vdev = NULL;
1833}
1834
1835int vhost_net_set_backend(struct vhost_dev *hdev,
1836                          struct vhost_vring_file *file)
1837{
1838    if (hdev->vhost_ops->vhost_net_set_backend) {
1839        return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
1840    }
1841
1842    return -1;
1843}
1844