qemu/hw/virtio/vhost.c
<<
>>
Prefs
   1/*
   2 * vhost support
   3 *
   4 * Copyright Red Hat, Inc. 2010
   5 *
   6 * Authors:
   7 *  Michael S. Tsirkin <mst@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qapi/error.h"
  18#include "hw/virtio/vhost.h"
  19#include "hw/hw.h"
  20#include "qemu/atomic.h"
  21#include "qemu/range.h"
  22#include "qemu/error-report.h"
  23#include "qemu/memfd.h"
  24#include <linux/vhost.h>
  25#include "exec/address-spaces.h"
  26#include "hw/virtio/virtio-bus.h"
  27#include "hw/virtio/virtio-access.h"
  28#include "migration/blocker.h"
  29#include "sysemu/dma.h"
  30
  31/* enabled until disconnected backend stabilizes */
  32#define _VHOST_DEBUG 1
  33
  34#ifdef _VHOST_DEBUG
  35#define VHOST_OPS_DEBUG(fmt, ...) \
  36    do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
  37                      strerror(errno), errno); } while (0)
  38#else
  39#define VHOST_OPS_DEBUG(fmt, ...) \
  40    do { } while (0)
  41#endif
  42
  43static struct vhost_log *vhost_log;
  44static struct vhost_log *vhost_log_shm;
  45
  46static unsigned int used_memslots;
  47static QLIST_HEAD(, vhost_dev) vhost_devices =
  48    QLIST_HEAD_INITIALIZER(vhost_devices);
  49
  50bool vhost_has_free_slot(void)
  51{
  52    unsigned int slots_limit = ~0U;
  53    struct vhost_dev *hdev;
  54
  55    QLIST_FOREACH(hdev, &vhost_devices, entry) {
  56        unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  57        slots_limit = MIN(slots_limit, r);
  58    }
  59    return slots_limit > used_memslots;
  60}
  61
  62static void vhost_dev_sync_region(struct vhost_dev *dev,
  63                                  MemoryRegionSection *section,
  64                                  uint64_t mfirst, uint64_t mlast,
  65                                  uint64_t rfirst, uint64_t rlast)
  66{
  67    vhost_log_chunk_t *log = dev->log->log;
  68
  69    uint64_t start = MAX(mfirst, rfirst);
  70    uint64_t end = MIN(mlast, rlast);
  71    vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  72    vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  73    uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
  74
  75    if (end < start) {
  76        return;
  77    }
  78    assert(end / VHOST_LOG_CHUNK < dev->log_size);
  79    assert(start / VHOST_LOG_CHUNK < dev->log_size);
  80
  81    for (;from < to; ++from) {
  82        vhost_log_chunk_t log;
  83        /* We first check with non-atomic: much cheaper,
  84         * and we expect non-dirty to be the common case. */
  85        if (!*from) {
  86            addr += VHOST_LOG_CHUNK;
  87            continue;
  88        }
  89        /* Data must be read atomically. We don't really need barrier semantics
  90         * but it's easier to use atomic_* than roll our own. */
  91        log = atomic_xchg(from, 0);
  92        while (log) {
  93            int bit = ctzl(log);
  94            hwaddr page_addr;
  95            hwaddr section_offset;
  96            hwaddr mr_offset;
  97            page_addr = addr + bit * VHOST_LOG_PAGE;
  98            section_offset = page_addr - section->offset_within_address_space;
  99            mr_offset = section_offset + section->offset_within_region;
 100            memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
 101            log &= ~(0x1ull << bit);
 102        }
 103        addr += VHOST_LOG_CHUNK;
 104    }
 105}
 106
 107static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
 108                                   MemoryRegionSection *section,
 109                                   hwaddr first,
 110                                   hwaddr last)
 111{
 112    int i;
 113    hwaddr start_addr;
 114    hwaddr end_addr;
 115
 116    if (!dev->log_enabled || !dev->started) {
 117        return 0;
 118    }
 119    start_addr = section->offset_within_address_space;
 120    end_addr = range_get_last(start_addr, int128_get64(section->size));
 121    start_addr = MAX(first, start_addr);
 122    end_addr = MIN(last, end_addr);
 123
 124    for (i = 0; i < dev->mem->nregions; ++i) {
 125        struct vhost_memory_region *reg = dev->mem->regions + i;
 126        vhost_dev_sync_region(dev, section, start_addr, end_addr,
 127                              reg->guest_phys_addr,
 128                              range_get_last(reg->guest_phys_addr,
 129                                             reg->memory_size));
 130    }
 131    for (i = 0; i < dev->nvqs; ++i) {
 132        struct vhost_virtqueue *vq = dev->vqs + i;
 133        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 134                              range_get_last(vq->used_phys, vq->used_size));
 135    }
 136    return 0;
 137}
 138
 139static void vhost_log_sync(MemoryListener *listener,
 140                          MemoryRegionSection *section)
 141{
 142    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 143                                         memory_listener);
 144    vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 145}
 146
 147static void vhost_log_sync_range(struct vhost_dev *dev,
 148                                 hwaddr first, hwaddr last)
 149{
 150    int i;
 151    /* FIXME: this is N^2 in number of sections */
 152    for (i = 0; i < dev->n_mem_sections; ++i) {
 153        MemoryRegionSection *section = &dev->mem_sections[i];
 154        vhost_sync_dirty_bitmap(dev, section, first, last);
 155    }
 156}
 157
 158/* Assign/unassign. Keep an unsorted array of non-overlapping
 159 * memory regions in dev->mem. */
 160static void vhost_dev_unassign_memory(struct vhost_dev *dev,
 161                                      uint64_t start_addr,
 162                                      uint64_t size)
 163{
 164    int from, to, n = dev->mem->nregions;
 165    /* Track overlapping/split regions for sanity checking. */
 166    int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
 167
 168    for (from = 0, to = 0; from < n; ++from, ++to) {
 169        struct vhost_memory_region *reg = dev->mem->regions + to;
 170        uint64_t reglast;
 171        uint64_t memlast;
 172        uint64_t change;
 173
 174        /* clone old region */
 175        if (to != from) {
 176            memcpy(reg, dev->mem->regions + from, sizeof *reg);
 177        }
 178
 179        /* No overlap is simple */
 180        if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 181                            start_addr, size)) {
 182            continue;
 183        }
 184
 185        /* Split only happens if supplied region
 186         * is in the middle of an existing one. Thus it can not
 187         * overlap with any other existing region. */
 188        assert(!split);
 189
 190        reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 191        memlast = range_get_last(start_addr, size);
 192
 193        /* Remove whole region */
 194        if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
 195            --dev->mem->nregions;
 196            --to;
 197            ++overlap_middle;
 198            continue;
 199        }
 200
 201        /* Shrink region */
 202        if (memlast >= reglast) {
 203            reg->memory_size = start_addr - reg->guest_phys_addr;
 204            assert(reg->memory_size);
 205            assert(!overlap_end);
 206            ++overlap_end;
 207            continue;
 208        }
 209
 210        /* Shift region */
 211        if (start_addr <= reg->guest_phys_addr) {
 212            change = memlast + 1 - reg->guest_phys_addr;
 213            reg->memory_size -= change;
 214            reg->guest_phys_addr += change;
 215            reg->userspace_addr += change;
 216            assert(reg->memory_size);
 217            assert(!overlap_start);
 218            ++overlap_start;
 219            continue;
 220        }
 221
 222        /* This only happens if supplied region
 223         * is in the middle of an existing one. Thus it can not
 224         * overlap with any other existing region. */
 225        assert(!overlap_start);
 226        assert(!overlap_end);
 227        assert(!overlap_middle);
 228        /* Split region: shrink first part, shift second part. */
 229        memcpy(dev->mem->regions + n, reg, sizeof *reg);
 230        reg->memory_size = start_addr - reg->guest_phys_addr;
 231        assert(reg->memory_size);
 232        change = memlast + 1 - reg->guest_phys_addr;
 233        reg = dev->mem->regions + n;
 234        reg->memory_size -= change;
 235        assert(reg->memory_size);
 236        reg->guest_phys_addr += change;
 237        reg->userspace_addr += change;
 238        /* Never add more than 1 region */
 239        assert(dev->mem->nregions == n);
 240        ++dev->mem->nregions;
 241        ++split;
 242    }
 243}
 244
 245/* Called after unassign, so no regions overlap the given range. */
 246static void vhost_dev_assign_memory(struct vhost_dev *dev,
 247                                    uint64_t start_addr,
 248                                    uint64_t size,
 249                                    uint64_t uaddr)
 250{
 251    int from, to;
 252    struct vhost_memory_region *merged = NULL;
 253    for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
 254        struct vhost_memory_region *reg = dev->mem->regions + to;
 255        uint64_t prlast, urlast;
 256        uint64_t pmlast, umlast;
 257        uint64_t s, e, u;
 258
 259        /* clone old region */
 260        if (to != from) {
 261            memcpy(reg, dev->mem->regions + from, sizeof *reg);
 262        }
 263        prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 264        pmlast = range_get_last(start_addr, size);
 265        urlast = range_get_last(reg->userspace_addr, reg->memory_size);
 266        umlast = range_get_last(uaddr, size);
 267
 268        /* check for overlapping regions: should never happen. */
 269        assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
 270        /* Not an adjacent or overlapping region - do not merge. */
 271        if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
 272            (pmlast + 1 != reg->guest_phys_addr ||
 273             umlast + 1 != reg->userspace_addr)) {
 274            continue;
 275        }
 276
 277        if (dev->vhost_ops->vhost_backend_can_merge &&
 278            !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
 279                                                     reg->userspace_addr,
 280                                                     reg->memory_size)) {
 281            continue;
 282        }
 283
 284        if (merged) {
 285            --to;
 286            assert(to >= 0);
 287        } else {
 288            merged = reg;
 289        }
 290        u = MIN(uaddr, reg->userspace_addr);
 291        s = MIN(start_addr, reg->guest_phys_addr);
 292        e = MAX(pmlast, prlast);
 293        uaddr = merged->userspace_addr = u;
 294        start_addr = merged->guest_phys_addr = s;
 295        size = merged->memory_size = e - s + 1;
 296        assert(merged->memory_size);
 297    }
 298
 299    if (!merged) {
 300        struct vhost_memory_region *reg = dev->mem->regions + to;
 301        memset(reg, 0, sizeof *reg);
 302        reg->memory_size = size;
 303        assert(reg->memory_size);
 304        reg->guest_phys_addr = start_addr;
 305        reg->userspace_addr = uaddr;
 306        ++to;
 307    }
 308    assert(to <= dev->mem->nregions + 1);
 309    dev->mem->nregions = to;
 310}
 311
 312static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 313{
 314    uint64_t log_size = 0;
 315    int i;
 316    for (i = 0; i < dev->mem->nregions; ++i) {
 317        struct vhost_memory_region *reg = dev->mem->regions + i;
 318        uint64_t last = range_get_last(reg->guest_phys_addr,
 319                                       reg->memory_size);
 320        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 321    }
 322    for (i = 0; i < dev->nvqs; ++i) {
 323        struct vhost_virtqueue *vq = dev->vqs + i;
 324        uint64_t last = vq->used_phys + vq->used_size - 1;
 325        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 326    }
 327    return log_size;
 328}
 329
 330static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 331{
 332    struct vhost_log *log;
 333    uint64_t logsize = size * sizeof(*(log->log));
 334    int fd = -1;
 335
 336    log = g_new0(struct vhost_log, 1);
 337    if (share) {
 338        log->log = qemu_memfd_alloc("vhost-log", logsize,
 339                                    F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 340                                    &fd);
 341        memset(log->log, 0, logsize);
 342    } else {
 343        log->log = g_malloc0(logsize);
 344    }
 345
 346    log->size = size;
 347    log->refcnt = 1;
 348    log->fd = fd;
 349
 350    return log;
 351}
 352
 353static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 354{
 355    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 356
 357    if (!log || log->size != size) {
 358        log = vhost_log_alloc(size, share);
 359        if (share) {
 360            vhost_log_shm = log;
 361        } else {
 362            vhost_log = log;
 363        }
 364    } else {
 365        ++log->refcnt;
 366    }
 367
 368    return log;
 369}
 370
 371static void vhost_log_put(struct vhost_dev *dev, bool sync)
 372{
 373    struct vhost_log *log = dev->log;
 374
 375    if (!log) {
 376        return;
 377    }
 378
 379    --log->refcnt;
 380    if (log->refcnt == 0) {
 381        /* Sync only the range covered by the old log */
 382        if (dev->log_size && sync) {
 383            vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 384        }
 385
 386        if (vhost_log == log) {
 387            g_free(log->log);
 388            vhost_log = NULL;
 389        } else if (vhost_log_shm == log) {
 390            qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 391                            log->fd);
 392            vhost_log_shm = NULL;
 393        }
 394
 395        g_free(log);
 396    }
 397
 398    dev->log = NULL;
 399    dev->log_size = 0;
 400}
 401
 402static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 403{
 404    return dev->vhost_ops->vhost_requires_shm_log &&
 405           dev->vhost_ops->vhost_requires_shm_log(dev);
 406}
 407
 408static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 409{
 410    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 411    uint64_t log_base = (uintptr_t)log->log;
 412    int r;
 413
 414    /* inform backend of log switching, this must be done before
 415       releasing the current log, to ensure no logging is lost */
 416    r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 417    if (r < 0) {
 418        VHOST_OPS_DEBUG("vhost_set_log_base failed");
 419    }
 420
 421    vhost_log_put(dev, true);
 422    dev->log = log;
 423    dev->log_size = size;
 424}
 425
 426static int vhost_dev_has_iommu(struct vhost_dev *dev)
 427{
 428    VirtIODevice *vdev = dev->vdev;
 429
 430    return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
 431}
 432
 433static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
 434                              hwaddr *plen, int is_write)
 435{
 436    if (!vhost_dev_has_iommu(dev)) {
 437        return cpu_physical_memory_map(addr, plen, is_write);
 438    } else {
 439        return (void *)(uintptr_t)addr;
 440    }
 441}
 442
 443static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
 444                               hwaddr len, int is_write,
 445                               hwaddr access_len)
 446{
 447    if (!vhost_dev_has_iommu(dev)) {
 448        cpu_physical_memory_unmap(buffer, len, is_write, access_len);
 449    }
 450}
 451
 452static int vhost_verify_ring_part_mapping(struct vhost_dev *dev,
 453                                          void *part,
 454                                          uint64_t part_addr,
 455                                          uint64_t part_size,
 456                                          uint64_t start_addr,
 457                                          uint64_t size)
 458{
 459    hwaddr l;
 460    void *p;
 461    int r = 0;
 462
 463    if (!ranges_overlap(start_addr, size, part_addr, part_size)) {
 464        return 0;
 465    }
 466    l = part_size;
 467    p = vhost_memory_map(dev, part_addr, &l, 1);
 468    if (!p || l != part_size) {
 469        r = -ENOMEM;
 470    }
 471    if (p != part) {
 472        r = -EBUSY;
 473    }
 474    vhost_memory_unmap(dev, p, l, 0, 0);
 475    return r;
 476}
 477
 478static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 479                                      uint64_t start_addr,
 480                                      uint64_t size)
 481{
 482    int i, j;
 483    int r = 0;
 484    const char *part_name[] = {
 485        "descriptor table",
 486        "available ring",
 487        "used ring"
 488    };
 489
 490    for (i = 0; i < dev->nvqs; ++i) {
 491        struct vhost_virtqueue *vq = dev->vqs + i;
 492
 493        j = 0;
 494        r = vhost_verify_ring_part_mapping(dev, vq->desc, vq->desc_phys,
 495                                           vq->desc_size, start_addr, size);
 496        if (r) {
 497            break;
 498        }
 499
 500        j++;
 501        r = vhost_verify_ring_part_mapping(dev, vq->avail, vq->avail_phys,
 502                                           vq->avail_size, start_addr, size);
 503        if (r) {
 504            break;
 505        }
 506
 507        j++;
 508        r = vhost_verify_ring_part_mapping(dev, vq->used, vq->used_phys,
 509                                           vq->used_size, start_addr, size);
 510        if (r) {
 511            break;
 512        }
 513    }
 514
 515    if (r == -ENOMEM) {
 516        error_report("Unable to map %s for ring %d", part_name[j], i);
 517    } else if (r == -EBUSY) {
 518        error_report("%s relocated for ring %d", part_name[j], i);
 519    }
 520    return r;
 521}
 522
 523static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
 524                                                      uint64_t start_addr,
 525                                                      uint64_t size)
 526{
 527    int i, n = dev->mem->nregions;
 528    for (i = 0; i < n; ++i) {
 529        struct vhost_memory_region *reg = dev->mem->regions + i;
 530        if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 531                           start_addr, size)) {
 532            return reg;
 533        }
 534    }
 535    return NULL;
 536}
 537
 538static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
 539                                 uint64_t start_addr,
 540                                 uint64_t size,
 541                                 uint64_t uaddr)
 542{
 543    struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
 544    uint64_t reglast;
 545    uint64_t memlast;
 546
 547    if (!reg) {
 548        return true;
 549    }
 550
 551    reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 552    memlast = range_get_last(start_addr, size);
 553
 554    /* Need to extend region? */
 555    if (start_addr < reg->guest_phys_addr || memlast > reglast) {
 556        return true;
 557    }
 558    /* userspace_addr changed? */
 559    return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
 560}
 561
 562static void vhost_set_memory(MemoryListener *listener,
 563                             MemoryRegionSection *section,
 564                             bool add)
 565{
 566    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 567                                         memory_listener);
 568    hwaddr start_addr = section->offset_within_address_space;
 569    ram_addr_t size = int128_get64(section->size);
 570    bool log_dirty =
 571        memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
 572    int s = offsetof(struct vhost_memory, regions) +
 573        (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
 574    void *ram;
 575
 576    dev->mem = g_realloc(dev->mem, s);
 577
 578    if (log_dirty) {
 579        add = false;
 580    }
 581
 582    assert(size);
 583
 584    /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
 585    ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
 586    if (add) {
 587        if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
 588            /* Region exists with same address. Nothing to do. */
 589            return;
 590        }
 591    } else {
 592        if (!vhost_dev_find_reg(dev, start_addr, size)) {
 593            /* Removing region that we don't access. Nothing to do. */
 594            return;
 595        }
 596    }
 597
 598    vhost_dev_unassign_memory(dev, start_addr, size);
 599    if (add) {
 600        /* Add given mapping, merging adjacent regions if any */
 601        vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
 602    } else {
 603        /* Remove old mapping for this memory, if any. */
 604        vhost_dev_unassign_memory(dev, start_addr, size);
 605    }
 606    dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
 607    dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
 608    dev->memory_changed = true;
 609    used_memslots = dev->mem->nregions;
 610}
 611
 612static bool vhost_section(MemoryRegionSection *section)
 613{
 614    return memory_region_is_ram(section->mr) &&
 615        !memory_region_is_rom(section->mr);
 616}
 617
 618static void vhost_begin(MemoryListener *listener)
 619{
 620    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 621                                         memory_listener);
 622    dev->mem_changed_end_addr = 0;
 623    dev->mem_changed_start_addr = -1;
 624}
 625
 626static void vhost_commit(MemoryListener *listener)
 627{
 628    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 629                                         memory_listener);
 630    hwaddr start_addr = 0;
 631    ram_addr_t size = 0;
 632    uint64_t log_size;
 633    int r;
 634
 635    if (!dev->memory_changed) {
 636        return;
 637    }
 638    if (!dev->started) {
 639        return;
 640    }
 641    if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
 642        return;
 643    }
 644
 645    if (dev->started) {
 646        start_addr = dev->mem_changed_start_addr;
 647        size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
 648
 649        r = vhost_verify_ring_mappings(dev, start_addr, size);
 650        assert(r >= 0);
 651    }
 652
 653    if (!dev->log_enabled) {
 654        r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 655        if (r < 0) {
 656            VHOST_OPS_DEBUG("vhost_set_mem_table failed");
 657        }
 658        dev->memory_changed = false;
 659        return;
 660    }
 661    log_size = vhost_get_log_size(dev);
 662    /* We allocate an extra 4K bytes to log,
 663     * to reduce the * number of reallocations. */
 664#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 665    /* To log more, must increase log size before table update. */
 666    if (dev->log_size < log_size) {
 667        vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 668    }
 669    r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 670    if (r < 0) {
 671        VHOST_OPS_DEBUG("vhost_set_mem_table failed");
 672    }
 673    /* To log less, can only decrease log size after table update. */
 674    if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 675        vhost_dev_log_resize(dev, log_size);
 676    }
 677    dev->memory_changed = false;
 678}
 679
 680static void vhost_region_add(MemoryListener *listener,
 681                             MemoryRegionSection *section)
 682{
 683    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 684                                         memory_listener);
 685
 686    if (!vhost_section(section)) {
 687        return;
 688    }
 689
 690    ++dev->n_mem_sections;
 691    dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
 692                                dev->n_mem_sections);
 693    dev->mem_sections[dev->n_mem_sections - 1] = *section;
 694    memory_region_ref(section->mr);
 695    vhost_set_memory(listener, section, true);
 696}
 697
 698static void vhost_region_del(MemoryListener *listener,
 699                             MemoryRegionSection *section)
 700{
 701    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 702                                         memory_listener);
 703    int i;
 704
 705    if (!vhost_section(section)) {
 706        return;
 707    }
 708
 709    vhost_set_memory(listener, section, false);
 710    memory_region_unref(section->mr);
 711    for (i = 0; i < dev->n_mem_sections; ++i) {
 712        if (dev->mem_sections[i].offset_within_address_space
 713            == section->offset_within_address_space) {
 714            --dev->n_mem_sections;
 715            memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
 716                    (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
 717            break;
 718        }
 719    }
 720}
 721
 722static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 723{
 724    struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
 725    struct vhost_dev *hdev = iommu->hdev;
 726    hwaddr iova = iotlb->iova + iommu->iommu_offset;
 727
 728    if (vhost_backend_invalidate_device_iotlb(hdev, iova,
 729                                              iotlb->addr_mask + 1)) {
 730        error_report("Fail to invalidate device iotlb");
 731    }
 732}
 733
 734static void vhost_iommu_region_add(MemoryListener *listener,
 735                                   MemoryRegionSection *section)
 736{
 737    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 738                                         iommu_listener);
 739    struct vhost_iommu *iommu;
 740    Int128 end;
 741
 742    if (!memory_region_is_iommu(section->mr)) {
 743        return;
 744    }
 745
 746    iommu = g_malloc0(sizeof(*iommu));
 747    end = int128_add(int128_make64(section->offset_within_region),
 748                     section->size);
 749    end = int128_sub(end, int128_one());
 750    iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
 751                        IOMMU_NOTIFIER_UNMAP,
 752                        section->offset_within_region,
 753                        int128_get64(end));
 754    iommu->mr = section->mr;
 755    iommu->iommu_offset = section->offset_within_address_space -
 756                          section->offset_within_region;
 757    iommu->hdev = dev;
 758    memory_region_register_iommu_notifier(section->mr, &iommu->n);
 759    QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
 760    /* TODO: can replay help performance here? */
 761}
 762
 763static void vhost_iommu_region_del(MemoryListener *listener,
 764                                   MemoryRegionSection *section)
 765{
 766    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 767                                         iommu_listener);
 768    struct vhost_iommu *iommu;
 769
 770    if (!memory_region_is_iommu(section->mr)) {
 771        return;
 772    }
 773
 774    QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
 775        if (iommu->mr == section->mr &&
 776            iommu->n.start == section->offset_within_region) {
 777            memory_region_unregister_iommu_notifier(iommu->mr,
 778                                                    &iommu->n);
 779            QLIST_REMOVE(iommu, iommu_next);
 780            g_free(iommu);
 781            break;
 782        }
 783    }
 784}
 785
 786static void vhost_region_nop(MemoryListener *listener,
 787                             MemoryRegionSection *section)
 788{
 789}
 790
 791static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 792                                    struct vhost_virtqueue *vq,
 793                                    unsigned idx, bool enable_log)
 794{
 795    struct vhost_vring_addr addr = {
 796        .index = idx,
 797        .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
 798        .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
 799        .used_user_addr = (uint64_t)(unsigned long)vq->used,
 800        .log_guest_addr = vq->used_phys,
 801        .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
 802    };
 803    int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 804    if (r < 0) {
 805        VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
 806        return -errno;
 807    }
 808    return 0;
 809}
 810
 811static int vhost_dev_set_features(struct vhost_dev *dev,
 812                                  bool enable_log)
 813{
 814    uint64_t features = dev->acked_features;
 815    int r;
 816    if (enable_log) {
 817        features |= 0x1ULL << VHOST_F_LOG_ALL;
 818    }
 819    r = dev->vhost_ops->vhost_set_features(dev, features);
 820    if (r < 0) {
 821        VHOST_OPS_DEBUG("vhost_set_features failed");
 822    }
 823    return r < 0 ? -errno : 0;
 824}
 825
 826static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 827{
 828    int r, i, idx;
 829    r = vhost_dev_set_features(dev, enable_log);
 830    if (r < 0) {
 831        goto err_features;
 832    }
 833    for (i = 0; i < dev->nvqs; ++i) {
 834        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 835        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 836                                     enable_log);
 837        if (r < 0) {
 838            goto err_vq;
 839        }
 840    }
 841    return 0;
 842err_vq:
 843    for (; i >= 0; --i) {
 844        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 845        vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 846                                 dev->log_enabled);
 847    }
 848    vhost_dev_set_features(dev, dev->log_enabled);
 849err_features:
 850    return r;
 851}
 852
 853static int vhost_migration_log(MemoryListener *listener, int enable)
 854{
 855    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 856                                         memory_listener);
 857    int r;
 858    if (!!enable == dev->log_enabled) {
 859        return 0;
 860    }
 861    if (!dev->started) {
 862        dev->log_enabled = enable;
 863        return 0;
 864    }
 865    if (!enable) {
 866        r = vhost_dev_set_log(dev, false);
 867        if (r < 0) {
 868            return r;
 869        }
 870        vhost_log_put(dev, false);
 871    } else {
 872        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 873        r = vhost_dev_set_log(dev, true);
 874        if (r < 0) {
 875            return r;
 876        }
 877    }
 878    dev->log_enabled = enable;
 879    return 0;
 880}
 881
 882static void vhost_log_global_start(MemoryListener *listener)
 883{
 884    int r;
 885
 886    r = vhost_migration_log(listener, true);
 887    if (r < 0) {
 888        abort();
 889    }
 890}
 891
 892static void vhost_log_global_stop(MemoryListener *listener)
 893{
 894    int r;
 895
 896    r = vhost_migration_log(listener, false);
 897    if (r < 0) {
 898        abort();
 899    }
 900}
 901
 902static void vhost_log_start(MemoryListener *listener,
 903                            MemoryRegionSection *section,
 904                            int old, int new)
 905{
 906    /* FIXME: implement */
 907}
 908
 909static void vhost_log_stop(MemoryListener *listener,
 910                           MemoryRegionSection *section,
 911                           int old, int new)
 912{
 913    /* FIXME: implement */
 914}
 915
 916/* The vhost driver natively knows how to handle the vrings of non
 917 * cross-endian legacy devices and modern devices. Only legacy devices
 918 * exposed to a bi-endian guest may require the vhost driver to use a
 919 * specific endianness.
 920 */
 921static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
 922{
 923    if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
 924        return false;
 925    }
 926#ifdef HOST_WORDS_BIGENDIAN
 927    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
 928#else
 929    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
 930#endif
 931}
 932
 933static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 934                                                   bool is_big_endian,
 935                                                   int vhost_vq_index)
 936{
 937    struct vhost_vring_state s = {
 938        .index = vhost_vq_index,
 939        .num = is_big_endian
 940    };
 941
 942    if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
 943        return 0;
 944    }
 945
 946    VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
 947    if (errno == ENOTTY) {
 948        error_report("vhost does not support cross-endian");
 949        return -ENOSYS;
 950    }
 951
 952    return -errno;
 953}
 954
 955static int vhost_memory_region_lookup(struct vhost_dev *hdev,
 956                                      uint64_t gpa, uint64_t *uaddr,
 957                                      uint64_t *len)
 958{
 959    int i;
 960
 961    for (i = 0; i < hdev->mem->nregions; i++) {
 962        struct vhost_memory_region *reg = hdev->mem->regions + i;
 963
 964        if (gpa >= reg->guest_phys_addr &&
 965            reg->guest_phys_addr + reg->memory_size > gpa) {
 966            *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
 967            *len = reg->guest_phys_addr + reg->memory_size - gpa;
 968            return 0;
 969        }
 970    }
 971
 972    return -EFAULT;
 973}
 974
 975int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
 976{
 977    IOMMUTLBEntry iotlb;
 978    uint64_t uaddr, len;
 979    int ret = -EFAULT;
 980
 981    rcu_read_lock();
 982
 983    iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
 984                                          iova, write);
 985    if (iotlb.target_as != NULL) {
 986        ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
 987                                         &uaddr, &len);
 988        if (ret) {
 989            error_report("Fail to lookup the translated address "
 990                         "%"PRIx64, iotlb.translated_addr);
 991            goto out;
 992        }
 993
 994        len = MIN(iotlb.addr_mask + 1, len);
 995        iova = iova & ~iotlb.addr_mask;
 996
 997        ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
 998                                                len, iotlb.perm);
 999        if (ret) {
1000            error_report("Fail to update device iotlb");
1001            goto out;
1002        }
1003    }
1004out:
1005    rcu_read_unlock();
1006
1007    return ret;
1008}
1009
1010static int vhost_virtqueue_start(struct vhost_dev *dev,
1011                                struct VirtIODevice *vdev,
1012                                struct vhost_virtqueue *vq,
1013                                unsigned idx)
1014{
1015    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1016    VirtioBusState *vbus = VIRTIO_BUS(qbus);
1017    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1018    hwaddr s, l, a;
1019    int r;
1020    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1021    struct vhost_vring_file file = {
1022        .index = vhost_vq_index
1023    };
1024    struct vhost_vring_state state = {
1025        .index = vhost_vq_index
1026    };
1027    struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1028
1029
1030    vq->num = state.num = virtio_queue_get_num(vdev, idx);
1031    r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1032    if (r) {
1033        VHOST_OPS_DEBUG("vhost_set_vring_num failed");
1034        return -errno;
1035    }
1036
1037    state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1038    r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1039    if (r) {
1040        VHOST_OPS_DEBUG("vhost_set_vring_base failed");
1041        return -errno;
1042    }
1043
1044    if (vhost_needs_vring_endian(vdev)) {
1045        r = vhost_virtqueue_set_vring_endian_legacy(dev,
1046                                                    virtio_is_big_endian(vdev),
1047                                                    vhost_vq_index);
1048        if (r) {
1049            return -errno;
1050        }
1051    }
1052
1053    vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1054    vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx);
1055    vq->desc = vhost_memory_map(dev, a, &l, 0);
1056    if (!vq->desc || l != s) {
1057        r = -ENOMEM;
1058        goto fail_alloc_desc;
1059    }
1060    vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1061    vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1062    vq->avail = vhost_memory_map(dev, a, &l, 0);
1063    if (!vq->avail || l != s) {
1064        r = -ENOMEM;
1065        goto fail_alloc_avail;
1066    }
1067    vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1068    vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1069    vq->used = vhost_memory_map(dev, a, &l, 1);
1070    if (!vq->used || l != s) {
1071        r = -ENOMEM;
1072        goto fail_alloc_used;
1073    }
1074
1075    r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1076    if (r < 0) {
1077        r = -errno;
1078        goto fail_alloc;
1079    }
1080
1081    file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1082    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1083    if (r) {
1084        VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
1085        r = -errno;
1086        goto fail_kick;
1087    }
1088
1089    /* Clear and discard previous events if any. */
1090    event_notifier_test_and_clear(&vq->masked_notifier);
1091
1092    /* Init vring in unmasked state, unless guest_notifier_mask
1093     * will do it later.
1094     */
1095    if (!vdev->use_guest_notifier_mask) {
1096        /* TODO: check and handle errors. */
1097        vhost_virtqueue_mask(dev, vdev, idx, false);
1098    }
1099
1100    if (k->query_guest_notifiers &&
1101        k->query_guest_notifiers(qbus->parent) &&
1102        virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1103        file.fd = -1;
1104        r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1105        if (r) {
1106            goto fail_vector;
1107        }
1108    }
1109
1110    return 0;
1111
1112fail_vector:
1113fail_kick:
1114fail_alloc:
1115    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1116                       0, 0);
1117fail_alloc_used:
1118    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1119                       0, 0);
1120fail_alloc_avail:
1121    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1122                       0, 0);
1123fail_alloc_desc:
1124    return r;
1125}
1126
1127static void vhost_virtqueue_stop(struct vhost_dev *dev,
1128                                    struct VirtIODevice *vdev,
1129                                    struct vhost_virtqueue *vq,
1130                                    unsigned idx)
1131{
1132    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1133    struct vhost_vring_state state = {
1134        .index = vhost_vq_index,
1135    };
1136    int r;
1137
1138    r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1139    if (r < 0) {
1140        VHOST_OPS_DEBUG("vhost VQ %d ring restore failed: %d", idx, r);
1141        /* Connection to the backend is broken, so let's sync internal
1142         * last avail idx to the device used idx.
1143         */
1144        virtio_queue_restore_last_avail_idx(vdev, idx);
1145    } else {
1146        virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1147    }
1148    virtio_queue_invalidate_signalled_used(vdev, idx);
1149    virtio_queue_update_used_idx(vdev, idx);
1150
1151    /* In the cross-endian case, we need to reset the vring endianness to
1152     * native as legacy devices expect so by default.
1153     */
1154    if (vhost_needs_vring_endian(vdev)) {
1155        vhost_virtqueue_set_vring_endian_legacy(dev,
1156                                                !virtio_is_big_endian(vdev),
1157                                                vhost_vq_index);
1158    }
1159
1160    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1161                       1, virtio_queue_get_used_size(vdev, idx));
1162    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1163                       0, virtio_queue_get_avail_size(vdev, idx));
1164    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1165                       0, virtio_queue_get_desc_size(vdev, idx));
1166}
1167
1168static void vhost_eventfd_add(MemoryListener *listener,
1169                              MemoryRegionSection *section,
1170                              bool match_data, uint64_t data, EventNotifier *e)
1171{
1172}
1173
1174static void vhost_eventfd_del(MemoryListener *listener,
1175                              MemoryRegionSection *section,
1176                              bool match_data, uint64_t data, EventNotifier *e)
1177{
1178}
1179
1180static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1181                                                int n, uint32_t timeout)
1182{
1183    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1184    struct vhost_vring_state state = {
1185        .index = vhost_vq_index,
1186        .num = timeout,
1187    };
1188    int r;
1189
1190    if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1191        return -EINVAL;
1192    }
1193
1194    r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1195    if (r) {
1196        VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
1197        return r;
1198    }
1199
1200    return 0;
1201}
1202
1203static int vhost_virtqueue_init(struct vhost_dev *dev,
1204                                struct vhost_virtqueue *vq, int n)
1205{
1206    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1207    struct vhost_vring_file file = {
1208        .index = vhost_vq_index,
1209    };
1210    int r = event_notifier_init(&vq->masked_notifier, 0);
1211    if (r < 0) {
1212        return r;
1213    }
1214
1215    file.fd = event_notifier_get_fd(&vq->masked_notifier);
1216    r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1217    if (r) {
1218        VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1219        r = -errno;
1220        goto fail_call;
1221    }
1222
1223    vq->dev = dev;
1224
1225    return 0;
1226fail_call:
1227    event_notifier_cleanup(&vq->masked_notifier);
1228    return r;
1229}
1230
1231static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1232{
1233    event_notifier_cleanup(&vq->masked_notifier);
1234}
1235
1236int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1237                   VhostBackendType backend_type, uint32_t busyloop_timeout)
1238{
1239    uint64_t features;
1240    int i, r, n_initialized_vqs = 0;
1241    Error *local_err = NULL;
1242
1243    hdev->vdev = NULL;
1244    hdev->migration_blocker = NULL;
1245
1246    r = vhost_set_backend_type(hdev, backend_type);
1247    assert(r >= 0);
1248
1249    r = hdev->vhost_ops->vhost_backend_init(hdev, opaque);
1250    if (r < 0) {
1251        goto fail;
1252    }
1253
1254    if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1255        error_report("vhost backend memory slots limit is less"
1256                " than current number of present memory slots");
1257        r = -1;
1258        goto fail;
1259    }
1260
1261    r = hdev->vhost_ops->vhost_set_owner(hdev);
1262    if (r < 0) {
1263        VHOST_OPS_DEBUG("vhost_set_owner failed");
1264        goto fail;
1265    }
1266
1267    r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1268    if (r < 0) {
1269        VHOST_OPS_DEBUG("vhost_get_features failed");
1270        goto fail;
1271    }
1272
1273    for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1274        r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1275        if (r < 0) {
1276            goto fail;
1277        }
1278    }
1279
1280    if (busyloop_timeout) {
1281        for (i = 0; i < hdev->nvqs; ++i) {
1282            r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1283                                                     busyloop_timeout);
1284            if (r < 0) {
1285                goto fail_busyloop;
1286            }
1287        }
1288    }
1289
1290    hdev->features = features;
1291
1292    hdev->memory_listener = (MemoryListener) {
1293        .begin = vhost_begin,
1294        .commit = vhost_commit,
1295        .region_add = vhost_region_add,
1296        .region_del = vhost_region_del,
1297        .region_nop = vhost_region_nop,
1298        .log_start = vhost_log_start,
1299        .log_stop = vhost_log_stop,
1300        .log_sync = vhost_log_sync,
1301        .log_global_start = vhost_log_global_start,
1302        .log_global_stop = vhost_log_global_stop,
1303        .eventfd_add = vhost_eventfd_add,
1304        .eventfd_del = vhost_eventfd_del,
1305        .priority = 10
1306    };
1307
1308    hdev->iommu_listener = (MemoryListener) {
1309        .region_add = vhost_iommu_region_add,
1310        .region_del = vhost_iommu_region_del,
1311    };
1312
1313    if (hdev->migration_blocker == NULL) {
1314        if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1315            error_setg(&hdev->migration_blocker,
1316                       "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1317        } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_check()) {
1318            error_setg(&hdev->migration_blocker,
1319                       "Migration disabled: failed to allocate shared memory");
1320        }
1321    }
1322
1323    if (hdev->migration_blocker != NULL) {
1324        r = migrate_add_blocker(hdev->migration_blocker, &local_err);
1325        if (local_err) {
1326            error_report_err(local_err);
1327            error_free(hdev->migration_blocker);
1328            goto fail_busyloop;
1329        }
1330    }
1331
1332    hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1333    hdev->n_mem_sections = 0;
1334    hdev->mem_sections = NULL;
1335    hdev->log = NULL;
1336    hdev->log_size = 0;
1337    hdev->log_enabled = false;
1338    hdev->started = false;
1339    hdev->memory_changed = false;
1340    memory_listener_register(&hdev->memory_listener, &address_space_memory);
1341    QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1342    return 0;
1343
1344fail_busyloop:
1345    while (--i >= 0) {
1346        vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1347    }
1348fail:
1349    hdev->nvqs = n_initialized_vqs;
1350    vhost_dev_cleanup(hdev);
1351    return r;
1352}
1353
1354void vhost_dev_cleanup(struct vhost_dev *hdev)
1355{
1356    int i;
1357
1358    for (i = 0; i < hdev->nvqs; ++i) {
1359        vhost_virtqueue_cleanup(hdev->vqs + i);
1360    }
1361    if (hdev->mem) {
1362        /* those are only safe after successful init */
1363        memory_listener_unregister(&hdev->memory_listener);
1364        for (i = 0; i < hdev->n_mem_sections; ++i) {
1365            MemoryRegionSection *section = &hdev->mem_sections[i];
1366            memory_region_unref(section->mr);
1367        }
1368        QLIST_REMOVE(hdev, entry);
1369    }
1370    if (hdev->migration_blocker) {
1371        migrate_del_blocker(hdev->migration_blocker);
1372        error_free(hdev->migration_blocker);
1373    }
1374    g_free(hdev->mem);
1375    g_free(hdev->mem_sections);
1376    if (hdev->vhost_ops) {
1377        hdev->vhost_ops->vhost_backend_cleanup(hdev);
1378    }
1379    assert(!hdev->log);
1380
1381    memset(hdev, 0, sizeof(struct vhost_dev));
1382}
1383
1384/* Stop processing guest IO notifications in qemu.
1385 * Start processing them in vhost in kernel.
1386 */
1387int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1388{
1389    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1390    int i, r, e;
1391
1392    /* We will pass the notifiers to the kernel, make sure that QEMU
1393     * doesn't interfere.
1394     */
1395    r = virtio_device_grab_ioeventfd(vdev);
1396    if (r < 0) {
1397        error_report("binding does not support host notifiers");
1398        goto fail;
1399    }
1400
1401    for (i = 0; i < hdev->nvqs; ++i) {
1402        r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1403                                         true);
1404        if (r < 0) {
1405            error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1406            goto fail_vq;
1407        }
1408    }
1409
1410    return 0;
1411fail_vq:
1412    while (--i >= 0) {
1413        e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1414                                         false);
1415        if (e < 0) {
1416            error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
1417        }
1418        assert (e >= 0);
1419    }
1420    virtio_device_release_ioeventfd(vdev);
1421fail:
1422    return r;
1423}
1424
1425/* Stop processing guest IO notifications in vhost.
1426 * Start processing them in qemu.
1427 * This might actually run the qemu handlers right away,
1428 * so virtio in qemu must be completely setup when this is called.
1429 */
1430void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1431{
1432    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1433    int i, r;
1434
1435    for (i = 0; i < hdev->nvqs; ++i) {
1436        r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1437                                         false);
1438        if (r < 0) {
1439            error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1440        }
1441        assert (r >= 0);
1442    }
1443    virtio_device_release_ioeventfd(vdev);
1444}
1445
1446/* Test and clear event pending status.
1447 * Should be called after unmask to avoid losing events.
1448 */
1449bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1450{
1451    struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1452    assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1453    return event_notifier_test_and_clear(&vq->masked_notifier);
1454}
1455
1456/* Mask/unmask events from this vq. */
1457void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1458                         bool mask)
1459{
1460    struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1461    int r, index = n - hdev->vq_index;
1462    struct vhost_vring_file file;
1463
1464    /* should only be called after backend is connected */
1465    assert(hdev->vhost_ops);
1466
1467    if (mask) {
1468        assert(vdev->use_guest_notifier_mask);
1469        file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1470    } else {
1471        file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1472    }
1473
1474    file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1475    r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1476    if (r < 0) {
1477        VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1478    }
1479}
1480
1481uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1482                            uint64_t features)
1483{
1484    const int *bit = feature_bits;
1485    while (*bit != VHOST_INVALID_FEATURE_BIT) {
1486        uint64_t bit_mask = (1ULL << *bit);
1487        if (!(hdev->features & bit_mask)) {
1488            features &= ~bit_mask;
1489        }
1490        bit++;
1491    }
1492    return features;
1493}
1494
1495void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1496                        uint64_t features)
1497{
1498    const int *bit = feature_bits;
1499    while (*bit != VHOST_INVALID_FEATURE_BIT) {
1500        uint64_t bit_mask = (1ULL << *bit);
1501        if (features & bit_mask) {
1502            hdev->acked_features |= bit_mask;
1503        }
1504        bit++;
1505    }
1506}
1507
1508/* Host notifiers must be enabled at this point. */
1509int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1510{
1511    int i, r;
1512
1513    /* should only be called after backend is connected */
1514    assert(hdev->vhost_ops);
1515
1516    hdev->started = true;
1517    hdev->vdev = vdev;
1518
1519    r = vhost_dev_set_features(hdev, hdev->log_enabled);
1520    if (r < 0) {
1521        goto fail_features;
1522    }
1523
1524    if (vhost_dev_has_iommu(hdev)) {
1525        memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1526    }
1527
1528    r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1529    if (r < 0) {
1530        VHOST_OPS_DEBUG("vhost_set_mem_table failed");
1531        r = -errno;
1532        goto fail_mem;
1533    }
1534    for (i = 0; i < hdev->nvqs; ++i) {
1535        r = vhost_virtqueue_start(hdev,
1536                                  vdev,
1537                                  hdev->vqs + i,
1538                                  hdev->vq_index + i);
1539        if (r < 0) {
1540            goto fail_vq;
1541        }
1542    }
1543
1544    if (hdev->log_enabled) {
1545        uint64_t log_base;
1546
1547        hdev->log_size = vhost_get_log_size(hdev);
1548        hdev->log = vhost_log_get(hdev->log_size,
1549                                  vhost_dev_log_is_shared(hdev));
1550        log_base = (uintptr_t)hdev->log->log;
1551        r = hdev->vhost_ops->vhost_set_log_base(hdev,
1552                                                hdev->log_size ? log_base : 0,
1553                                                hdev->log);
1554        if (r < 0) {
1555            VHOST_OPS_DEBUG("vhost_set_log_base failed");
1556            r = -errno;
1557            goto fail_log;
1558        }
1559    }
1560
1561    if (vhost_dev_has_iommu(hdev)) {
1562        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
1563
1564        /* Update used ring information for IOTLB to work correctly,
1565         * vhost-kernel code requires for this.*/
1566        for (i = 0; i < hdev->nvqs; ++i) {
1567            struct vhost_virtqueue *vq = hdev->vqs + i;
1568            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1569        }
1570    }
1571    return 0;
1572fail_log:
1573    vhost_log_put(hdev, false);
1574fail_vq:
1575    while (--i >= 0) {
1576        vhost_virtqueue_stop(hdev,
1577                             vdev,
1578                             hdev->vqs + i,
1579                             hdev->vq_index + i);
1580    }
1581    i = hdev->nvqs;
1582
1583fail_mem:
1584fail_features:
1585
1586    hdev->started = false;
1587    return r;
1588}
1589
1590/* Host notifiers must be enabled at this point. */
1591void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1592{
1593    int i;
1594
1595    /* should only be called after backend is connected */
1596    assert(hdev->vhost_ops);
1597
1598    for (i = 0; i < hdev->nvqs; ++i) {
1599        vhost_virtqueue_stop(hdev,
1600                             vdev,
1601                             hdev->vqs + i,
1602                             hdev->vq_index + i);
1603    }
1604
1605    if (vhost_dev_has_iommu(hdev)) {
1606        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
1607        memory_listener_unregister(&hdev->iommu_listener);
1608    }
1609    vhost_log_put(hdev, true);
1610    hdev->started = false;
1611    hdev->vdev = NULL;
1612}
1613
1614int vhost_net_set_backend(struct vhost_dev *hdev,
1615                          struct vhost_vring_file *file)
1616{
1617    if (hdev->vhost_ops->vhost_net_set_backend) {
1618        return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
1619    }
1620
1621    return -1;
1622}
1623