qemu/hw/virtio/vhost.c
<<
>>
Prefs
   1/*
   2 * vhost support
   3 *
   4 * Copyright Red Hat, Inc. 2010
   5 *
   6 * Authors:
   7 *  Michael S. Tsirkin <mst@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qapi/error.h"
  18#include "hw/virtio/vhost.h"
  19#include "hw/hw.h"
  20#include "qemu/atomic.h"
  21#include "qemu/range.h"
  22#include "qemu/error-report.h"
  23#include "qemu/memfd.h"
  24#include <linux/vhost.h>
  25#include "exec/address-spaces.h"
  26#include "hw/virtio/virtio-bus.h"
  27#include "hw/virtio/virtio-access.h"
  28#include "migration/migration.h"
  29
  30static struct vhost_log *vhost_log;
  31static struct vhost_log *vhost_log_shm;
  32
  33static unsigned int used_memslots;
  34static QLIST_HEAD(, vhost_dev) vhost_devices =
  35    QLIST_HEAD_INITIALIZER(vhost_devices);
  36
  37bool vhost_has_free_slot(void)
  38{
  39    unsigned int slots_limit = ~0U;
  40    struct vhost_dev *hdev;
  41
  42    QLIST_FOREACH(hdev, &vhost_devices, entry) {
  43        unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  44        slots_limit = MIN(slots_limit, r);
  45    }
  46    return slots_limit > used_memslots;
  47}
  48
  49static void vhost_dev_sync_region(struct vhost_dev *dev,
  50                                  MemoryRegionSection *section,
  51                                  uint64_t mfirst, uint64_t mlast,
  52                                  uint64_t rfirst, uint64_t rlast)
  53{
  54    vhost_log_chunk_t *log = dev->log->log;
  55
  56    uint64_t start = MAX(mfirst, rfirst);
  57    uint64_t end = MIN(mlast, rlast);
  58    vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
  59    vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
  60    uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
  61
  62    if (end < start) {
  63        return;
  64    }
  65    assert(end / VHOST_LOG_CHUNK < dev->log_size);
  66    assert(start / VHOST_LOG_CHUNK < dev->log_size);
  67
  68    for (;from < to; ++from) {
  69        vhost_log_chunk_t log;
  70        /* We first check with non-atomic: much cheaper,
  71         * and we expect non-dirty to be the common case. */
  72        if (!*from) {
  73            addr += VHOST_LOG_CHUNK;
  74            continue;
  75        }
  76        /* Data must be read atomically. We don't really need barrier semantics
  77         * but it's easier to use atomic_* than roll our own. */
  78        log = atomic_xchg(from, 0);
  79        while (log) {
  80            int bit = ctzl(log);
  81            hwaddr page_addr;
  82            hwaddr section_offset;
  83            hwaddr mr_offset;
  84            page_addr = addr + bit * VHOST_LOG_PAGE;
  85            section_offset = page_addr - section->offset_within_address_space;
  86            mr_offset = section_offset + section->offset_within_region;
  87            memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
  88            log &= ~(0x1ull << bit);
  89        }
  90        addr += VHOST_LOG_CHUNK;
  91    }
  92}
  93
  94static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
  95                                   MemoryRegionSection *section,
  96                                   hwaddr first,
  97                                   hwaddr last)
  98{
  99    int i;
 100    hwaddr start_addr;
 101    hwaddr end_addr;
 102
 103    if (!dev->log_enabled || !dev->started) {
 104        return 0;
 105    }
 106    start_addr = section->offset_within_address_space;
 107    end_addr = range_get_last(start_addr, int128_get64(section->size));
 108    start_addr = MAX(first, start_addr);
 109    end_addr = MIN(last, end_addr);
 110
 111    for (i = 0; i < dev->mem->nregions; ++i) {
 112        struct vhost_memory_region *reg = dev->mem->regions + i;
 113        vhost_dev_sync_region(dev, section, start_addr, end_addr,
 114                              reg->guest_phys_addr,
 115                              range_get_last(reg->guest_phys_addr,
 116                                             reg->memory_size));
 117    }
 118    for (i = 0; i < dev->nvqs; ++i) {
 119        struct vhost_virtqueue *vq = dev->vqs + i;
 120        vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
 121                              range_get_last(vq->used_phys, vq->used_size));
 122    }
 123    return 0;
 124}
 125
 126static void vhost_log_sync(MemoryListener *listener,
 127                          MemoryRegionSection *section)
 128{
 129    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 130                                         memory_listener);
 131    vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 132}
 133
 134static void vhost_log_sync_range(struct vhost_dev *dev,
 135                                 hwaddr first, hwaddr last)
 136{
 137    int i;
 138    /* FIXME: this is N^2 in number of sections */
 139    for (i = 0; i < dev->n_mem_sections; ++i) {
 140        MemoryRegionSection *section = &dev->mem_sections[i];
 141        vhost_sync_dirty_bitmap(dev, section, first, last);
 142    }
 143}
 144
 145/* Assign/unassign. Keep an unsorted array of non-overlapping
 146 * memory regions in dev->mem. */
 147static void vhost_dev_unassign_memory(struct vhost_dev *dev,
 148                                      uint64_t start_addr,
 149                                      uint64_t size)
 150{
 151    int from, to, n = dev->mem->nregions;
 152    /* Track overlapping/split regions for sanity checking. */
 153    int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
 154
 155    for (from = 0, to = 0; from < n; ++from, ++to) {
 156        struct vhost_memory_region *reg = dev->mem->regions + to;
 157        uint64_t reglast;
 158        uint64_t memlast;
 159        uint64_t change;
 160
 161        /* clone old region */
 162        if (to != from) {
 163            memcpy(reg, dev->mem->regions + from, sizeof *reg);
 164        }
 165
 166        /* No overlap is simple */
 167        if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 168                            start_addr, size)) {
 169            continue;
 170        }
 171
 172        /* Split only happens if supplied region
 173         * is in the middle of an existing one. Thus it can not
 174         * overlap with any other existing region. */
 175        assert(!split);
 176
 177        reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 178        memlast = range_get_last(start_addr, size);
 179
 180        /* Remove whole region */
 181        if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
 182            --dev->mem->nregions;
 183            --to;
 184            ++overlap_middle;
 185            continue;
 186        }
 187
 188        /* Shrink region */
 189        if (memlast >= reglast) {
 190            reg->memory_size = start_addr - reg->guest_phys_addr;
 191            assert(reg->memory_size);
 192            assert(!overlap_end);
 193            ++overlap_end;
 194            continue;
 195        }
 196
 197        /* Shift region */
 198        if (start_addr <= reg->guest_phys_addr) {
 199            change = memlast + 1 - reg->guest_phys_addr;
 200            reg->memory_size -= change;
 201            reg->guest_phys_addr += change;
 202            reg->userspace_addr += change;
 203            assert(reg->memory_size);
 204            assert(!overlap_start);
 205            ++overlap_start;
 206            continue;
 207        }
 208
 209        /* This only happens if supplied region
 210         * is in the middle of an existing one. Thus it can not
 211         * overlap with any other existing region. */
 212        assert(!overlap_start);
 213        assert(!overlap_end);
 214        assert(!overlap_middle);
 215        /* Split region: shrink first part, shift second part. */
 216        memcpy(dev->mem->regions + n, reg, sizeof *reg);
 217        reg->memory_size = start_addr - reg->guest_phys_addr;
 218        assert(reg->memory_size);
 219        change = memlast + 1 - reg->guest_phys_addr;
 220        reg = dev->mem->regions + n;
 221        reg->memory_size -= change;
 222        assert(reg->memory_size);
 223        reg->guest_phys_addr += change;
 224        reg->userspace_addr += change;
 225        /* Never add more than 1 region */
 226        assert(dev->mem->nregions == n);
 227        ++dev->mem->nregions;
 228        ++split;
 229    }
 230}
 231
 232/* Called after unassign, so no regions overlap the given range. */
 233static void vhost_dev_assign_memory(struct vhost_dev *dev,
 234                                    uint64_t start_addr,
 235                                    uint64_t size,
 236                                    uint64_t uaddr)
 237{
 238    int from, to;
 239    struct vhost_memory_region *merged = NULL;
 240    for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
 241        struct vhost_memory_region *reg = dev->mem->regions + to;
 242        uint64_t prlast, urlast;
 243        uint64_t pmlast, umlast;
 244        uint64_t s, e, u;
 245
 246        /* clone old region */
 247        if (to != from) {
 248            memcpy(reg, dev->mem->regions + from, sizeof *reg);
 249        }
 250        prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 251        pmlast = range_get_last(start_addr, size);
 252        urlast = range_get_last(reg->userspace_addr, reg->memory_size);
 253        umlast = range_get_last(uaddr, size);
 254
 255        /* check for overlapping regions: should never happen. */
 256        assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
 257        /* Not an adjacent or overlapping region - do not merge. */
 258        if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
 259            (pmlast + 1 != reg->guest_phys_addr ||
 260             umlast + 1 != reg->userspace_addr)) {
 261            continue;
 262        }
 263
 264        if (dev->vhost_ops->vhost_backend_can_merge &&
 265            !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
 266                                                     reg->userspace_addr,
 267                                                     reg->memory_size)) {
 268            continue;
 269        }
 270
 271        if (merged) {
 272            --to;
 273            assert(to >= 0);
 274        } else {
 275            merged = reg;
 276        }
 277        u = MIN(uaddr, reg->userspace_addr);
 278        s = MIN(start_addr, reg->guest_phys_addr);
 279        e = MAX(pmlast, prlast);
 280        uaddr = merged->userspace_addr = u;
 281        start_addr = merged->guest_phys_addr = s;
 282        size = merged->memory_size = e - s + 1;
 283        assert(merged->memory_size);
 284    }
 285
 286    if (!merged) {
 287        struct vhost_memory_region *reg = dev->mem->regions + to;
 288        memset(reg, 0, sizeof *reg);
 289        reg->memory_size = size;
 290        assert(reg->memory_size);
 291        reg->guest_phys_addr = start_addr;
 292        reg->userspace_addr = uaddr;
 293        ++to;
 294    }
 295    assert(to <= dev->mem->nregions + 1);
 296    dev->mem->nregions = to;
 297}
 298
 299static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 300{
 301    uint64_t log_size = 0;
 302    int i;
 303    for (i = 0; i < dev->mem->nregions; ++i) {
 304        struct vhost_memory_region *reg = dev->mem->regions + i;
 305        uint64_t last = range_get_last(reg->guest_phys_addr,
 306                                       reg->memory_size);
 307        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 308    }
 309    for (i = 0; i < dev->nvqs; ++i) {
 310        struct vhost_virtqueue *vq = dev->vqs + i;
 311        uint64_t last = vq->used_phys + vq->used_size - 1;
 312        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 313    }
 314    return log_size;
 315}
 316
 317static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 318{
 319    struct vhost_log *log;
 320    uint64_t logsize = size * sizeof(*(log->log));
 321    int fd = -1;
 322
 323    log = g_new0(struct vhost_log, 1);
 324    if (share) {
 325        log->log = qemu_memfd_alloc("vhost-log", logsize,
 326                                    F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 327                                    &fd);
 328        memset(log->log, 0, logsize);
 329    } else {
 330        log->log = g_malloc0(logsize);
 331    }
 332
 333    log->size = size;
 334    log->refcnt = 1;
 335    log->fd = fd;
 336
 337    return log;
 338}
 339
 340static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 341{
 342    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
 343
 344    if (!log || log->size != size) {
 345        log = vhost_log_alloc(size, share);
 346        if (share) {
 347            vhost_log_shm = log;
 348        } else {
 349            vhost_log = log;
 350        }
 351    } else {
 352        ++log->refcnt;
 353    }
 354
 355    return log;
 356}
 357
 358static void vhost_log_put(struct vhost_dev *dev, bool sync)
 359{
 360    struct vhost_log *log = dev->log;
 361
 362    if (!log) {
 363        return;
 364    }
 365
 366    --log->refcnt;
 367    if (log->refcnt == 0) {
 368        /* Sync only the range covered by the old log */
 369        if (dev->log_size && sync) {
 370            vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 371        }
 372
 373        if (vhost_log == log) {
 374            g_free(log->log);
 375            vhost_log = NULL;
 376        } else if (vhost_log_shm == log) {
 377            qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 378                            log->fd);
 379            vhost_log_shm = NULL;
 380        }
 381
 382        g_free(log);
 383    }
 384}
 385
 386static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 387{
 388    return dev->vhost_ops->vhost_requires_shm_log &&
 389           dev->vhost_ops->vhost_requires_shm_log(dev);
 390}
 391
 392static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 393{
 394    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
 395    uint64_t log_base = (uintptr_t)log->log;
 396    int r;
 397
 398    /* inform backend of log switching, this must be done before
 399       releasing the current log, to ensure no logging is lost */
 400    r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 401    assert(r >= 0);
 402    vhost_log_put(dev, true);
 403    dev->log = log;
 404    dev->log_size = size;
 405}
 406
 407static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 408                                      uint64_t start_addr,
 409                                      uint64_t size)
 410{
 411    int i;
 412    int r = 0;
 413
 414    for (i = 0; !r && i < dev->nvqs; ++i) {
 415        struct vhost_virtqueue *vq = dev->vqs + i;
 416        hwaddr l;
 417        void *p;
 418
 419        if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) {
 420            continue;
 421        }
 422        l = vq->ring_size;
 423        p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
 424        if (!p || l != vq->ring_size) {
 425            fprintf(stderr, "Unable to map ring buffer for ring %d\n", i);
 426            r = -ENOMEM;
 427        }
 428        if (p != vq->ring) {
 429            fprintf(stderr, "Ring buffer relocated for ring %d\n", i);
 430            r = -EBUSY;
 431        }
 432        cpu_physical_memory_unmap(p, l, 0, 0);
 433    }
 434    return r;
 435}
 436
 437static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
 438                                                      uint64_t start_addr,
 439                                                      uint64_t size)
 440{
 441    int i, n = dev->mem->nregions;
 442    for (i = 0; i < n; ++i) {
 443        struct vhost_memory_region *reg = dev->mem->regions + i;
 444        if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
 445                           start_addr, size)) {
 446            return reg;
 447        }
 448    }
 449    return NULL;
 450}
 451
 452static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
 453                                 uint64_t start_addr,
 454                                 uint64_t size,
 455                                 uint64_t uaddr)
 456{
 457    struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
 458    uint64_t reglast;
 459    uint64_t memlast;
 460
 461    if (!reg) {
 462        return true;
 463    }
 464
 465    reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
 466    memlast = range_get_last(start_addr, size);
 467
 468    /* Need to extend region? */
 469    if (start_addr < reg->guest_phys_addr || memlast > reglast) {
 470        return true;
 471    }
 472    /* userspace_addr changed? */
 473    return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
 474}
 475
 476static void vhost_set_memory(MemoryListener *listener,
 477                             MemoryRegionSection *section,
 478                             bool add)
 479{
 480    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 481                                         memory_listener);
 482    hwaddr start_addr = section->offset_within_address_space;
 483    ram_addr_t size = int128_get64(section->size);
 484    bool log_dirty =
 485        memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
 486    int s = offsetof(struct vhost_memory, regions) +
 487        (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
 488    void *ram;
 489
 490    dev->mem = g_realloc(dev->mem, s);
 491
 492    if (log_dirty) {
 493        add = false;
 494    }
 495
 496    assert(size);
 497
 498    /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
 499    ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
 500    if (add) {
 501        if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
 502            /* Region exists with same address. Nothing to do. */
 503            return;
 504        }
 505    } else {
 506        if (!vhost_dev_find_reg(dev, start_addr, size)) {
 507            /* Removing region that we don't access. Nothing to do. */
 508            return;
 509        }
 510    }
 511
 512    vhost_dev_unassign_memory(dev, start_addr, size);
 513    if (add) {
 514        /* Add given mapping, merging adjacent regions if any */
 515        vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
 516    } else {
 517        /* Remove old mapping for this memory, if any. */
 518        vhost_dev_unassign_memory(dev, start_addr, size);
 519    }
 520    dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
 521    dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
 522    dev->memory_changed = true;
 523    used_memslots = dev->mem->nregions;
 524}
 525
 526static bool vhost_section(MemoryRegionSection *section)
 527{
 528    return memory_region_is_ram(section->mr);
 529}
 530
 531static void vhost_begin(MemoryListener *listener)
 532{
 533    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 534                                         memory_listener);
 535    dev->mem_changed_end_addr = 0;
 536    dev->mem_changed_start_addr = -1;
 537}
 538
 539static void vhost_commit(MemoryListener *listener)
 540{
 541    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 542                                         memory_listener);
 543    hwaddr start_addr = 0;
 544    ram_addr_t size = 0;
 545    uint64_t log_size;
 546    int r;
 547
 548    if (!dev->memory_changed) {
 549        return;
 550    }
 551    if (!dev->started) {
 552        return;
 553    }
 554    if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
 555        return;
 556    }
 557
 558    if (dev->started) {
 559        start_addr = dev->mem_changed_start_addr;
 560        size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
 561
 562        r = vhost_verify_ring_mappings(dev, start_addr, size);
 563        assert(r >= 0);
 564    }
 565
 566    if (!dev->log_enabled) {
 567        r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 568        assert(r >= 0);
 569        dev->memory_changed = false;
 570        return;
 571    }
 572    log_size = vhost_get_log_size(dev);
 573    /* We allocate an extra 4K bytes to log,
 574     * to reduce the * number of reallocations. */
 575#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 576    /* To log more, must increase log size before table update. */
 577    if (dev->log_size < log_size) {
 578        vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 579    }
 580    r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 581    assert(r >= 0);
 582    /* To log less, can only decrease log size after table update. */
 583    if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 584        vhost_dev_log_resize(dev, log_size);
 585    }
 586    dev->memory_changed = false;
 587}
 588
 589static void vhost_region_add(MemoryListener *listener,
 590                             MemoryRegionSection *section)
 591{
 592    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 593                                         memory_listener);
 594
 595    if (!vhost_section(section)) {
 596        return;
 597    }
 598
 599    ++dev->n_mem_sections;
 600    dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
 601                                dev->n_mem_sections);
 602    dev->mem_sections[dev->n_mem_sections - 1] = *section;
 603    memory_region_ref(section->mr);
 604    vhost_set_memory(listener, section, true);
 605}
 606
 607static void vhost_region_del(MemoryListener *listener,
 608                             MemoryRegionSection *section)
 609{
 610    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 611                                         memory_listener);
 612    int i;
 613
 614    if (!vhost_section(section)) {
 615        return;
 616    }
 617
 618    vhost_set_memory(listener, section, false);
 619    memory_region_unref(section->mr);
 620    for (i = 0; i < dev->n_mem_sections; ++i) {
 621        if (dev->mem_sections[i].offset_within_address_space
 622            == section->offset_within_address_space) {
 623            --dev->n_mem_sections;
 624            memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
 625                    (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
 626            break;
 627        }
 628    }
 629}
 630
 631static void vhost_region_nop(MemoryListener *listener,
 632                             MemoryRegionSection *section)
 633{
 634}
 635
 636static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 637                                    struct vhost_virtqueue *vq,
 638                                    unsigned idx, bool enable_log)
 639{
 640    struct vhost_vring_addr addr = {
 641        .index = idx,
 642        .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
 643        .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
 644        .used_user_addr = (uint64_t)(unsigned long)vq->used,
 645        .log_guest_addr = vq->used_phys,
 646        .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
 647    };
 648    int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 649    if (r < 0) {
 650        return -errno;
 651    }
 652    return 0;
 653}
 654
 655static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
 656{
 657    uint64_t features = dev->acked_features;
 658    int r;
 659    if (enable_log) {
 660        features |= 0x1ULL << VHOST_F_LOG_ALL;
 661    }
 662    r = dev->vhost_ops->vhost_set_features(dev, features);
 663    return r < 0 ? -errno : 0;
 664}
 665
 666static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 667{
 668    int r, t, i, idx;
 669    r = vhost_dev_set_features(dev, enable_log);
 670    if (r < 0) {
 671        goto err_features;
 672    }
 673    for (i = 0; i < dev->nvqs; ++i) {
 674        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 675        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 676                                     enable_log);
 677        if (r < 0) {
 678            goto err_vq;
 679        }
 680    }
 681    return 0;
 682err_vq:
 683    for (; i >= 0; --i) {
 684        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
 685        t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
 686                                     dev->log_enabled);
 687        assert(t >= 0);
 688    }
 689    t = vhost_dev_set_features(dev, dev->log_enabled);
 690    assert(t >= 0);
 691err_features:
 692    return r;
 693}
 694
 695static int vhost_migration_log(MemoryListener *listener, int enable)
 696{
 697    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 698                                         memory_listener);
 699    int r;
 700    if (!!enable == dev->log_enabled) {
 701        return 0;
 702    }
 703    if (!dev->started) {
 704        dev->log_enabled = enable;
 705        return 0;
 706    }
 707    if (!enable) {
 708        r = vhost_dev_set_log(dev, false);
 709        if (r < 0) {
 710            return r;
 711        }
 712        vhost_log_put(dev, false);
 713        dev->log = NULL;
 714        dev->log_size = 0;
 715    } else {
 716        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
 717        r = vhost_dev_set_log(dev, true);
 718        if (r < 0) {
 719            return r;
 720        }
 721    }
 722    dev->log_enabled = enable;
 723    return 0;
 724}
 725
 726static void vhost_log_global_start(MemoryListener *listener)
 727{
 728    int r;
 729
 730    r = vhost_migration_log(listener, true);
 731    if (r < 0) {
 732        abort();
 733    }
 734}
 735
 736static void vhost_log_global_stop(MemoryListener *listener)
 737{
 738    int r;
 739
 740    r = vhost_migration_log(listener, false);
 741    if (r < 0) {
 742        abort();
 743    }
 744}
 745
 746static void vhost_log_start(MemoryListener *listener,
 747                            MemoryRegionSection *section,
 748                            int old, int new)
 749{
 750    /* FIXME: implement */
 751}
 752
 753static void vhost_log_stop(MemoryListener *listener,
 754                           MemoryRegionSection *section,
 755                           int old, int new)
 756{
 757    /* FIXME: implement */
 758}
 759
 760/* The vhost driver natively knows how to handle the vrings of non
 761 * cross-endian legacy devices and modern devices. Only legacy devices
 762 * exposed to a bi-endian guest may require the vhost driver to use a
 763 * specific endianness.
 764 */
 765static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
 766{
 767    if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
 768        return false;
 769    }
 770#ifdef TARGET_IS_BIENDIAN
 771#ifdef HOST_WORDS_BIGENDIAN
 772    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
 773#else
 774    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
 775#endif
 776#else
 777    return false;
 778#endif
 779}
 780
 781static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
 782                                                   bool is_big_endian,
 783                                                   int vhost_vq_index)
 784{
 785    struct vhost_vring_state s = {
 786        .index = vhost_vq_index,
 787        .num = is_big_endian
 788    };
 789
 790    if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
 791        return 0;
 792    }
 793
 794    if (errno == ENOTTY) {
 795        error_report("vhost does not support cross-endian");
 796        return -ENOSYS;
 797    }
 798
 799    return -errno;
 800}
 801
 802static int vhost_virtqueue_start(struct vhost_dev *dev,
 803                                struct VirtIODevice *vdev,
 804                                struct vhost_virtqueue *vq,
 805                                unsigned idx)
 806{
 807    hwaddr s, l, a;
 808    int r;
 809    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 810    struct vhost_vring_file file = {
 811        .index = vhost_vq_index
 812    };
 813    struct vhost_vring_state state = {
 814        .index = vhost_vq_index
 815    };
 816    struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
 817
 818
 819    vq->num = state.num = virtio_queue_get_num(vdev, idx);
 820    r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
 821    if (r) {
 822        return -errno;
 823    }
 824
 825    state.num = virtio_queue_get_last_avail_idx(vdev, idx);
 826    r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
 827    if (r) {
 828        return -errno;
 829    }
 830
 831    if (vhost_needs_vring_endian(vdev)) {
 832        r = vhost_virtqueue_set_vring_endian_legacy(dev,
 833                                                    virtio_is_big_endian(vdev),
 834                                                    vhost_vq_index);
 835        if (r) {
 836            return -errno;
 837        }
 838    }
 839
 840    s = l = virtio_queue_get_desc_size(vdev, idx);
 841    a = virtio_queue_get_desc_addr(vdev, idx);
 842    vq->desc = cpu_physical_memory_map(a, &l, 0);
 843    if (!vq->desc || l != s) {
 844        r = -ENOMEM;
 845        goto fail_alloc_desc;
 846    }
 847    s = l = virtio_queue_get_avail_size(vdev, idx);
 848    a = virtio_queue_get_avail_addr(vdev, idx);
 849    vq->avail = cpu_physical_memory_map(a, &l, 0);
 850    if (!vq->avail || l != s) {
 851        r = -ENOMEM;
 852        goto fail_alloc_avail;
 853    }
 854    vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
 855    vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
 856    vq->used = cpu_physical_memory_map(a, &l, 1);
 857    if (!vq->used || l != s) {
 858        r = -ENOMEM;
 859        goto fail_alloc_used;
 860    }
 861
 862    vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
 863    vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
 864    vq->ring = cpu_physical_memory_map(a, &l, 1);
 865    if (!vq->ring || l != s) {
 866        r = -ENOMEM;
 867        goto fail_alloc_ring;
 868    }
 869
 870    r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
 871    if (r < 0) {
 872        r = -errno;
 873        goto fail_alloc;
 874    }
 875
 876    file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
 877    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
 878    if (r) {
 879        r = -errno;
 880        goto fail_kick;
 881    }
 882
 883    /* Clear and discard previous events if any. */
 884    event_notifier_test_and_clear(&vq->masked_notifier);
 885
 886    /* Init vring in unmasked state, unless guest_notifier_mask
 887     * will do it later.
 888     */
 889    if (!vdev->use_guest_notifier_mask) {
 890        /* TODO: check and handle errors. */
 891        vhost_virtqueue_mask(dev, vdev, idx, false);
 892    }
 893
 894    return 0;
 895
 896fail_kick:
 897fail_alloc:
 898    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 899                              0, 0);
 900fail_alloc_ring:
 901    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 902                              0, 0);
 903fail_alloc_used:
 904    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 905                              0, 0);
 906fail_alloc_avail:
 907    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 908                              0, 0);
 909fail_alloc_desc:
 910    return r;
 911}
 912
 913static void vhost_virtqueue_stop(struct vhost_dev *dev,
 914                                    struct VirtIODevice *vdev,
 915                                    struct vhost_virtqueue *vq,
 916                                    unsigned idx)
 917{
 918    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
 919    struct vhost_vring_state state = {
 920        .index = vhost_vq_index,
 921    };
 922    int r;
 923
 924    r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
 925    if (r < 0) {
 926        fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
 927        fflush(stderr);
 928    }
 929    virtio_queue_set_last_avail_idx(vdev, idx, state.num);
 930    virtio_queue_invalidate_signalled_used(vdev, idx);
 931
 932    /* In the cross-endian case, we need to reset the vring endianness to
 933     * native as legacy devices expect so by default.
 934     */
 935    if (vhost_needs_vring_endian(vdev)) {
 936        r = vhost_virtqueue_set_vring_endian_legacy(dev,
 937                                                    !virtio_is_big_endian(vdev),
 938                                                    vhost_vq_index);
 939        if (r < 0) {
 940            error_report("failed to reset vring endianness");
 941        }
 942    }
 943
 944    assert (r >= 0);
 945    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
 946                              0, virtio_queue_get_ring_size(vdev, idx));
 947    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
 948                              1, virtio_queue_get_used_size(vdev, idx));
 949    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
 950                              0, virtio_queue_get_avail_size(vdev, idx));
 951    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
 952                              0, virtio_queue_get_desc_size(vdev, idx));
 953}
 954
 955static void vhost_eventfd_add(MemoryListener *listener,
 956                              MemoryRegionSection *section,
 957                              bool match_data, uint64_t data, EventNotifier *e)
 958{
 959}
 960
 961static void vhost_eventfd_del(MemoryListener *listener,
 962                              MemoryRegionSection *section,
 963                              bool match_data, uint64_t data, EventNotifier *e)
 964{
 965}
 966
 967static int vhost_virtqueue_init(struct vhost_dev *dev,
 968                                struct vhost_virtqueue *vq, int n)
 969{
 970    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
 971    struct vhost_vring_file file = {
 972        .index = vhost_vq_index,
 973    };
 974    int r = event_notifier_init(&vq->masked_notifier, 0);
 975    if (r < 0) {
 976        return r;
 977    }
 978
 979    file.fd = event_notifier_get_fd(&vq->masked_notifier);
 980    r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
 981    if (r) {
 982        r = -errno;
 983        goto fail_call;
 984    }
 985    return 0;
 986fail_call:
 987    event_notifier_cleanup(&vq->masked_notifier);
 988    return r;
 989}
 990
 991static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
 992{
 993    event_notifier_cleanup(&vq->masked_notifier);
 994}
 995
 996int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 997                   VhostBackendType backend_type)
 998{
 999    uint64_t features;
1000    int i, r;
1001
1002    hdev->migration_blocker = NULL;
1003
1004    if (vhost_set_backend_type(hdev, backend_type) < 0) {
1005        close((uintptr_t)opaque);
1006        return -1;
1007    }
1008
1009    if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) {
1010        close((uintptr_t)opaque);
1011        return -errno;
1012    }
1013
1014    if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1015        fprintf(stderr, "vhost backend memory slots limit is less"
1016                " than current number of present memory slots\n");
1017        close((uintptr_t)opaque);
1018        return -1;
1019    }
1020    QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1021
1022    r = hdev->vhost_ops->vhost_set_owner(hdev);
1023    if (r < 0) {
1024        goto fail;
1025    }
1026
1027    r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1028    if (r < 0) {
1029        goto fail;
1030    }
1031
1032    for (i = 0; i < hdev->nvqs; ++i) {
1033        r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1034        if (r < 0) {
1035            goto fail_vq;
1036        }
1037    }
1038    hdev->features = features;
1039
1040    hdev->memory_listener = (MemoryListener) {
1041        .begin = vhost_begin,
1042        .commit = vhost_commit,
1043        .region_add = vhost_region_add,
1044        .region_del = vhost_region_del,
1045        .region_nop = vhost_region_nop,
1046        .log_start = vhost_log_start,
1047        .log_stop = vhost_log_stop,
1048        .log_sync = vhost_log_sync,
1049        .log_global_start = vhost_log_global_start,
1050        .log_global_stop = vhost_log_global_stop,
1051        .eventfd_add = vhost_eventfd_add,
1052        .eventfd_del = vhost_eventfd_del,
1053        .priority = 10
1054    };
1055
1056    if (hdev->migration_blocker == NULL) {
1057        if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1058            error_setg(&hdev->migration_blocker,
1059                       "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1060        } else if (!qemu_memfd_check()) {
1061            error_setg(&hdev->migration_blocker,
1062                       "Migration disabled: failed to allocate shared memory");
1063        }
1064    }
1065
1066    if (hdev->migration_blocker != NULL) {
1067        migrate_add_blocker(hdev->migration_blocker);
1068    }
1069
1070    hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1071    hdev->n_mem_sections = 0;
1072    hdev->mem_sections = NULL;
1073    hdev->log = NULL;
1074    hdev->log_size = 0;
1075    hdev->log_enabled = false;
1076    hdev->started = false;
1077    hdev->memory_changed = false;
1078    memory_listener_register(&hdev->memory_listener, &address_space_memory);
1079    return 0;
1080fail_vq:
1081    while (--i >= 0) {
1082        vhost_virtqueue_cleanup(hdev->vqs + i);
1083    }
1084fail:
1085    r = -errno;
1086    hdev->vhost_ops->vhost_backend_cleanup(hdev);
1087    QLIST_REMOVE(hdev, entry);
1088    return r;
1089}
1090
1091void vhost_dev_cleanup(struct vhost_dev *hdev)
1092{
1093    int i;
1094    for (i = 0; i < hdev->nvqs; ++i) {
1095        vhost_virtqueue_cleanup(hdev->vqs + i);
1096    }
1097    memory_listener_unregister(&hdev->memory_listener);
1098    if (hdev->migration_blocker) {
1099        migrate_del_blocker(hdev->migration_blocker);
1100        error_free(hdev->migration_blocker);
1101    }
1102    g_free(hdev->mem);
1103    g_free(hdev->mem_sections);
1104    hdev->vhost_ops->vhost_backend_cleanup(hdev);
1105    QLIST_REMOVE(hdev, entry);
1106}
1107
1108/* Stop processing guest IO notifications in qemu.
1109 * Start processing them in vhost in kernel.
1110 */
1111int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1112{
1113    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1114    VirtioBusState *vbus = VIRTIO_BUS(qbus);
1115    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1116    int i, r, e;
1117    if (!k->set_host_notifier) {
1118        fprintf(stderr, "binding does not support host notifiers\n");
1119        r = -ENOSYS;
1120        goto fail;
1121    }
1122
1123    for (i = 0; i < hdev->nvqs; ++i) {
1124        r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, true);
1125        if (r < 0) {
1126            fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
1127            goto fail_vq;
1128        }
1129    }
1130
1131    return 0;
1132fail_vq:
1133    while (--i >= 0) {
1134        e = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1135        if (e < 0) {
1136            fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
1137            fflush(stderr);
1138        }
1139        assert (e >= 0);
1140    }
1141fail:
1142    return r;
1143}
1144
1145/* Stop processing guest IO notifications in vhost.
1146 * Start processing them in qemu.
1147 * This might actually run the qemu handlers right away,
1148 * so virtio in qemu must be completely setup when this is called.
1149 */
1150void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1151{
1152    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1153    VirtioBusState *vbus = VIRTIO_BUS(qbus);
1154    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1155    int i, r;
1156
1157    for (i = 0; i < hdev->nvqs; ++i) {
1158        r = k->set_host_notifier(qbus->parent, hdev->vq_index + i, false);
1159        if (r < 0) {
1160            fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
1161            fflush(stderr);
1162        }
1163        assert (r >= 0);
1164    }
1165}
1166
1167/* Test and clear event pending status.
1168 * Should be called after unmask to avoid losing events.
1169 */
1170bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1171{
1172    struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1173    assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1174    return event_notifier_test_and_clear(&vq->masked_notifier);
1175}
1176
1177/* Mask/unmask events from this vq. */
1178void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1179                         bool mask)
1180{
1181    struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1182    int r, index = n - hdev->vq_index;
1183    struct vhost_vring_file file;
1184
1185    if (mask) {
1186        assert(vdev->use_guest_notifier_mask);
1187        file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1188    } else {
1189        file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1190    }
1191
1192    file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1193    r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1194    assert(r >= 0);
1195}
1196
1197uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1198                            uint64_t features)
1199{
1200    const int *bit = feature_bits;
1201    while (*bit != VHOST_INVALID_FEATURE_BIT) {
1202        uint64_t bit_mask = (1ULL << *bit);
1203        if (!(hdev->features & bit_mask)) {
1204            features &= ~bit_mask;
1205        }
1206        bit++;
1207    }
1208    return features;
1209}
1210
1211void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1212                        uint64_t features)
1213{
1214    const int *bit = feature_bits;
1215    while (*bit != VHOST_INVALID_FEATURE_BIT) {
1216        uint64_t bit_mask = (1ULL << *bit);
1217        if (features & bit_mask) {
1218            hdev->acked_features |= bit_mask;
1219        }
1220        bit++;
1221    }
1222}
1223
1224/* Host notifiers must be enabled at this point. */
1225int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1226{
1227    int i, r;
1228
1229    hdev->started = true;
1230
1231    r = vhost_dev_set_features(hdev, hdev->log_enabled);
1232    if (r < 0) {
1233        goto fail_features;
1234    }
1235    r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1236    if (r < 0) {
1237        r = -errno;
1238        goto fail_mem;
1239    }
1240    for (i = 0; i < hdev->nvqs; ++i) {
1241        r = vhost_virtqueue_start(hdev,
1242                                  vdev,
1243                                  hdev->vqs + i,
1244                                  hdev->vq_index + i);
1245        if (r < 0) {
1246            goto fail_vq;
1247        }
1248    }
1249
1250    if (hdev->log_enabled) {
1251        uint64_t log_base;
1252
1253        hdev->log_size = vhost_get_log_size(hdev);
1254        hdev->log = vhost_log_get(hdev->log_size,
1255                                  vhost_dev_log_is_shared(hdev));
1256        log_base = (uintptr_t)hdev->log->log;
1257        r = hdev->vhost_ops->vhost_set_log_base(hdev,
1258                                                hdev->log_size ? log_base : 0,
1259                                                hdev->log);
1260        if (r < 0) {
1261            r = -errno;
1262            goto fail_log;
1263        }
1264    }
1265
1266    return 0;
1267fail_log:
1268    vhost_log_put(hdev, false);
1269fail_vq:
1270    while (--i >= 0) {
1271        vhost_virtqueue_stop(hdev,
1272                             vdev,
1273                             hdev->vqs + i,
1274                             hdev->vq_index + i);
1275    }
1276    i = hdev->nvqs;
1277fail_mem:
1278fail_features:
1279
1280    hdev->started = false;
1281    return r;
1282}
1283
1284/* Host notifiers must be enabled at this point. */
1285void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1286{
1287    int i;
1288
1289    for (i = 0; i < hdev->nvqs; ++i) {
1290        vhost_virtqueue_stop(hdev,
1291                             vdev,
1292                             hdev->vqs + i,
1293                             hdev->vq_index + i);
1294    }
1295
1296    vhost_log_put(hdev, true);
1297    hdev->started = false;
1298    hdev->log = NULL;
1299    hdev->log_size = 0;
1300}
1301
1302