qemu/hw/misc/ivshmem.c
<<
>>
Prefs
   1/*
   2 * Inter-VM Shared Memory PCI device.
   3 *
   4 * Author:
   5 *      Cam Macdonell <cam@cs.ualberta.ca>
   6 *
   7 * Based On: cirrus_vga.c
   8 *          Copyright (c) 2004 Fabrice Bellard
   9 *          Copyright (c) 2004 Makoto Suzuki (suzu)
  10 *
  11 *      and rtl8139.c
  12 *          Copyright (c) 2006 Igor Kovalenko
  13 *
  14 * This code is licensed under the GNU GPL v2.
  15 *
  16 * Contributions after 2012-01-13 are licensed under the terms of the
  17 * GNU GPL, version 2 or (at your option) any later version.
  18 */
  19#include "qemu/osdep.h"
  20#include "qemu/units.h"
  21#include "qapi/error.h"
  22#include "qemu/cutils.h"
  23#include "hw/hw.h"
  24#include "hw/pci/pci.h"
  25#include "hw/pci/msi.h"
  26#include "hw/pci/msix.h"
  27#include "sysemu/kvm.h"
  28#include "migration/blocker.h"
  29#include "qemu/error-report.h"
  30#include "qemu/event_notifier.h"
  31#include "qom/object_interfaces.h"
  32#include "chardev/char-fe.h"
  33#include "sysemu/hostmem.h"
  34#include "sysemu/qtest.h"
  35#include "qapi/visitor.h"
  36
  37#include "hw/misc/ivshmem.h"
  38
  39#define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
  40#define PCI_DEVICE_ID_IVSHMEM   0x1110
  41
  42#define IVSHMEM_MAX_PEERS UINT16_MAX
  43#define IVSHMEM_IOEVENTFD   0
  44#define IVSHMEM_MSI     1
  45
  46#define IVSHMEM_REG_BAR_SIZE 0x100
  47
  48#define IVSHMEM_DEBUG 0
  49#define IVSHMEM_DPRINTF(fmt, ...)                       \
  50    do {                                                \
  51        if (IVSHMEM_DEBUG) {                            \
  52            printf("IVSHMEM: " fmt, ## __VA_ARGS__);    \
  53        }                                               \
  54    } while (0)
  55
  56#define TYPE_IVSHMEM_COMMON "ivshmem-common"
  57#define IVSHMEM_COMMON(obj) \
  58    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_COMMON)
  59
  60#define TYPE_IVSHMEM_PLAIN "ivshmem-plain"
  61#define IVSHMEM_PLAIN(obj) \
  62    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_PLAIN)
  63
  64#define TYPE_IVSHMEM_DOORBELL "ivshmem-doorbell"
  65#define IVSHMEM_DOORBELL(obj) \
  66    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_DOORBELL)
  67
  68#define TYPE_IVSHMEM "ivshmem"
  69#define IVSHMEM(obj) \
  70    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM)
  71
  72typedef struct Peer {
  73    int nb_eventfds;
  74    EventNotifier *eventfds;
  75} Peer;
  76
  77typedef struct MSIVector {
  78    PCIDevice *pdev;
  79    int virq;
  80    bool unmasked;
  81} MSIVector;
  82
  83typedef struct IVShmemState {
  84    /*< private >*/
  85    PCIDevice parent_obj;
  86    /*< public >*/
  87
  88    uint32_t features;
  89
  90    /* exactly one of these two may be set */
  91    HostMemoryBackend *hostmem; /* with interrupts */
  92    CharBackend server_chr; /* without interrupts */
  93
  94    /* registers */
  95    uint32_t intrmask;
  96    uint32_t intrstatus;
  97    int vm_id;
  98
  99    /* BARs */
 100    MemoryRegion ivshmem_mmio;  /* BAR 0 (registers) */
 101    MemoryRegion *ivshmem_bar2; /* BAR 2 (shared memory) */
 102    MemoryRegion server_bar2;   /* used with server_chr */
 103
 104    /* interrupt support */
 105    Peer *peers;
 106    int nb_peers;               /* space in @peers[] */
 107    uint32_t vectors;
 108    MSIVector *msi_vectors;
 109    uint64_t msg_buf;           /* buffer for receiving server messages */
 110    int msg_buffered_bytes;     /* #bytes in @msg_buf */
 111
 112    /* migration stuff */
 113    OnOffAuto master;
 114    Error *migration_blocker;
 115} IVShmemState;
 116
 117/* registers for the Inter-VM shared memory device */
 118enum ivshmem_registers {
 119    INTRMASK = 0,
 120    INTRSTATUS = 4,
 121    IVPOSITION = 8,
 122    DOORBELL = 12,
 123};
 124
 125static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
 126                                                    unsigned int feature) {
 127    return (ivs->features & (1 << feature));
 128}
 129
 130static inline bool ivshmem_is_master(IVShmemState *s)
 131{
 132    assert(s->master != ON_OFF_AUTO_AUTO);
 133    return s->master == ON_OFF_AUTO_ON;
 134}
 135
 136static void ivshmem_update_irq(IVShmemState *s)
 137{
 138    PCIDevice *d = PCI_DEVICE(s);
 139    uint32_t isr = s->intrstatus & s->intrmask;
 140
 141    /*
 142     * Do nothing unless the device actually uses INTx.  Here's how
 143     * the device variants signal interrupts, what they put in PCI
 144     * config space:
 145     * Device variant    Interrupt  Interrupt Pin  MSI-X cap.
 146     * ivshmem-plain         none            0         no
 147     * ivshmem-doorbell     MSI-X            1        yes(1)
 148     * ivshmem,msi=off       INTx            1         no
 149     * ivshmem,msi=on       MSI-X            1(2)     yes(1)
 150     * (1) if guest enabled MSI-X
 151     * (2) the device lies
 152     * Leads to the condition for doing nothing:
 153     */
 154    if (ivshmem_has_feature(s, IVSHMEM_MSI)
 155        || !d->config[PCI_INTERRUPT_PIN]) {
 156        return;
 157    }
 158
 159    /* don't print ISR resets */
 160    if (isr) {
 161        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
 162                        isr ? 1 : 0, s->intrstatus, s->intrmask);
 163    }
 164
 165    pci_set_irq(d, isr != 0);
 166}
 167
 168static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
 169{
 170    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
 171
 172    s->intrmask = val;
 173    ivshmem_update_irq(s);
 174}
 175
 176static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
 177{
 178    uint32_t ret = s->intrmask;
 179
 180    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
 181    return ret;
 182}
 183
 184static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
 185{
 186    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
 187
 188    s->intrstatus = val;
 189    ivshmem_update_irq(s);
 190}
 191
 192static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
 193{
 194    uint32_t ret = s->intrstatus;
 195
 196    /* reading ISR clears all interrupts */
 197    s->intrstatus = 0;
 198    ivshmem_update_irq(s);
 199    return ret;
 200}
 201
 202static void ivshmem_io_write(void *opaque, hwaddr addr,
 203                             uint64_t val, unsigned size)
 204{
 205    IVShmemState *s = opaque;
 206
 207    uint16_t dest = val >> 16;
 208    uint16_t vector = val & 0xff;
 209
 210    addr &= 0xfc;
 211
 212    IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
 213    switch (addr)
 214    {
 215        case INTRMASK:
 216            ivshmem_IntrMask_write(s, val);
 217            break;
 218
 219        case INTRSTATUS:
 220            ivshmem_IntrStatus_write(s, val);
 221            break;
 222
 223        case DOORBELL:
 224            /* check that dest VM ID is reasonable */
 225            if (dest >= s->nb_peers) {
 226                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
 227                break;
 228            }
 229
 230            /* check doorbell range */
 231            if (vector < s->peers[dest].nb_eventfds) {
 232                IVSHMEM_DPRINTF("Notifying VM %d on vector %d\n", dest, vector);
 233                event_notifier_set(&s->peers[dest].eventfds[vector]);
 234            } else {
 235                IVSHMEM_DPRINTF("Invalid destination vector %d on VM %d\n",
 236                                vector, dest);
 237            }
 238            break;
 239        default:
 240            IVSHMEM_DPRINTF("Unhandled write " TARGET_FMT_plx "\n", addr);
 241    }
 242}
 243
 244static uint64_t ivshmem_io_read(void *opaque, hwaddr addr,
 245                                unsigned size)
 246{
 247
 248    IVShmemState *s = opaque;
 249    uint32_t ret;
 250
 251    switch (addr)
 252    {
 253        case INTRMASK:
 254            ret = ivshmem_IntrMask_read(s);
 255            break;
 256
 257        case INTRSTATUS:
 258            ret = ivshmem_IntrStatus_read(s);
 259            break;
 260
 261        case IVPOSITION:
 262            ret = s->vm_id;
 263            break;
 264
 265        default:
 266            IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
 267            ret = 0;
 268    }
 269
 270    return ret;
 271}
 272
 273static const MemoryRegionOps ivshmem_mmio_ops = {
 274    .read = ivshmem_io_read,
 275    .write = ivshmem_io_write,
 276    .endianness = DEVICE_NATIVE_ENDIAN,
 277    .impl = {
 278        .min_access_size = 4,
 279        .max_access_size = 4,
 280    },
 281};
 282
 283static void ivshmem_vector_notify(void *opaque)
 284{
 285    MSIVector *entry = opaque;
 286    PCIDevice *pdev = entry->pdev;
 287    IVShmemState *s = IVSHMEM_COMMON(pdev);
 288    int vector = entry - s->msi_vectors;
 289    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
 290
 291    if (!event_notifier_test_and_clear(n)) {
 292        return;
 293    }
 294
 295    IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector);
 296    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
 297        if (msix_enabled(pdev)) {
 298            msix_notify(pdev, vector);
 299        }
 300    } else {
 301        ivshmem_IntrStatus_write(s, 1);
 302    }
 303}
 304
 305static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
 306                                 MSIMessage msg)
 307{
 308    IVShmemState *s = IVSHMEM_COMMON(dev);
 309    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
 310    MSIVector *v = &s->msi_vectors[vector];
 311    int ret;
 312
 313    IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);
 314    if (!v->pdev) {
 315        error_report("ivshmem: vector %d route does not exist", vector);
 316        return -EINVAL;
 317    }
 318    assert(!v->unmasked);
 319
 320    ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
 321    if (ret < 0) {
 322        return ret;
 323    }
 324    kvm_irqchip_commit_routes(kvm_state);
 325
 326    ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
 327    if (ret < 0) {
 328        return ret;
 329    }
 330    v->unmasked = true;
 331
 332    return 0;
 333}
 334
 335static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector)
 336{
 337    IVShmemState *s = IVSHMEM_COMMON(dev);
 338    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
 339    MSIVector *v = &s->msi_vectors[vector];
 340    int ret;
 341
 342    IVSHMEM_DPRINTF("vector mask %p %d\n", dev, vector);
 343    if (!v->pdev) {
 344        error_report("ivshmem: vector %d route does not exist", vector);
 345        return;
 346    }
 347    assert(v->unmasked);
 348
 349    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, v->virq);
 350    if (ret < 0) {
 351        error_report("remove_irqfd_notifier_gsi failed");
 352        return;
 353    }
 354    v->unmasked = false;
 355}
 356
 357static void ivshmem_vector_poll(PCIDevice *dev,
 358                                unsigned int vector_start,
 359                                unsigned int vector_end)
 360{
 361    IVShmemState *s = IVSHMEM_COMMON(dev);
 362    unsigned int vector;
 363
 364    IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end);
 365
 366    vector_end = MIN(vector_end, s->vectors);
 367
 368    for (vector = vector_start; vector < vector_end; vector++) {
 369        EventNotifier *notifier = &s->peers[s->vm_id].eventfds[vector];
 370
 371        if (!msix_is_masked(dev, vector)) {
 372            continue;
 373        }
 374
 375        if (event_notifier_test_and_clear(notifier)) {
 376            msix_set_pending(dev, vector);
 377        }
 378    }
 379}
 380
 381static void watch_vector_notifier(IVShmemState *s, EventNotifier *n,
 382                                 int vector)
 383{
 384    int eventfd = event_notifier_get_fd(n);
 385
 386    assert(!s->msi_vectors[vector].pdev);
 387    s->msi_vectors[vector].pdev = PCI_DEVICE(s);
 388
 389    qemu_set_fd_handler(eventfd, ivshmem_vector_notify,
 390                        NULL, &s->msi_vectors[vector]);
 391}
 392
 393static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i)
 394{
 395    memory_region_add_eventfd(&s->ivshmem_mmio,
 396                              DOORBELL,
 397                              4,
 398                              true,
 399                              (posn << 16) | i,
 400                              &s->peers[posn].eventfds[i]);
 401}
 402
 403static void ivshmem_del_eventfd(IVShmemState *s, int posn, int i)
 404{
 405    memory_region_del_eventfd(&s->ivshmem_mmio,
 406                              DOORBELL,
 407                              4,
 408                              true,
 409                              (posn << 16) | i,
 410                              &s->peers[posn].eventfds[i]);
 411}
 412
 413static void close_peer_eventfds(IVShmemState *s, int posn)
 414{
 415    int i, n;
 416
 417    assert(posn >= 0 && posn < s->nb_peers);
 418    n = s->peers[posn].nb_eventfds;
 419
 420    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
 421        memory_region_transaction_begin();
 422        for (i = 0; i < n; i++) {
 423            ivshmem_del_eventfd(s, posn, i);
 424        }
 425        memory_region_transaction_commit();
 426    }
 427
 428    for (i = 0; i < n; i++) {
 429        event_notifier_cleanup(&s->peers[posn].eventfds[i]);
 430    }
 431
 432    g_free(s->peers[posn].eventfds);
 433    s->peers[posn].nb_eventfds = 0;
 434}
 435
 436static void resize_peers(IVShmemState *s, int nb_peers)
 437{
 438    int old_nb_peers = s->nb_peers;
 439    int i;
 440
 441    assert(nb_peers > old_nb_peers);
 442    IVSHMEM_DPRINTF("bumping storage to %d peers\n", nb_peers);
 443
 444    s->peers = g_realloc(s->peers, nb_peers * sizeof(Peer));
 445    s->nb_peers = nb_peers;
 446
 447    for (i = old_nb_peers; i < nb_peers; i++) {
 448        s->peers[i].eventfds = g_new0(EventNotifier, s->vectors);
 449        s->peers[i].nb_eventfds = 0;
 450    }
 451}
 452
 453static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector,
 454                                     Error **errp)
 455{
 456    PCIDevice *pdev = PCI_DEVICE(s);
 457    int ret;
 458
 459    IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
 460    assert(!s->msi_vectors[vector].pdev);
 461
 462    ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev);
 463    if (ret < 0) {
 464        error_setg(errp, "kvm_irqchip_add_msi_route failed");
 465        return;
 466    }
 467
 468    s->msi_vectors[vector].virq = ret;
 469    s->msi_vectors[vector].pdev = pdev;
 470}
 471
 472static void setup_interrupt(IVShmemState *s, int vector, Error **errp)
 473{
 474    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
 475    bool with_irqfd = kvm_msi_via_irqfd_enabled() &&
 476        ivshmem_has_feature(s, IVSHMEM_MSI);
 477    PCIDevice *pdev = PCI_DEVICE(s);
 478    Error *err = NULL;
 479
 480    IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector);
 481
 482    if (!with_irqfd) {
 483        IVSHMEM_DPRINTF("with eventfd\n");
 484        watch_vector_notifier(s, n, vector);
 485    } else if (msix_enabled(pdev)) {
 486        IVSHMEM_DPRINTF("with irqfd\n");
 487        ivshmem_add_kvm_msi_virq(s, vector, &err);
 488        if (err) {
 489            error_propagate(errp, err);
 490            return;
 491        }
 492
 493        if (!msix_is_masked(pdev, vector)) {
 494            kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL,
 495                                               s->msi_vectors[vector].virq);
 496            /* TODO handle error */
 497        }
 498    } else {
 499        /* it will be delayed until msix is enabled, in write_config */
 500        IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled\n");
 501    }
 502}
 503
 504static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
 505{
 506    Error *local_err = NULL;
 507    struct stat buf;
 508    size_t size;
 509
 510    if (s->ivshmem_bar2) {
 511        error_setg(errp, "server sent unexpected shared memory message");
 512        close(fd);
 513        return;
 514    }
 515
 516    if (fstat(fd, &buf) < 0) {
 517        error_setg_errno(errp, errno,
 518            "can't determine size of shared memory sent by server");
 519        close(fd);
 520        return;
 521    }
 522
 523    size = buf.st_size;
 524
 525    /* mmap the region and map into the BAR2 */
 526    memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
 527                                   "ivshmem.bar2", size, true, fd, &local_err);
 528    if (local_err) {
 529        error_propagate(errp, local_err);
 530        return;
 531    }
 532
 533    s->ivshmem_bar2 = &s->server_bar2;
 534}
 535
 536static void process_msg_disconnect(IVShmemState *s, uint16_t posn,
 537                                   Error **errp)
 538{
 539    IVSHMEM_DPRINTF("posn %d has gone away\n", posn);
 540    if (posn >= s->nb_peers || posn == s->vm_id) {
 541        error_setg(errp, "invalid peer %d", posn);
 542        return;
 543    }
 544    close_peer_eventfds(s, posn);
 545}
 546
 547static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd,
 548                                Error **errp)
 549{
 550    Peer *peer = &s->peers[posn];
 551    int vector;
 552
 553    /*
 554     * The N-th connect message for this peer comes with the file
 555     * descriptor for vector N-1.  Count messages to find the vector.
 556     */
 557    if (peer->nb_eventfds >= s->vectors) {
 558        error_setg(errp, "Too many eventfd received, device has %d vectors",
 559                   s->vectors);
 560        close(fd);
 561        return;
 562    }
 563    vector = peer->nb_eventfds++;
 564
 565    IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd);
 566    event_notifier_init_fd(&peer->eventfds[vector], fd);
 567    fcntl_setfl(fd, O_NONBLOCK); /* msix/irqfd poll non block */
 568
 569    if (posn == s->vm_id) {
 570        setup_interrupt(s, vector, errp);
 571        /* TODO do we need to handle the error? */
 572    }
 573
 574    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
 575        ivshmem_add_eventfd(s, posn, vector);
 576    }
 577}
 578
 579static void process_msg(IVShmemState *s, int64_t msg, int fd, Error **errp)
 580{
 581    IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd);
 582
 583    if (msg < -1 || msg > IVSHMEM_MAX_PEERS) {
 584        error_setg(errp, "server sent invalid message %" PRId64, msg);
 585        close(fd);
 586        return;
 587    }
 588
 589    if (msg == -1) {
 590        process_msg_shmem(s, fd, errp);
 591        return;
 592    }
 593
 594    if (msg >= s->nb_peers) {
 595        resize_peers(s, msg + 1);
 596    }
 597
 598    if (fd >= 0) {
 599        process_msg_connect(s, msg, fd, errp);
 600    } else {
 601        process_msg_disconnect(s, msg, errp);
 602    }
 603}
 604
 605static int ivshmem_can_receive(void *opaque)
 606{
 607    IVShmemState *s = opaque;
 608
 609    assert(s->msg_buffered_bytes < sizeof(s->msg_buf));
 610    return sizeof(s->msg_buf) - s->msg_buffered_bytes;
 611}
 612
 613static void ivshmem_read(void *opaque, const uint8_t *buf, int size)
 614{
 615    IVShmemState *s = opaque;
 616    Error *err = NULL;
 617    int fd;
 618    int64_t msg;
 619
 620    assert(size >= 0 && s->msg_buffered_bytes + size <= sizeof(s->msg_buf));
 621    memcpy((unsigned char *)&s->msg_buf + s->msg_buffered_bytes, buf, size);
 622    s->msg_buffered_bytes += size;
 623    if (s->msg_buffered_bytes < sizeof(s->msg_buf)) {
 624        return;
 625    }
 626    msg = le64_to_cpu(s->msg_buf);
 627    s->msg_buffered_bytes = 0;
 628
 629    fd = qemu_chr_fe_get_msgfd(&s->server_chr);
 630
 631    process_msg(s, msg, fd, &err);
 632    if (err) {
 633        error_report_err(err);
 634    }
 635}
 636
 637static int64_t ivshmem_recv_msg(IVShmemState *s, int *pfd, Error **errp)
 638{
 639    int64_t msg;
 640    int n, ret;
 641
 642    n = 0;
 643    do {
 644        ret = qemu_chr_fe_read_all(&s->server_chr, (uint8_t *)&msg + n,
 645                                   sizeof(msg) - n);
 646        if (ret < 0) {
 647            if (ret == -EINTR) {
 648                continue;
 649            }
 650            error_setg_errno(errp, -ret, "read from server failed");
 651            return INT64_MIN;
 652        }
 653        n += ret;
 654    } while (n < sizeof(msg));
 655
 656    *pfd = qemu_chr_fe_get_msgfd(&s->server_chr);
 657    return le64_to_cpu(msg);
 658}
 659
 660static void ivshmem_recv_setup(IVShmemState *s, Error **errp)
 661{
 662    Error *err = NULL;
 663    int64_t msg;
 664    int fd;
 665
 666    msg = ivshmem_recv_msg(s, &fd, &err);
 667    if (err) {
 668        error_propagate(errp, err);
 669        return;
 670    }
 671    if (msg != IVSHMEM_PROTOCOL_VERSION) {
 672        error_setg(errp, "server sent version %" PRId64 ", expecting %d",
 673                   msg, IVSHMEM_PROTOCOL_VERSION);
 674        return;
 675    }
 676    if (fd != -1) {
 677        error_setg(errp, "server sent invalid version message");
 678        return;
 679    }
 680
 681    /*
 682     * ivshmem-server sends the remaining initial messages in a fixed
 683     * order, but the device has always accepted them in any order.
 684     * Stay as compatible as practical, just in case people use
 685     * servers that behave differently.
 686     */
 687
 688    /*
 689     * ivshmem_device_spec.txt has always required the ID message
 690     * right here, and ivshmem-server has always complied.  However,
 691     * older versions of the device accepted it out of order, but
 692     * broke when an interrupt setup message arrived before it.
 693     */
 694    msg = ivshmem_recv_msg(s, &fd, &err);
 695    if (err) {
 696        error_propagate(errp, err);
 697        return;
 698    }
 699    if (fd != -1 || msg < 0 || msg > IVSHMEM_MAX_PEERS) {
 700        error_setg(errp, "server sent invalid ID message");
 701        return;
 702    }
 703    s->vm_id = msg;
 704
 705    /*
 706     * Receive more messages until we got shared memory.
 707     */
 708    do {
 709        msg = ivshmem_recv_msg(s, &fd, &err);
 710        if (err) {
 711            error_propagate(errp, err);
 712            return;
 713        }
 714        process_msg(s, msg, fd, &err);
 715        if (err) {
 716            error_propagate(errp, err);
 717            return;
 718        }
 719    } while (msg != -1);
 720
 721    /*
 722     * This function must either map the shared memory or fail.  The
 723     * loop above ensures that: it terminates normally only after it
 724     * successfully processed the server's shared memory message.
 725     * Assert that actually mapped the shared memory:
 726     */
 727    assert(s->ivshmem_bar2);
 728}
 729
 730/* Select the MSI-X vectors used by device.
 731 * ivshmem maps events to vectors statically, so
 732 * we just enable all vectors on init and after reset. */
 733static void ivshmem_msix_vector_use(IVShmemState *s)
 734{
 735    PCIDevice *d = PCI_DEVICE(s);
 736    int i;
 737
 738    for (i = 0; i < s->vectors; i++) {
 739        msix_vector_use(d, i);
 740    }
 741}
 742
 743static void ivshmem_disable_irqfd(IVShmemState *s);
 744
 745static void ivshmem_reset(DeviceState *d)
 746{
 747    IVShmemState *s = IVSHMEM_COMMON(d);
 748
 749    ivshmem_disable_irqfd(s);
 750
 751    s->intrstatus = 0;
 752    s->intrmask = 0;
 753    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
 754        ivshmem_msix_vector_use(s);
 755    }
 756}
 757
 758static int ivshmem_setup_interrupts(IVShmemState *s, Error **errp)
 759{
 760    /* allocate QEMU callback data for receiving interrupts */
 761    s->msi_vectors = g_malloc0(s->vectors * sizeof(MSIVector));
 762
 763    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
 764        if (msix_init_exclusive_bar(PCI_DEVICE(s), s->vectors, 1, errp)) {
 765            return -1;
 766        }
 767
 768        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
 769        ivshmem_msix_vector_use(s);
 770    }
 771
 772    return 0;
 773}
 774
 775static void ivshmem_remove_kvm_msi_virq(IVShmemState *s, int vector)
 776{
 777    IVSHMEM_DPRINTF("ivshmem_remove_kvm_msi_virq vector:%d\n", vector);
 778
 779    if (s->msi_vectors[vector].pdev == NULL) {
 780        return;
 781    }
 782
 783    /* it was cleaned when masked in the frontend. */
 784    kvm_irqchip_release_virq(kvm_state, s->msi_vectors[vector].virq);
 785
 786    s->msi_vectors[vector].pdev = NULL;
 787}
 788
 789static void ivshmem_enable_irqfd(IVShmemState *s)
 790{
 791    PCIDevice *pdev = PCI_DEVICE(s);
 792    int i;
 793
 794    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
 795        Error *err = NULL;
 796
 797        ivshmem_add_kvm_msi_virq(s, i, &err);
 798        if (err) {
 799            error_report_err(err);
 800            goto undo;
 801        }
 802    }
 803
 804    if (msix_set_vector_notifiers(pdev,
 805                                  ivshmem_vector_unmask,
 806                                  ivshmem_vector_mask,
 807                                  ivshmem_vector_poll)) {
 808        error_report("ivshmem: msix_set_vector_notifiers failed");
 809        goto undo;
 810    }
 811    return;
 812
 813undo:
 814    while (--i >= 0) {
 815        ivshmem_remove_kvm_msi_virq(s, i);
 816    }
 817}
 818
 819static void ivshmem_disable_irqfd(IVShmemState *s)
 820{
 821    PCIDevice *pdev = PCI_DEVICE(s);
 822    int i;
 823
 824    if (!pdev->msix_vector_use_notifier) {
 825        return;
 826    }
 827
 828    msix_unset_vector_notifiers(pdev);
 829
 830    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
 831        /*
 832         * MSI-X is already disabled here so msix_unset_vector_notifiers()
 833         * didn't call our release notifier.  Do it now to keep our masks and
 834         * unmasks balanced.
 835         */
 836        if (s->msi_vectors[i].unmasked) {
 837            ivshmem_vector_mask(pdev, i);
 838        }
 839        ivshmem_remove_kvm_msi_virq(s, i);
 840    }
 841
 842}
 843
 844static void ivshmem_write_config(PCIDevice *pdev, uint32_t address,
 845                                 uint32_t val, int len)
 846{
 847    IVShmemState *s = IVSHMEM_COMMON(pdev);
 848    int is_enabled, was_enabled = msix_enabled(pdev);
 849
 850    pci_default_write_config(pdev, address, val, len);
 851    is_enabled = msix_enabled(pdev);
 852
 853    if (kvm_msi_via_irqfd_enabled()) {
 854        if (!was_enabled && is_enabled) {
 855            ivshmem_enable_irqfd(s);
 856        } else if (was_enabled && !is_enabled) {
 857            ivshmem_disable_irqfd(s);
 858        }
 859    }
 860}
 861
 862static void ivshmem_common_realize(PCIDevice *dev, Error **errp)
 863{
 864    IVShmemState *s = IVSHMEM_COMMON(dev);
 865    Error *err = NULL;
 866    uint8_t *pci_conf;
 867    Error *local_err = NULL;
 868
 869    /* IRQFD requires MSI */
 870    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
 871        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
 872        error_setg(errp, "ioeventfd/irqfd requires MSI");
 873        return;
 874    }
 875
 876    pci_conf = dev->config;
 877    pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
 878
 879    memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s,
 880                          "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);
 881
 882    /* region for registers*/
 883    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
 884                     &s->ivshmem_mmio);
 885
 886    if (s->hostmem != NULL) {
 887        IVSHMEM_DPRINTF("using hostmem\n");
 888
 889        s->ivshmem_bar2 = host_memory_backend_get_memory(s->hostmem);
 890        host_memory_backend_set_mapped(s->hostmem, true);
 891    } else {
 892        Chardev *chr = qemu_chr_fe_get_driver(&s->server_chr);
 893        assert(chr);
 894
 895        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
 896                        chr->filename);
 897
 898        /* we allocate enough space for 16 peers and grow as needed */
 899        resize_peers(s, 16);
 900
 901        /*
 902         * Receive setup messages from server synchronously.
 903         * Older versions did it asynchronously, but that creates a
 904         * number of entertaining race conditions.
 905         */
 906        ivshmem_recv_setup(s, &err);
 907        if (err) {
 908            error_propagate(errp, err);
 909            return;
 910        }
 911
 912        if (s->master == ON_OFF_AUTO_ON && s->vm_id != 0) {
 913            error_setg(errp,
 914                       "master must connect to the server before any peers");
 915            return;
 916        }
 917
 918        qemu_chr_fe_set_handlers(&s->server_chr, ivshmem_can_receive,
 919                                 ivshmem_read, NULL, NULL, s, NULL, true);
 920
 921        if (ivshmem_setup_interrupts(s, errp) < 0) {
 922            error_prepend(errp, "Failed to initialize interrupts: ");
 923            return;
 924        }
 925    }
 926
 927    if (s->master == ON_OFF_AUTO_AUTO) {
 928        s->master = s->vm_id == 0 ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
 929    }
 930
 931    if (!ivshmem_is_master(s)) {
 932        error_setg(&s->migration_blocker,
 933                   "Migration is disabled when using feature 'peer mode' in device 'ivshmem'");
 934        migrate_add_blocker(s->migration_blocker, &local_err);
 935        if (local_err) {
 936            error_propagate(errp, local_err);
 937            error_free(s->migration_blocker);
 938            return;
 939        }
 940    }
 941
 942    vmstate_register_ram(s->ivshmem_bar2, DEVICE(s));
 943    pci_register_bar(PCI_DEVICE(s), 2,
 944                     PCI_BASE_ADDRESS_SPACE_MEMORY |
 945                     PCI_BASE_ADDRESS_MEM_PREFETCH |
 946                     PCI_BASE_ADDRESS_MEM_TYPE_64,
 947                     s->ivshmem_bar2);
 948}
 949
 950static void ivshmem_exit(PCIDevice *dev)
 951{
 952    IVShmemState *s = IVSHMEM_COMMON(dev);
 953    int i;
 954
 955    if (s->migration_blocker) {
 956        migrate_del_blocker(s->migration_blocker);
 957        error_free(s->migration_blocker);
 958    }
 959
 960    if (memory_region_is_mapped(s->ivshmem_bar2)) {
 961        if (!s->hostmem) {
 962            void *addr = memory_region_get_ram_ptr(s->ivshmem_bar2);
 963            int fd;
 964
 965            if (munmap(addr, memory_region_size(s->ivshmem_bar2) == -1)) {
 966                error_report("Failed to munmap shared memory %s",
 967                             strerror(errno));
 968            }
 969
 970            fd = memory_region_get_fd(s->ivshmem_bar2);
 971            close(fd);
 972        }
 973
 974        vmstate_unregister_ram(s->ivshmem_bar2, DEVICE(dev));
 975    }
 976
 977    if (s->hostmem) {
 978        host_memory_backend_set_mapped(s->hostmem, false);
 979    }
 980
 981    if (s->peers) {
 982        for (i = 0; i < s->nb_peers; i++) {
 983            close_peer_eventfds(s, i);
 984        }
 985        g_free(s->peers);
 986    }
 987
 988    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
 989        msix_uninit_exclusive_bar(dev);
 990    }
 991
 992    g_free(s->msi_vectors);
 993}
 994
 995static int ivshmem_pre_load(void *opaque)
 996{
 997    IVShmemState *s = opaque;
 998
 999    if (!ivshmem_is_master(s)) {
1000        error_report("'peer' devices are not migratable");
1001        return -EINVAL;
1002    }
1003
1004    return 0;
1005}
1006
1007static int ivshmem_post_load(void *opaque, int version_id)
1008{
1009    IVShmemState *s = opaque;
1010
1011    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
1012        ivshmem_msix_vector_use(s);
1013    }
1014    return 0;
1015}
1016
1017static void ivshmem_common_class_init(ObjectClass *klass, void *data)
1018{
1019    DeviceClass *dc = DEVICE_CLASS(klass);
1020    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1021
1022    k->realize = ivshmem_common_realize;
1023    k->exit = ivshmem_exit;
1024    k->config_write = ivshmem_write_config;
1025    k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
1026    k->device_id = PCI_DEVICE_ID_IVSHMEM;
1027    k->class_id = PCI_CLASS_MEMORY_RAM;
1028    k->revision = 1;
1029    dc->reset = ivshmem_reset;
1030    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1031    dc->desc = "Inter-VM shared memory";
1032}
1033
1034static const TypeInfo ivshmem_common_info = {
1035    .name          = TYPE_IVSHMEM_COMMON,
1036    .parent        = TYPE_PCI_DEVICE,
1037    .instance_size = sizeof(IVShmemState),
1038    .abstract      = true,
1039    .class_init    = ivshmem_common_class_init,
1040    .interfaces = (InterfaceInfo[]) {
1041        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1042        { },
1043    },
1044};
1045
1046static const VMStateDescription ivshmem_plain_vmsd = {
1047    .name = TYPE_IVSHMEM_PLAIN,
1048    .version_id = 0,
1049    .minimum_version_id = 0,
1050    .pre_load = ivshmem_pre_load,
1051    .post_load = ivshmem_post_load,
1052    .fields = (VMStateField[]) {
1053        VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1054        VMSTATE_UINT32(intrstatus, IVShmemState),
1055        VMSTATE_UINT32(intrmask, IVShmemState),
1056        VMSTATE_END_OF_LIST()
1057    },
1058};
1059
1060static Property ivshmem_plain_properties[] = {
1061    DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF),
1062    DEFINE_PROP_LINK("memdev", IVShmemState, hostmem, TYPE_MEMORY_BACKEND,
1063                     HostMemoryBackend *),
1064    DEFINE_PROP_END_OF_LIST(),
1065};
1066
1067static void ivshmem_plain_realize(PCIDevice *dev, Error **errp)
1068{
1069    IVShmemState *s = IVSHMEM_COMMON(dev);
1070
1071    if (!s->hostmem) {
1072        error_setg(errp, "You must specify a 'memdev'");
1073        return;
1074    } else if (host_memory_backend_is_mapped(s->hostmem)) {
1075        char *path = object_get_canonical_path_component(OBJECT(s->hostmem));
1076        error_setg(errp, "can't use already busy memdev: %s", path);
1077        g_free(path);
1078        return;
1079    }
1080
1081    ivshmem_common_realize(dev, errp);
1082}
1083
1084static void ivshmem_plain_class_init(ObjectClass *klass, void *data)
1085{
1086    DeviceClass *dc = DEVICE_CLASS(klass);
1087    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1088
1089    k->realize = ivshmem_plain_realize;
1090    dc->props = ivshmem_plain_properties;
1091    dc->vmsd = &ivshmem_plain_vmsd;
1092}
1093
1094static const TypeInfo ivshmem_plain_info = {
1095    .name          = TYPE_IVSHMEM_PLAIN,
1096    .parent        = TYPE_IVSHMEM_COMMON,
1097    .instance_size = sizeof(IVShmemState),
1098    .class_init    = ivshmem_plain_class_init,
1099};
1100
1101static const VMStateDescription ivshmem_doorbell_vmsd = {
1102    .name = TYPE_IVSHMEM_DOORBELL,
1103    .version_id = 0,
1104    .minimum_version_id = 0,
1105    .pre_load = ivshmem_pre_load,
1106    .post_load = ivshmem_post_load,
1107    .fields = (VMStateField[]) {
1108        VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1109        VMSTATE_MSIX(parent_obj, IVShmemState),
1110        VMSTATE_UINT32(intrstatus, IVShmemState),
1111        VMSTATE_UINT32(intrmask, IVShmemState),
1112        VMSTATE_END_OF_LIST()
1113    },
1114};
1115
1116static Property ivshmem_doorbell_properties[] = {
1117    DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
1118    DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
1119    DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD,
1120                    true),
1121    DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF),
1122    DEFINE_PROP_END_OF_LIST(),
1123};
1124
1125static void ivshmem_doorbell_init(Object *obj)
1126{
1127    IVShmemState *s = IVSHMEM_DOORBELL(obj);
1128
1129    s->features |= (1 << IVSHMEM_MSI);
1130}
1131
1132static void ivshmem_doorbell_realize(PCIDevice *dev, Error **errp)
1133{
1134    IVShmemState *s = IVSHMEM_COMMON(dev);
1135
1136    if (!qemu_chr_fe_backend_connected(&s->server_chr)) {
1137        error_setg(errp, "You must specify a 'chardev'");
1138        return;
1139    }
1140
1141    ivshmem_common_realize(dev, errp);
1142}
1143
1144static void ivshmem_doorbell_class_init(ObjectClass *klass, void *data)
1145{
1146    DeviceClass *dc = DEVICE_CLASS(klass);
1147    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1148
1149    k->realize = ivshmem_doorbell_realize;
1150    dc->props = ivshmem_doorbell_properties;
1151    dc->vmsd = &ivshmem_doorbell_vmsd;
1152}
1153
1154static const TypeInfo ivshmem_doorbell_info = {
1155    .name          = TYPE_IVSHMEM_DOORBELL,
1156    .parent        = TYPE_IVSHMEM_COMMON,
1157    .instance_size = sizeof(IVShmemState),
1158    .instance_init = ivshmem_doorbell_init,
1159    .class_init    = ivshmem_doorbell_class_init,
1160};
1161
1162static void ivshmem_register_types(void)
1163{
1164    type_register_static(&ivshmem_common_info);
1165    type_register_static(&ivshmem_plain_info);
1166    type_register_static(&ivshmem_doorbell_info);
1167}
1168
1169type_init(ivshmem_register_types)
1170