qemu/hw/misc/ivshmem.c
<<
>>
Prefs
   1/*
   2 * Inter-VM Shared Memory PCI device.
   3 *
   4 * Author:
   5 *      Cam Macdonell <cam@cs.ualberta.ca>
   6 *
   7 * Based On: cirrus_vga.c
   8 *          Copyright (c) 2004 Fabrice Bellard
   9 *          Copyright (c) 2004 Makoto Suzuki (suzu)
  10 *
  11 *      and rtl8139.c
  12 *          Copyright (c) 2006 Igor Kovalenko
  13 *
  14 * This code is licensed under the GNU GPL v2.
  15 *
  16 * Contributions after 2012-01-13 are licensed under the terms of the
  17 * GNU GPL, version 2 or (at your option) any later version.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu/units.h"
  22#include "qapi/error.h"
  23#include "qemu/cutils.h"
  24#include "hw/pci/pci.h"
  25#include "hw/qdev-properties.h"
  26#include "hw/pci/msi.h"
  27#include "hw/pci/msix.h"
  28#include "sysemu/kvm.h"
  29#include "migration/blocker.h"
  30#include "migration/vmstate.h"
  31#include "qemu/error-report.h"
  32#include "qemu/event_notifier.h"
  33#include "qemu/module.h"
  34#include "qom/object_interfaces.h"
  35#include "chardev/char-fe.h"
  36#include "sysemu/hostmem.h"
  37#include "sysemu/qtest.h"
  38#include "qapi/visitor.h"
  39
  40#include "hw/misc/ivshmem.h"
  41
  42#define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
  43#define PCI_DEVICE_ID_IVSHMEM   0x1110
  44
  45#define IVSHMEM_MAX_PEERS UINT16_MAX
  46#define IVSHMEM_IOEVENTFD   0
  47#define IVSHMEM_MSI     1
  48
  49#define IVSHMEM_REG_BAR_SIZE 0x100
  50
  51#define IVSHMEM_DEBUG 0
  52#define IVSHMEM_DPRINTF(fmt, ...)                       \
  53    do {                                                \
  54        if (IVSHMEM_DEBUG) {                            \
  55            printf("IVSHMEM: " fmt, ## __VA_ARGS__);    \
  56        }                                               \
  57    } while (0)
  58
  59#define TYPE_IVSHMEM_COMMON "ivshmem-common"
  60#define IVSHMEM_COMMON(obj) \
  61    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_COMMON)
  62
  63#define TYPE_IVSHMEM_PLAIN "ivshmem-plain"
  64#define IVSHMEM_PLAIN(obj) \
  65    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_PLAIN)
  66
  67#define TYPE_IVSHMEM_DOORBELL "ivshmem-doorbell"
  68#define IVSHMEM_DOORBELL(obj) \
  69    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_DOORBELL)
  70
  71#define TYPE_IVSHMEM "ivshmem"
  72#define IVSHMEM(obj) \
  73    OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM)
  74
  75typedef struct Peer {
  76    int nb_eventfds;
  77    EventNotifier *eventfds;
  78} Peer;
  79
  80typedef struct MSIVector {
  81    PCIDevice *pdev;
  82    int virq;
  83    bool unmasked;
  84} MSIVector;
  85
  86typedef struct IVShmemState {
  87    /*< private >*/
  88    PCIDevice parent_obj;
  89    /*< public >*/
  90
  91    uint32_t features;
  92
  93    /* exactly one of these two may be set */
  94    HostMemoryBackend *hostmem; /* with interrupts */
  95    CharBackend server_chr; /* without interrupts */
  96
  97    /* registers */
  98    uint32_t intrmask;
  99    uint32_t intrstatus;
 100    int vm_id;
 101
 102    /* BARs */
 103    MemoryRegion ivshmem_mmio;  /* BAR 0 (registers) */
 104    MemoryRegion *ivshmem_bar2; /* BAR 2 (shared memory) */
 105    MemoryRegion server_bar2;   /* used with server_chr */
 106
 107    /* interrupt support */
 108    Peer *peers;
 109    int nb_peers;               /* space in @peers[] */
 110    uint32_t vectors;
 111    MSIVector *msi_vectors;
 112    uint64_t msg_buf;           /* buffer for receiving server messages */
 113    int msg_buffered_bytes;     /* #bytes in @msg_buf */
 114
 115    /* migration stuff */
 116    OnOffAuto master;
 117    Error *migration_blocker;
 118} IVShmemState;
 119
 120/* registers for the Inter-VM shared memory device */
 121enum ivshmem_registers {
 122    INTRMASK = 0,
 123    INTRSTATUS = 4,
 124    IVPOSITION = 8,
 125    DOORBELL = 12,
 126};
 127
 128static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
 129                                                    unsigned int feature) {
 130    return (ivs->features & (1 << feature));
 131}
 132
 133static inline bool ivshmem_is_master(IVShmemState *s)
 134{
 135    assert(s->master != ON_OFF_AUTO_AUTO);
 136    return s->master == ON_OFF_AUTO_ON;
 137}
 138
 139static void ivshmem_update_irq(IVShmemState *s)
 140{
 141    PCIDevice *d = PCI_DEVICE(s);
 142    uint32_t isr = s->intrstatus & s->intrmask;
 143
 144    /*
 145     * Do nothing unless the device actually uses INTx.  Here's how
 146     * the device variants signal interrupts, what they put in PCI
 147     * config space:
 148     * Device variant    Interrupt  Interrupt Pin  MSI-X cap.
 149     * ivshmem-plain         none            0         no
 150     * ivshmem-doorbell     MSI-X            1        yes(1)
 151     * ivshmem,msi=off       INTx            1         no
 152     * ivshmem,msi=on       MSI-X            1(2)     yes(1)
 153     * (1) if guest enabled MSI-X
 154     * (2) the device lies
 155     * Leads to the condition for doing nothing:
 156     */
 157    if (ivshmem_has_feature(s, IVSHMEM_MSI)
 158        || !d->config[PCI_INTERRUPT_PIN]) {
 159        return;
 160    }
 161
 162    /* don't print ISR resets */
 163    if (isr) {
 164        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
 165                        isr ? 1 : 0, s->intrstatus, s->intrmask);
 166    }
 167
 168    pci_set_irq(d, isr != 0);
 169}
 170
 171static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
 172{
 173    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
 174
 175    s->intrmask = val;
 176    ivshmem_update_irq(s);
 177}
 178
 179static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
 180{
 181    uint32_t ret = s->intrmask;
 182
 183    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
 184    return ret;
 185}
 186
 187static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
 188{
 189    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
 190
 191    s->intrstatus = val;
 192    ivshmem_update_irq(s);
 193}
 194
 195static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
 196{
 197    uint32_t ret = s->intrstatus;
 198
 199    /* reading ISR clears all interrupts */
 200    s->intrstatus = 0;
 201    ivshmem_update_irq(s);
 202    return ret;
 203}
 204
 205static void ivshmem_io_write(void *opaque, hwaddr addr,
 206                             uint64_t val, unsigned size)
 207{
 208    IVShmemState *s = opaque;
 209
 210    uint16_t dest = val >> 16;
 211    uint16_t vector = val & 0xff;
 212
 213    addr &= 0xfc;
 214
 215    IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
 216    switch (addr)
 217    {
 218        case INTRMASK:
 219            ivshmem_IntrMask_write(s, val);
 220            break;
 221
 222        case INTRSTATUS:
 223            ivshmem_IntrStatus_write(s, val);
 224            break;
 225
 226        case DOORBELL:
 227            /* check that dest VM ID is reasonable */
 228            if (dest >= s->nb_peers) {
 229                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
 230                break;
 231            }
 232
 233            /* check doorbell range */
 234            if (vector < s->peers[dest].nb_eventfds) {
 235                IVSHMEM_DPRINTF("Notifying VM %d on vector %d\n", dest, vector);
 236                event_notifier_set(&s->peers[dest].eventfds[vector]);
 237            } else {
 238                IVSHMEM_DPRINTF("Invalid destination vector %d on VM %d\n",
 239                                vector, dest);
 240            }
 241            break;
 242        default:
 243            IVSHMEM_DPRINTF("Unhandled write " TARGET_FMT_plx "\n", addr);
 244    }
 245}
 246
 247static uint64_t ivshmem_io_read(void *opaque, hwaddr addr,
 248                                unsigned size)
 249{
 250
 251    IVShmemState *s = opaque;
 252    uint32_t ret;
 253
 254    switch (addr)
 255    {
 256        case INTRMASK:
 257            ret = ivshmem_IntrMask_read(s);
 258            break;
 259
 260        case INTRSTATUS:
 261            ret = ivshmem_IntrStatus_read(s);
 262            break;
 263
 264        case IVPOSITION:
 265            ret = s->vm_id;
 266            break;
 267
 268        default:
 269            IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
 270            ret = 0;
 271    }
 272
 273    return ret;
 274}
 275
 276static const MemoryRegionOps ivshmem_mmio_ops = {
 277    .read = ivshmem_io_read,
 278    .write = ivshmem_io_write,
 279    .endianness = DEVICE_NATIVE_ENDIAN,
 280    .impl = {
 281        .min_access_size = 4,
 282        .max_access_size = 4,
 283    },
 284};
 285
 286static void ivshmem_vector_notify(void *opaque)
 287{
 288    MSIVector *entry = opaque;
 289    PCIDevice *pdev = entry->pdev;
 290    IVShmemState *s = IVSHMEM_COMMON(pdev);
 291    int vector = entry - s->msi_vectors;
 292    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
 293
 294    if (!event_notifier_test_and_clear(n)) {
 295        return;
 296    }
 297
 298    IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector);
 299    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
 300        if (msix_enabled(pdev)) {
 301            msix_notify(pdev, vector);
 302        }
 303    } else {
 304        ivshmem_IntrStatus_write(s, 1);
 305    }
 306}
 307
 308static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
 309                                 MSIMessage msg)
 310{
 311    IVShmemState *s = IVSHMEM_COMMON(dev);
 312    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
 313    MSIVector *v = &s->msi_vectors[vector];
 314    int ret;
 315
 316    IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);
 317    if (!v->pdev) {
 318        error_report("ivshmem: vector %d route does not exist", vector);
 319        return -EINVAL;
 320    }
 321    assert(!v->unmasked);
 322
 323    ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
 324    if (ret < 0) {
 325        return ret;
 326    }
 327    kvm_irqchip_commit_routes(kvm_state);
 328
 329    ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
 330    if (ret < 0) {
 331        return ret;
 332    }
 333    v->unmasked = true;
 334
 335    return 0;
 336}
 337
 338static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector)
 339{
 340    IVShmemState *s = IVSHMEM_COMMON(dev);
 341    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
 342    MSIVector *v = &s->msi_vectors[vector];
 343    int ret;
 344
 345    IVSHMEM_DPRINTF("vector mask %p %d\n", dev, vector);
 346    if (!v->pdev) {
 347        error_report("ivshmem: vector %d route does not exist", vector);
 348        return;
 349    }
 350    assert(v->unmasked);
 351
 352    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, v->virq);
 353    if (ret < 0) {
 354        error_report("remove_irqfd_notifier_gsi failed");
 355        return;
 356    }
 357    v->unmasked = false;
 358}
 359
 360static void ivshmem_vector_poll(PCIDevice *dev,
 361                                unsigned int vector_start,
 362                                unsigned int vector_end)
 363{
 364    IVShmemState *s = IVSHMEM_COMMON(dev);
 365    unsigned int vector;
 366
 367    IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end);
 368
 369    vector_end = MIN(vector_end, s->vectors);
 370
 371    for (vector = vector_start; vector < vector_end; vector++) {
 372        EventNotifier *notifier = &s->peers[s->vm_id].eventfds[vector];
 373
 374        if (!msix_is_masked(dev, vector)) {
 375            continue;
 376        }
 377
 378        if (event_notifier_test_and_clear(notifier)) {
 379            msix_set_pending(dev, vector);
 380        }
 381    }
 382}
 383
 384static void watch_vector_notifier(IVShmemState *s, EventNotifier *n,
 385                                 int vector)
 386{
 387    int eventfd = event_notifier_get_fd(n);
 388
 389    assert(!s->msi_vectors[vector].pdev);
 390    s->msi_vectors[vector].pdev = PCI_DEVICE(s);
 391
 392    qemu_set_fd_handler(eventfd, ivshmem_vector_notify,
 393                        NULL, &s->msi_vectors[vector]);
 394}
 395
 396static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i)
 397{
 398    memory_region_add_eventfd(&s->ivshmem_mmio,
 399                              DOORBELL,
 400                              4,
 401                              true,
 402                              (posn << 16) | i,
 403                              &s->peers[posn].eventfds[i]);
 404}
 405
 406static void ivshmem_del_eventfd(IVShmemState *s, int posn, int i)
 407{
 408    memory_region_del_eventfd(&s->ivshmem_mmio,
 409                              DOORBELL,
 410                              4,
 411                              true,
 412                              (posn << 16) | i,
 413                              &s->peers[posn].eventfds[i]);
 414}
 415
 416static void close_peer_eventfds(IVShmemState *s, int posn)
 417{
 418    int i, n;
 419
 420    assert(posn >= 0 && posn < s->nb_peers);
 421    n = s->peers[posn].nb_eventfds;
 422
 423    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
 424        memory_region_transaction_begin();
 425        for (i = 0; i < n; i++) {
 426            ivshmem_del_eventfd(s, posn, i);
 427        }
 428        memory_region_transaction_commit();
 429    }
 430
 431    for (i = 0; i < n; i++) {
 432        event_notifier_cleanup(&s->peers[posn].eventfds[i]);
 433    }
 434
 435    g_free(s->peers[posn].eventfds);
 436    s->peers[posn].nb_eventfds = 0;
 437}
 438
 439static void resize_peers(IVShmemState *s, int nb_peers)
 440{
 441    int old_nb_peers = s->nb_peers;
 442    int i;
 443
 444    assert(nb_peers > old_nb_peers);
 445    IVSHMEM_DPRINTF("bumping storage to %d peers\n", nb_peers);
 446
 447    s->peers = g_realloc(s->peers, nb_peers * sizeof(Peer));
 448    s->nb_peers = nb_peers;
 449
 450    for (i = old_nb_peers; i < nb_peers; i++) {
 451        s->peers[i].eventfds = g_new0(EventNotifier, s->vectors);
 452        s->peers[i].nb_eventfds = 0;
 453    }
 454}
 455
 456static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector,
 457                                     Error **errp)
 458{
 459    PCIDevice *pdev = PCI_DEVICE(s);
 460    int ret;
 461
 462    IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
 463    assert(!s->msi_vectors[vector].pdev);
 464
 465    ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev);
 466    if (ret < 0) {
 467        error_setg(errp, "kvm_irqchip_add_msi_route failed");
 468        return;
 469    }
 470
 471    s->msi_vectors[vector].virq = ret;
 472    s->msi_vectors[vector].pdev = pdev;
 473}
 474
 475static void setup_interrupt(IVShmemState *s, int vector, Error **errp)
 476{
 477    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
 478    bool with_irqfd = kvm_msi_via_irqfd_enabled() &&
 479        ivshmem_has_feature(s, IVSHMEM_MSI);
 480    PCIDevice *pdev = PCI_DEVICE(s);
 481    Error *err = NULL;
 482
 483    IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector);
 484
 485    if (!with_irqfd) {
 486        IVSHMEM_DPRINTF("with eventfd\n");
 487        watch_vector_notifier(s, n, vector);
 488    } else if (msix_enabled(pdev)) {
 489        IVSHMEM_DPRINTF("with irqfd\n");
 490        ivshmem_add_kvm_msi_virq(s, vector, &err);
 491        if (err) {
 492            error_propagate(errp, err);
 493            return;
 494        }
 495
 496        if (!msix_is_masked(pdev, vector)) {
 497            kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL,
 498                                               s->msi_vectors[vector].virq);
 499            /* TODO handle error */
 500        }
 501    } else {
 502        /* it will be delayed until msix is enabled, in write_config */
 503        IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled\n");
 504    }
 505}
 506
 507static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
 508{
 509    Error *local_err = NULL;
 510    struct stat buf;
 511    size_t size;
 512
 513    if (s->ivshmem_bar2) {
 514        error_setg(errp, "server sent unexpected shared memory message");
 515        close(fd);
 516        return;
 517    }
 518
 519    if (fstat(fd, &buf) < 0) {
 520        error_setg_errno(errp, errno,
 521            "can't determine size of shared memory sent by server");
 522        close(fd);
 523        return;
 524    }
 525
 526    size = buf.st_size;
 527
 528    /* mmap the region and map into the BAR2 */
 529    memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
 530                                   "ivshmem.bar2", size, true, fd, &local_err);
 531    if (local_err) {
 532        error_propagate(errp, local_err);
 533        return;
 534    }
 535
 536    s->ivshmem_bar2 = &s->server_bar2;
 537}
 538
 539static void process_msg_disconnect(IVShmemState *s, uint16_t posn,
 540                                   Error **errp)
 541{
 542    IVSHMEM_DPRINTF("posn %d has gone away\n", posn);
 543    if (posn >= s->nb_peers || posn == s->vm_id) {
 544        error_setg(errp, "invalid peer %d", posn);
 545        return;
 546    }
 547    close_peer_eventfds(s, posn);
 548}
 549
 550static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd,
 551                                Error **errp)
 552{
 553    Peer *peer = &s->peers[posn];
 554    int vector;
 555
 556    /*
 557     * The N-th connect message for this peer comes with the file
 558     * descriptor for vector N-1.  Count messages to find the vector.
 559     */
 560    if (peer->nb_eventfds >= s->vectors) {
 561        error_setg(errp, "Too many eventfd received, device has %d vectors",
 562                   s->vectors);
 563        close(fd);
 564        return;
 565    }
 566    vector = peer->nb_eventfds++;
 567
 568    IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd);
 569    event_notifier_init_fd(&peer->eventfds[vector], fd);
 570    fcntl_setfl(fd, O_NONBLOCK); /* msix/irqfd poll non block */
 571
 572    if (posn == s->vm_id) {
 573        setup_interrupt(s, vector, errp);
 574        /* TODO do we need to handle the error? */
 575    }
 576
 577    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
 578        ivshmem_add_eventfd(s, posn, vector);
 579    }
 580}
 581
 582static void process_msg(IVShmemState *s, int64_t msg, int fd, Error **errp)
 583{
 584    IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd);
 585
 586    if (msg < -1 || msg > IVSHMEM_MAX_PEERS) {
 587        error_setg(errp, "server sent invalid message %" PRId64, msg);
 588        close(fd);
 589        return;
 590    }
 591
 592    if (msg == -1) {
 593        process_msg_shmem(s, fd, errp);
 594        return;
 595    }
 596
 597    if (msg >= s->nb_peers) {
 598        resize_peers(s, msg + 1);
 599    }
 600
 601    if (fd >= 0) {
 602        process_msg_connect(s, msg, fd, errp);
 603    } else {
 604        process_msg_disconnect(s, msg, errp);
 605    }
 606}
 607
 608static int ivshmem_can_receive(void *opaque)
 609{
 610    IVShmemState *s = opaque;
 611
 612    assert(s->msg_buffered_bytes < sizeof(s->msg_buf));
 613    return sizeof(s->msg_buf) - s->msg_buffered_bytes;
 614}
 615
 616static void ivshmem_read(void *opaque, const uint8_t *buf, int size)
 617{
 618    IVShmemState *s = opaque;
 619    Error *err = NULL;
 620    int fd;
 621    int64_t msg;
 622
 623    assert(size >= 0 && s->msg_buffered_bytes + size <= sizeof(s->msg_buf));
 624    memcpy((unsigned char *)&s->msg_buf + s->msg_buffered_bytes, buf, size);
 625    s->msg_buffered_bytes += size;
 626    if (s->msg_buffered_bytes < sizeof(s->msg_buf)) {
 627        return;
 628    }
 629    msg = le64_to_cpu(s->msg_buf);
 630    s->msg_buffered_bytes = 0;
 631
 632    fd = qemu_chr_fe_get_msgfd(&s->server_chr);
 633
 634    process_msg(s, msg, fd, &err);
 635    if (err) {
 636        error_report_err(err);
 637    }
 638}
 639
 640static int64_t ivshmem_recv_msg(IVShmemState *s, int *pfd, Error **errp)
 641{
 642    int64_t msg;
 643    int n, ret;
 644
 645    n = 0;
 646    do {
 647        ret = qemu_chr_fe_read_all(&s->server_chr, (uint8_t *)&msg + n,
 648                                   sizeof(msg) - n);
 649        if (ret < 0) {
 650            if (ret == -EINTR) {
 651                continue;
 652            }
 653            error_setg_errno(errp, -ret, "read from server failed");
 654            return INT64_MIN;
 655        }
 656        n += ret;
 657    } while (n < sizeof(msg));
 658
 659    *pfd = qemu_chr_fe_get_msgfd(&s->server_chr);
 660    return le64_to_cpu(msg);
 661}
 662
 663static void ivshmem_recv_setup(IVShmemState *s, Error **errp)
 664{
 665    Error *err = NULL;
 666    int64_t msg;
 667    int fd;
 668
 669    msg = ivshmem_recv_msg(s, &fd, &err);
 670    if (err) {
 671        error_propagate(errp, err);
 672        return;
 673    }
 674    if (msg != IVSHMEM_PROTOCOL_VERSION) {
 675        error_setg(errp, "server sent version %" PRId64 ", expecting %d",
 676                   msg, IVSHMEM_PROTOCOL_VERSION);
 677        return;
 678    }
 679    if (fd != -1) {
 680        error_setg(errp, "server sent invalid version message");
 681        return;
 682    }
 683
 684    /*
 685     * ivshmem-server sends the remaining initial messages in a fixed
 686     * order, but the device has always accepted them in any order.
 687     * Stay as compatible as practical, just in case people use
 688     * servers that behave differently.
 689     */
 690
 691    /*
 692     * ivshmem_device_spec.txt has always required the ID message
 693     * right here, and ivshmem-server has always complied.  However,
 694     * older versions of the device accepted it out of order, but
 695     * broke when an interrupt setup message arrived before it.
 696     */
 697    msg = ivshmem_recv_msg(s, &fd, &err);
 698    if (err) {
 699        error_propagate(errp, err);
 700        return;
 701    }
 702    if (fd != -1 || msg < 0 || msg > IVSHMEM_MAX_PEERS) {
 703        error_setg(errp, "server sent invalid ID message");
 704        return;
 705    }
 706    s->vm_id = msg;
 707
 708    /*
 709     * Receive more messages until we got shared memory.
 710     */
 711    do {
 712        msg = ivshmem_recv_msg(s, &fd, &err);
 713        if (err) {
 714            error_propagate(errp, err);
 715            return;
 716        }
 717        process_msg(s, msg, fd, &err);
 718        if (err) {
 719            error_propagate(errp, err);
 720            return;
 721        }
 722    } while (msg != -1);
 723
 724    /*
 725     * This function must either map the shared memory or fail.  The
 726     * loop above ensures that: it terminates normally only after it
 727     * successfully processed the server's shared memory message.
 728     * Assert that actually mapped the shared memory:
 729     */
 730    assert(s->ivshmem_bar2);
 731}
 732
 733/* Select the MSI-X vectors used by device.
 734 * ivshmem maps events to vectors statically, so
 735 * we just enable all vectors on init and after reset. */
 736static void ivshmem_msix_vector_use(IVShmemState *s)
 737{
 738    PCIDevice *d = PCI_DEVICE(s);
 739    int i;
 740
 741    for (i = 0; i < s->vectors; i++) {
 742        msix_vector_use(d, i);
 743    }
 744}
 745
 746static void ivshmem_disable_irqfd(IVShmemState *s);
 747
 748static void ivshmem_reset(DeviceState *d)
 749{
 750    IVShmemState *s = IVSHMEM_COMMON(d);
 751
 752    ivshmem_disable_irqfd(s);
 753
 754    s->intrstatus = 0;
 755    s->intrmask = 0;
 756    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
 757        ivshmem_msix_vector_use(s);
 758    }
 759}
 760
 761static int ivshmem_setup_interrupts(IVShmemState *s, Error **errp)
 762{
 763    /* allocate QEMU callback data for receiving interrupts */
 764    s->msi_vectors = g_malloc0(s->vectors * sizeof(MSIVector));
 765
 766    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
 767        if (msix_init_exclusive_bar(PCI_DEVICE(s), s->vectors, 1, errp)) {
 768            return -1;
 769        }
 770
 771        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
 772        ivshmem_msix_vector_use(s);
 773    }
 774
 775    return 0;
 776}
 777
 778static void ivshmem_remove_kvm_msi_virq(IVShmemState *s, int vector)
 779{
 780    IVSHMEM_DPRINTF("ivshmem_remove_kvm_msi_virq vector:%d\n", vector);
 781
 782    if (s->msi_vectors[vector].pdev == NULL) {
 783        return;
 784    }
 785
 786    /* it was cleaned when masked in the frontend. */
 787    kvm_irqchip_release_virq(kvm_state, s->msi_vectors[vector].virq);
 788
 789    s->msi_vectors[vector].pdev = NULL;
 790}
 791
 792static void ivshmem_enable_irqfd(IVShmemState *s)
 793{
 794    PCIDevice *pdev = PCI_DEVICE(s);
 795    int i;
 796
 797    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
 798        Error *err = NULL;
 799
 800        ivshmem_add_kvm_msi_virq(s, i, &err);
 801        if (err) {
 802            error_report_err(err);
 803            goto undo;
 804        }
 805    }
 806
 807    if (msix_set_vector_notifiers(pdev,
 808                                  ivshmem_vector_unmask,
 809                                  ivshmem_vector_mask,
 810                                  ivshmem_vector_poll)) {
 811        error_report("ivshmem: msix_set_vector_notifiers failed");
 812        goto undo;
 813    }
 814    return;
 815
 816undo:
 817    while (--i >= 0) {
 818        ivshmem_remove_kvm_msi_virq(s, i);
 819    }
 820}
 821
 822static void ivshmem_disable_irqfd(IVShmemState *s)
 823{
 824    PCIDevice *pdev = PCI_DEVICE(s);
 825    int i;
 826
 827    if (!pdev->msix_vector_use_notifier) {
 828        return;
 829    }
 830
 831    msix_unset_vector_notifiers(pdev);
 832
 833    for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
 834        /*
 835         * MSI-X is already disabled here so msix_unset_vector_notifiers()
 836         * didn't call our release notifier.  Do it now to keep our masks and
 837         * unmasks balanced.
 838         */
 839        if (s->msi_vectors[i].unmasked) {
 840            ivshmem_vector_mask(pdev, i);
 841        }
 842        ivshmem_remove_kvm_msi_virq(s, i);
 843    }
 844
 845}
 846
 847static void ivshmem_write_config(PCIDevice *pdev, uint32_t address,
 848                                 uint32_t val, int len)
 849{
 850    IVShmemState *s = IVSHMEM_COMMON(pdev);
 851    int is_enabled, was_enabled = msix_enabled(pdev);
 852
 853    pci_default_write_config(pdev, address, val, len);
 854    is_enabled = msix_enabled(pdev);
 855
 856    if (kvm_msi_via_irqfd_enabled()) {
 857        if (!was_enabled && is_enabled) {
 858            ivshmem_enable_irqfd(s);
 859        } else if (was_enabled && !is_enabled) {
 860            ivshmem_disable_irqfd(s);
 861        }
 862    }
 863}
 864
 865static void ivshmem_common_realize(PCIDevice *dev, Error **errp)
 866{
 867    IVShmemState *s = IVSHMEM_COMMON(dev);
 868    Error *err = NULL;
 869    uint8_t *pci_conf;
 870    Error *local_err = NULL;
 871
 872    /* IRQFD requires MSI */
 873    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
 874        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
 875        error_setg(errp, "ioeventfd/irqfd requires MSI");
 876        return;
 877    }
 878
 879    pci_conf = dev->config;
 880    pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
 881
 882    memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s,
 883                          "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);
 884
 885    /* region for registers*/
 886    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
 887                     &s->ivshmem_mmio);
 888
 889    if (s->hostmem != NULL) {
 890        IVSHMEM_DPRINTF("using hostmem\n");
 891
 892        s->ivshmem_bar2 = host_memory_backend_get_memory(s->hostmem);
 893        host_memory_backend_set_mapped(s->hostmem, true);
 894    } else {
 895        Chardev *chr = qemu_chr_fe_get_driver(&s->server_chr);
 896        assert(chr);
 897
 898        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
 899                        chr->filename);
 900
 901        /* we allocate enough space for 16 peers and grow as needed */
 902        resize_peers(s, 16);
 903
 904        /*
 905         * Receive setup messages from server synchronously.
 906         * Older versions did it asynchronously, but that creates a
 907         * number of entertaining race conditions.
 908         */
 909        ivshmem_recv_setup(s, &err);
 910        if (err) {
 911            error_propagate(errp, err);
 912            return;
 913        }
 914
 915        if (s->master == ON_OFF_AUTO_ON && s->vm_id != 0) {
 916            error_setg(errp,
 917                       "master must connect to the server before any peers");
 918            return;
 919        }
 920
 921        qemu_chr_fe_set_handlers(&s->server_chr, ivshmem_can_receive,
 922                                 ivshmem_read, NULL, NULL, s, NULL, true);
 923
 924        if (ivshmem_setup_interrupts(s, errp) < 0) {
 925            error_prepend(errp, "Failed to initialize interrupts: ");
 926            return;
 927        }
 928    }
 929
 930    if (s->master == ON_OFF_AUTO_AUTO) {
 931        s->master = s->vm_id == 0 ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
 932    }
 933
 934    if (!ivshmem_is_master(s)) {
 935        error_setg(&s->migration_blocker,
 936                   "Migration is disabled when using feature 'peer mode' in device 'ivshmem'");
 937        migrate_add_blocker(s->migration_blocker, &local_err);
 938        if (local_err) {
 939            error_propagate(errp, local_err);
 940            error_free(s->migration_blocker);
 941            return;
 942        }
 943    }
 944
 945    vmstate_register_ram(s->ivshmem_bar2, DEVICE(s));
 946    pci_register_bar(PCI_DEVICE(s), 2,
 947                     PCI_BASE_ADDRESS_SPACE_MEMORY |
 948                     PCI_BASE_ADDRESS_MEM_PREFETCH |
 949                     PCI_BASE_ADDRESS_MEM_TYPE_64,
 950                     s->ivshmem_bar2);
 951}
 952
 953static void ivshmem_exit(PCIDevice *dev)
 954{
 955    IVShmemState *s = IVSHMEM_COMMON(dev);
 956    int i;
 957
 958    if (s->migration_blocker) {
 959        migrate_del_blocker(s->migration_blocker);
 960        error_free(s->migration_blocker);
 961    }
 962
 963    if (memory_region_is_mapped(s->ivshmem_bar2)) {
 964        if (!s->hostmem) {
 965            void *addr = memory_region_get_ram_ptr(s->ivshmem_bar2);
 966            int fd;
 967
 968            if (munmap(addr, memory_region_size(s->ivshmem_bar2) == -1)) {
 969                error_report("Failed to munmap shared memory %s",
 970                             strerror(errno));
 971            }
 972
 973            fd = memory_region_get_fd(s->ivshmem_bar2);
 974            close(fd);
 975        }
 976
 977        vmstate_unregister_ram(s->ivshmem_bar2, DEVICE(dev));
 978    }
 979
 980    if (s->hostmem) {
 981        host_memory_backend_set_mapped(s->hostmem, false);
 982    }
 983
 984    if (s->peers) {
 985        for (i = 0; i < s->nb_peers; i++) {
 986            close_peer_eventfds(s, i);
 987        }
 988        g_free(s->peers);
 989    }
 990
 991    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
 992        msix_uninit_exclusive_bar(dev);
 993    }
 994
 995    g_free(s->msi_vectors);
 996}
 997
 998static int ivshmem_pre_load(void *opaque)
 999{
1000    IVShmemState *s = opaque;
1001
1002    if (!ivshmem_is_master(s)) {
1003        error_report("'peer' devices are not migratable");
1004        return -EINVAL;
1005    }
1006
1007    return 0;
1008}
1009
1010static int ivshmem_post_load(void *opaque, int version_id)
1011{
1012    IVShmemState *s = opaque;
1013
1014    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
1015        ivshmem_msix_vector_use(s);
1016    }
1017    return 0;
1018}
1019
1020static void ivshmem_common_class_init(ObjectClass *klass, void *data)
1021{
1022    DeviceClass *dc = DEVICE_CLASS(klass);
1023    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1024
1025    k->realize = ivshmem_common_realize;
1026    k->exit = ivshmem_exit;
1027    k->config_write = ivshmem_write_config;
1028    k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
1029    k->device_id = PCI_DEVICE_ID_IVSHMEM;
1030    k->class_id = PCI_CLASS_MEMORY_RAM;
1031    k->revision = 1;
1032    dc->reset = ivshmem_reset;
1033    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1034    dc->desc = "Inter-VM shared memory";
1035}
1036
1037static const TypeInfo ivshmem_common_info = {
1038    .name          = TYPE_IVSHMEM_COMMON,
1039    .parent        = TYPE_PCI_DEVICE,
1040    .instance_size = sizeof(IVShmemState),
1041    .abstract      = true,
1042    .class_init    = ivshmem_common_class_init,
1043    .interfaces = (InterfaceInfo[]) {
1044        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1045        { },
1046    },
1047};
1048
1049static const VMStateDescription ivshmem_plain_vmsd = {
1050    .name = TYPE_IVSHMEM_PLAIN,
1051    .version_id = 0,
1052    .minimum_version_id = 0,
1053    .pre_load = ivshmem_pre_load,
1054    .post_load = ivshmem_post_load,
1055    .fields = (VMStateField[]) {
1056        VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1057        VMSTATE_UINT32(intrstatus, IVShmemState),
1058        VMSTATE_UINT32(intrmask, IVShmemState),
1059        VMSTATE_END_OF_LIST()
1060    },
1061};
1062
1063static Property ivshmem_plain_properties[] = {
1064    DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF),
1065    DEFINE_PROP_LINK("memdev", IVShmemState, hostmem, TYPE_MEMORY_BACKEND,
1066                     HostMemoryBackend *),
1067    DEFINE_PROP_END_OF_LIST(),
1068};
1069
1070static void ivshmem_plain_realize(PCIDevice *dev, Error **errp)
1071{
1072    IVShmemState *s = IVSHMEM_COMMON(dev);
1073
1074    if (!s->hostmem) {
1075        error_setg(errp, "You must specify a 'memdev'");
1076        return;
1077    } else if (host_memory_backend_is_mapped(s->hostmem)) {
1078        char *path = object_get_canonical_path_component(OBJECT(s->hostmem));
1079        error_setg(errp, "can't use already busy memdev: %s", path);
1080        g_free(path);
1081        return;
1082    }
1083
1084    ivshmem_common_realize(dev, errp);
1085}
1086
1087static void ivshmem_plain_class_init(ObjectClass *klass, void *data)
1088{
1089    DeviceClass *dc = DEVICE_CLASS(klass);
1090    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1091
1092    k->realize = ivshmem_plain_realize;
1093    dc->props = ivshmem_plain_properties;
1094    dc->vmsd = &ivshmem_plain_vmsd;
1095}
1096
1097static const TypeInfo ivshmem_plain_info = {
1098    .name          = TYPE_IVSHMEM_PLAIN,
1099    .parent        = TYPE_IVSHMEM_COMMON,
1100    .instance_size = sizeof(IVShmemState),
1101    .class_init    = ivshmem_plain_class_init,
1102};
1103
1104static const VMStateDescription ivshmem_doorbell_vmsd = {
1105    .name = TYPE_IVSHMEM_DOORBELL,
1106    .version_id = 0,
1107    .minimum_version_id = 0,
1108    .pre_load = ivshmem_pre_load,
1109    .post_load = ivshmem_post_load,
1110    .fields = (VMStateField[]) {
1111        VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1112        VMSTATE_MSIX(parent_obj, IVShmemState),
1113        VMSTATE_UINT32(intrstatus, IVShmemState),
1114        VMSTATE_UINT32(intrmask, IVShmemState),
1115        VMSTATE_END_OF_LIST()
1116    },
1117};
1118
1119static Property ivshmem_doorbell_properties[] = {
1120    DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
1121    DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
1122    DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD,
1123                    true),
1124    DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF),
1125    DEFINE_PROP_END_OF_LIST(),
1126};
1127
1128static void ivshmem_doorbell_init(Object *obj)
1129{
1130    IVShmemState *s = IVSHMEM_DOORBELL(obj);
1131
1132    s->features |= (1 << IVSHMEM_MSI);
1133}
1134
1135static void ivshmem_doorbell_realize(PCIDevice *dev, Error **errp)
1136{
1137    IVShmemState *s = IVSHMEM_COMMON(dev);
1138
1139    if (!qemu_chr_fe_backend_connected(&s->server_chr)) {
1140        error_setg(errp, "You must specify a 'chardev'");
1141        return;
1142    }
1143
1144    ivshmem_common_realize(dev, errp);
1145}
1146
1147static void ivshmem_doorbell_class_init(ObjectClass *klass, void *data)
1148{
1149    DeviceClass *dc = DEVICE_CLASS(klass);
1150    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1151
1152    k->realize = ivshmem_doorbell_realize;
1153    dc->props = ivshmem_doorbell_properties;
1154    dc->vmsd = &ivshmem_doorbell_vmsd;
1155}
1156
1157static const TypeInfo ivshmem_doorbell_info = {
1158    .name          = TYPE_IVSHMEM_DOORBELL,
1159    .parent        = TYPE_IVSHMEM_COMMON,
1160    .instance_size = sizeof(IVShmemState),
1161    .instance_init = ivshmem_doorbell_init,
1162    .class_init    = ivshmem_doorbell_class_init,
1163};
1164
1165static void ivshmem_register_types(void)
1166{
1167    type_register_static(&ivshmem_common_info);
1168    type_register_static(&ivshmem_plain_info);
1169    type_register_static(&ivshmem_doorbell_info);
1170}
1171
1172type_init(ivshmem_register_types)
1173