linux/virt/kvm/eventfd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * kvm eventfd support - use eventfd objects to signal various KVM events
   4 *
   5 * Copyright 2009 Novell.  All Rights Reserved.
   6 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   7 *
   8 * Author:
   9 *      Gregory Haskins <ghaskins@novell.com>
  10 */
  11
  12#include <linux/kvm_host.h>
  13#include <linux/kvm.h>
  14#include <linux/kvm_irqfd.h>
  15#include <linux/workqueue.h>
  16#include <linux/syscalls.h>
  17#include <linux/wait.h>
  18#include <linux/poll.h>
  19#include <linux/file.h>
  20#include <linux/list.h>
  21#include <linux/eventfd.h>
  22#include <linux/kernel.h>
  23#include <linux/srcu.h>
  24#include <linux/slab.h>
  25#include <linux/seqlock.h>
  26#include <linux/irqbypass.h>
  27#include <trace/events/kvm.h>
  28
  29#include <kvm/iodev.h>
  30
  31#ifdef CONFIG_HAVE_KVM_IRQFD
  32
  33static struct workqueue_struct *irqfd_cleanup_wq;
  34
  35bool __attribute__((weak))
  36kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
  37{
  38        return true;
  39}
  40
  41static void
  42irqfd_inject(struct work_struct *work)
  43{
  44        struct kvm_kernel_irqfd *irqfd =
  45                container_of(work, struct kvm_kernel_irqfd, inject);
  46        struct kvm *kvm = irqfd->kvm;
  47
  48        if (!irqfd->resampler) {
  49                kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
  50                                false);
  51                kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
  52                                false);
  53        } else
  54                kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
  55                            irqfd->gsi, 1, false);
  56}
  57
  58/*
  59 * Since resampler irqfds share an IRQ source ID, we de-assert once
  60 * then notify all of the resampler irqfds using this GSI.  We can't
  61 * do multiple de-asserts or we risk racing with incoming re-asserts.
  62 */
  63static void
  64irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
  65{
  66        struct kvm_kernel_irqfd_resampler *resampler;
  67        struct kvm *kvm;
  68        struct kvm_kernel_irqfd *irqfd;
  69        int idx;
  70
  71        resampler = container_of(kian,
  72                        struct kvm_kernel_irqfd_resampler, notifier);
  73        kvm = resampler->kvm;
  74
  75        kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
  76                    resampler->notifier.gsi, 0, false);
  77
  78        idx = srcu_read_lock(&kvm->irq_srcu);
  79
  80        list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
  81                eventfd_signal(irqfd->resamplefd, 1);
  82
  83        srcu_read_unlock(&kvm->irq_srcu, idx);
  84}
  85
  86static void
  87irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
  88{
  89        struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
  90        struct kvm *kvm = resampler->kvm;
  91
  92        mutex_lock(&kvm->irqfds.resampler_lock);
  93
  94        list_del_rcu(&irqfd->resampler_link);
  95        synchronize_srcu(&kvm->irq_srcu);
  96
  97        if (list_empty(&resampler->list)) {
  98                list_del(&resampler->link);
  99                kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
 100                kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 101                            resampler->notifier.gsi, 0, false);
 102                kfree(resampler);
 103        }
 104
 105        mutex_unlock(&kvm->irqfds.resampler_lock);
 106}
 107
 108/*
 109 * Race-free decouple logic (ordering is critical)
 110 */
 111static void
 112irqfd_shutdown(struct work_struct *work)
 113{
 114        struct kvm_kernel_irqfd *irqfd =
 115                container_of(work, struct kvm_kernel_irqfd, shutdown);
 116        struct kvm *kvm = irqfd->kvm;
 117        u64 cnt;
 118
 119        /* Make sure irqfd has been initalized in assign path. */
 120        synchronize_srcu(&kvm->irq_srcu);
 121
 122        /*
 123         * Synchronize with the wait-queue and unhook ourselves to prevent
 124         * further events.
 125         */
 126        eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
 127
 128        /*
 129         * We know no new events will be scheduled at this point, so block
 130         * until all previously outstanding events have completed
 131         */
 132        flush_work(&irqfd->inject);
 133
 134        if (irqfd->resampler) {
 135                irqfd_resampler_shutdown(irqfd);
 136                eventfd_ctx_put(irqfd->resamplefd);
 137        }
 138
 139        /*
 140         * It is now safe to release the object's resources
 141         */
 142#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 143        irq_bypass_unregister_consumer(&irqfd->consumer);
 144#endif
 145        eventfd_ctx_put(irqfd->eventfd);
 146        kfree(irqfd);
 147}
 148
 149
 150/* assumes kvm->irqfds.lock is held */
 151static bool
 152irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
 153{
 154        return list_empty(&irqfd->list) ? false : true;
 155}
 156
 157/*
 158 * Mark the irqfd as inactive and schedule it for removal
 159 *
 160 * assumes kvm->irqfds.lock is held
 161 */
 162static void
 163irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
 164{
 165        BUG_ON(!irqfd_is_active(irqfd));
 166
 167        list_del_init(&irqfd->list);
 168
 169        queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 170}
 171
 172int __attribute__((weak)) kvm_arch_set_irq_inatomic(
 173                                struct kvm_kernel_irq_routing_entry *irq,
 174                                struct kvm *kvm, int irq_source_id,
 175                                int level,
 176                                bool line_status)
 177{
 178        return -EWOULDBLOCK;
 179}
 180
 181/*
 182 * Called with wqh->lock held and interrupts disabled
 183 */
 184static int
 185irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 186{
 187        struct kvm_kernel_irqfd *irqfd =
 188                container_of(wait, struct kvm_kernel_irqfd, wait);
 189        __poll_t flags = key_to_poll(key);
 190        struct kvm_kernel_irq_routing_entry irq;
 191        struct kvm *kvm = irqfd->kvm;
 192        unsigned seq;
 193        int idx;
 194
 195        if (flags & EPOLLIN) {
 196                idx = srcu_read_lock(&kvm->irq_srcu);
 197                do {
 198                        seq = read_seqcount_begin(&irqfd->irq_entry_sc);
 199                        irq = irqfd->irq_entry;
 200                } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
 201                /* An event has been signaled, inject an interrupt */
 202                if (kvm_arch_set_irq_inatomic(&irq, kvm,
 203                                              KVM_USERSPACE_IRQ_SOURCE_ID, 1,
 204                                              false) == -EWOULDBLOCK)
 205                        schedule_work(&irqfd->inject);
 206                srcu_read_unlock(&kvm->irq_srcu, idx);
 207        }
 208
 209        if (flags & EPOLLHUP) {
 210                /* The eventfd is closing, detach from KVM */
 211                unsigned long iflags;
 212
 213                spin_lock_irqsave(&kvm->irqfds.lock, iflags);
 214
 215                /*
 216                 * We must check if someone deactivated the irqfd before
 217                 * we could acquire the irqfds.lock since the item is
 218                 * deactivated from the KVM side before it is unhooked from
 219                 * the wait-queue.  If it is already deactivated, we can
 220                 * simply return knowing the other side will cleanup for us.
 221                 * We cannot race against the irqfd going away since the
 222                 * other side is required to acquire wqh->lock, which we hold
 223                 */
 224                if (irqfd_is_active(irqfd))
 225                        irqfd_deactivate(irqfd);
 226
 227                spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
 228        }
 229
 230        return 0;
 231}
 232
 233static void
 234irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 235                        poll_table *pt)
 236{
 237        struct kvm_kernel_irqfd *irqfd =
 238                container_of(pt, struct kvm_kernel_irqfd, pt);
 239        add_wait_queue(wqh, &irqfd->wait);
 240}
 241
 242/* Must be called under irqfds.lock */
 243static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 244{
 245        struct kvm_kernel_irq_routing_entry *e;
 246        struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
 247        int n_entries;
 248
 249        n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
 250
 251        write_seqcount_begin(&irqfd->irq_entry_sc);
 252
 253        e = entries;
 254        if (n_entries == 1)
 255                irqfd->irq_entry = *e;
 256        else
 257                irqfd->irq_entry.type = 0;
 258
 259        write_seqcount_end(&irqfd->irq_entry_sc);
 260}
 261
 262#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 263void __attribute__((weak)) kvm_arch_irq_bypass_stop(
 264                                struct irq_bypass_consumer *cons)
 265{
 266}
 267
 268void __attribute__((weak)) kvm_arch_irq_bypass_start(
 269                                struct irq_bypass_consumer *cons)
 270{
 271}
 272
 273int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
 274                                struct kvm *kvm, unsigned int host_irq,
 275                                uint32_t guest_irq, bool set)
 276{
 277        return 0;
 278}
 279#endif
 280
 281static int
 282kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 283{
 284        struct kvm_kernel_irqfd *irqfd, *tmp;
 285        struct fd f;
 286        struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
 287        int ret;
 288        __poll_t events;
 289        int idx;
 290
 291        if (!kvm_arch_intc_initialized(kvm))
 292                return -EAGAIN;
 293
 294        if (!kvm_arch_irqfd_allowed(kvm, args))
 295                return -EINVAL;
 296
 297        irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
 298        if (!irqfd)
 299                return -ENOMEM;
 300
 301        irqfd->kvm = kvm;
 302        irqfd->gsi = args->gsi;
 303        INIT_LIST_HEAD(&irqfd->list);
 304        INIT_WORK(&irqfd->inject, irqfd_inject);
 305        INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
 306        seqcount_init(&irqfd->irq_entry_sc);
 307
 308        f = fdget(args->fd);
 309        if (!f.file) {
 310                ret = -EBADF;
 311                goto out;
 312        }
 313
 314        eventfd = eventfd_ctx_fileget(f.file);
 315        if (IS_ERR(eventfd)) {
 316                ret = PTR_ERR(eventfd);
 317                goto fail;
 318        }
 319
 320        irqfd->eventfd = eventfd;
 321
 322        if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
 323                struct kvm_kernel_irqfd_resampler *resampler;
 324
 325                resamplefd = eventfd_ctx_fdget(args->resamplefd);
 326                if (IS_ERR(resamplefd)) {
 327                        ret = PTR_ERR(resamplefd);
 328                        goto fail;
 329                }
 330
 331                irqfd->resamplefd = resamplefd;
 332                INIT_LIST_HEAD(&irqfd->resampler_link);
 333
 334                mutex_lock(&kvm->irqfds.resampler_lock);
 335
 336                list_for_each_entry(resampler,
 337                                    &kvm->irqfds.resampler_list, link) {
 338                        if (resampler->notifier.gsi == irqfd->gsi) {
 339                                irqfd->resampler = resampler;
 340                                break;
 341                        }
 342                }
 343
 344                if (!irqfd->resampler) {
 345                        resampler = kzalloc(sizeof(*resampler),
 346                                            GFP_KERNEL_ACCOUNT);
 347                        if (!resampler) {
 348                                ret = -ENOMEM;
 349                                mutex_unlock(&kvm->irqfds.resampler_lock);
 350                                goto fail;
 351                        }
 352
 353                        resampler->kvm = kvm;
 354                        INIT_LIST_HEAD(&resampler->list);
 355                        resampler->notifier.gsi = irqfd->gsi;
 356                        resampler->notifier.irq_acked = irqfd_resampler_ack;
 357                        INIT_LIST_HEAD(&resampler->link);
 358
 359                        list_add(&resampler->link, &kvm->irqfds.resampler_list);
 360                        kvm_register_irq_ack_notifier(kvm,
 361                                                      &resampler->notifier);
 362                        irqfd->resampler = resampler;
 363                }
 364
 365                list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
 366                synchronize_srcu(&kvm->irq_srcu);
 367
 368                mutex_unlock(&kvm->irqfds.resampler_lock);
 369        }
 370
 371        /*
 372         * Install our own custom wake-up handling so we are notified via
 373         * a callback whenever someone signals the underlying eventfd
 374         */
 375        init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
 376        init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
 377
 378        spin_lock_irq(&kvm->irqfds.lock);
 379
 380        ret = 0;
 381        list_for_each_entry(tmp, &kvm->irqfds.items, list) {
 382                if (irqfd->eventfd != tmp->eventfd)
 383                        continue;
 384                /* This fd is used for another irq already. */
 385                ret = -EBUSY;
 386                spin_unlock_irq(&kvm->irqfds.lock);
 387                goto fail;
 388        }
 389
 390        idx = srcu_read_lock(&kvm->irq_srcu);
 391        irqfd_update(kvm, irqfd);
 392
 393        list_add_tail(&irqfd->list, &kvm->irqfds.items);
 394
 395        spin_unlock_irq(&kvm->irqfds.lock);
 396
 397        /*
 398         * Check if there was an event already pending on the eventfd
 399         * before we registered, and trigger it as if we didn't miss it.
 400         */
 401        events = vfs_poll(f.file, &irqfd->pt);
 402
 403        if (events & EPOLLIN)
 404                schedule_work(&irqfd->inject);
 405
 406#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 407        if (kvm_arch_has_irq_bypass()) {
 408                irqfd->consumer.token = (void *)irqfd->eventfd;
 409                irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
 410                irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
 411                irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
 412                irqfd->consumer.start = kvm_arch_irq_bypass_start;
 413                ret = irq_bypass_register_consumer(&irqfd->consumer);
 414                if (ret)
 415                        pr_info("irq bypass consumer (token %p) registration fails: %d\n",
 416                                irqfd->consumer.token, ret);
 417        }
 418#endif
 419
 420        srcu_read_unlock(&kvm->irq_srcu, idx);
 421
 422        /*
 423         * do not drop the file until the irqfd is fully initialized, otherwise
 424         * we might race against the EPOLLHUP
 425         */
 426        fdput(f);
 427        return 0;
 428
 429fail:
 430        if (irqfd->resampler)
 431                irqfd_resampler_shutdown(irqfd);
 432
 433        if (resamplefd && !IS_ERR(resamplefd))
 434                eventfd_ctx_put(resamplefd);
 435
 436        if (eventfd && !IS_ERR(eventfd))
 437                eventfd_ctx_put(eventfd);
 438
 439        fdput(f);
 440
 441out:
 442        kfree(irqfd);
 443        return ret;
 444}
 445
 446bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 447{
 448        struct kvm_irq_ack_notifier *kian;
 449        int gsi, idx;
 450
 451        idx = srcu_read_lock(&kvm->irq_srcu);
 452        gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 453        if (gsi != -1)
 454                hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
 455                                         link)
 456                        if (kian->gsi == gsi) {
 457                                srcu_read_unlock(&kvm->irq_srcu, idx);
 458                                return true;
 459                        }
 460
 461        srcu_read_unlock(&kvm->irq_srcu, idx);
 462
 463        return false;
 464}
 465EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
 466
 467void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
 468{
 469        struct kvm_irq_ack_notifier *kian;
 470
 471        hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
 472                                 link)
 473                if (kian->gsi == gsi)
 474                        kian->irq_acked(kian);
 475}
 476
 477void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 478{
 479        int gsi, idx;
 480
 481        trace_kvm_ack_irq(irqchip, pin);
 482
 483        idx = srcu_read_lock(&kvm->irq_srcu);
 484        gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 485        if (gsi != -1)
 486                kvm_notify_acked_gsi(kvm, gsi);
 487        srcu_read_unlock(&kvm->irq_srcu, idx);
 488}
 489
 490void kvm_register_irq_ack_notifier(struct kvm *kvm,
 491                                   struct kvm_irq_ack_notifier *kian)
 492{
 493        mutex_lock(&kvm->irq_lock);
 494        hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
 495        mutex_unlock(&kvm->irq_lock);
 496        kvm_arch_post_irq_ack_notifier_list_update(kvm);
 497}
 498
 499void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 500                                    struct kvm_irq_ack_notifier *kian)
 501{
 502        mutex_lock(&kvm->irq_lock);
 503        hlist_del_init_rcu(&kian->link);
 504        mutex_unlock(&kvm->irq_lock);
 505        synchronize_srcu(&kvm->irq_srcu);
 506        kvm_arch_post_irq_ack_notifier_list_update(kvm);
 507}
 508#endif
 509
 510void
 511kvm_eventfd_init(struct kvm *kvm)
 512{
 513#ifdef CONFIG_HAVE_KVM_IRQFD
 514        spin_lock_init(&kvm->irqfds.lock);
 515        INIT_LIST_HEAD(&kvm->irqfds.items);
 516        INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
 517        mutex_init(&kvm->irqfds.resampler_lock);
 518#endif
 519        INIT_LIST_HEAD(&kvm->ioeventfds);
 520}
 521
 522#ifdef CONFIG_HAVE_KVM_IRQFD
 523/*
 524 * shutdown any irqfd's that match fd+gsi
 525 */
 526static int
 527kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 528{
 529        struct kvm_kernel_irqfd *irqfd, *tmp;
 530        struct eventfd_ctx *eventfd;
 531
 532        eventfd = eventfd_ctx_fdget(args->fd);
 533        if (IS_ERR(eventfd))
 534                return PTR_ERR(eventfd);
 535
 536        spin_lock_irq(&kvm->irqfds.lock);
 537
 538        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
 539                if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
 540                        /*
 541                         * This clearing of irq_entry.type is needed for when
 542                         * another thread calls kvm_irq_routing_update before
 543                         * we flush workqueue below (we synchronize with
 544                         * kvm_irq_routing_update using irqfds.lock).
 545                         */
 546                        write_seqcount_begin(&irqfd->irq_entry_sc);
 547                        irqfd->irq_entry.type = 0;
 548                        write_seqcount_end(&irqfd->irq_entry_sc);
 549                        irqfd_deactivate(irqfd);
 550                }
 551        }
 552
 553        spin_unlock_irq(&kvm->irqfds.lock);
 554        eventfd_ctx_put(eventfd);
 555
 556        /*
 557         * Block until we know all outstanding shutdown jobs have completed
 558         * so that we guarantee there will not be any more interrupts on this
 559         * gsi once this deassign function returns.
 560         */
 561        flush_workqueue(irqfd_cleanup_wq);
 562
 563        return 0;
 564}
 565
 566int
 567kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 568{
 569        if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
 570                return -EINVAL;
 571
 572        if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
 573                return kvm_irqfd_deassign(kvm, args);
 574
 575        return kvm_irqfd_assign(kvm, args);
 576}
 577
 578/*
 579 * This function is called as the kvm VM fd is being released. Shutdown all
 580 * irqfds that still remain open
 581 */
 582void
 583kvm_irqfd_release(struct kvm *kvm)
 584{
 585        struct kvm_kernel_irqfd *irqfd, *tmp;
 586
 587        spin_lock_irq(&kvm->irqfds.lock);
 588
 589        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
 590                irqfd_deactivate(irqfd);
 591
 592        spin_unlock_irq(&kvm->irqfds.lock);
 593
 594        /*
 595         * Block until we know all outstanding shutdown jobs have completed
 596         * since we do not take a kvm* reference.
 597         */
 598        flush_workqueue(irqfd_cleanup_wq);
 599
 600}
 601
 602/*
 603 * Take note of a change in irq routing.
 604 * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
 605 */
 606void kvm_irq_routing_update(struct kvm *kvm)
 607{
 608        struct kvm_kernel_irqfd *irqfd;
 609
 610        spin_lock_irq(&kvm->irqfds.lock);
 611
 612        list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
 613                irqfd_update(kvm, irqfd);
 614
 615#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
 616                if (irqfd->producer) {
 617                        int ret = kvm_arch_update_irqfd_routing(
 618                                        irqfd->kvm, irqfd->producer->irq,
 619                                        irqfd->gsi, 1);
 620                        WARN_ON(ret);
 621                }
 622#endif
 623        }
 624
 625        spin_unlock_irq(&kvm->irqfds.lock);
 626}
 627
 628/*
 629 * create a host-wide workqueue for issuing deferred shutdown requests
 630 * aggregated from all vm* instances. We need our own isolated
 631 * queue to ease flushing work items when a VM exits.
 632 */
 633int kvm_irqfd_init(void)
 634{
 635        irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0);
 636        if (!irqfd_cleanup_wq)
 637                return -ENOMEM;
 638
 639        return 0;
 640}
 641
 642void kvm_irqfd_exit(void)
 643{
 644        destroy_workqueue(irqfd_cleanup_wq);
 645}
 646#endif
 647
 648/*
 649 * --------------------------------------------------------------------
 650 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
 651 *
 652 * userspace can register a PIO/MMIO address with an eventfd for receiving
 653 * notification when the memory has been touched.
 654 * --------------------------------------------------------------------
 655 */
 656
 657struct _ioeventfd {
 658        struct list_head     list;
 659        u64                  addr;
 660        int                  length;
 661        struct eventfd_ctx  *eventfd;
 662        u64                  datamatch;
 663        struct kvm_io_device dev;
 664        u8                   bus_idx;
 665        bool                 wildcard;
 666};
 667
 668static inline struct _ioeventfd *
 669to_ioeventfd(struct kvm_io_device *dev)
 670{
 671        return container_of(dev, struct _ioeventfd, dev);
 672}
 673
 674static void
 675ioeventfd_release(struct _ioeventfd *p)
 676{
 677        eventfd_ctx_put(p->eventfd);
 678        list_del(&p->list);
 679        kfree(p);
 680}
 681
 682static bool
 683ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 684{
 685        u64 _val;
 686
 687        if (addr != p->addr)
 688                /* address must be precise for a hit */
 689                return false;
 690
 691        if (!p->length)
 692                /* length = 0 means only look at the address, so always a hit */
 693                return true;
 694
 695        if (len != p->length)
 696                /* address-range must be precise for a hit */
 697                return false;
 698
 699        if (p->wildcard)
 700                /* all else equal, wildcard is always a hit */
 701                return true;
 702
 703        /* otherwise, we have to actually compare the data */
 704
 705        BUG_ON(!IS_ALIGNED((unsigned long)val, len));
 706
 707        switch (len) {
 708        case 1:
 709                _val = *(u8 *)val;
 710                break;
 711        case 2:
 712                _val = *(u16 *)val;
 713                break;
 714        case 4:
 715                _val = *(u32 *)val;
 716                break;
 717        case 8:
 718                _val = *(u64 *)val;
 719                break;
 720        default:
 721                return false;
 722        }
 723
 724        return _val == p->datamatch ? true : false;
 725}
 726
 727/* MMIO/PIO writes trigger an event if the addr/val match */
 728static int
 729ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 730                int len, const void *val)
 731{
 732        struct _ioeventfd *p = to_ioeventfd(this);
 733
 734        if (!ioeventfd_in_range(p, addr, len, val))
 735                return -EOPNOTSUPP;
 736
 737        eventfd_signal(p->eventfd, 1);
 738        return 0;
 739}
 740
 741/*
 742 * This function is called as KVM is completely shutting down.  We do not
 743 * need to worry about locking just nuke anything we have as quickly as possible
 744 */
 745static void
 746ioeventfd_destructor(struct kvm_io_device *this)
 747{
 748        struct _ioeventfd *p = to_ioeventfd(this);
 749
 750        ioeventfd_release(p);
 751}
 752
 753static const struct kvm_io_device_ops ioeventfd_ops = {
 754        .write      = ioeventfd_write,
 755        .destructor = ioeventfd_destructor,
 756};
 757
 758/* assumes kvm->slots_lock held */
 759static bool
 760ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
 761{
 762        struct _ioeventfd *_p;
 763
 764        list_for_each_entry(_p, &kvm->ioeventfds, list)
 765                if (_p->bus_idx == p->bus_idx &&
 766                    _p->addr == p->addr &&
 767                    (!_p->length || !p->length ||
 768                     (_p->length == p->length &&
 769                      (_p->wildcard || p->wildcard ||
 770                       _p->datamatch == p->datamatch))))
 771                        return true;
 772
 773        return false;
 774}
 775
 776static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
 777{
 778        if (flags & KVM_IOEVENTFD_FLAG_PIO)
 779                return KVM_PIO_BUS;
 780        if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
 781                return KVM_VIRTIO_CCW_NOTIFY_BUS;
 782        return KVM_MMIO_BUS;
 783}
 784
 785static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
 786                                enum kvm_bus bus_idx,
 787                                struct kvm_ioeventfd *args)
 788{
 789
 790        struct eventfd_ctx *eventfd;
 791        struct _ioeventfd *p;
 792        int ret;
 793
 794        eventfd = eventfd_ctx_fdget(args->fd);
 795        if (IS_ERR(eventfd))
 796                return PTR_ERR(eventfd);
 797
 798        p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
 799        if (!p) {
 800                ret = -ENOMEM;
 801                goto fail;
 802        }
 803
 804        INIT_LIST_HEAD(&p->list);
 805        p->addr    = args->addr;
 806        p->bus_idx = bus_idx;
 807        p->length  = args->len;
 808        p->eventfd = eventfd;
 809
 810        /* The datamatch feature is optional, otherwise this is a wildcard */
 811        if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
 812                p->datamatch = args->datamatch;
 813        else
 814                p->wildcard = true;
 815
 816        mutex_lock(&kvm->slots_lock);
 817
 818        /* Verify that there isn't a match already */
 819        if (ioeventfd_check_collision(kvm, p)) {
 820                ret = -EEXIST;
 821                goto unlock_fail;
 822        }
 823
 824        kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 825
 826        ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
 827                                      &p->dev);
 828        if (ret < 0)
 829                goto unlock_fail;
 830
 831        kvm_get_bus(kvm, bus_idx)->ioeventfd_count++;
 832        list_add_tail(&p->list, &kvm->ioeventfds);
 833
 834        mutex_unlock(&kvm->slots_lock);
 835
 836        return 0;
 837
 838unlock_fail:
 839        mutex_unlock(&kvm->slots_lock);
 840
 841fail:
 842        kfree(p);
 843        eventfd_ctx_put(eventfd);
 844
 845        return ret;
 846}
 847
 848static int
 849kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
 850                           struct kvm_ioeventfd *args)
 851{
 852        struct _ioeventfd        *p, *tmp;
 853        struct eventfd_ctx       *eventfd;
 854        struct kvm_io_bus        *bus;
 855        int                       ret = -ENOENT;
 856
 857        eventfd = eventfd_ctx_fdget(args->fd);
 858        if (IS_ERR(eventfd))
 859                return PTR_ERR(eventfd);
 860
 861        mutex_lock(&kvm->slots_lock);
 862
 863        list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
 864                bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
 865
 866                if (p->bus_idx != bus_idx ||
 867                    p->eventfd != eventfd  ||
 868                    p->addr != args->addr  ||
 869                    p->length != args->len ||
 870                    p->wildcard != wildcard)
 871                        continue;
 872
 873                if (!p->wildcard && p->datamatch != args->datamatch)
 874                        continue;
 875
 876                kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
 877                bus = kvm_get_bus(kvm, bus_idx);
 878                if (bus)
 879                        bus->ioeventfd_count--;
 880                ioeventfd_release(p);
 881                ret = 0;
 882                break;
 883        }
 884
 885        mutex_unlock(&kvm->slots_lock);
 886
 887        eventfd_ctx_put(eventfd);
 888
 889        return ret;
 890}
 891
 892static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 893{
 894        enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags);
 895        int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
 896
 897        if (!args->len && bus_idx == KVM_MMIO_BUS)
 898                kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
 899
 900        return ret;
 901}
 902
 903static int
 904kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 905{
 906        enum kvm_bus              bus_idx;
 907        int ret;
 908
 909        bus_idx = ioeventfd_bus_from_flags(args->flags);
 910        /* must be natural-word sized, or 0 to ignore length */
 911        switch (args->len) {
 912        case 0:
 913        case 1:
 914        case 2:
 915        case 4:
 916        case 8:
 917                break;
 918        default:
 919                return -EINVAL;
 920        }
 921
 922        /* check for range overflow */
 923        if (args->addr + args->len < args->addr)
 924                return -EINVAL;
 925
 926        /* check for extra flags that we don't understand */
 927        if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
 928                return -EINVAL;
 929
 930        /* ioeventfd with no length can't be combined with DATAMATCH */
 931        if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
 932                return -EINVAL;
 933
 934        ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
 935        if (ret)
 936                goto fail;
 937
 938        /* When length is ignored, MMIO is also put on a separate bus, for
 939         * faster lookups.
 940         */
 941        if (!args->len && bus_idx == KVM_MMIO_BUS) {
 942                ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
 943                if (ret < 0)
 944                        goto fast_fail;
 945        }
 946
 947        return 0;
 948
 949fast_fail:
 950        kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
 951fail:
 952        return ret;
 953}
 954
 955int
 956kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 957{
 958        if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
 959                return kvm_deassign_ioeventfd(kvm, args);
 960
 961        return kvm_assign_ioeventfd(kvm, args);
 962}
 963