linux/arch/x86/kvm/vmx/posted_intr.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <linux/kvm_host.h>
   3
   4#include <asm/irq_remapping.h>
   5#include <asm/cpu.h>
   6
   7#include "lapic.h"
   8#include "posted_intr.h"
   9#include "trace.h"
  10#include "vmx.h"
  11
  12/*
  13 * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we
  14 * can find which vCPU should be waken up.
  15 */
  16static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
  17static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
  18
  19static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
  20{
  21        return &(to_vmx(vcpu)->pi_desc);
  22}
  23
  24void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
  25{
  26        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
  27        struct pi_desc old, new;
  28        unsigned int dest;
  29
  30        /*
  31         * In case of hot-plug or hot-unplug, we may have to undo
  32         * vmx_vcpu_pi_put even if there is no assigned device.  And we
  33         * always keep PI.NDST up to date for simplicity: it makes the
  34         * code easier, and CPU migration is not a fast path.
  35         */
  36        if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
  37                return;
  38
  39        /*
  40         * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
  41         * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
  42         * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
  43         * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
  44         * correctly.
  45         */
  46        if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
  47                pi_clear_sn(pi_desc);
  48                goto after_clear_sn;
  49        }
  50
  51        /* The full case.  */
  52        do {
  53                old.control = new.control = pi_desc->control;
  54
  55                dest = cpu_physical_id(cpu);
  56
  57                if (x2apic_mode)
  58                        new.ndst = dest;
  59                else
  60                        new.ndst = (dest << 8) & 0xFF00;
  61
  62                new.sn = 0;
  63        } while (cmpxchg64(&pi_desc->control, old.control,
  64                           new.control) != old.control);
  65
  66after_clear_sn:
  67
  68        /*
  69         * Clear SN before reading the bitmap.  The VT-d firmware
  70         * writes the bitmap and reads SN atomically (5.2.3 in the
  71         * spec), so it doesn't really have a memory barrier that
  72         * pairs with this, but we cannot do that and we need one.
  73         */
  74        smp_mb__after_atomic();
  75
  76        if (!pi_is_pir_empty(pi_desc))
  77                pi_set_on(pi_desc);
  78}
  79
  80void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
  81{
  82        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
  83
  84        if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
  85                !irq_remapping_cap(IRQ_POSTING_CAP)  ||
  86                !kvm_vcpu_apicv_active(vcpu))
  87                return;
  88
  89        /* Set SN when the vCPU is preempted */
  90        if (vcpu->preempted)
  91                pi_set_sn(pi_desc);
  92}
  93
  94static void __pi_post_block(struct kvm_vcpu *vcpu)
  95{
  96        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
  97        struct pi_desc old, new;
  98        unsigned int dest;
  99
 100        do {
 101                old.control = new.control = pi_desc->control;
 102                WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
 103                     "Wakeup handler not enabled while the VCPU is blocked\n");
 104
 105                dest = cpu_physical_id(vcpu->cpu);
 106
 107                if (x2apic_mode)
 108                        new.ndst = dest;
 109                else
 110                        new.ndst = (dest << 8) & 0xFF00;
 111
 112                /* set 'NV' to 'notification vector' */
 113                new.nv = POSTED_INTR_VECTOR;
 114        } while (cmpxchg64(&pi_desc->control, old.control,
 115                           new.control) != old.control);
 116
 117        if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
 118                spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 119                list_del(&vcpu->blocked_vcpu_list);
 120                spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 121                vcpu->pre_pcpu = -1;
 122        }
 123}
 124
 125/*
 126 * This routine does the following things for vCPU which is going
 127 * to be blocked if VT-d PI is enabled.
 128 * - Store the vCPU to the wakeup list, so when interrupts happen
 129 *   we can find the right vCPU to wake up.
 130 * - Change the Posted-interrupt descriptor as below:
 131 *      'NDST' <-- vcpu->pre_pcpu
 132 *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
 133 * - If 'ON' is set during this process, which means at least one
 134 *   interrupt is posted for this vCPU, we cannot block it, in
 135 *   this case, return 1, otherwise, return 0.
 136 *
 137 */
 138int pi_pre_block(struct kvm_vcpu *vcpu)
 139{
 140        unsigned int dest;
 141        struct pi_desc old, new;
 142        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 143
 144        if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 145                !irq_remapping_cap(IRQ_POSTING_CAP)  ||
 146                !kvm_vcpu_apicv_active(vcpu))
 147                return 0;
 148
 149        WARN_ON(irqs_disabled());
 150        local_irq_disable();
 151        if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
 152                vcpu->pre_pcpu = vcpu->cpu;
 153                spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 154                list_add_tail(&vcpu->blocked_vcpu_list,
 155                              &per_cpu(blocked_vcpu_on_cpu,
 156                                       vcpu->pre_pcpu));
 157                spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 158        }
 159
 160        do {
 161                old.control = new.control = pi_desc->control;
 162
 163                WARN((pi_desc->sn == 1),
 164                     "Warning: SN field of posted-interrupts "
 165                     "is set before blocking\n");
 166
 167                /*
 168                 * Since vCPU can be preempted during this process,
 169                 * vcpu->cpu could be different with pre_pcpu, we
 170                 * need to set pre_pcpu as the destination of wakeup
 171                 * notification event, then we can find the right vCPU
 172                 * to wakeup in wakeup handler if interrupts happen
 173                 * when the vCPU is in blocked state.
 174                 */
 175                dest = cpu_physical_id(vcpu->pre_pcpu);
 176
 177                if (x2apic_mode)
 178                        new.ndst = dest;
 179                else
 180                        new.ndst = (dest << 8) & 0xFF00;
 181
 182                /* set 'NV' to 'wakeup vector' */
 183                new.nv = POSTED_INTR_WAKEUP_VECTOR;
 184        } while (cmpxchg64(&pi_desc->control, old.control,
 185                           new.control) != old.control);
 186
 187        /* We should not block the vCPU if an interrupt is posted for it.  */
 188        if (pi_test_on(pi_desc) == 1)
 189                __pi_post_block(vcpu);
 190
 191        local_irq_enable();
 192        return (vcpu->pre_pcpu == -1);
 193}
 194
 195void pi_post_block(struct kvm_vcpu *vcpu)
 196{
 197        if (vcpu->pre_pcpu == -1)
 198                return;
 199
 200        WARN_ON(irqs_disabled());
 201        local_irq_disable();
 202        __pi_post_block(vcpu);
 203        local_irq_enable();
 204}
 205
 206/*
 207 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
 208 */
 209void pi_wakeup_handler(void)
 210{
 211        struct kvm_vcpu *vcpu;
 212        int cpu = smp_processor_id();
 213
 214        spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 215        list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
 216                        blocked_vcpu_list) {
 217                struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 218
 219                if (pi_test_on(pi_desc) == 1)
 220                        kvm_vcpu_kick(vcpu);
 221        }
 222        spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 223}
 224
 225void __init pi_init_cpu(int cpu)
 226{
 227        INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
 228        spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 229}
 230
 231bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
 232{
 233        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 234
 235        return pi_test_on(pi_desc) ||
 236                (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
 237}
 238
 239
 240/*
 241 * Bail out of the block loop if the VM has an assigned
 242 * device, but the blocking vCPU didn't reconfigure the
 243 * PI.NV to the wakeup vector, i.e. the assigned device
 244 * came along after the initial check in pi_pre_block().
 245 */
 246void vmx_pi_start_assignment(struct kvm *kvm)
 247{
 248        if (!irq_remapping_cap(IRQ_POSTING_CAP))
 249                return;
 250
 251        kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
 252}
 253
 254/*
 255 * pi_update_irte - set IRTE for Posted-Interrupts
 256 *
 257 * @kvm: kvm
 258 * @host_irq: host irq of the interrupt
 259 * @guest_irq: gsi of the interrupt
 260 * @set: set or unset PI
 261 * returns 0 on success, < 0 on failure
 262 */
 263int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
 264                   bool set)
 265{
 266        struct kvm_kernel_irq_routing_entry *e;
 267        struct kvm_irq_routing_table *irq_rt;
 268        struct kvm_lapic_irq irq;
 269        struct kvm_vcpu *vcpu;
 270        struct vcpu_data vcpu_info;
 271        int idx, ret = 0;
 272
 273        if (!kvm_arch_has_assigned_device(kvm) ||
 274            !irq_remapping_cap(IRQ_POSTING_CAP) ||
 275            !kvm_vcpu_apicv_active(kvm->vcpus[0]))
 276                return 0;
 277
 278        idx = srcu_read_lock(&kvm->irq_srcu);
 279        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 280        if (guest_irq >= irq_rt->nr_rt_entries ||
 281            hlist_empty(&irq_rt->map[guest_irq])) {
 282                pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
 283                             guest_irq, irq_rt->nr_rt_entries);
 284                goto out;
 285        }
 286
 287        hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
 288                if (e->type != KVM_IRQ_ROUTING_MSI)
 289                        continue;
 290                /*
 291                 * VT-d PI cannot support posting multicast/broadcast
 292                 * interrupts to a vCPU, we still use interrupt remapping
 293                 * for these kind of interrupts.
 294                 *
 295                 * For lowest-priority interrupts, we only support
 296                 * those with single CPU as the destination, e.g. user
 297                 * configures the interrupts via /proc/irq or uses
 298                 * irqbalance to make the interrupts single-CPU.
 299                 *
 300                 * We will support full lowest-priority interrupt later.
 301                 *
 302                 * In addition, we can only inject generic interrupts using
 303                 * the PI mechanism, refuse to route others through it.
 304                 */
 305
 306                kvm_set_msi_irq(kvm, e, &irq);
 307                if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
 308                    !kvm_irq_is_postable(&irq)) {
 309                        /*
 310                         * Make sure the IRTE is in remapped mode if
 311                         * we don't handle it in posted mode.
 312                         */
 313                        ret = irq_set_vcpu_affinity(host_irq, NULL);
 314                        if (ret < 0) {
 315                                printk(KERN_INFO
 316                                   "failed to back to remapped mode, irq: %u\n",
 317                                   host_irq);
 318                                goto out;
 319                        }
 320
 321                        continue;
 322                }
 323
 324                vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
 325                vcpu_info.vector = irq.vector;
 326
 327                trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
 328                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 329
 330                if (set)
 331                        ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
 332                else
 333                        ret = irq_set_vcpu_affinity(host_irq, NULL);
 334
 335                if (ret < 0) {
 336                        printk(KERN_INFO "%s: failed to update PI IRTE\n",
 337                                        __func__);
 338                        goto out;
 339                }
 340        }
 341
 342        ret = 0;
 343out:
 344        srcu_read_unlock(&kvm->irq_srcu, idx);
 345        return ret;
 346}
 347