linux/arch/powerpc/kvm/book3s_xive.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
   4 */
   5
   6#define pr_fmt(fmt) "xive-kvm: " fmt
   7
   8#include <linux/kernel.h>
   9#include <linux/kvm_host.h>
  10#include <linux/err.h>
  11#include <linux/gfp.h>
  12#include <linux/spinlock.h>
  13#include <linux/delay.h>
  14#include <linux/percpu.h>
  15#include <linux/cpumask.h>
  16#include <linux/uaccess.h>
  17#include <linux/irqdomain.h>
  18#include <asm/kvm_book3s.h>
  19#include <asm/kvm_ppc.h>
  20#include <asm/hvcall.h>
  21#include <asm/xics.h>
  22#include <asm/xive.h>
  23#include <asm/xive-regs.h>
  24#include <asm/debug.h>
  25#include <asm/time.h>
  26#include <asm/opal.h>
  27
  28#include <linux/debugfs.h>
  29#include <linux/seq_file.h>
  30
  31#include "book3s_xive.h"
  32
  33#define __x_eoi_page(xd)        ((void __iomem *)((xd)->eoi_mmio))
  34#define __x_trig_page(xd)       ((void __iomem *)((xd)->trig_mmio))
  35
  36/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */
  37#define XICS_DUMMY      1
  38
  39static void xive_vm_ack_pending(struct kvmppc_xive_vcpu *xc)
  40{
  41        u8 cppr;
  42        u16 ack;
  43
  44        /*
  45         * Ensure any previous store to CPPR is ordered vs.
  46         * the subsequent loads from PIPR or ACK.
  47         */
  48        eieio();
  49
  50        /* Perform the acknowledge OS to register cycle. */
  51        ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_OS_REG));
  52
  53        /* Synchronize subsequent queue accesses */
  54        mb();
  55
  56        /* XXX Check grouping level */
  57
  58        /* Anything ? */
  59        if (!((ack >> 8) & TM_QW1_NSR_EO))
  60                return;
  61
  62        /* Grab CPPR of the most favored pending interrupt */
  63        cppr = ack & 0xff;
  64        if (cppr < 8)
  65                xc->pending |= 1 << cppr;
  66
  67        /* Check consistency */
  68        if (cppr >= xc->hw_cppr)
  69                pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
  70                        smp_processor_id(), cppr, xc->hw_cppr);
  71
  72        /*
  73         * Update our image of the HW CPPR. We don't yet modify
  74         * xc->cppr, this will be done as we scan for interrupts
  75         * in the queues.
  76         */
  77        xc->hw_cppr = cppr;
  78}
  79
  80static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
  81{
  82        u64 val;
  83
  84        if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
  85                offset |= XIVE_ESB_LD_ST_MO;
  86
  87        val = __raw_readq(__x_eoi_page(xd) + offset);
  88#ifdef __LITTLE_ENDIAN__
  89        val >>= 64-8;
  90#endif
  91        return (u8)val;
  92}
  93
  94
  95static void xive_vm_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
  96{
  97        /* If the XIVE supports the new "store EOI facility, use it */
  98        if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
  99                __raw_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI);
 100        else if (xd->flags & XIVE_IRQ_FLAG_LSI) {
 101                /*
 102                 * For LSIs the HW EOI cycle is used rather than PQ bits,
 103                 * as they are automatically re-triggred in HW when still
 104                 * pending.
 105                 */
 106                __raw_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI);
 107        } else {
 108                uint64_t eoi_val;
 109
 110                /*
 111                 * Otherwise for EOI, we use the special MMIO that does
 112                 * a clear of both P and Q and returns the old Q,
 113                 * except for LSIs where we use the "EOI cycle" special
 114                 * load.
 115                 *
 116                 * This allows us to then do a re-trigger if Q was set
 117                 * rather than synthetizing an interrupt in software
 118                 */
 119                eoi_val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_00);
 120
 121                /* Re-trigger if needed */
 122                if ((eoi_val & 1) && __x_trig_page(xd))
 123                        __raw_writeq(0, __x_trig_page(xd));
 124        }
 125}
 126
 127enum {
 128        scan_fetch,
 129        scan_poll,
 130        scan_eoi,
 131};
 132
 133static u32 xive_vm_scan_interrupts(struct kvmppc_xive_vcpu *xc,
 134                                       u8 pending, int scan_type)
 135{
 136        u32 hirq = 0;
 137        u8 prio = 0xff;
 138
 139        /* Find highest pending priority */
 140        while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
 141                struct xive_q *q;
 142                u32 idx, toggle;
 143                __be32 *qpage;
 144
 145                /*
 146                 * If pending is 0 this will return 0xff which is what
 147                 * we want
 148                 */
 149                prio = ffs(pending) - 1;
 150
 151                /* Don't scan past the guest cppr */
 152                if (prio >= xc->cppr || prio > 7) {
 153                        if (xc->mfrr < xc->cppr) {
 154                                prio = xc->mfrr;
 155                                hirq = XICS_IPI;
 156                        }
 157                        break;
 158                }
 159
 160                /* Grab queue and pointers */
 161                q = &xc->queues[prio];
 162                idx = q->idx;
 163                toggle = q->toggle;
 164
 165                /*
 166                 * Snapshot the queue page. The test further down for EOI
 167                 * must use the same "copy" that was used by __xive_read_eq
 168                 * since qpage can be set concurrently and we don't want
 169                 * to miss an EOI.
 170                 */
 171                qpage = READ_ONCE(q->qpage);
 172
 173skip_ipi:
 174                /*
 175                 * Try to fetch from the queue. Will return 0 for a
 176                 * non-queueing priority (ie, qpage = 0).
 177                 */
 178                hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
 179
 180                /*
 181                 * If this was a signal for an MFFR change done by
 182                 * H_IPI we skip it. Additionally, if we were fetching
 183                 * we EOI it now, thus re-enabling reception of a new
 184                 * such signal.
 185                 *
 186                 * We also need to do that if prio is 0 and we had no
 187                 * page for the queue. In this case, we have non-queued
 188                 * IPI that needs to be EOId.
 189                 *
 190                 * This is safe because if we have another pending MFRR
 191                 * change that wasn't observed above, the Q bit will have
 192                 * been set and another occurrence of the IPI will trigger.
 193                 */
 194                if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
 195                        if (scan_type == scan_fetch) {
 196                                xive_vm_source_eoi(xc->vp_ipi,
 197                                                       &xc->vp_ipi_data);
 198                                q->idx = idx;
 199                                q->toggle = toggle;
 200                        }
 201                        /* Loop back on same queue with updated idx/toggle */
 202                        WARN_ON(hirq && hirq != XICS_IPI);
 203                        if (hirq)
 204                                goto skip_ipi;
 205                }
 206
 207                /* If it's the dummy interrupt, continue searching */
 208                if (hirq == XICS_DUMMY)
 209                        goto skip_ipi;
 210
 211                /* Clear the pending bit if the queue is now empty */
 212                if (!hirq) {
 213                        pending &= ~(1 << prio);
 214
 215                        /*
 216                         * Check if the queue count needs adjusting due to
 217                         * interrupts being moved away.
 218                         */
 219                        if (atomic_read(&q->pending_count)) {
 220                                int p = atomic_xchg(&q->pending_count, 0);
 221
 222                                if (p) {
 223                                        WARN_ON(p > atomic_read(&q->count));
 224                                        atomic_sub(p, &q->count);
 225                                }
 226                        }
 227                }
 228
 229                /*
 230                 * If the most favoured prio we found pending is less
 231                 * favored (or equal) than a pending IPI, we return
 232                 * the IPI instead.
 233                 */
 234                if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
 235                        prio = xc->mfrr;
 236                        hirq = XICS_IPI;
 237                        break;
 238                }
 239
 240                /* If fetching, update queue pointers */
 241                if (scan_type == scan_fetch) {
 242                        q->idx = idx;
 243                        q->toggle = toggle;
 244                }
 245        }
 246
 247        /* If we are just taking a "peek", do nothing else */
 248        if (scan_type == scan_poll)
 249                return hirq;
 250
 251        /* Update the pending bits */
 252        xc->pending = pending;
 253
 254        /*
 255         * If this is an EOI that's it, no CPPR adjustment done here,
 256         * all we needed was cleanup the stale pending bits and check
 257         * if there's anything left.
 258         */
 259        if (scan_type == scan_eoi)
 260                return hirq;
 261
 262        /*
 263         * If we found an interrupt, adjust what the guest CPPR should
 264         * be as if we had just fetched that interrupt from HW.
 265         *
 266         * Note: This can only make xc->cppr smaller as the previous
 267         * loop will only exit with hirq != 0 if prio is lower than
 268         * the current xc->cppr. Thus we don't need to re-check xc->mfrr
 269         * for pending IPIs.
 270         */
 271        if (hirq)
 272                xc->cppr = prio;
 273        /*
 274         * If it was an IPI the HW CPPR might have been lowered too much
 275         * as the HW interrupt we use for IPIs is routed to priority 0.
 276         *
 277         * We re-sync it here.
 278         */
 279        if (xc->cppr != xc->hw_cppr) {
 280                xc->hw_cppr = xc->cppr;
 281                __raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
 282        }
 283
 284        return hirq;
 285}
 286
 287static unsigned long xive_vm_h_xirr(struct kvm_vcpu *vcpu)
 288{
 289        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 290        u8 old_cppr;
 291        u32 hirq;
 292
 293        pr_devel("H_XIRR\n");
 294
 295        xc->stat_vm_h_xirr++;
 296
 297        /* First collect pending bits from HW */
 298        xive_vm_ack_pending(xc);
 299
 300        pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
 301                 xc->pending, xc->hw_cppr, xc->cppr);
 302
 303        /* Grab previous CPPR and reverse map it */
 304        old_cppr = xive_prio_to_guest(xc->cppr);
 305
 306        /* Scan for actual interrupts */
 307        hirq = xive_vm_scan_interrupts(xc, xc->pending, scan_fetch);
 308
 309        pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
 310                 hirq, xc->hw_cppr, xc->cppr);
 311
 312        /* That should never hit */
 313        if (hirq & 0xff000000)
 314                pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
 315
 316        /*
 317         * XXX We could check if the interrupt is masked here and
 318         * filter it. If we chose to do so, we would need to do:
 319         *
 320         *    if (masked) {
 321         *        lock();
 322         *        if (masked) {
 323         *            old_Q = true;
 324         *            hirq = 0;
 325         *        }
 326         *        unlock();
 327         *    }
 328         */
 329
 330        /* Return interrupt and old CPPR in GPR4 */
 331        vcpu->arch.regs.gpr[4] = hirq | (old_cppr << 24);
 332
 333        return H_SUCCESS;
 334}
 335
 336static unsigned long xive_vm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
 337{
 338        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 339        u8 pending = xc->pending;
 340        u32 hirq;
 341
 342        pr_devel("H_IPOLL(server=%ld)\n", server);
 343
 344        xc->stat_vm_h_ipoll++;
 345
 346        /* Grab the target VCPU if not the current one */
 347        if (xc->server_num != server) {
 348                vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
 349                if (!vcpu)
 350                        return H_PARAMETER;
 351                xc = vcpu->arch.xive_vcpu;
 352
 353                /* Scan all priorities */
 354                pending = 0xff;
 355        } else {
 356                /* Grab pending interrupt if any */
 357                __be64 qw1 = __raw_readq(xive_tima + TM_QW1_OS);
 358                u8 pipr = be64_to_cpu(qw1) & 0xff;
 359
 360                if (pipr < 8)
 361                        pending |= 1 << pipr;
 362        }
 363
 364        hirq = xive_vm_scan_interrupts(xc, pending, scan_poll);
 365
 366        /* Return interrupt and old CPPR in GPR4 */
 367        vcpu->arch.regs.gpr[4] = hirq | (xc->cppr << 24);
 368
 369        return H_SUCCESS;
 370}
 371
 372static void xive_vm_push_pending_to_hw(struct kvmppc_xive_vcpu *xc)
 373{
 374        u8 pending, prio;
 375
 376        pending = xc->pending;
 377        if (xc->mfrr != 0xff) {
 378                if (xc->mfrr < 8)
 379                        pending |= 1 << xc->mfrr;
 380                else
 381                        pending |= 0x80;
 382        }
 383        if (!pending)
 384                return;
 385        prio = ffs(pending) - 1;
 386
 387        __raw_writeb(prio, xive_tima + TM_SPC_SET_OS_PENDING);
 388}
 389
 390static void xive_vm_scan_for_rerouted_irqs(struct kvmppc_xive *xive,
 391                                               struct kvmppc_xive_vcpu *xc)
 392{
 393        unsigned int prio;
 394
 395        /* For each priority that is now masked */
 396        for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
 397                struct xive_q *q = &xc->queues[prio];
 398                struct kvmppc_xive_irq_state *state;
 399                struct kvmppc_xive_src_block *sb;
 400                u32 idx, toggle, entry, irq, hw_num;
 401                struct xive_irq_data *xd;
 402                __be32 *qpage;
 403                u16 src;
 404
 405                idx = q->idx;
 406                toggle = q->toggle;
 407                qpage = READ_ONCE(q->qpage);
 408                if (!qpage)
 409                        continue;
 410
 411                /* For each interrupt in the queue */
 412                for (;;) {
 413                        entry = be32_to_cpup(qpage + idx);
 414
 415                        /* No more ? */
 416                        if ((entry >> 31) == toggle)
 417                                break;
 418                        irq = entry & 0x7fffffff;
 419
 420                        /* Skip dummies and IPIs */
 421                        if (irq == XICS_DUMMY || irq == XICS_IPI)
 422                                goto next;
 423                        sb = kvmppc_xive_find_source(xive, irq, &src);
 424                        if (!sb)
 425                                goto next;
 426                        state = &sb->irq_state[src];
 427
 428                        /* Has it been rerouted ? */
 429                        if (xc->server_num == state->act_server)
 430                                goto next;
 431
 432                        /*
 433                         * Allright, it *has* been re-routed, kill it from
 434                         * the queue.
 435                         */
 436                        qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY);
 437
 438                        /* Find the HW interrupt */
 439                        kvmppc_xive_select_irq(state, &hw_num, &xd);
 440
 441                        /* If it's not an LSI, set PQ to 11 the EOI will force a resend */
 442                        if (!(xd->flags & XIVE_IRQ_FLAG_LSI))
 443                                xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
 444
 445                        /* EOI the source */
 446                        xive_vm_source_eoi(hw_num, xd);
 447
 448next:
 449                        idx = (idx + 1) & q->msk;
 450                        if (idx == 0)
 451                                toggle ^= 1;
 452                }
 453        }
 454}
 455
 456static int xive_vm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 457{
 458        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 459        struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
 460        u8 old_cppr;
 461
 462        pr_devel("H_CPPR(cppr=%ld)\n", cppr);
 463
 464        xc->stat_vm_h_cppr++;
 465
 466        /* Map CPPR */
 467        cppr = xive_prio_from_guest(cppr);
 468
 469        /* Remember old and update SW state */
 470        old_cppr = xc->cppr;
 471        xc->cppr = cppr;
 472
 473        /*
 474         * Order the above update of xc->cppr with the subsequent
 475         * read of xc->mfrr inside push_pending_to_hw()
 476         */
 477        smp_mb();
 478
 479        if (cppr > old_cppr) {
 480                /*
 481                 * We are masking less, we need to look for pending things
 482                 * to deliver and set VP pending bits accordingly to trigger
 483                 * a new interrupt otherwise we might miss MFRR changes for
 484                 * which we have optimized out sending an IPI signal.
 485                 */
 486                xive_vm_push_pending_to_hw(xc);
 487        } else {
 488                /*
 489                 * We are masking more, we need to check the queue for any
 490                 * interrupt that has been routed to another CPU, take
 491                 * it out (replace it with the dummy) and retrigger it.
 492                 *
 493                 * This is necessary since those interrupts may otherwise
 494                 * never be processed, at least not until this CPU restores
 495                 * its CPPR.
 496                 *
 497                 * This is in theory racy vs. HW adding new interrupts to
 498                 * the queue. In practice this works because the interesting
 499                 * cases are when the guest has done a set_xive() to move the
 500                 * interrupt away, which flushes the xive, followed by the
 501                 * target CPU doing a H_CPPR. So any new interrupt coming into
 502                 * the queue must still be routed to us and isn't a source
 503                 * of concern.
 504                 */
 505                xive_vm_scan_for_rerouted_irqs(xive, xc);
 506        }
 507
 508        /* Apply new CPPR */
 509        xc->hw_cppr = cppr;
 510        __raw_writeb(cppr, xive_tima + TM_QW1_OS + TM_CPPR);
 511
 512        return H_SUCCESS;
 513}
 514
 515static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 516{
 517        struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
 518        struct kvmppc_xive_src_block *sb;
 519        struct kvmppc_xive_irq_state *state;
 520        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 521        struct xive_irq_data *xd;
 522        u8 new_cppr = xirr >> 24;
 523        u32 irq = xirr & 0x00ffffff, hw_num;
 524        u16 src;
 525        int rc = 0;
 526
 527        pr_devel("H_EOI(xirr=%08lx)\n", xirr);
 528
 529        xc->stat_vm_h_eoi++;
 530
 531        xc->cppr = xive_prio_from_guest(new_cppr);
 532
 533        /*
 534         * IPIs are synthetized from MFRR and thus don't need
 535         * any special EOI handling. The underlying interrupt
 536         * used to signal MFRR changes is EOId when fetched from
 537         * the queue.
 538         */
 539        if (irq == XICS_IPI || irq == 0) {
 540                /*
 541                 * This barrier orders the setting of xc->cppr vs.
 542                 * subsquent test of xc->mfrr done inside
 543                 * scan_interrupts and push_pending_to_hw
 544                 */
 545                smp_mb();
 546                goto bail;
 547        }
 548
 549        /* Find interrupt source */
 550        sb = kvmppc_xive_find_source(xive, irq, &src);
 551        if (!sb) {
 552                pr_devel(" source not found !\n");
 553                rc = H_PARAMETER;
 554                /* Same as above */
 555                smp_mb();
 556                goto bail;
 557        }
 558        state = &sb->irq_state[src];
 559        kvmppc_xive_select_irq(state, &hw_num, &xd);
 560
 561        state->in_eoi = true;
 562
 563        /*
 564         * This barrier orders both setting of in_eoi above vs,
 565         * subsequent test of guest_priority, and the setting
 566         * of xc->cppr vs. subsquent test of xc->mfrr done inside
 567         * scan_interrupts and push_pending_to_hw
 568         */
 569        smp_mb();
 570
 571again:
 572        if (state->guest_priority == MASKED) {
 573                arch_spin_lock(&sb->lock);
 574                if (state->guest_priority != MASKED) {
 575                        arch_spin_unlock(&sb->lock);
 576                        goto again;
 577                }
 578                pr_devel(" EOI on saved P...\n");
 579
 580                /* Clear old_p, that will cause unmask to perform an EOI */
 581                state->old_p = false;
 582
 583                arch_spin_unlock(&sb->lock);
 584        } else {
 585                pr_devel(" EOI on source...\n");
 586
 587                /* Perform EOI on the source */
 588                xive_vm_source_eoi(hw_num, xd);
 589
 590                /* If it's an emulated LSI, check level and resend */
 591                if (state->lsi && state->asserted)
 592                        __raw_writeq(0, __x_trig_page(xd));
 593
 594        }
 595
 596        /*
 597         * This barrier orders the above guest_priority check
 598         * and spin_lock/unlock with clearing in_eoi below.
 599         *
 600         * It also has to be a full mb() as it must ensure
 601         * the MMIOs done in source_eoi() are completed before
 602         * state->in_eoi is visible.
 603         */
 604        mb();
 605        state->in_eoi = false;
 606bail:
 607
 608        /* Re-evaluate pending IRQs and update HW */
 609        xive_vm_scan_interrupts(xc, xc->pending, scan_eoi);
 610        xive_vm_push_pending_to_hw(xc);
 611        pr_devel(" after scan pending=%02x\n", xc->pending);
 612
 613        /* Apply new CPPR */
 614        xc->hw_cppr = xc->cppr;
 615        __raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
 616
 617        return rc;
 618}
 619
 620static int xive_vm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 621                               unsigned long mfrr)
 622{
 623        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 624
 625        pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
 626
 627        xc->stat_vm_h_ipi++;
 628
 629        /* Find target */
 630        vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
 631        if (!vcpu)
 632                return H_PARAMETER;
 633        xc = vcpu->arch.xive_vcpu;
 634
 635        /* Locklessly write over MFRR */
 636        xc->mfrr = mfrr;
 637
 638        /*
 639         * The load of xc->cppr below and the subsequent MMIO store
 640         * to the IPI must happen after the above mfrr update is
 641         * globally visible so that:
 642         *
 643         * - Synchronize with another CPU doing an H_EOI or a H_CPPR
 644         *   updating xc->cppr then reading xc->mfrr.
 645         *
 646         * - The target of the IPI sees the xc->mfrr update
 647         */
 648        mb();
 649
 650        /* Shoot the IPI if most favored than target cppr */
 651        if (mfrr < xc->cppr)
 652                __raw_writeq(0, __x_trig_page(&xc->vp_ipi_data));
 653
 654        return H_SUCCESS;
 655}
 656
 657/*
 658 * We leave a gap of a couple of interrupts in the queue to
 659 * account for the IPI and additional safety guard.
 660 */
 661#define XIVE_Q_GAP      2
 662
 663static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu)
 664{
 665        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 666
 667        /* Check enablement at VP level */
 668        return xc->vp_cam & TM_QW1W2_HO;
 669}
 670
 671bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu)
 672{
 673        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 674        struct kvmppc_xive *xive = xc->xive;
 675
 676        if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE)
 677                return kvmppc_xive_vcpu_has_save_restore(vcpu);
 678
 679        return true;
 680}
 681
 682/*
 683 * Push a vcpu's context to the XIVE on guest entry.
 684 * This assumes we are in virtual mode (MMU on)
 685 */
 686void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
 687{
 688        void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
 689        u64 pq;
 690
 691        /*
 692         * Nothing to do if the platform doesn't have a XIVE
 693         * or this vCPU doesn't have its own XIVE context
 694         * (e.g. because it's not using an in-kernel interrupt controller).
 695         */
 696        if (!tima || !vcpu->arch.xive_cam_word)
 697                return;
 698
 699        eieio();
 700        if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
 701                __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
 702        __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
 703        vcpu->arch.xive_pushed = 1;
 704        eieio();
 705
 706        /*
 707         * We clear the irq_pending flag. There is a small chance of a
 708         * race vs. the escalation interrupt happening on another
 709         * processor setting it again, but the only consequence is to
 710         * cause a spurious wakeup on the next H_CEDE, which is not an
 711         * issue.
 712         */
 713        vcpu->arch.irq_pending = 0;
 714
 715        /*
 716         * In single escalation mode, if the escalation interrupt is
 717         * on, we mask it.
 718         */
 719        if (vcpu->arch.xive_esc_on) {
 720                pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
 721                                                  XIVE_ESB_SET_PQ_01));
 722                mb();
 723
 724                /*
 725                 * We have a possible subtle race here: The escalation
 726                 * interrupt might have fired and be on its way to the
 727                 * host queue while we mask it, and if we unmask it
 728                 * early enough (re-cede right away), there is a
 729                 * theoretical possibility that it fires again, thus
 730                 * landing in the target queue more than once which is
 731                 * a big no-no.
 732                 *
 733                 * Fortunately, solving this is rather easy. If the
 734                 * above load setting PQ to 01 returns a previous
 735                 * value where P is set, then we know the escalation
 736                 * interrupt is somewhere on its way to the host. In
 737                 * that case we simply don't clear the xive_esc_on
 738                 * flag below. It will be eventually cleared by the
 739                 * handler for the escalation interrupt.
 740                 *
 741                 * Then, when doing a cede, we check that flag again
 742                 * before re-enabling the escalation interrupt, and if
 743                 * set, we abort the cede.
 744                 */
 745                if (!(pq & XIVE_ESB_VAL_P))
 746                        /* Now P is 0, we can clear the flag */
 747                        vcpu->arch.xive_esc_on = 0;
 748        }
 749}
 750EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
 751
 752/*
 753 * Pull a vcpu's context from the XIVE on guest exit.
 754 * This assumes we are in virtual mode (MMU on)
 755 */
 756void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
 757{
 758        void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
 759
 760        if (!vcpu->arch.xive_pushed)
 761                return;
 762
 763        /*
 764         * Should not have been pushed if there is no tima
 765         */
 766        if (WARN_ON(!tima))
 767                return;
 768
 769        eieio();
 770        /* First load to pull the context, we ignore the value */
 771        __raw_readl(tima + TM_SPC_PULL_OS_CTX);
 772        /* Second load to recover the context state (Words 0 and 1) */
 773        if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
 774                vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
 775
 776        /* Fixup some of the state for the next load */
 777        vcpu->arch.xive_saved_state.lsmfb = 0;
 778        vcpu->arch.xive_saved_state.ack = 0xff;
 779        vcpu->arch.xive_pushed = 0;
 780        eieio();
 781}
 782EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
 783
 784bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
 785{
 786        void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
 787        bool ret = true;
 788
 789        if (!esc_vaddr)
 790                return ret;
 791
 792        /* we are using XIVE with single escalation */
 793
 794        if (vcpu->arch.xive_esc_on) {
 795                /*
 796                 * If we still have a pending escalation, abort the cede,
 797                 * and we must set PQ to 10 rather than 00 so that we don't
 798                 * potentially end up with two entries for the escalation
 799                 * interrupt in the XIVE interrupt queue.  In that case
 800                 * we also don't want to set xive_esc_on to 1 here in
 801                 * case we race with xive_esc_irq().
 802                 */
 803                ret = false;
 804                /*
 805                 * The escalation interrupts are special as we don't EOI them.
 806                 * There is no need to use the load-after-store ordering offset
 807                 * to set PQ to 10 as we won't use StoreEOI.
 808                 */
 809                __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
 810        } else {
 811                vcpu->arch.xive_esc_on = true;
 812                mb();
 813                __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
 814        }
 815        mb();
 816
 817        return ret;
 818}
 819EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation);
 820
 821/*
 822 * This is a simple trigger for a generic XIVE IRQ. This must
 823 * only be called for interrupts that support a trigger page
 824 */
 825static bool xive_irq_trigger(struct xive_irq_data *xd)
 826{
 827        /* This should be only for MSIs */
 828        if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
 829                return false;
 830
 831        /* Those interrupts should always have a trigger page */
 832        if (WARN_ON(!xd->trig_mmio))
 833                return false;
 834
 835        out_be64(xd->trig_mmio, 0);
 836
 837        return true;
 838}
 839
 840static irqreturn_t xive_esc_irq(int irq, void *data)
 841{
 842        struct kvm_vcpu *vcpu = data;
 843
 844        vcpu->arch.irq_pending = 1;
 845        smp_mb();
 846        if (vcpu->arch.ceded || vcpu->arch.nested)
 847                kvmppc_fast_vcpu_kick(vcpu);
 848
 849        /* Since we have the no-EOI flag, the interrupt is effectively
 850         * disabled now. Clearing xive_esc_on means we won't bother
 851         * doing so on the next entry.
 852         *
 853         * This also allows the entry code to know that if a PQ combination
 854         * of 10 is observed while xive_esc_on is true, it means the queue
 855         * contains an unprocessed escalation interrupt. We don't make use of
 856         * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
 857         */
 858        vcpu->arch.xive_esc_on = false;
 859
 860        /* This orders xive_esc_on = false vs. subsequent stale_p = true */
 861        smp_wmb();      /* goes with smp_mb() in cleanup_single_escalation */
 862
 863        return IRQ_HANDLED;
 864}
 865
 866int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
 867                                  bool single_escalation)
 868{
 869        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 870        struct xive_q *q = &xc->queues[prio];
 871        char *name = NULL;
 872        int rc;
 873
 874        /* Already there ? */
 875        if (xc->esc_virq[prio])
 876                return 0;
 877
 878        /* Hook up the escalation interrupt */
 879        xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq);
 880        if (!xc->esc_virq[prio]) {
 881                pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n",
 882                       prio, xc->server_num);
 883                return -EIO;
 884        }
 885
 886        if (single_escalation)
 887                name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
 888                                 vcpu->kvm->arch.lpid, xc->server_num);
 889        else
 890                name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
 891                                 vcpu->kvm->arch.lpid, xc->server_num, prio);
 892        if (!name) {
 893                pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
 894                       prio, xc->server_num);
 895                rc = -ENOMEM;
 896                goto error;
 897        }
 898
 899        pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio);
 900
 901        rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
 902                         IRQF_NO_THREAD, name, vcpu);
 903        if (rc) {
 904                pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n",
 905                       prio, xc->server_num);
 906                goto error;
 907        }
 908        xc->esc_virq_names[prio] = name;
 909
 910        /* In single escalation mode, we grab the ESB MMIO of the
 911         * interrupt and mask it. Also populate the VCPU v/raddr
 912         * of the ESB page for use by asm entry/exit code. Finally
 913         * set the XIVE_IRQ_FLAG_NO_EOI flag which will prevent the
 914         * core code from performing an EOI on the escalation
 915         * interrupt, thus leaving it effectively masked after
 916         * it fires once.
 917         */
 918        if (single_escalation) {
 919                struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
 920                struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
 921
 922                xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
 923                vcpu->arch.xive_esc_raddr = xd->eoi_page;
 924                vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio;
 925                xd->flags |= XIVE_IRQ_FLAG_NO_EOI;
 926        }
 927
 928        return 0;
 929error:
 930        irq_dispose_mapping(xc->esc_virq[prio]);
 931        xc->esc_virq[prio] = 0;
 932        kfree(name);
 933        return rc;
 934}
 935
 936static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
 937{
 938        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 939        struct kvmppc_xive *xive = xc->xive;
 940        struct xive_q *q =  &xc->queues[prio];
 941        void *qpage;
 942        int rc;
 943
 944        if (WARN_ON(q->qpage))
 945                return 0;
 946
 947        /* Allocate the queue and retrieve infos on current node for now */
 948        qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order);
 949        if (!qpage) {
 950                pr_err("Failed to allocate queue %d for VCPU %d\n",
 951                       prio, xc->server_num);
 952                return -ENOMEM;
 953        }
 954        memset(qpage, 0, 1 << xive->q_order);
 955
 956        /*
 957         * Reconfigure the queue. This will set q->qpage only once the
 958         * queue is fully configured. This is a requirement for prio 0
 959         * as we will stop doing EOIs for every IPI as soon as we observe
 960         * qpage being non-NULL, and instead will only EOI when we receive
 961         * corresponding queue 0 entries
 962         */
 963        rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
 964                                         xive->q_order, true);
 965        if (rc)
 966                pr_err("Failed to configure queue %d for VCPU %d\n",
 967                       prio, xc->server_num);
 968        return rc;
 969}
 970
 971/* Called with xive->lock held */
 972static int xive_check_provisioning(struct kvm *kvm, u8 prio)
 973{
 974        struct kvmppc_xive *xive = kvm->arch.xive;
 975        struct kvm_vcpu *vcpu;
 976        unsigned long i;
 977        int rc;
 978
 979        lockdep_assert_held(&xive->lock);
 980
 981        /* Already provisioned ? */
 982        if (xive->qmap & (1 << prio))
 983                return 0;
 984
 985        pr_devel("Provisioning prio... %d\n", prio);
 986
 987        /* Provision each VCPU and enable escalations if needed */
 988        kvm_for_each_vcpu(i, vcpu, kvm) {
 989                if (!vcpu->arch.xive_vcpu)
 990                        continue;
 991                rc = xive_provision_queue(vcpu, prio);
 992                if (rc == 0 && !kvmppc_xive_has_single_escalation(xive))
 993                        kvmppc_xive_attach_escalation(vcpu, prio,
 994                                                      kvmppc_xive_has_single_escalation(xive));
 995                if (rc)
 996                        return rc;
 997        }
 998
 999        /* Order previous stores and mark it as provisioned */
1000        mb();
1001        xive->qmap |= (1 << prio);
1002        return 0;
1003}
1004
1005static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio)
1006{
1007        struct kvm_vcpu *vcpu;
1008        struct kvmppc_xive_vcpu *xc;
1009        struct xive_q *q;
1010
1011        /* Locate target server */
1012        vcpu = kvmppc_xive_find_server(kvm, server);
1013        if (!vcpu) {
1014                pr_warn("%s: Can't find server %d\n", __func__, server);
1015                return;
1016        }
1017        xc = vcpu->arch.xive_vcpu;
1018        if (WARN_ON(!xc))
1019                return;
1020
1021        q = &xc->queues[prio];
1022        atomic_inc(&q->pending_count);
1023}
1024
1025static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
1026{
1027        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1028        struct xive_q *q;
1029        u32 max;
1030
1031        if (WARN_ON(!xc))
1032                return -ENXIO;
1033        if (!xc->valid)
1034                return -ENXIO;
1035
1036        q = &xc->queues[prio];
1037        if (WARN_ON(!q->qpage))
1038                return -ENXIO;
1039
1040        /* Calculate max number of interrupts in that queue. */
1041        max = (q->msk + 1) - XIVE_Q_GAP;
1042        return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
1043}
1044
1045int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
1046{
1047        struct kvm_vcpu *vcpu;
1048        unsigned long i;
1049        int rc;
1050
1051        /* Locate target server */
1052        vcpu = kvmppc_xive_find_server(kvm, *server);
1053        if (!vcpu) {
1054                pr_devel("Can't find server %d\n", *server);
1055                return -EINVAL;
1056        }
1057
1058        pr_devel("Finding irq target on 0x%x/%d...\n", *server, prio);
1059
1060        /* Try pick it */
1061        rc = xive_try_pick_queue(vcpu, prio);
1062        if (rc == 0)
1063                return rc;
1064
1065        pr_devel(" .. failed, looking up candidate...\n");
1066
1067        /* Failed, pick another VCPU */
1068        kvm_for_each_vcpu(i, vcpu, kvm) {
1069                if (!vcpu->arch.xive_vcpu)
1070                        continue;
1071                rc = xive_try_pick_queue(vcpu, prio);
1072                if (rc == 0) {
1073                        *server = vcpu->arch.xive_vcpu->server_num;
1074                        pr_devel("  found on 0x%x/%d\n", *server, prio);
1075                        return rc;
1076                }
1077        }
1078        pr_devel("  no available target !\n");
1079
1080        /* No available target ! */
1081        return -EBUSY;
1082}
1083
1084static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
1085                             struct kvmppc_xive_src_block *sb,
1086                             struct kvmppc_xive_irq_state *state)
1087{
1088        struct xive_irq_data *xd;
1089        u32 hw_num;
1090        u8 old_prio;
1091        u64 val;
1092
1093        /*
1094         * Take the lock, set masked, try again if racing
1095         * with H_EOI
1096         */
1097        for (;;) {
1098                arch_spin_lock(&sb->lock);
1099                old_prio = state->guest_priority;
1100                state->guest_priority = MASKED;
1101                mb();
1102                if (!state->in_eoi)
1103                        break;
1104                state->guest_priority = old_prio;
1105                arch_spin_unlock(&sb->lock);
1106        }
1107
1108        /* No change ? Bail */
1109        if (old_prio == MASKED)
1110                return old_prio;
1111
1112        /* Get the right irq */
1113        kvmppc_xive_select_irq(state, &hw_num, &xd);
1114
1115        /* Set PQ to 10, return old P and old Q and remember them */
1116        val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
1117        state->old_p = !!(val & 2);
1118        state->old_q = !!(val & 1);
1119
1120        /*
1121         * Synchronize hardware to sensure the queues are updated when
1122         * masking
1123         */
1124        xive_native_sync_source(hw_num);
1125
1126        return old_prio;
1127}
1128
1129static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb,
1130                                 struct kvmppc_xive_irq_state *state)
1131{
1132        /*
1133         * Take the lock try again if racing with H_EOI
1134         */
1135        for (;;) {
1136                arch_spin_lock(&sb->lock);
1137                if (!state->in_eoi)
1138                        break;
1139                arch_spin_unlock(&sb->lock);
1140        }
1141}
1142
1143static void xive_finish_unmask(struct kvmppc_xive *xive,
1144                               struct kvmppc_xive_src_block *sb,
1145                               struct kvmppc_xive_irq_state *state,
1146                               u8 prio)
1147{
1148        struct xive_irq_data *xd;
1149        u32 hw_num;
1150
1151        /* If we aren't changing a thing, move on */
1152        if (state->guest_priority != MASKED)
1153                goto bail;
1154
1155        /* Get the right irq */
1156        kvmppc_xive_select_irq(state, &hw_num, &xd);
1157
1158        /* Old Q set, set PQ to 11 */
1159        if (state->old_q)
1160                xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
1161
1162        /*
1163         * If not old P, then perform an "effective" EOI,
1164         * on the source. This will handle the cases where
1165         * FW EOI is needed.
1166         */
1167        if (!state->old_p)
1168                xive_vm_source_eoi(hw_num, xd);
1169
1170        /* Synchronize ordering and mark unmasked */
1171        mb();
1172bail:
1173        state->guest_priority = prio;
1174}
1175
1176/*
1177 * Target an interrupt to a given server/prio, this will fallback
1178 * to another server if necessary and perform the HW targetting
1179 * updates as needed
1180 *
1181 * NOTE: Must be called with the state lock held
1182 */
1183static int xive_target_interrupt(struct kvm *kvm,
1184                                 struct kvmppc_xive_irq_state *state,
1185                                 u32 server, u8 prio)
1186{
1187        struct kvmppc_xive *xive = kvm->arch.xive;
1188        u32 hw_num;
1189        int rc;
1190
1191        /*
1192         * This will return a tentative server and actual
1193         * priority. The count for that new target will have
1194         * already been incremented.
1195         */
1196        rc = kvmppc_xive_select_target(kvm, &server, prio);
1197
1198        /*
1199         * We failed to find a target ? Not much we can do
1200         * at least until we support the GIQ.
1201         */
1202        if (rc)
1203                return rc;
1204
1205        /*
1206         * Increment the old queue pending count if there
1207         * was one so that the old queue count gets adjusted later
1208         * when observed to be empty.
1209         */
1210        if (state->act_priority != MASKED)
1211                xive_inc_q_pending(kvm,
1212                                   state->act_server,
1213                                   state->act_priority);
1214        /*
1215         * Update state and HW
1216         */
1217        state->act_priority = prio;
1218        state->act_server = server;
1219
1220        /* Get the right irq */
1221        kvmppc_xive_select_irq(state, &hw_num, NULL);
1222
1223        return xive_native_configure_irq(hw_num,
1224                                         kvmppc_xive_vp(xive, server),
1225                                         prio, state->number);
1226}
1227
1228/*
1229 * Targetting rules: In order to avoid losing track of
1230 * pending interrupts across mask and unmask, which would
1231 * allow queue overflows, we implement the following rules:
1232 *
1233 *  - Unless it was never enabled (or we run out of capacity)
1234 *    an interrupt is always targetted at a valid server/queue
1235 *    pair even when "masked" by the guest. This pair tends to
1236 *    be the last one used but it can be changed under some
1237 *    circumstances. That allows us to separate targetting
1238 *    from masking, we only handle accounting during (re)targetting,
1239 *    this also allows us to let an interrupt drain into its target
1240 *    queue after masking, avoiding complex schemes to remove
1241 *    interrupts out of remote processor queues.
1242 *
1243 *  - When masking, we set PQ to 10 and save the previous value
1244 *    of P and Q.
1245 *
1246 *  - When unmasking, if saved Q was set, we set PQ to 11
1247 *    otherwise we leave PQ to the HW state which will be either
1248 *    10 if nothing happened or 11 if the interrupt fired while
1249 *    masked. Effectively we are OR'ing the previous Q into the
1250 *    HW Q.
1251 *
1252 *    Then if saved P is clear, we do an effective EOI (Q->P->Trigger)
1253 *    which will unmask the interrupt and shoot a new one if Q was
1254 *    set.
1255 *
1256 *    Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11,
1257 *    effectively meaning an H_EOI from the guest is still expected
1258 *    for that interrupt).
1259 *
1260 *  - If H_EOI occurs while masked, we clear the saved P.
1261 *
1262 *  - When changing target, we account on the new target and
1263 *    increment a separate "pending" counter on the old one.
1264 *    This pending counter will be used to decrement the old
1265 *    target's count when its queue has been observed empty.
1266 */
1267
1268int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
1269                         u32 priority)
1270{
1271        struct kvmppc_xive *xive = kvm->arch.xive;
1272        struct kvmppc_xive_src_block *sb;
1273        struct kvmppc_xive_irq_state *state;
1274        u8 new_act_prio;
1275        int rc = 0;
1276        u16 idx;
1277
1278        if (!xive)
1279                return -ENODEV;
1280
1281        pr_devel("set_xive ! irq 0x%x server 0x%x prio %d\n",
1282                 irq, server, priority);
1283
1284        /* First, check provisioning of queues */
1285        if (priority != MASKED) {
1286                mutex_lock(&xive->lock);
1287                rc = xive_check_provisioning(xive->kvm,
1288                              xive_prio_from_guest(priority));
1289                mutex_unlock(&xive->lock);
1290        }
1291        if (rc) {
1292                pr_devel("  provisioning failure %d !\n", rc);
1293                return rc;
1294        }
1295
1296        sb = kvmppc_xive_find_source(xive, irq, &idx);
1297        if (!sb)
1298                return -EINVAL;
1299        state = &sb->irq_state[idx];
1300
1301        /*
1302         * We first handle masking/unmasking since the locking
1303         * might need to be retried due to EOIs, we'll handle
1304         * targetting changes later. These functions will return
1305         * with the SB lock held.
1306         *
1307         * xive_lock_and_mask() will also set state->guest_priority
1308         * but won't otherwise change other fields of the state.
1309         *
1310         * xive_lock_for_unmask will not actually unmask, this will
1311         * be done later by xive_finish_unmask() once the targetting
1312         * has been done, so we don't try to unmask an interrupt
1313         * that hasn't yet been targetted.
1314         */
1315        if (priority == MASKED)
1316                xive_lock_and_mask(xive, sb, state);
1317        else
1318                xive_lock_for_unmask(sb, state);
1319
1320
1321        /*
1322         * Then we handle targetting.
1323         *
1324         * First calculate a new "actual priority"
1325         */
1326        new_act_prio = state->act_priority;
1327        if (priority != MASKED)
1328                new_act_prio = xive_prio_from_guest(priority);
1329
1330        pr_devel(" new_act_prio=%x act_server=%x act_prio=%x\n",
1331                 new_act_prio, state->act_server, state->act_priority);
1332
1333        /*
1334         * Then check if we actually need to change anything,
1335         *
1336         * The condition for re-targetting the interrupt is that
1337         * we have a valid new priority (new_act_prio is not 0xff)
1338         * and either the server or the priority changed.
1339         *
1340         * Note: If act_priority was ff and the new priority is
1341         *       also ff, we don't do anything and leave the interrupt
1342         *       untargetted. An attempt of doing an int_on on an
1343         *       untargetted interrupt will fail. If that is a problem
1344         *       we could initialize interrupts with valid default
1345         */
1346
1347        if (new_act_prio != MASKED &&
1348            (state->act_server != server ||
1349             state->act_priority != new_act_prio))
1350                rc = xive_target_interrupt(kvm, state, server, new_act_prio);
1351
1352        /*
1353         * Perform the final unmasking of the interrupt source
1354         * if necessary
1355         */
1356        if (priority != MASKED)
1357                xive_finish_unmask(xive, sb, state, priority);
1358
1359        /*
1360         * Finally Update saved_priority to match. Only int_on/off
1361         * set this field to a different value.
1362         */
1363        state->saved_priority = priority;
1364
1365        arch_spin_unlock(&sb->lock);
1366        return rc;
1367}
1368
1369int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
1370                         u32 *priority)
1371{
1372        struct kvmppc_xive *xive = kvm->arch.xive;
1373        struct kvmppc_xive_src_block *sb;
1374        struct kvmppc_xive_irq_state *state;
1375        u16 idx;
1376
1377        if (!xive)
1378                return -ENODEV;
1379
1380        sb = kvmppc_xive_find_source(xive, irq, &idx);
1381        if (!sb)
1382                return -EINVAL;
1383        state = &sb->irq_state[idx];
1384        arch_spin_lock(&sb->lock);
1385        *server = state->act_server;
1386        *priority = state->guest_priority;
1387        arch_spin_unlock(&sb->lock);
1388
1389        return 0;
1390}
1391
1392int kvmppc_xive_int_on(struct kvm *kvm, u32 irq)
1393{
1394        struct kvmppc_xive *xive = kvm->arch.xive;
1395        struct kvmppc_xive_src_block *sb;
1396        struct kvmppc_xive_irq_state *state;
1397        u16 idx;
1398
1399        if (!xive)
1400                return -ENODEV;
1401
1402        sb = kvmppc_xive_find_source(xive, irq, &idx);
1403        if (!sb)
1404                return -EINVAL;
1405        state = &sb->irq_state[idx];
1406
1407        pr_devel("int_on(irq=0x%x)\n", irq);
1408
1409        /*
1410         * Check if interrupt was not targetted
1411         */
1412        if (state->act_priority == MASKED) {
1413                pr_devel("int_on on untargetted interrupt\n");
1414                return -EINVAL;
1415        }
1416
1417        /* If saved_priority is 0xff, do nothing */
1418        if (state->saved_priority == MASKED)
1419                return 0;
1420
1421        /*
1422         * Lock and unmask it.
1423         */
1424        xive_lock_for_unmask(sb, state);
1425        xive_finish_unmask(xive, sb, state, state->saved_priority);
1426        arch_spin_unlock(&sb->lock);
1427
1428        return 0;
1429}
1430
1431int kvmppc_xive_int_off(struct kvm *kvm, u32 irq)
1432{
1433        struct kvmppc_xive *xive = kvm->arch.xive;
1434        struct kvmppc_xive_src_block *sb;
1435        struct kvmppc_xive_irq_state *state;
1436        u16 idx;
1437
1438        if (!xive)
1439                return -ENODEV;
1440
1441        sb = kvmppc_xive_find_source(xive, irq, &idx);
1442        if (!sb)
1443                return -EINVAL;
1444        state = &sb->irq_state[idx];
1445
1446        pr_devel("int_off(irq=0x%x)\n", irq);
1447
1448        /*
1449         * Lock and mask
1450         */
1451        state->saved_priority = xive_lock_and_mask(xive, sb, state);
1452        arch_spin_unlock(&sb->lock);
1453
1454        return 0;
1455}
1456
1457static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq)
1458{
1459        struct kvmppc_xive_src_block *sb;
1460        struct kvmppc_xive_irq_state *state;
1461        u16 idx;
1462
1463        sb = kvmppc_xive_find_source(xive, irq, &idx);
1464        if (!sb)
1465                return false;
1466        state = &sb->irq_state[idx];
1467        if (!state->valid)
1468                return false;
1469
1470        /*
1471         * Trigger the IPI. This assumes we never restore a pass-through
1472         * interrupt which should be safe enough
1473         */
1474        xive_irq_trigger(&state->ipi_data);
1475
1476        return true;
1477}
1478
1479u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
1480{
1481        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1482
1483        if (!xc)
1484                return 0;
1485
1486        /* Return the per-cpu state for state saving/migration */
1487        return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
1488               (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
1489               (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
1490}
1491
1492int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
1493{
1494        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1495        struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1496        u8 cppr, mfrr;
1497        u32 xisr;
1498
1499        if (!xc || !xive)
1500                return -ENOENT;
1501
1502        /* Grab individual state fields. We don't use pending_pri */
1503        cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
1504        xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
1505                KVM_REG_PPC_ICP_XISR_MASK;
1506        mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
1507
1508        pr_devel("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n",
1509                 xc->server_num, cppr, mfrr, xisr);
1510
1511        /*
1512         * We can't update the state of a "pushed" VCPU, but that
1513         * shouldn't happen because the vcpu->mutex makes running a
1514         * vcpu mutually exclusive with doing one_reg get/set on it.
1515         */
1516        if (WARN_ON(vcpu->arch.xive_pushed))
1517                return -EIO;
1518
1519        /* Update VCPU HW saved state */
1520        vcpu->arch.xive_saved_state.cppr = cppr;
1521        xc->hw_cppr = xc->cppr = cppr;
1522
1523        /*
1524         * Update MFRR state. If it's not 0xff, we mark the VCPU as
1525         * having a pending MFRR change, which will re-evaluate the
1526         * target. The VCPU will thus potentially get a spurious
1527         * interrupt but that's not a big deal.
1528         */
1529        xc->mfrr = mfrr;
1530        if (mfrr < cppr)
1531                xive_irq_trigger(&xc->vp_ipi_data);
1532
1533        /*
1534         * Now saved XIRR is "interesting". It means there's something in
1535         * the legacy "1 element" queue... for an IPI we simply ignore it,
1536         * as the MFRR restore will handle that. For anything else we need
1537         * to force a resend of the source.
1538         * However the source may not have been setup yet. If that's the
1539         * case, we keep that info and increment a counter in the xive to
1540         * tell subsequent xive_set_source() to go look.
1541         */
1542        if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
1543                xc->delayed_irq = xisr;
1544                xive->delayed_irqs++;
1545                pr_devel("  xisr restore delayed\n");
1546        }
1547
1548        return 0;
1549}
1550
1551int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
1552                           unsigned long host_irq)
1553{
1554        struct kvmppc_xive *xive = kvm->arch.xive;
1555        struct kvmppc_xive_src_block *sb;
1556        struct kvmppc_xive_irq_state *state;
1557        struct irq_data *host_data =
1558                irq_domain_get_irq_data(irq_get_default_host(), host_irq);
1559        unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
1560        u16 idx;
1561        u8 prio;
1562        int rc;
1563
1564        if (!xive)
1565                return -ENODEV;
1566
1567        pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n",
1568                 __func__, guest_irq, host_irq, hw_irq);
1569
1570        sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
1571        if (!sb)
1572                return -EINVAL;
1573        state = &sb->irq_state[idx];
1574
1575        /*
1576         * Mark the passed-through interrupt as going to a VCPU,
1577         * this will prevent further EOIs and similar operations
1578         * from the XIVE code. It will also mask the interrupt
1579         * to either PQ=10 or 11 state, the latter if the interrupt
1580         * is pending. This will allow us to unmask or retrigger it
1581         * after routing it to the guest with a simple EOI.
1582         *
1583         * The "state" argument is a "token", all it needs is to be
1584         * non-NULL to switch to passed-through or NULL for the
1585         * other way around. We may not yet have an actual VCPU
1586         * target here and we don't really care.
1587         */
1588        rc = irq_set_vcpu_affinity(host_irq, state);
1589        if (rc) {
1590                pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq);
1591                return rc;
1592        }
1593
1594        /*
1595         * Mask and read state of IPI. We need to know if its P bit
1596         * is set as that means it's potentially already using a
1597         * queue entry in the target
1598         */
1599        prio = xive_lock_and_mask(xive, sb, state);
1600        pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio,
1601                 state->old_p, state->old_q);
1602
1603        /* Turn the IPI hard off */
1604        xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
1605
1606        /*
1607         * Reset ESB guest mapping. Needed when ESB pages are exposed
1608         * to the guest in XIVE native mode
1609         */
1610        if (xive->ops && xive->ops->reset_mapped)
1611                xive->ops->reset_mapped(kvm, guest_irq);
1612
1613        /* Grab info about irq */
1614        state->pt_number = hw_irq;
1615        state->pt_data = irq_data_get_irq_handler_data(host_data);
1616
1617        /*
1618         * Configure the IRQ to match the existing configuration of
1619         * the IPI if it was already targetted. Otherwise this will
1620         * mask the interrupt in a lossy way (act_priority is 0xff)
1621         * which is fine for a never started interrupt.
1622         */
1623        xive_native_configure_irq(hw_irq,
1624                                  kvmppc_xive_vp(xive, state->act_server),
1625                                  state->act_priority, state->number);
1626
1627        /*
1628         * We do an EOI to enable the interrupt (and retrigger if needed)
1629         * if the guest has the interrupt unmasked and the P bit was *not*
1630         * set in the IPI. If it was set, we know a slot may still be in
1631         * use in the target queue thus we have to wait for a guest
1632         * originated EOI
1633         */
1634        if (prio != MASKED && !state->old_p)
1635                xive_vm_source_eoi(hw_irq, state->pt_data);
1636
1637        /* Clear old_p/old_q as they are no longer relevant */
1638        state->old_p = state->old_q = false;
1639
1640        /* Restore guest prio (unlocks EOI) */
1641        mb();
1642        state->guest_priority = prio;
1643        arch_spin_unlock(&sb->lock);
1644
1645        return 0;
1646}
1647EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
1648
1649int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
1650                           unsigned long host_irq)
1651{
1652        struct kvmppc_xive *xive = kvm->arch.xive;
1653        struct kvmppc_xive_src_block *sb;
1654        struct kvmppc_xive_irq_state *state;
1655        u16 idx;
1656        u8 prio;
1657        int rc;
1658
1659        if (!xive)
1660                return -ENODEV;
1661
1662        pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__, guest_irq, host_irq);
1663
1664        sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
1665        if (!sb)
1666                return -EINVAL;
1667        state = &sb->irq_state[idx];
1668
1669        /*
1670         * Mask and read state of IRQ. We need to know if its P bit
1671         * is set as that means it's potentially already using a
1672         * queue entry in the target
1673         */
1674        prio = xive_lock_and_mask(xive, sb, state);
1675        pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio,
1676                 state->old_p, state->old_q);
1677
1678        /*
1679         * If old_p is set, the interrupt is pending, we switch it to
1680         * PQ=11. This will force a resend in the host so the interrupt
1681         * isn't lost to whatever host driver may pick it up
1682         */
1683        if (state->old_p)
1684                xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
1685
1686        /* Release the passed-through interrupt to the host */
1687        rc = irq_set_vcpu_affinity(host_irq, NULL);
1688        if (rc) {
1689                pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq);
1690                return rc;
1691        }
1692
1693        /* Forget about the IRQ */
1694        state->pt_number = 0;
1695        state->pt_data = NULL;
1696
1697        /*
1698         * Reset ESB guest mapping. Needed when ESB pages are exposed
1699         * to the guest in XIVE native mode
1700         */
1701        if (xive->ops && xive->ops->reset_mapped) {
1702                xive->ops->reset_mapped(kvm, guest_irq);
1703        }
1704
1705        /* Reconfigure the IPI */
1706        xive_native_configure_irq(state->ipi_number,
1707                                  kvmppc_xive_vp(xive, state->act_server),
1708                                  state->act_priority, state->number);
1709
1710        /*
1711         * If old_p is set (we have a queue entry potentially
1712         * occupied) or the interrupt is masked, we set the IPI
1713         * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
1714         */
1715        if (prio == MASKED || state->old_p)
1716                xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10);
1717        else
1718                xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
1719
1720        /* Restore guest prio (unlocks EOI) */
1721        mb();
1722        state->guest_priority = prio;
1723        arch_spin_unlock(&sb->lock);
1724
1725        return 0;
1726}
1727EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
1728
1729void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
1730{
1731        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1732        struct kvm *kvm = vcpu->kvm;
1733        struct kvmppc_xive *xive = kvm->arch.xive;
1734        int i, j;
1735
1736        for (i = 0; i <= xive->max_sbid; i++) {
1737                struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
1738
1739                if (!sb)
1740                        continue;
1741                for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
1742                        struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
1743
1744                        if (!state->valid)
1745                                continue;
1746                        if (state->act_priority == MASKED)
1747                                continue;
1748                        if (state->act_server != xc->server_num)
1749                                continue;
1750
1751                        /* Clean it up */
1752                        arch_spin_lock(&sb->lock);
1753                        state->act_priority = MASKED;
1754                        xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
1755                        xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
1756                        if (state->pt_number) {
1757                                xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
1758                                xive_native_configure_irq(state->pt_number, 0, MASKED, 0);
1759                        }
1760                        arch_spin_unlock(&sb->lock);
1761                }
1762        }
1763
1764        /* Disable vcpu's escalation interrupt */
1765        if (vcpu->arch.xive_esc_on) {
1766                __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
1767                                             XIVE_ESB_SET_PQ_01));
1768                vcpu->arch.xive_esc_on = false;
1769        }
1770
1771        /*
1772         * Clear pointers to escalation interrupt ESB.
1773         * This is safe because the vcpu->mutex is held, preventing
1774         * any other CPU from concurrently executing a KVM_RUN ioctl.
1775         */
1776        vcpu->arch.xive_esc_vaddr = 0;
1777        vcpu->arch.xive_esc_raddr = 0;
1778}
1779
1780/*
1781 * In single escalation mode, the escalation interrupt is marked so
1782 * that EOI doesn't re-enable it, but just sets the stale_p flag to
1783 * indicate that the P bit has already been dealt with.  However, the
1784 * assembly code that enters the guest sets PQ to 00 without clearing
1785 * stale_p (because it has no easy way to address it).  Hence we have
1786 * to adjust stale_p before shutting down the interrupt.
1787 */
1788void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
1789                                    struct kvmppc_xive_vcpu *xc, int irq)
1790{
1791        struct irq_data *d = irq_get_irq_data(irq);
1792        struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
1793
1794        /*
1795         * This slightly odd sequence gives the right result
1796         * (i.e. stale_p set if xive_esc_on is false) even if
1797         * we race with xive_esc_irq() and xive_irq_eoi().
1798         */
1799        xd->stale_p = false;
1800        smp_mb();               /* paired with smb_wmb in xive_esc_irq */
1801        if (!vcpu->arch.xive_esc_on)
1802                xd->stale_p = true;
1803}
1804
1805void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
1806{
1807        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1808        struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1809        int i;
1810
1811        if (!kvmppc_xics_enabled(vcpu))
1812                return;
1813
1814        if (!xc)
1815                return;
1816
1817        pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
1818
1819        /* Ensure no interrupt is still routed to that VP */
1820        xc->valid = false;
1821        kvmppc_xive_disable_vcpu_interrupts(vcpu);
1822
1823        /* Mask the VP IPI */
1824        xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
1825
1826        /* Free escalations */
1827        for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
1828                if (xc->esc_virq[i]) {
1829                        if (kvmppc_xive_has_single_escalation(xc->xive))
1830                                xive_cleanup_single_escalation(vcpu, xc,
1831                                                        xc->esc_virq[i]);
1832                        free_irq(xc->esc_virq[i], vcpu);
1833                        irq_dispose_mapping(xc->esc_virq[i]);
1834                        kfree(xc->esc_virq_names[i]);
1835                }
1836        }
1837
1838        /* Disable the VP */
1839        xive_native_disable_vp(xc->vp_id);
1840
1841        /* Clear the cam word so guest entry won't try to push context */
1842        vcpu->arch.xive_cam_word = 0;
1843
1844        /* Free the queues */
1845        for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
1846                struct xive_q *q = &xc->queues[i];
1847
1848                xive_native_disable_queue(xc->vp_id, q, i);
1849                if (q->qpage) {
1850                        free_pages((unsigned long)q->qpage,
1851                                   xive->q_page_order);
1852                        q->qpage = NULL;
1853                }
1854        }
1855
1856        /* Free the IPI */
1857        if (xc->vp_ipi) {
1858                xive_cleanup_irq_data(&xc->vp_ipi_data);
1859                xive_native_free_irq(xc->vp_ipi);
1860        }
1861        /* Free the VP */
1862        kfree(xc);
1863
1864        /* Cleanup the vcpu */
1865        vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
1866        vcpu->arch.xive_vcpu = NULL;
1867}
1868
1869static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu)
1870{
1871        /* We have a block of xive->nr_servers VPs. We just need to check
1872         * packed vCPU ids are below that.
1873         */
1874        return kvmppc_pack_vcpu_id(xive->kvm, cpu) < xive->nr_servers;
1875}
1876
1877int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp)
1878{
1879        u32 vp_id;
1880
1881        if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) {
1882                pr_devel("Out of bounds !\n");
1883                return -EINVAL;
1884        }
1885
1886        if (xive->vp_base == XIVE_INVALID_VP) {
1887                xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers);
1888                pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers);
1889
1890                if (xive->vp_base == XIVE_INVALID_VP)
1891                        return -ENOSPC;
1892        }
1893
1894        vp_id = kvmppc_xive_vp(xive, cpu);
1895        if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
1896                pr_devel("Duplicate !\n");
1897                return -EEXIST;
1898        }
1899
1900        *vp = vp_id;
1901
1902        return 0;
1903}
1904
1905int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1906                             struct kvm_vcpu *vcpu, u32 cpu)
1907{
1908        struct kvmppc_xive *xive = dev->private;
1909        struct kvmppc_xive_vcpu *xc;
1910        int i, r = -EBUSY;
1911        u32 vp_id;
1912
1913        pr_devel("connect_vcpu(cpu=%d)\n", cpu);
1914
1915        if (dev->ops != &kvm_xive_ops) {
1916                pr_devel("Wrong ops !\n");
1917                return -EPERM;
1918        }
1919        if (xive->kvm != vcpu->kvm)
1920                return -EPERM;
1921        if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
1922                return -EBUSY;
1923
1924        /* We need to synchronize with queue provisioning */
1925        mutex_lock(&xive->lock);
1926
1927        r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id);
1928        if (r)
1929                goto bail;
1930
1931        xc = kzalloc(sizeof(*xc), GFP_KERNEL);
1932        if (!xc) {
1933                r = -ENOMEM;
1934                goto bail;
1935        }
1936
1937        vcpu->arch.xive_vcpu = xc;
1938        xc->xive = xive;
1939        xc->vcpu = vcpu;
1940        xc->server_num = cpu;
1941        xc->vp_id = vp_id;
1942        xc->mfrr = 0xff;
1943        xc->valid = true;
1944
1945        r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
1946        if (r)
1947                goto bail;
1948
1949        if (!kvmppc_xive_check_save_restore(vcpu)) {
1950                pr_err("inconsistent save-restore setup for VCPU %d\n", cpu);
1951                r = -EIO;
1952                goto bail;
1953        }
1954
1955        /* Configure VCPU fields for use by assembly push/pull */
1956        vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
1957        vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
1958
1959        /* Allocate IPI */
1960        xc->vp_ipi = xive_native_alloc_irq();
1961        if (!xc->vp_ipi) {
1962                pr_err("Failed to allocate xive irq for VCPU IPI\n");
1963                r = -EIO;
1964                goto bail;
1965        }
1966        pr_devel(" IPI=0x%x\n", xc->vp_ipi);
1967
1968        r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data);
1969        if (r)
1970                goto bail;
1971
1972        /*
1973         * Enable the VP first as the single escalation mode will
1974         * affect escalation interrupts numbering
1975         */
1976        r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
1977        if (r) {
1978                pr_err("Failed to enable VP in OPAL, err %d\n", r);
1979                goto bail;
1980        }
1981
1982        /*
1983         * Initialize queues. Initially we set them all for no queueing
1984         * and we enable escalation for queue 0 only which we'll use for
1985         * our mfrr change notifications. If the VCPU is hot-plugged, we
1986         * do handle provisioning however based on the existing "map"
1987         * of enabled queues.
1988         */
1989        for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
1990                struct xive_q *q = &xc->queues[i];
1991
1992                /* Single escalation, no queue 7 */
1993                if (i == 7 && kvmppc_xive_has_single_escalation(xive))
1994                        break;
1995
1996                /* Is queue already enabled ? Provision it */
1997                if (xive->qmap & (1 << i)) {
1998                        r = xive_provision_queue(vcpu, i);
1999                        if (r == 0 && !kvmppc_xive_has_single_escalation(xive))
2000                                kvmppc_xive_attach_escalation(
2001                                        vcpu, i, kvmppc_xive_has_single_escalation(xive));
2002                        if (r)
2003                                goto bail;
2004                } else {
2005                        r = xive_native_configure_queue(xc->vp_id,
2006                                                        q, i, NULL, 0, true);
2007                        if (r) {
2008                                pr_err("Failed to configure queue %d for VCPU %d\n",
2009                                       i, cpu);
2010                                goto bail;
2011                        }
2012                }
2013        }
2014
2015        /* If not done above, attach priority 0 escalation */
2016        r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive));
2017        if (r)
2018                goto bail;
2019
2020        /* Route the IPI */
2021        r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
2022        if (!r)
2023                xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
2024
2025bail:
2026        mutex_unlock(&xive->lock);
2027        if (r) {
2028                kvmppc_xive_cleanup_vcpu(vcpu);
2029                return r;
2030        }
2031
2032        vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
2033        return 0;
2034}
2035
2036/*
2037 * Scanning of queues before/after migration save
2038 */
2039static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq)
2040{
2041        struct kvmppc_xive_src_block *sb;
2042        struct kvmppc_xive_irq_state *state;
2043        u16 idx;
2044
2045        sb = kvmppc_xive_find_source(xive, irq, &idx);
2046        if (!sb)
2047                return;
2048
2049        state = &sb->irq_state[idx];
2050
2051        /* Some sanity checking */
2052        if (!state->valid) {
2053                pr_err("invalid irq 0x%x in cpu queue!\n", irq);
2054                return;
2055        }
2056
2057        /*
2058         * If the interrupt is in a queue it should have P set.
2059         * We warn so that gets reported. A backtrace isn't useful
2060         * so no need to use a WARN_ON.
2061         */
2062        if (!state->saved_p)
2063                pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq);
2064
2065        /* Set flag */
2066        state->in_queue = true;
2067}
2068
2069static void xive_pre_save_mask_irq(struct kvmppc_xive *xive,
2070                                   struct kvmppc_xive_src_block *sb,
2071                                   u32 irq)
2072{
2073        struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
2074
2075        if (!state->valid)
2076                return;
2077
2078        /* Mask and save state, this will also sync HW queues */
2079        state->saved_scan_prio = xive_lock_and_mask(xive, sb, state);
2080
2081        /* Transfer P and Q */
2082        state->saved_p = state->old_p;
2083        state->saved_q = state->old_q;
2084
2085        /* Unlock */
2086        arch_spin_unlock(&sb->lock);
2087}
2088
2089static void xive_pre_save_unmask_irq(struct kvmppc_xive *xive,
2090                                     struct kvmppc_xive_src_block *sb,
2091                                     u32 irq)
2092{
2093        struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
2094
2095        if (!state->valid)
2096                return;
2097
2098        /*
2099         * Lock / exclude EOI (not technically necessary if the
2100         * guest isn't running concurrently. If this becomes a
2101         * performance issue we can probably remove the lock.
2102         */
2103        xive_lock_for_unmask(sb, state);
2104
2105        /* Restore mask/prio if it wasn't masked */
2106        if (state->saved_scan_prio != MASKED)
2107                xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
2108
2109        /* Unlock */
2110        arch_spin_unlock(&sb->lock);
2111}
2112
2113static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q)
2114{
2115        u32 idx = q->idx;
2116        u32 toggle = q->toggle;
2117        u32 irq;
2118
2119        do {
2120                irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle);
2121                if (irq > XICS_IPI)
2122                        xive_pre_save_set_queued(xive, irq);
2123        } while(irq);
2124}
2125
2126static void xive_pre_save_scan(struct kvmppc_xive *xive)
2127{
2128        struct kvm_vcpu *vcpu = NULL;
2129        unsigned long i;
2130        int j;
2131
2132        /*
2133         * See comment in xive_get_source() about how this
2134         * work. Collect a stable state for all interrupts
2135         */
2136        for (i = 0; i <= xive->max_sbid; i++) {
2137                struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
2138                if (!sb)
2139                        continue;
2140                for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
2141                        xive_pre_save_mask_irq(xive, sb, j);
2142        }
2143
2144        /* Then scan the queues and update the "in_queue" flag */
2145        kvm_for_each_vcpu(i, vcpu, xive->kvm) {
2146                struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
2147                if (!xc)
2148                        continue;
2149                for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
2150                        if (xc->queues[j].qpage)
2151                                xive_pre_save_queue(xive, &xc->queues[j]);
2152                }
2153        }
2154
2155        /* Finally restore interrupt states */
2156        for (i = 0; i <= xive->max_sbid; i++) {
2157                struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
2158                if (!sb)
2159                        continue;
2160                for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
2161                        xive_pre_save_unmask_irq(xive, sb, j);
2162        }
2163}
2164
2165static void xive_post_save_scan(struct kvmppc_xive *xive)
2166{
2167        u32 i, j;
2168
2169        /* Clear all the in_queue flags */
2170        for (i = 0; i <= xive->max_sbid; i++) {
2171                struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
2172                if (!sb)
2173                        continue;
2174                for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
2175                        sb->irq_state[j].in_queue = false;
2176        }
2177
2178        /* Next get_source() will do a new scan */
2179        xive->saved_src_count = 0;
2180}
2181
2182/*
2183 * This returns the source configuration and state to user space.
2184 */
2185static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
2186{
2187        struct kvmppc_xive_src_block *sb;
2188        struct kvmppc_xive_irq_state *state;
2189        u64 __user *ubufp = (u64 __user *) addr;
2190        u64 val, prio;
2191        u16 idx;
2192
2193        sb = kvmppc_xive_find_source(xive, irq, &idx);
2194        if (!sb)
2195                return -ENOENT;
2196
2197        state = &sb->irq_state[idx];
2198
2199        if (!state->valid)
2200                return -ENOENT;
2201
2202        pr_devel("get_source(%ld)...\n", irq);
2203
2204        /*
2205         * So to properly save the state into something that looks like a
2206         * XICS migration stream we cannot treat interrupts individually.
2207         *
2208         * We need, instead, mask them all (& save their previous PQ state)
2209         * to get a stable state in the HW, then sync them to ensure that
2210         * any interrupt that had already fired hits its queue, and finally
2211         * scan all the queues to collect which interrupts are still present
2212         * in the queues, so we can set the "pending" flag on them and
2213         * they can be resent on restore.
2214         *
2215         * So we do it all when the "first" interrupt gets saved, all the
2216         * state is collected at that point, the rest of xive_get_source()
2217         * will merely collect and convert that state to the expected
2218         * userspace bit mask.
2219         */
2220        if (xive->saved_src_count == 0)
2221                xive_pre_save_scan(xive);
2222        xive->saved_src_count++;
2223
2224        /* Convert saved state into something compatible with xics */
2225        val = state->act_server;
2226        prio = state->saved_scan_prio;
2227
2228        if (prio == MASKED) {
2229                val |= KVM_XICS_MASKED;
2230                prio = state->saved_priority;
2231        }
2232        val |= prio << KVM_XICS_PRIORITY_SHIFT;
2233        if (state->lsi) {
2234                val |= KVM_XICS_LEVEL_SENSITIVE;
2235                if (state->saved_p)
2236                        val |= KVM_XICS_PENDING;
2237        } else {
2238                if (state->saved_p)
2239                        val |= KVM_XICS_PRESENTED;
2240
2241                if (state->saved_q)
2242                        val |= KVM_XICS_QUEUED;
2243
2244                /*
2245                 * We mark it pending (which will attempt a re-delivery)
2246                 * if we are in a queue *or* we were masked and had
2247                 * Q set which is equivalent to the XICS "masked pending"
2248                 * state
2249                 */
2250                if (state->in_queue || (prio == MASKED && state->saved_q))
2251                        val |= KVM_XICS_PENDING;
2252        }
2253
2254        /*
2255         * If that was the last interrupt saved, reset the
2256         * in_queue flags
2257         */
2258        if (xive->saved_src_count == xive->src_count)
2259                xive_post_save_scan(xive);
2260
2261        /* Copy the result to userspace */
2262        if (put_user(val, ubufp))
2263                return -EFAULT;
2264
2265        return 0;
2266}
2267
2268struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
2269        struct kvmppc_xive *xive, int irq)
2270{
2271        struct kvmppc_xive_src_block *sb;
2272        int i, bid;
2273
2274        bid = irq >> KVMPPC_XICS_ICS_SHIFT;
2275
2276        mutex_lock(&xive->lock);
2277
2278        /* block already exists - somebody else got here first */
2279        if (xive->src_blocks[bid])
2280                goto out;
2281
2282        /* Create the ICS */
2283        sb = kzalloc(sizeof(*sb), GFP_KERNEL);
2284        if (!sb)
2285                goto out;
2286
2287        sb->id = bid;
2288
2289        for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
2290                sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
2291                sb->irq_state[i].eisn = 0;
2292                sb->irq_state[i].guest_priority = MASKED;
2293                sb->irq_state[i].saved_priority = MASKED;
2294                sb->irq_state[i].act_priority = MASKED;
2295        }
2296        smp_wmb();
2297        xive->src_blocks[bid] = sb;
2298
2299        if (bid > xive->max_sbid)
2300                xive->max_sbid = bid;
2301
2302out:
2303        mutex_unlock(&xive->lock);
2304        return xive->src_blocks[bid];
2305}
2306
2307static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq)
2308{
2309        struct kvm *kvm = xive->kvm;
2310        struct kvm_vcpu *vcpu = NULL;
2311        unsigned long i;
2312
2313        kvm_for_each_vcpu(i, vcpu, kvm) {
2314                struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
2315
2316                if (!xc)
2317                        continue;
2318
2319                if (xc->delayed_irq == irq) {
2320                        xc->delayed_irq = 0;
2321                        xive->delayed_irqs--;
2322                        return true;
2323                }
2324        }
2325        return false;
2326}
2327
2328static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
2329{
2330        struct kvmppc_xive_src_block *sb;
2331        struct kvmppc_xive_irq_state *state;
2332        u64 __user *ubufp = (u64 __user *) addr;
2333        u16 idx;
2334        u64 val;
2335        u8 act_prio, guest_prio;
2336        u32 server;
2337        int rc = 0;
2338
2339        if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
2340                return -ENOENT;
2341
2342        pr_devel("set_source(irq=0x%lx)\n", irq);
2343
2344        /* Find the source */
2345        sb = kvmppc_xive_find_source(xive, irq, &idx);
2346        if (!sb) {
2347                pr_devel("No source, creating source block...\n");
2348                sb = kvmppc_xive_create_src_block(xive, irq);
2349                if (!sb) {
2350                        pr_devel("Failed to create block...\n");
2351                        return -ENOMEM;
2352                }
2353        }
2354        state = &sb->irq_state[idx];
2355
2356        /* Read user passed data */
2357        if (get_user(val, ubufp)) {
2358                pr_devel("fault getting user info !\n");
2359                return -EFAULT;
2360        }
2361
2362        server = val & KVM_XICS_DESTINATION_MASK;
2363        guest_prio = val >> KVM_XICS_PRIORITY_SHIFT;
2364
2365        pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
2366                 val, server, guest_prio);
2367
2368        /*
2369         * If the source doesn't already have an IPI, allocate
2370         * one and get the corresponding data
2371         */
2372        if (!state->ipi_number) {
2373                state->ipi_number = xive_native_alloc_irq();
2374                if (state->ipi_number == 0) {
2375                        pr_devel("Failed to allocate IPI !\n");
2376                        return -ENOMEM;
2377                }
2378                xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
2379                pr_devel(" src_ipi=0x%x\n", state->ipi_number);
2380        }
2381
2382        /*
2383         * We use lock_and_mask() to set us in the right masked
2384         * state. We will override that state from the saved state
2385         * further down, but this will handle the cases of interrupts
2386         * that need FW masking. We set the initial guest_priority to
2387         * 0 before calling it to ensure it actually performs the masking.
2388         */
2389        state->guest_priority = 0;
2390        xive_lock_and_mask(xive, sb, state);
2391
2392        /*
2393         * Now, we select a target if we have one. If we don't we
2394         * leave the interrupt untargetted. It means that an interrupt
2395         * can become "untargetted" accross migration if it was masked
2396         * by set_xive() but there is little we can do about it.
2397         */
2398
2399        /* First convert prio and mark interrupt as untargetted */
2400        act_prio = xive_prio_from_guest(guest_prio);
2401        state->act_priority = MASKED;
2402
2403        /*
2404         * We need to drop the lock due to the mutex below. Hopefully
2405         * nothing is touching that interrupt yet since it hasn't been
2406         * advertized to a running guest yet
2407         */
2408        arch_spin_unlock(&sb->lock);
2409
2410        /* If we have a priority target the interrupt */
2411        if (act_prio != MASKED) {
2412                /* First, check provisioning of queues */
2413                mutex_lock(&xive->lock);
2414                rc = xive_check_provisioning(xive->kvm, act_prio);
2415                mutex_unlock(&xive->lock);
2416
2417                /* Target interrupt */
2418                if (rc == 0)
2419                        rc = xive_target_interrupt(xive->kvm, state,
2420                                                   server, act_prio);
2421                /*
2422                 * If provisioning or targetting failed, leave it
2423                 * alone and masked. It will remain disabled until
2424                 * the guest re-targets it.
2425                 */
2426        }
2427
2428        /*
2429         * Find out if this was a delayed irq stashed in an ICP,
2430         * in which case, treat it as pending
2431         */
2432        if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
2433                val |= KVM_XICS_PENDING;
2434                pr_devel("  Found delayed ! forcing PENDING !\n");
2435        }
2436
2437        /* Cleanup the SW state */
2438        state->old_p = false;
2439        state->old_q = false;
2440        state->lsi = false;
2441        state->asserted = false;
2442
2443        /* Restore LSI state */
2444        if (val & KVM_XICS_LEVEL_SENSITIVE) {
2445                state->lsi = true;
2446                if (val & KVM_XICS_PENDING)
2447                        state->asserted = true;
2448                pr_devel("  LSI ! Asserted=%d\n", state->asserted);
2449        }
2450
2451        /*
2452         * Restore P and Q. If the interrupt was pending, we
2453         * force Q and !P, which will trigger a resend.
2454         *
2455         * That means that a guest that had both an interrupt
2456         * pending (queued) and Q set will restore with only
2457         * one instance of that interrupt instead of 2, but that
2458         * is perfectly fine as coalescing interrupts that haven't
2459         * been presented yet is always allowed.
2460         */
2461        if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
2462                state->old_p = true;
2463        if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
2464                state->old_q = true;
2465
2466        pr_devel("  P=%d, Q=%d\n", state->old_p, state->old_q);
2467
2468        /*
2469         * If the interrupt was unmasked, update guest priority and
2470         * perform the appropriate state transition and do a
2471         * re-trigger if necessary.
2472         */
2473        if (val & KVM_XICS_MASKED) {
2474                pr_devel("  masked, saving prio\n");
2475                state->guest_priority = MASKED;
2476                state->saved_priority = guest_prio;
2477        } else {
2478                pr_devel("  unmasked, restoring to prio %d\n", guest_prio);
2479                xive_finish_unmask(xive, sb, state, guest_prio);
2480                state->saved_priority = guest_prio;
2481        }
2482
2483        /* Increment the number of valid sources and mark this one valid */
2484        if (!state->valid)
2485                xive->src_count++;
2486        state->valid = true;
2487
2488        return 0;
2489}
2490
2491int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
2492                        bool line_status)
2493{
2494        struct kvmppc_xive *xive = kvm->arch.xive;
2495        struct kvmppc_xive_src_block *sb;
2496        struct kvmppc_xive_irq_state *state;
2497        u16 idx;
2498
2499        if (!xive)
2500                return -ENODEV;
2501
2502        sb = kvmppc_xive_find_source(xive, irq, &idx);
2503        if (!sb)
2504                return -EINVAL;
2505
2506        /* Perform locklessly .... (we need to do some RCUisms here...) */
2507        state = &sb->irq_state[idx];
2508        if (!state->valid)
2509                return -EINVAL;
2510
2511        /* We don't allow a trigger on a passed-through interrupt */
2512        if (state->pt_number)
2513                return -EINVAL;
2514
2515        if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
2516                state->asserted = true;
2517        else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
2518                state->asserted = false;
2519                return 0;
2520        }
2521
2522        /* Trigger the IPI */
2523        xive_irq_trigger(&state->ipi_data);
2524
2525        return 0;
2526}
2527
2528int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr)
2529{
2530        u32 __user *ubufp = (u32 __user *) addr;
2531        u32 nr_servers;
2532        int rc = 0;
2533
2534        if (get_user(nr_servers, ubufp))
2535                return -EFAULT;
2536
2537        pr_devel("%s nr_servers=%u\n", __func__, nr_servers);
2538
2539        if (!nr_servers || nr_servers > KVM_MAX_VCPU_IDS)
2540                return -EINVAL;
2541
2542        mutex_lock(&xive->lock);
2543        if (xive->vp_base != XIVE_INVALID_VP)
2544                /* The VP block is allocated once and freed when the device
2545                 * is released. Better not allow to change its size since its
2546                 * used by connect_vcpu to validate vCPU ids are valid (eg,
2547                 * setting it back to a higher value could allow connect_vcpu
2548                 * to come up with a VP id that goes beyond the VP block, which
2549                 * is likely to cause a crash in OPAL).
2550                 */
2551                rc = -EBUSY;
2552        else if (nr_servers > KVM_MAX_VCPUS)
2553                /* We don't need more servers. Higher vCPU ids get packed
2554                 * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id().
2555                 */
2556                xive->nr_servers = KVM_MAX_VCPUS;
2557        else
2558                xive->nr_servers = nr_servers;
2559
2560        mutex_unlock(&xive->lock);
2561
2562        return rc;
2563}
2564
2565static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2566{
2567        struct kvmppc_xive *xive = dev->private;
2568
2569        /* We honor the existing XICS ioctl */
2570        switch (attr->group) {
2571        case KVM_DEV_XICS_GRP_SOURCES:
2572                return xive_set_source(xive, attr->attr, attr->addr);
2573        case KVM_DEV_XICS_GRP_CTRL:
2574                switch (attr->attr) {
2575                case KVM_DEV_XICS_NR_SERVERS:
2576                        return kvmppc_xive_set_nr_servers(xive, attr->addr);
2577                }
2578        }
2579        return -ENXIO;
2580}
2581
2582static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2583{
2584        struct kvmppc_xive *xive = dev->private;
2585
2586        /* We honor the existing XICS ioctl */
2587        switch (attr->group) {
2588        case KVM_DEV_XICS_GRP_SOURCES:
2589                return xive_get_source(xive, attr->attr, attr->addr);
2590        }
2591        return -ENXIO;
2592}
2593
2594static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2595{
2596        /* We honor the same limits as XICS, at least for now */
2597        switch (attr->group) {
2598        case KVM_DEV_XICS_GRP_SOURCES:
2599                if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
2600                    attr->attr < KVMPPC_XICS_NR_IRQS)
2601                        return 0;
2602                break;
2603        case KVM_DEV_XICS_GRP_CTRL:
2604                switch (attr->attr) {
2605                case KVM_DEV_XICS_NR_SERVERS:
2606                        return 0;
2607                }
2608        }
2609        return -ENXIO;
2610}
2611
2612static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
2613{
2614        xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
2615        xive_native_configure_irq(hw_num, 0, MASKED, 0);
2616}
2617
2618void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
2619{
2620        int i;
2621
2622        for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
2623                struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
2624
2625                if (!state->valid)
2626                        continue;
2627
2628                kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
2629                xive_cleanup_irq_data(&state->ipi_data);
2630                xive_native_free_irq(state->ipi_number);
2631
2632                /* Pass-through, cleanup too but keep IRQ hw data */
2633                if (state->pt_number)
2634                        kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
2635
2636                state->valid = false;
2637        }
2638}
2639
2640/*
2641 * Called when device fd is closed.  kvm->lock is held.
2642 */
2643static void kvmppc_xive_release(struct kvm_device *dev)
2644{
2645        struct kvmppc_xive *xive = dev->private;
2646        struct kvm *kvm = xive->kvm;
2647        struct kvm_vcpu *vcpu;
2648        unsigned long i;
2649
2650        pr_devel("Releasing xive device\n");
2651
2652        /*
2653         * Since this is the device release function, we know that
2654         * userspace does not have any open fd referring to the
2655         * device.  Therefore there can not be any of the device
2656         * attribute set/get functions being executed concurrently,
2657         * and similarly, the connect_vcpu and set/clr_mapped
2658         * functions also cannot be being executed.
2659         */
2660
2661        debugfs_remove(xive->dentry);
2662
2663        /*
2664         * We should clean up the vCPU interrupt presenters first.
2665         */
2666        kvm_for_each_vcpu(i, vcpu, kvm) {
2667                /*
2668                 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
2669                 * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
2670                 * Holding the vcpu->mutex also means that the vcpu cannot
2671                 * be executing the KVM_RUN ioctl, and therefore it cannot
2672                 * be executing the XIVE push or pull code or accessing
2673                 * the XIVE MMIO regions.
2674                 */
2675                mutex_lock(&vcpu->mutex);
2676                kvmppc_xive_cleanup_vcpu(vcpu);
2677                mutex_unlock(&vcpu->mutex);
2678        }
2679
2680        /*
2681         * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
2682         * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
2683         * against xive code getting called during vcpu execution or
2684         * set/get one_reg operations.
2685         */
2686        kvm->arch.xive = NULL;
2687
2688        /* Mask and free interrupts */
2689        for (i = 0; i <= xive->max_sbid; i++) {
2690                if (xive->src_blocks[i])
2691                        kvmppc_xive_free_sources(xive->src_blocks[i]);
2692                kfree(xive->src_blocks[i]);
2693                xive->src_blocks[i] = NULL;
2694        }
2695
2696        if (xive->vp_base != XIVE_INVALID_VP)
2697                xive_native_free_vp_block(xive->vp_base);
2698
2699        /*
2700         * A reference of the kvmppc_xive pointer is now kept under
2701         * the xive_devices struct of the machine for reuse. It is
2702         * freed when the VM is destroyed for now until we fix all the
2703         * execution paths.
2704         */
2705
2706        kfree(dev);
2707}
2708
2709/*
2710 * When the guest chooses the interrupt mode (XICS legacy or XIVE
2711 * native), the VM will switch of KVM device. The previous device will
2712 * be "released" before the new one is created.
2713 *
2714 * Until we are sure all execution paths are well protected, provide a
2715 * fail safe (transitional) method for device destruction, in which
2716 * the XIVE device pointer is recycled and not directly freed.
2717 */
2718struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
2719{
2720        struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ?
2721                &kvm->arch.xive_devices.native :
2722                &kvm->arch.xive_devices.xics_on_xive;
2723        struct kvmppc_xive *xive = *kvm_xive_device;
2724
2725        if (!xive) {
2726                xive = kzalloc(sizeof(*xive), GFP_KERNEL);
2727                *kvm_xive_device = xive;
2728        } else {
2729                memset(xive, 0, sizeof(*xive));
2730        }
2731
2732        return xive;
2733}
2734
2735/*
2736 * Create a XICS device with XIVE backend.  kvm->lock is held.
2737 */
2738static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
2739{
2740        struct kvmppc_xive *xive;
2741        struct kvm *kvm = dev->kvm;
2742
2743        pr_devel("Creating xive for partition\n");
2744
2745        /* Already there ? */
2746        if (kvm->arch.xive)
2747                return -EEXIST;
2748
2749        xive = kvmppc_xive_get_device(kvm, type);
2750        if (!xive)
2751                return -ENOMEM;
2752
2753        dev->private = xive;
2754        xive->dev = dev;
2755        xive->kvm = kvm;
2756        mutex_init(&xive->lock);
2757
2758        /* We use the default queue size set by the host */
2759        xive->q_order = xive_native_default_eq_shift();
2760        if (xive->q_order < PAGE_SHIFT)
2761                xive->q_page_order = 0;
2762        else
2763                xive->q_page_order = xive->q_order - PAGE_SHIFT;
2764
2765        /* VP allocation is delayed to the first call to connect_vcpu */
2766        xive->vp_base = XIVE_INVALID_VP;
2767        /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
2768         * on a POWER9 system.
2769         */
2770        xive->nr_servers = KVM_MAX_VCPUS;
2771
2772        if (xive_native_has_single_escalation())
2773                xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
2774
2775        if (xive_native_has_save_restore())
2776                xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
2777
2778        kvm->arch.xive = xive;
2779        return 0;
2780}
2781
2782int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
2783{
2784        struct kvmppc_vcore *vc = vcpu->arch.vcore;
2785
2786        /* The VM should have configured XICS mode before doing XICS hcalls. */
2787        if (!kvmppc_xics_enabled(vcpu))
2788                return H_TOO_HARD;
2789
2790        switch (req) {
2791        case H_XIRR:
2792                return xive_vm_h_xirr(vcpu);
2793        case H_CPPR:
2794                return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
2795        case H_EOI:
2796                return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
2797        case H_IPI:
2798                return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
2799                                          kvmppc_get_gpr(vcpu, 5));
2800        case H_IPOLL:
2801                return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
2802        case H_XIRR_X:
2803                xive_vm_h_xirr(vcpu);
2804                kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
2805                return H_SUCCESS;
2806        }
2807
2808        return H_UNSUPPORTED;
2809}
2810EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
2811
2812int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
2813{
2814        struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
2815        unsigned int i;
2816
2817        for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
2818                struct xive_q *q = &xc->queues[i];
2819                u32 i0, i1, idx;
2820
2821                if (!q->qpage && !xc->esc_virq[i])
2822                        continue;
2823
2824                if (q->qpage) {
2825                        seq_printf(m, "    q[%d]: ", i);
2826                        idx = q->idx;
2827                        i0 = be32_to_cpup(q->qpage + idx);
2828                        idx = (idx + 1) & q->msk;
2829                        i1 = be32_to_cpup(q->qpage + idx);
2830                        seq_printf(m, "T=%d %08x %08x...\n", q->toggle,
2831                                   i0, i1);
2832                }
2833                if (xc->esc_virq[i]) {
2834                        struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
2835                        struct xive_irq_data *xd =
2836                                irq_data_get_irq_handler_data(d);
2837                        u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
2838
2839                        seq_printf(m, "    ESC %d %c%c EOI @%llx",
2840                                   xc->esc_virq[i],
2841                                   (pq & XIVE_ESB_VAL_P) ? 'P' : '-',
2842                                   (pq & XIVE_ESB_VAL_Q) ? 'Q' : '-',
2843                                   xd->eoi_page);
2844                        seq_puts(m, "\n");
2845                }
2846        }
2847        return 0;
2848}
2849
2850void kvmppc_xive_debug_show_sources(struct seq_file *m,
2851                                    struct kvmppc_xive_src_block *sb)
2852{
2853        int i;
2854
2855        seq_puts(m, "    LISN      HW/CHIP   TYPE    PQ      EISN    CPU/PRIO\n");
2856        for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
2857                struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
2858                struct xive_irq_data *xd;
2859                u64 pq;
2860                u32 hw_num;
2861
2862                if (!state->valid)
2863                        continue;
2864
2865                kvmppc_xive_select_irq(state, &hw_num, &xd);
2866
2867                pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
2868
2869                seq_printf(m, "%08x  %08x/%02x", state->number, hw_num,
2870                           xd->src_chip);
2871                if (state->lsi)
2872                        seq_printf(m, " %cLSI", state->asserted ? '^' : ' ');
2873                else
2874                        seq_puts(m, "  MSI");
2875
2876                seq_printf(m, " %s  %c%c  %08x   % 4d/%d",
2877                           state->ipi_number == hw_num ? "IPI" : " PT",
2878                           pq & XIVE_ESB_VAL_P ? 'P' : '-',
2879                           pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
2880                           state->eisn, state->act_server,
2881                           state->act_priority);
2882
2883                seq_puts(m, "\n");
2884        }
2885}
2886
2887static int xive_debug_show(struct seq_file *m, void *private)
2888{
2889        struct kvmppc_xive *xive = m->private;
2890        struct kvm *kvm = xive->kvm;
2891        struct kvm_vcpu *vcpu;
2892        u64 t_rm_h_xirr = 0;
2893        u64 t_rm_h_ipoll = 0;
2894        u64 t_rm_h_cppr = 0;
2895        u64 t_rm_h_eoi = 0;
2896        u64 t_rm_h_ipi = 0;
2897        u64 t_vm_h_xirr = 0;
2898        u64 t_vm_h_ipoll = 0;
2899        u64 t_vm_h_cppr = 0;
2900        u64 t_vm_h_eoi = 0;
2901        u64 t_vm_h_ipi = 0;
2902        unsigned long i;
2903
2904        if (!kvm)
2905                return 0;
2906
2907        seq_puts(m, "=========\nVCPU state\n=========\n");
2908
2909        kvm_for_each_vcpu(i, vcpu, kvm) {
2910                struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
2911
2912                if (!xc)
2913                        continue;
2914
2915                seq_printf(m, "VCPU %d: VP:%#x/%02x\n"
2916                         "    CPPR:%#x HWCPPR:%#x MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
2917                         xc->server_num, xc->vp_id, xc->vp_chip_id,
2918                         xc->cppr, xc->hw_cppr,
2919                         xc->mfrr, xc->pending,
2920                         xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
2921
2922                kvmppc_xive_debug_show_queues(m, vcpu);
2923
2924                t_rm_h_xirr += xc->stat_rm_h_xirr;
2925                t_rm_h_ipoll += xc->stat_rm_h_ipoll;
2926                t_rm_h_cppr += xc->stat_rm_h_cppr;
2927                t_rm_h_eoi += xc->stat_rm_h_eoi;
2928                t_rm_h_ipi += xc->stat_rm_h_ipi;
2929                t_vm_h_xirr += xc->stat_vm_h_xirr;
2930                t_vm_h_ipoll += xc->stat_vm_h_ipoll;
2931                t_vm_h_cppr += xc->stat_vm_h_cppr;
2932                t_vm_h_eoi += xc->stat_vm_h_eoi;
2933                t_vm_h_ipi += xc->stat_vm_h_ipi;
2934        }
2935
2936        seq_puts(m, "Hcalls totals\n");
2937        seq_printf(m, " H_XIRR  R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr);
2938        seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll);
2939        seq_printf(m, " H_CPPR  R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr);
2940        seq_printf(m, " H_EOI   R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi);
2941        seq_printf(m, " H_IPI   R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi);
2942
2943        seq_puts(m, "=========\nSources\n=========\n");
2944
2945        for (i = 0; i <= xive->max_sbid; i++) {
2946                struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
2947
2948                if (sb) {
2949                        arch_spin_lock(&sb->lock);
2950                        kvmppc_xive_debug_show_sources(m, sb);
2951                        arch_spin_unlock(&sb->lock);
2952                }
2953        }
2954
2955        return 0;
2956}
2957
2958DEFINE_SHOW_ATTRIBUTE(xive_debug);
2959
2960static void xive_debugfs_init(struct kvmppc_xive *xive)
2961{
2962        xive->dentry = debugfs_create_file("xive", S_IRUGO, xive->kvm->debugfs_dentry,
2963                                           xive, &xive_debug_fops);
2964
2965        pr_debug("%s: created\n", __func__);
2966}
2967
2968static void kvmppc_xive_init(struct kvm_device *dev)
2969{
2970        struct kvmppc_xive *xive = dev->private;
2971
2972        /* Register some debug interfaces */
2973        xive_debugfs_init(xive);
2974}
2975
2976struct kvm_device_ops kvm_xive_ops = {
2977        .name = "kvm-xive",
2978        .create = kvmppc_xive_create,
2979        .init = kvmppc_xive_init,
2980        .release = kvmppc_xive_release,
2981        .set_attr = xive_set_attr,
2982        .get_attr = xive_get_attr,
2983        .has_attr = xive_has_attr,
2984};
2985