linux/arch/s390/kvm/vsie.c
<<
>>
Prefs
   1/*
   2 * kvm nested virtualization support for s390x
   3 *
   4 * Copyright IBM Corp. 2016
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License (version 2 only)
   8 * as published by the Free Software Foundation.
   9 *
  10 *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
  11 */
  12#include <linux/vmalloc.h>
  13#include <linux/kvm_host.h>
  14#include <linux/bug.h>
  15#include <linux/list.h>
  16#include <linux/bitmap.h>
  17#include <linux/sched/signal.h>
  18
  19#include <asm/gmap.h>
  20#include <asm/mmu_context.h>
  21#include <asm/sclp.h>
  22#include <asm/nmi.h>
  23#include <asm/dis.h>
  24#include "kvm-s390.h"
  25#include "gaccess.h"
  26
  27struct vsie_page {
  28        struct kvm_s390_sie_block scb_s;        /* 0x0000 */
  29        /* the pinned originial scb */
  30        struct kvm_s390_sie_block *scb_o;       /* 0x0200 */
  31        /* the shadow gmap in use by the vsie_page */
  32        struct gmap *gmap;                      /* 0x0208 */
  33        /* address of the last reported fault to guest2 */
  34        unsigned long fault_addr;               /* 0x0210 */
  35        __u8 reserved[0x0700 - 0x0218];         /* 0x0218 */
  36        struct kvm_s390_crypto_cb crycb;        /* 0x0700 */
  37        __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
  38} __packed;
  39
  40/* trigger a validity icpt for the given scb */
  41static int set_validity_icpt(struct kvm_s390_sie_block *scb,
  42                             __u16 reason_code)
  43{
  44        scb->ipa = 0x1000;
  45        scb->ipb = ((__u32) reason_code) << 16;
  46        scb->icptcode = ICPT_VALIDITY;
  47        return 1;
  48}
  49
  50/* mark the prefix as unmapped, this will block the VSIE */
  51static void prefix_unmapped(struct vsie_page *vsie_page)
  52{
  53        atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
  54}
  55
  56/* mark the prefix as unmapped and wait until the VSIE has been left */
  57static void prefix_unmapped_sync(struct vsie_page *vsie_page)
  58{
  59        prefix_unmapped(vsie_page);
  60        if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
  61                atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
  62        while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
  63                cpu_relax();
  64}
  65
  66/* mark the prefix as mapped, this will allow the VSIE to run */
  67static void prefix_mapped(struct vsie_page *vsie_page)
  68{
  69        atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
  70}
  71
  72/* test if the prefix is mapped into the gmap shadow */
  73static int prefix_is_mapped(struct vsie_page *vsie_page)
  74{
  75        return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
  76}
  77
  78/* copy the updated intervention request bits into the shadow scb */
  79static void update_intervention_requests(struct vsie_page *vsie_page)
  80{
  81        const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
  82        int cpuflags;
  83
  84        cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
  85        atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
  86        atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
  87}
  88
  89/* shadow (filter and validate) the cpuflags  */
  90static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
  91{
  92        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
  93        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
  94        int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
  95
  96        /* we don't allow ESA/390 guests */
  97        if (!(cpuflags & CPUSTAT_ZARCH))
  98                return set_validity_icpt(scb_s, 0x0001U);
  99
 100        if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
 101                return set_validity_icpt(scb_s, 0x0001U);
 102        else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
 103                return set_validity_icpt(scb_s, 0x0007U);
 104
 105        /* intervention requests will be set later */
 106        newflags = CPUSTAT_ZARCH;
 107        if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
 108                newflags |= CPUSTAT_GED;
 109        if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
 110                if (cpuflags & CPUSTAT_GED)
 111                        return set_validity_icpt(scb_s, 0x0001U);
 112                newflags |= CPUSTAT_GED2;
 113        }
 114        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
 115                newflags |= cpuflags & CPUSTAT_P;
 116        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
 117                newflags |= cpuflags & CPUSTAT_SM;
 118        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
 119                newflags |= cpuflags & CPUSTAT_IBS;
 120
 121        atomic_set(&scb_s->cpuflags, newflags);
 122        return 0;
 123}
 124
 125/*
 126 * Create a shadow copy of the crycb block and setup key wrapping, if
 127 * requested for guest 3 and enabled for guest 2.
 128 *
 129 * We only accept format-1 (no AP in g2), but convert it into format-2
 130 * There is nothing to do for format-0.
 131 *
 132 * Returns: - 0 if shadowed or nothing to do
 133 *          - > 0 if control has to be given to guest 2
 134 */
 135static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 136{
 137        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 138        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 139        u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
 140        unsigned long *b1, *b2;
 141        u8 ecb3_flags;
 142
 143        scb_s->crycbd = 0;
 144        if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
 145                return 0;
 146        /* format-1 is supported with message-security-assist extension 3 */
 147        if (!test_kvm_facility(vcpu->kvm, 76))
 148                return 0;
 149        /* we may only allow it if enabled for guest 2 */
 150        ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
 151                     (ECB3_AES | ECB3_DEA);
 152        if (!ecb3_flags)
 153                return 0;
 154
 155        if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
 156                return set_validity_icpt(scb_s, 0x003CU);
 157        else if (!crycb_addr)
 158                return set_validity_icpt(scb_s, 0x0039U);
 159
 160        /* copy only the wrapping keys */
 161        if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
 162                return set_validity_icpt(scb_s, 0x0035U);
 163
 164        scb_s->ecb3 |= ecb3_flags;
 165        scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
 166                        CRYCB_FORMAT2;
 167
 168        /* xor both blocks in one run */
 169        b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
 170        b2 = (unsigned long *)
 171                            vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
 172        /* as 56%8 == 0, bitmap_xor won't overwrite any data */
 173        bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
 174        return 0;
 175}
 176
 177/* shadow (round up/down) the ibc to avoid validity icpt */
 178static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 179{
 180        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 181        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 182        __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
 183
 184        scb_s->ibc = 0;
 185        /* ibc installed in g2 and requested for g3 */
 186        if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
 187                scb_s->ibc = scb_o->ibc & 0x0fffU;
 188                /* takte care of the minimum ibc level of the machine */
 189                if (scb_s->ibc < min_ibc)
 190                        scb_s->ibc = min_ibc;
 191                /* take care of the maximum ibc level set for the guest */
 192                if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
 193                        scb_s->ibc = vcpu->kvm->arch.model.ibc;
 194        }
 195}
 196
 197/* unshadow the scb, copying parameters back to the real scb */
 198static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 199{
 200        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 201        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 202
 203        /* interception */
 204        scb_o->icptcode = scb_s->icptcode;
 205        scb_o->icptstatus = scb_s->icptstatus;
 206        scb_o->ipa = scb_s->ipa;
 207        scb_o->ipb = scb_s->ipb;
 208        scb_o->gbea = scb_s->gbea;
 209
 210        /* timer */
 211        scb_o->cputm = scb_s->cputm;
 212        scb_o->ckc = scb_s->ckc;
 213        scb_o->todpr = scb_s->todpr;
 214
 215        /* guest state */
 216        scb_o->gpsw = scb_s->gpsw;
 217        scb_o->gg14 = scb_s->gg14;
 218        scb_o->gg15 = scb_s->gg15;
 219        memcpy(scb_o->gcr, scb_s->gcr, 128);
 220        scb_o->pp = scb_s->pp;
 221
 222        /* interrupt intercept */
 223        switch (scb_s->icptcode) {
 224        case ICPT_PROGI:
 225        case ICPT_INSTPROGI:
 226        case ICPT_EXTINT:
 227                memcpy((void *)((u64)scb_o + 0xc0),
 228                       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
 229                break;
 230        case ICPT_PARTEXEC:
 231                /* MVPG only */
 232                memcpy((void *)((u64)scb_o + 0xc0),
 233                       (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
 234                break;
 235        }
 236
 237        if (scb_s->ihcpu != 0xffffU)
 238                scb_o->ihcpu = scb_s->ihcpu;
 239}
 240
 241/*
 242 * Setup the shadow scb by copying and checking the relevant parts of the g2
 243 * provided scb.
 244 *
 245 * Returns: - 0 if the scb has been shadowed
 246 *          - > 0 if control has to be given to guest 2
 247 */
 248static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 249{
 250        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 251        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 252        bool had_tx = scb_s->ecb & 0x10U;
 253        unsigned long new_mso = 0;
 254        int rc;
 255
 256        /* make sure we don't have any leftovers when reusing the scb */
 257        scb_s->icptcode = 0;
 258        scb_s->eca = 0;
 259        scb_s->ecb = 0;
 260        scb_s->ecb2 = 0;
 261        scb_s->ecb3 = 0;
 262        scb_s->ecd = 0;
 263        scb_s->fac = 0;
 264
 265        rc = prepare_cpuflags(vcpu, vsie_page);
 266        if (rc)
 267                goto out;
 268
 269        /* timer */
 270        scb_s->cputm = scb_o->cputm;
 271        scb_s->ckc = scb_o->ckc;
 272        scb_s->todpr = scb_o->todpr;
 273        scb_s->epoch = scb_o->epoch;
 274
 275        /* guest state */
 276        scb_s->gpsw = scb_o->gpsw;
 277        scb_s->gg14 = scb_o->gg14;
 278        scb_s->gg15 = scb_o->gg15;
 279        memcpy(scb_s->gcr, scb_o->gcr, 128);
 280        scb_s->pp = scb_o->pp;
 281
 282        /* interception / execution handling */
 283        scb_s->gbea = scb_o->gbea;
 284        scb_s->lctl = scb_o->lctl;
 285        scb_s->svcc = scb_o->svcc;
 286        scb_s->ictl = scb_o->ictl;
 287        /*
 288         * SKEY handling functions can't deal with false setting of PTE invalid
 289         * bits. Therefore we cannot provide interpretation and would later
 290         * have to provide own emulation handlers.
 291         */
 292        scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 293        scb_s->icpua = scb_o->icpua;
 294
 295        if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
 296                new_mso = scb_o->mso & 0xfffffffffff00000UL;
 297        /* if the hva of the prefix changes, we have to remap the prefix */
 298        if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
 299                prefix_unmapped(vsie_page);
 300         /* SIE will do mso/msl validity and exception checks for us */
 301        scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
 302        scb_s->mso = new_mso;
 303        scb_s->prefix = scb_o->prefix;
 304
 305        /* We have to definetly flush the tlb if this scb never ran */
 306        if (scb_s->ihcpu != 0xffffU)
 307                scb_s->ihcpu = scb_o->ihcpu;
 308
 309        /* MVPG and Protection Exception Interpretation are always available */
 310        scb_s->eca |= scb_o->eca & 0x01002000U;
 311        /* Host-protection-interruption introduced with ESOP */
 312        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
 313                scb_s->ecb |= scb_o->ecb & 0x02U;
 314        /* transactional execution */
 315        if (test_kvm_facility(vcpu->kvm, 73)) {
 316                /* remap the prefix is tx is toggled on */
 317                if ((scb_o->ecb & 0x10U) && !had_tx)
 318                        prefix_unmapped(vsie_page);
 319                scb_s->ecb |= scb_o->ecb & 0x10U;
 320        }
 321        /* SIMD */
 322        if (test_kvm_facility(vcpu->kvm, 129)) {
 323                scb_s->eca |= scb_o->eca & 0x00020000U;
 324                scb_s->ecd |= scb_o->ecd & 0x20000000U;
 325        }
 326        /* Run-time-Instrumentation */
 327        if (test_kvm_facility(vcpu->kvm, 64))
 328                scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
 329        /* Instruction Execution Prevention */
 330        if (test_kvm_facility(vcpu->kvm, 130))
 331                scb_s->ecb2 |= scb_o->ecb2 & 0x20U;
 332        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
 333                scb_s->eca |= scb_o->eca & 0x00000001U;
 334        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
 335                scb_s->eca |= scb_o->eca & 0x40000000U;
 336        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
 337                scb_s->eca |= scb_o->eca & 0x80000000U;
 338
 339        prepare_ibc(vcpu, vsie_page);
 340        rc = shadow_crycb(vcpu, vsie_page);
 341out:
 342        if (rc)
 343                unshadow_scb(vcpu, vsie_page);
 344        return rc;
 345}
 346
 347void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
 348                                 unsigned long end)
 349{
 350        struct kvm *kvm = gmap->private;
 351        struct vsie_page *cur;
 352        unsigned long prefix;
 353        struct page *page;
 354        int i;
 355
 356        if (!gmap_is_shadow(gmap))
 357                return;
 358        if (start >= 1UL << 31)
 359                /* We are only interested in prefix pages */
 360                return;
 361
 362        /*
 363         * Only new shadow blocks are added to the list during runtime,
 364         * therefore we can safely reference them all the time.
 365         */
 366        for (i = 0; i < kvm->arch.vsie.page_count; i++) {
 367                page = READ_ONCE(kvm->arch.vsie.pages[i]);
 368                if (!page)
 369                        continue;
 370                cur = page_to_virt(page);
 371                if (READ_ONCE(cur->gmap) != gmap)
 372                        continue;
 373                prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
 374                /* with mso/msl, the prefix lies at an offset */
 375                prefix += cur->scb_s.mso;
 376                if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
 377                        prefix_unmapped_sync(cur);
 378        }
 379}
 380
 381/*
 382 * Map the first prefix page and if tx is enabled also the second prefix page.
 383 *
 384 * The prefix will be protected, a gmap notifier will inform about unmaps.
 385 * The shadow scb must not be executed until the prefix is remapped, this is
 386 * guaranteed by properly handling PROG_REQUEST.
 387 *
 388 * Returns: - 0 on if successfully mapped or already mapped
 389 *          - > 0 if control has to be given to guest 2
 390 *          - -EAGAIN if the caller can retry immediately
 391 *          - -ENOMEM if out of memory
 392 */
 393static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 394{
 395        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 396        u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
 397        int rc;
 398
 399        if (prefix_is_mapped(vsie_page))
 400                return 0;
 401
 402        /* mark it as mapped so we can catch any concurrent unmappers */
 403        prefix_mapped(vsie_page);
 404
 405        /* with mso/msl, the prefix lies at offset *mso* */
 406        prefix += scb_s->mso;
 407
 408        rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
 409        if (!rc && (scb_s->ecb & 0x10U))
 410                rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
 411                                           prefix + PAGE_SIZE);
 412        /*
 413         * We don't have to mprotect, we will be called for all unshadows.
 414         * SIE will detect if protection applies and trigger a validity.
 415         */
 416        if (rc)
 417                prefix_unmapped(vsie_page);
 418        if (rc > 0 || rc == -EFAULT)
 419                rc = set_validity_icpt(scb_s, 0x0037U);
 420        return rc;
 421}
 422
 423/*
 424 * Pin the guest page given by gpa and set hpa to the pinned host address.
 425 * Will always be pinned writable.
 426 *
 427 * Returns: - 0 on success
 428 *          - -EINVAL if the gpa is not valid guest storage
 429 *          - -ENOMEM if out of memory
 430 */
 431static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
 432{
 433        struct page *page;
 434        hva_t hva;
 435        int rc;
 436
 437        hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
 438        if (kvm_is_error_hva(hva))
 439                return -EINVAL;
 440        rc = get_user_pages_fast(hva, 1, 1, &page);
 441        if (rc < 0)
 442                return rc;
 443        else if (rc != 1)
 444                return -ENOMEM;
 445        *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
 446        return 0;
 447}
 448
 449/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
 450static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
 451{
 452        struct page *page;
 453
 454        page = virt_to_page(hpa);
 455        set_page_dirty_lock(page);
 456        put_page(page);
 457        /* mark the page always as dirty for migration */
 458        mark_page_dirty(kvm, gpa_to_gfn(gpa));
 459}
 460
 461/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
 462static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 463{
 464        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 465        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 466        hpa_t hpa;
 467        gpa_t gpa;
 468
 469        hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
 470        if (hpa) {
 471                gpa = scb_o->scaol & ~0xfUL;
 472                if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
 473                        gpa |= (u64) scb_o->scaoh << 32;
 474                unpin_guest_page(vcpu->kvm, gpa, hpa);
 475                scb_s->scaol = 0;
 476                scb_s->scaoh = 0;
 477        }
 478
 479        hpa = scb_s->itdba;
 480        if (hpa) {
 481                gpa = scb_o->itdba & ~0xffUL;
 482                unpin_guest_page(vcpu->kvm, gpa, hpa);
 483                scb_s->itdba = 0;
 484        }
 485
 486        hpa = scb_s->gvrd;
 487        if (hpa) {
 488                gpa = scb_o->gvrd & ~0x1ffUL;
 489                unpin_guest_page(vcpu->kvm, gpa, hpa);
 490                scb_s->gvrd = 0;
 491        }
 492
 493        hpa = scb_s->riccbd;
 494        if (hpa) {
 495                gpa = scb_o->riccbd & ~0x3fUL;
 496                unpin_guest_page(vcpu->kvm, gpa, hpa);
 497                scb_s->riccbd = 0;
 498        }
 499}
 500
 501/*
 502 * Instead of shadowing some blocks, we can simply forward them because the
 503 * addresses in the scb are 64 bit long.
 504 *
 505 * This works as long as the data lies in one page. If blocks ever exceed one
 506 * page, we have to fall back to shadowing.
 507 *
 508 * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
 509 * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
 510 *
 511 * Returns: - 0 if all blocks were pinned.
 512 *          - > 0 if control has to be given to guest 2
 513 *          - -ENOMEM if out of memory
 514 */
 515static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 516{
 517        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 518        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 519        hpa_t hpa;
 520        gpa_t gpa;
 521        int rc = 0;
 522
 523        gpa = scb_o->scaol & ~0xfUL;
 524        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
 525                gpa |= (u64) scb_o->scaoh << 32;
 526        if (gpa) {
 527                if (!(gpa & ~0x1fffUL))
 528                        rc = set_validity_icpt(scb_s, 0x0038U);
 529                else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
 530                        rc = set_validity_icpt(scb_s, 0x0011U);
 531                else if ((gpa & PAGE_MASK) !=
 532                         ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
 533                        rc = set_validity_icpt(scb_s, 0x003bU);
 534                if (!rc) {
 535                        rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
 536                        if (rc == -EINVAL)
 537                                rc = set_validity_icpt(scb_s, 0x0034U);
 538                }
 539                if (rc)
 540                        goto unpin;
 541                scb_s->scaoh = (u32)((u64)hpa >> 32);
 542                scb_s->scaol = (u32)(u64)hpa;
 543        }
 544
 545        gpa = scb_o->itdba & ~0xffUL;
 546        if (gpa && (scb_s->ecb & 0x10U)) {
 547                if (!(gpa & ~0x1fffU)) {
 548                        rc = set_validity_icpt(scb_s, 0x0080U);
 549                        goto unpin;
 550                }
 551                /* 256 bytes cannot cross page boundaries */
 552                rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
 553                if (rc == -EINVAL)
 554                        rc = set_validity_icpt(scb_s, 0x0080U);
 555                if (rc)
 556                        goto unpin;
 557                scb_s->itdba = hpa;
 558        }
 559
 560        gpa = scb_o->gvrd & ~0x1ffUL;
 561        if (gpa && (scb_s->eca & 0x00020000U) &&
 562            !(scb_s->ecd & 0x20000000U)) {
 563                if (!(gpa & ~0x1fffUL)) {
 564                        rc = set_validity_icpt(scb_s, 0x1310U);
 565                        goto unpin;
 566                }
 567                /*
 568                 * 512 bytes vector registers cannot cross page boundaries
 569                 * if this block gets bigger, we have to shadow it.
 570                 */
 571                rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
 572                if (rc == -EINVAL)
 573                        rc = set_validity_icpt(scb_s, 0x1310U);
 574                if (rc)
 575                        goto unpin;
 576                scb_s->gvrd = hpa;
 577        }
 578
 579        gpa = scb_o->riccbd & ~0x3fUL;
 580        if (gpa && (scb_s->ecb3 & 0x01U)) {
 581                if (!(gpa & ~0x1fffUL)) {
 582                        rc = set_validity_icpt(scb_s, 0x0043U);
 583                        goto unpin;
 584                }
 585                /* 64 bytes cannot cross page boundaries */
 586                rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
 587                if (rc == -EINVAL)
 588                        rc = set_validity_icpt(scb_s, 0x0043U);
 589                /* Validity 0x0044 will be checked by SIE */
 590                if (rc)
 591                        goto unpin;
 592                scb_s->riccbd = hpa;
 593        }
 594        return 0;
 595unpin:
 596        unpin_blocks(vcpu, vsie_page);
 597        return rc;
 598}
 599
 600/* unpin the scb provided by guest 2, marking it as dirty */
 601static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
 602                      gpa_t gpa)
 603{
 604        hpa_t hpa = (hpa_t) vsie_page->scb_o;
 605
 606        if (hpa)
 607                unpin_guest_page(vcpu->kvm, gpa, hpa);
 608        vsie_page->scb_o = NULL;
 609}
 610
 611/*
 612 * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
 613 *
 614 * Returns: - 0 if the scb was pinned.
 615 *          - > 0 if control has to be given to guest 2
 616 *          - -ENOMEM if out of memory
 617 */
 618static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
 619                   gpa_t gpa)
 620{
 621        hpa_t hpa;
 622        int rc;
 623
 624        rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
 625        if (rc == -EINVAL) {
 626                rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 627                if (!rc)
 628                        rc = 1;
 629        }
 630        if (!rc)
 631                vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
 632        return rc;
 633}
 634
 635/*
 636 * Inject a fault into guest 2.
 637 *
 638 * Returns: - > 0 if control has to be given to guest 2
 639 *            < 0 if an error occurred during injection.
 640 */
 641static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
 642                        bool write_flag)
 643{
 644        struct kvm_s390_pgm_info pgm = {
 645                .code = code,
 646                .trans_exc_code =
 647                        /* 0-51: virtual address */
 648                        (vaddr & 0xfffffffffffff000UL) |
 649                        /* 52-53: store / fetch */
 650                        (((unsigned int) !write_flag) + 1) << 10,
 651                        /* 62-63: asce id (alway primary == 0) */
 652                .exc_access_id = 0, /* always primary */
 653                .op_access_id = 0, /* not MVPG */
 654        };
 655        int rc;
 656
 657        if (code == PGM_PROTECTION)
 658                pgm.trans_exc_code |= 0x4UL;
 659
 660        rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
 661        return rc ? rc : 1;
 662}
 663
 664/*
 665 * Handle a fault during vsie execution on a gmap shadow.
 666 *
 667 * Returns: - 0 if the fault was resolved
 668 *          - > 0 if control has to be given to guest 2
 669 *          - < 0 if an error occurred
 670 */
 671static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 672{
 673        int rc;
 674
 675        if (current->thread.gmap_int_code == PGM_PROTECTION)
 676                /* we can directly forward all protection exceptions */
 677                return inject_fault(vcpu, PGM_PROTECTION,
 678                                    current->thread.gmap_addr, 1);
 679
 680        rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
 681                                   current->thread.gmap_addr);
 682        if (rc > 0) {
 683                rc = inject_fault(vcpu, rc,
 684                                  current->thread.gmap_addr,
 685                                  current->thread.gmap_write_flag);
 686                if (rc >= 0)
 687                        vsie_page->fault_addr = current->thread.gmap_addr;
 688        }
 689        return rc;
 690}
 691
 692/*
 693 * Retry the previous fault that required guest 2 intervention. This avoids
 694 * one superfluous SIE re-entry and direct exit.
 695 *
 696 * Will ignore any errors. The next SIE fault will do proper fault handling.
 697 */
 698static void handle_last_fault(struct kvm_vcpu *vcpu,
 699                              struct vsie_page *vsie_page)
 700{
 701        if (vsie_page->fault_addr)
 702                kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
 703                                      vsie_page->fault_addr);
 704        vsie_page->fault_addr = 0;
 705}
 706
 707static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
 708{
 709        vsie_page->scb_s.icptcode = 0;
 710}
 711
 712/* rewind the psw and clear the vsie icpt, so we can retry execution */
 713static void retry_vsie_icpt(struct vsie_page *vsie_page)
 714{
 715        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 716        int ilen = insn_length(scb_s->ipa >> 8);
 717
 718        /* take care of EXECUTE instructions */
 719        if (scb_s->icptstatus & 1) {
 720                ilen = (scb_s->icptstatus >> 4) & 0x6;
 721                if (!ilen)
 722                        ilen = 4;
 723        }
 724        scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
 725        clear_vsie_icpt(vsie_page);
 726}
 727
 728/*
 729 * Try to shadow + enable the guest 2 provided facility list.
 730 * Retry instruction execution if enabled for and provided by guest 2.
 731 *
 732 * Returns: - 0 if handled (retry or guest 2 icpt)
 733 *          - > 0 if control has to be given to guest 2
 734 */
 735static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 736{
 737        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 738        __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
 739
 740        if (fac && test_kvm_facility(vcpu->kvm, 7)) {
 741                retry_vsie_icpt(vsie_page);
 742                if (read_guest_real(vcpu, fac, &vsie_page->fac,
 743                                    sizeof(vsie_page->fac)))
 744                        return set_validity_icpt(scb_s, 0x1090U);
 745                scb_s->fac = (__u32)(__u64) &vsie_page->fac;
 746        }
 747        return 0;
 748}
 749
 750/*
 751 * Run the vsie on a shadow scb and a shadow gmap, without any further
 752 * sanity checks, handling SIE faults.
 753 *
 754 * Returns: - 0 everything went fine
 755 *          - > 0 if control has to be given to guest 2
 756 *          - < 0 if an error occurred
 757 */
 758static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 759{
 760        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 761        struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 762        int rc;
 763
 764        handle_last_fault(vcpu, vsie_page);
 765
 766        if (need_resched())
 767                schedule();
 768        if (test_cpu_flag(CIF_MCCK_PENDING))
 769                s390_handle_mcck();
 770
 771        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 772        local_irq_disable();
 773        guest_enter_irqoff();
 774        local_irq_enable();
 775
 776        rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
 777
 778        local_irq_disable();
 779        guest_exit_irqoff();
 780        local_irq_enable();
 781        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 782
 783        if (rc > 0)
 784                rc = 0; /* we could still have an icpt */
 785        else if (rc == -EFAULT)
 786                return handle_fault(vcpu, vsie_page);
 787
 788        switch (scb_s->icptcode) {
 789        case ICPT_INST:
 790                if (scb_s->ipa == 0xb2b0)
 791                        rc = handle_stfle(vcpu, vsie_page);
 792                break;
 793        case ICPT_STOP:
 794                /* stop not requested by g2 - must have been a kick */
 795                if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
 796                        clear_vsie_icpt(vsie_page);
 797                break;
 798        case ICPT_VALIDITY:
 799                if ((scb_s->ipa & 0xf000) != 0xf000)
 800                        scb_s->ipa += 0x1000;
 801                break;
 802        }
 803        return rc;
 804}
 805
 806static void release_gmap_shadow(struct vsie_page *vsie_page)
 807{
 808        if (vsie_page->gmap)
 809                gmap_put(vsie_page->gmap);
 810        WRITE_ONCE(vsie_page->gmap, NULL);
 811        prefix_unmapped(vsie_page);
 812}
 813
 814static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
 815                               struct vsie_page *vsie_page)
 816{
 817        unsigned long asce;
 818        union ctlreg0 cr0;
 819        struct gmap *gmap;
 820        int edat;
 821
 822        asce = vcpu->arch.sie_block->gcr[1];
 823        cr0.val = vcpu->arch.sie_block->gcr[0];
 824        edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
 825        edat += edat && test_kvm_facility(vcpu->kvm, 78);
 826
 827        /*
 828         * ASCE or EDAT could have changed since last icpt, or the gmap
 829         * we're holding has been unshadowed. If the gmap is still valid,
 830         * we can safely reuse it.
 831         */
 832        if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
 833                return 0;
 834
 835        /* release the old shadow - if any, and mark the prefix as unmapped */
 836        release_gmap_shadow(vsie_page);
 837        gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
 838        if (IS_ERR(gmap))
 839                return PTR_ERR(gmap);
 840        gmap->private = vcpu->kvm;
 841        WRITE_ONCE(vsie_page->gmap, gmap);
 842        return 0;
 843}
 844
 845/*
 846 * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
 847 */
 848static void register_shadow_scb(struct kvm_vcpu *vcpu,
 849                                struct vsie_page *vsie_page)
 850{
 851        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 852
 853        WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
 854        /*
 855         * External calls have to lead to a kick of the vcpu and
 856         * therefore the vsie -> Simulate Wait state.
 857         */
 858        atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
 859        /*
 860         * We have to adjust the g3 epoch by the g2 epoch. The epoch will
 861         * automatically be adjusted on tod clock changes via kvm_sync_clock.
 862         */
 863        preempt_disable();
 864        scb_s->epoch += vcpu->kvm->arch.epoch;
 865        preempt_enable();
 866}
 867
 868/*
 869 * Unregister a shadow scb from a VCPU.
 870 */
 871static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
 872{
 873        atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
 874        WRITE_ONCE(vcpu->arch.vsie_block, NULL);
 875}
 876
 877/*
 878 * Run the vsie on a shadowed scb, managing the gmap shadow, handling
 879 * prefix pages and faults.
 880 *
 881 * Returns: - 0 if no errors occurred
 882 *          - > 0 if control has to be given to guest 2
 883 *          - -ENOMEM if out of memory
 884 */
 885static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 886{
 887        struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 888        int rc = 0;
 889
 890        while (1) {
 891                rc = acquire_gmap_shadow(vcpu, vsie_page);
 892                if (!rc)
 893                        rc = map_prefix(vcpu, vsie_page);
 894                if (!rc) {
 895                        gmap_enable(vsie_page->gmap);
 896                        update_intervention_requests(vsie_page);
 897                        rc = do_vsie_run(vcpu, vsie_page);
 898                        gmap_enable(vcpu->arch.gmap);
 899                }
 900                atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
 901
 902                if (rc == -EAGAIN)
 903                        rc = 0;
 904                if (rc || scb_s->icptcode || signal_pending(current) ||
 905                    kvm_s390_vcpu_has_irq(vcpu, 0))
 906                        break;
 907        }
 908
 909        if (rc == -EFAULT) {
 910                /*
 911                 * Addressing exceptions are always presentes as intercepts.
 912                 * As addressing exceptions are suppressing and our guest 3 PSW
 913                 * points at the responsible instruction, we have to
 914                 * forward the PSW and set the ilc. If we can't read guest 3
 915                 * instruction, we can use an arbitrary ilc. Let's always use
 916                 * ilen = 4 for now, so we can avoid reading in guest 3 virtual
 917                 * memory. (we could also fake the shadow so the hardware
 918                 * handles it).
 919                 */
 920                scb_s->icptcode = ICPT_PROGI;
 921                scb_s->iprcc = PGM_ADDRESSING;
 922                scb_s->pgmilc = 4;
 923                scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
 924        }
 925        return rc;
 926}
 927
 928/*
 929 * Get or create a vsie page for a scb address.
 930 *
 931 * Returns: - address of a vsie page (cached or new one)
 932 *          - NULL if the same scb address is already used by another VCPU
 933 *          - ERR_PTR(-ENOMEM) if out of memory
 934 */
 935static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 936{
 937        struct vsie_page *vsie_page;
 938        struct page *page;
 939        int nr_vcpus;
 940
 941        rcu_read_lock();
 942        page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
 943        rcu_read_unlock();
 944        if (page) {
 945                if (page_ref_inc_return(page) == 2)
 946                        return page_to_virt(page);
 947                page_ref_dec(page);
 948        }
 949
 950        /*
 951         * We want at least #online_vcpus shadows, so every VCPU can execute
 952         * the VSIE in parallel.
 953         */
 954        nr_vcpus = atomic_read(&kvm->online_vcpus);
 955
 956        mutex_lock(&kvm->arch.vsie.mutex);
 957        if (kvm->arch.vsie.page_count < nr_vcpus) {
 958                page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
 959                if (!page) {
 960                        mutex_unlock(&kvm->arch.vsie.mutex);
 961                        return ERR_PTR(-ENOMEM);
 962                }
 963                page_ref_inc(page);
 964                kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
 965                kvm->arch.vsie.page_count++;
 966        } else {
 967                /* reuse an existing entry that belongs to nobody */
 968                while (true) {
 969                        page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
 970                        if (page_ref_inc_return(page) == 2)
 971                                break;
 972                        page_ref_dec(page);
 973                        kvm->arch.vsie.next++;
 974                        kvm->arch.vsie.next %= nr_vcpus;
 975                }
 976                radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
 977        }
 978        page->index = addr;
 979        /* double use of the same address */
 980        if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
 981                page_ref_dec(page);
 982                mutex_unlock(&kvm->arch.vsie.mutex);
 983                return NULL;
 984        }
 985        mutex_unlock(&kvm->arch.vsie.mutex);
 986
 987        vsie_page = page_to_virt(page);
 988        memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
 989        release_gmap_shadow(vsie_page);
 990        vsie_page->fault_addr = 0;
 991        vsie_page->scb_s.ihcpu = 0xffffU;
 992        return vsie_page;
 993}
 994
 995/* put a vsie page acquired via get_vsie_page */
 996static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
 997{
 998        struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
 999
1000        page_ref_dec(page);
1001}
1002
1003int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
1004{
1005        struct vsie_page *vsie_page;
1006        unsigned long scb_addr;
1007        int rc;
1008
1009        vcpu->stat.instruction_sie++;
1010        if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
1011                return -EOPNOTSUPP;
1012        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
1013                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
1014
1015        BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
1016        scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
1017
1018        /* 512 byte alignment */
1019        if (unlikely(scb_addr & 0x1ffUL))
1020                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
1021
1022        if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
1023                return 0;
1024
1025        vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
1026        if (IS_ERR(vsie_page))
1027                return PTR_ERR(vsie_page);
1028        else if (!vsie_page)
1029                /* double use of sie control block - simply do nothing */
1030                return 0;
1031
1032        rc = pin_scb(vcpu, vsie_page, scb_addr);
1033        if (rc)
1034                goto out_put;
1035        rc = shadow_scb(vcpu, vsie_page);
1036        if (rc)
1037                goto out_unpin_scb;
1038        rc = pin_blocks(vcpu, vsie_page);
1039        if (rc)
1040                goto out_unshadow;
1041        register_shadow_scb(vcpu, vsie_page);
1042        rc = vsie_run(vcpu, vsie_page);
1043        unregister_shadow_scb(vcpu);
1044        unpin_blocks(vcpu, vsie_page);
1045out_unshadow:
1046        unshadow_scb(vcpu, vsie_page);
1047out_unpin_scb:
1048        unpin_scb(vcpu, vsie_page, scb_addr);
1049out_put:
1050        put_vsie_page(vcpu->kvm, vsie_page);
1051
1052        return rc < 0 ? rc : 0;
1053}
1054
1055/* Init the vsie data structures. To be called when a vm is initialized. */
1056void kvm_s390_vsie_init(struct kvm *kvm)
1057{
1058        mutex_init(&kvm->arch.vsie.mutex);
1059        INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
1060}
1061
1062/* Destroy the vsie data structures. To be called when a vm is destroyed. */
1063void kvm_s390_vsie_destroy(struct kvm *kvm)
1064{
1065        struct vsie_page *vsie_page;
1066        struct page *page;
1067        int i;
1068
1069        mutex_lock(&kvm->arch.vsie.mutex);
1070        for (i = 0; i < kvm->arch.vsie.page_count; i++) {
1071                page = kvm->arch.vsie.pages[i];
1072                kvm->arch.vsie.pages[i] = NULL;
1073                vsie_page = page_to_virt(page);
1074                release_gmap_shadow(vsie_page);
1075                /* free the radix tree entry */
1076                radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
1077                __free_page(page);
1078        }
1079        kvm->arch.vsie.page_count = 0;
1080        mutex_unlock(&kvm->arch.vsie.mutex);
1081}
1082
1083void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
1084{
1085        struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
1086
1087        /*
1088         * Even if the VCPU lets go of the shadow sie block reference, it is
1089         * still valid in the cache. So we can safely kick it.
1090         */
1091        if (scb) {
1092                atomic_or(PROG_BLOCK_SIE, &scb->prog20);
1093                if (scb->prog0c & PROG_IN_SIE)
1094                        atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
1095        }
1096}
1097