linux/arch/s390/mm/fault.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  S390 version
   4 *    Copyright IBM Corp. 1999
   5 *    Author(s): Hartmut Penner (hp@de.ibm.com)
   6 *               Ulrich Weigand (uweigand@de.ibm.com)
   7 *
   8 *  Derived from "arch/i386/mm/fault.c"
   9 *    Copyright (C) 1995  Linus Torvalds
  10 */
  11
  12#include <linux/kernel_stat.h>
  13#include <linux/perf_event.h>
  14#include <linux/signal.h>
  15#include <linux/sched.h>
  16#include <linux/sched/debug.h>
  17#include <linux/kernel.h>
  18#include <linux/errno.h>
  19#include <linux/string.h>
  20#include <linux/types.h>
  21#include <linux/ptrace.h>
  22#include <linux/mman.h>
  23#include <linux/mm.h>
  24#include <linux/compat.h>
  25#include <linux/smp.h>
  26#include <linux/kdebug.h>
  27#include <linux/init.h>
  28#include <linux/console.h>
  29#include <linux/extable.h>
  30#include <linux/hardirq.h>
  31#include <linux/kprobes.h>
  32#include <linux/uaccess.h>
  33#include <linux/hugetlb.h>
  34#include <asm/asm-offsets.h>
  35#include <asm/diag.h>
  36#include <asm/gmap.h>
  37#include <asm/irq.h>
  38#include <asm/mmu_context.h>
  39#include <asm/facility.h>
  40#include <asm/uv.h>
  41#include "../kernel/entry.h"
  42
  43#define __FAIL_ADDR_MASK -4096L
  44#define __SUBCODE_MASK 0x0600
  45#define __PF_RES_FIELD 0x8000000000000000ULL
  46
  47#define VM_FAULT_BADCONTEXT     ((__force vm_fault_t) 0x010000)
  48#define VM_FAULT_BADMAP         ((__force vm_fault_t) 0x020000)
  49#define VM_FAULT_BADACCESS      ((__force vm_fault_t) 0x040000)
  50#define VM_FAULT_SIGNAL         ((__force vm_fault_t) 0x080000)
  51#define VM_FAULT_PFAULT         ((__force vm_fault_t) 0x100000)
  52
  53enum fault_type {
  54        KERNEL_FAULT,
  55        USER_FAULT,
  56        GMAP_FAULT,
  57};
  58
  59static unsigned long store_indication __read_mostly;
  60
  61static int __init fault_init(void)
  62{
  63        if (test_facility(75))
  64                store_indication = 0xc00;
  65        return 0;
  66}
  67early_initcall(fault_init);
  68
  69/*
  70 * Find out which address space caused the exception.
  71 */
  72static enum fault_type get_fault_type(struct pt_regs *regs)
  73{
  74        unsigned long trans_exc_code;
  75
  76        trans_exc_code = regs->int_parm_long & 3;
  77        if (likely(trans_exc_code == 0)) {
  78                /* primary space exception */
  79                if (user_mode(regs))
  80                        return USER_FAULT;
  81                if (!IS_ENABLED(CONFIG_PGSTE))
  82                        return KERNEL_FAULT;
  83                if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
  84                        return GMAP_FAULT;
  85                return KERNEL_FAULT;
  86        }
  87        if (trans_exc_code == 2)
  88                return USER_FAULT;
  89        if (trans_exc_code == 1) {
  90                /* access register mode, not used in the kernel */
  91                return USER_FAULT;
  92        }
  93        /* home space exception -> access via kernel ASCE */
  94        return KERNEL_FAULT;
  95}
  96
  97static int bad_address(void *p)
  98{
  99        unsigned long dummy;
 100
 101        return get_kernel_nofault(dummy, (unsigned long *)p);
 102}
 103
 104static void dump_pagetable(unsigned long asce, unsigned long address)
 105{
 106        unsigned long *table = __va(asce & _ASCE_ORIGIN);
 107
 108        pr_alert("AS:%016lx ", asce);
 109        switch (asce & _ASCE_TYPE_MASK) {
 110        case _ASCE_TYPE_REGION1:
 111                table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
 112                if (bad_address(table))
 113                        goto bad;
 114                pr_cont("R1:%016lx ", *table);
 115                if (*table & _REGION_ENTRY_INVALID)
 116                        goto out;
 117                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 118                fallthrough;
 119        case _ASCE_TYPE_REGION2:
 120                table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
 121                if (bad_address(table))
 122                        goto bad;
 123                pr_cont("R2:%016lx ", *table);
 124                if (*table & _REGION_ENTRY_INVALID)
 125                        goto out;
 126                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 127                fallthrough;
 128        case _ASCE_TYPE_REGION3:
 129                table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
 130                if (bad_address(table))
 131                        goto bad;
 132                pr_cont("R3:%016lx ", *table);
 133                if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
 134                        goto out;
 135                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 136                fallthrough;
 137        case _ASCE_TYPE_SEGMENT:
 138                table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 139                if (bad_address(table))
 140                        goto bad;
 141                pr_cont("S:%016lx ", *table);
 142                if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
 143                        goto out;
 144                table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
 145        }
 146        table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
 147        if (bad_address(table))
 148                goto bad;
 149        pr_cont("P:%016lx ", *table);
 150out:
 151        pr_cont("\n");
 152        return;
 153bad:
 154        pr_cont("BAD\n");
 155}
 156
 157static void dump_fault_info(struct pt_regs *regs)
 158{
 159        unsigned long asce;
 160
 161        pr_alert("Failing address: %016lx TEID: %016lx\n",
 162                 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
 163        pr_alert("Fault in ");
 164        switch (regs->int_parm_long & 3) {
 165        case 3:
 166                pr_cont("home space ");
 167                break;
 168        case 2:
 169                pr_cont("secondary space ");
 170                break;
 171        case 1:
 172                pr_cont("access register ");
 173                break;
 174        case 0:
 175                pr_cont("primary space ");
 176                break;
 177        }
 178        pr_cont("mode while using ");
 179        switch (get_fault_type(regs)) {
 180        case USER_FAULT:
 181                asce = S390_lowcore.user_asce;
 182                pr_cont("user ");
 183                break;
 184        case GMAP_FAULT:
 185                asce = ((struct gmap *) S390_lowcore.gmap)->asce;
 186                pr_cont("gmap ");
 187                break;
 188        case KERNEL_FAULT:
 189                asce = S390_lowcore.kernel_asce;
 190                pr_cont("kernel ");
 191                break;
 192        default:
 193                unreachable();
 194        }
 195        pr_cont("ASCE.\n");
 196        dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
 197}
 198
 199int show_unhandled_signals = 1;
 200
 201void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
 202{
 203        if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 204                return;
 205        if (!unhandled_signal(current, signr))
 206                return;
 207        if (!printk_ratelimit())
 208                return;
 209        printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
 210               regs->int_code & 0xffff, regs->int_code >> 17);
 211        print_vma_addr(KERN_CONT "in ", regs->psw.addr);
 212        printk(KERN_CONT "\n");
 213        if (is_mm_fault)
 214                dump_fault_info(regs);
 215        show_regs(regs);
 216}
 217
 218/*
 219 * Send SIGSEGV to task.  This is an external routine
 220 * to keep the stack usage of do_page_fault small.
 221 */
 222static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 223{
 224        report_user_fault(regs, SIGSEGV, 1);
 225        force_sig_fault(SIGSEGV, si_code,
 226                        (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
 227}
 228
 229const struct exception_table_entry *s390_search_extables(unsigned long addr)
 230{
 231        const struct exception_table_entry *fixup;
 232
 233        fixup = search_extable(__start_dma_ex_table,
 234                               __stop_dma_ex_table - __start_dma_ex_table,
 235                               addr);
 236        if (!fixup)
 237                fixup = search_exception_tables(addr);
 238        return fixup;
 239}
 240
 241static noinline void do_no_context(struct pt_regs *regs)
 242{
 243        const struct exception_table_entry *fixup;
 244
 245        /* Are we prepared to handle this kernel fault?  */
 246        fixup = s390_search_extables(regs->psw.addr);
 247        if (fixup && ex_handle(fixup, regs))
 248                return;
 249
 250        /*
 251         * Oops. The kernel tried to access some bad page. We'll have to
 252         * terminate things with extreme prejudice.
 253         */
 254        if (get_fault_type(regs) == KERNEL_FAULT)
 255                printk(KERN_ALERT "Unable to handle kernel pointer dereference"
 256                       " in virtual kernel address space\n");
 257        else
 258                printk(KERN_ALERT "Unable to handle kernel paging request"
 259                       " in virtual user address space\n");
 260        dump_fault_info(regs);
 261        die(regs, "Oops");
 262        do_exit(SIGKILL);
 263}
 264
 265static noinline void do_low_address(struct pt_regs *regs)
 266{
 267        /* Low-address protection hit in kernel mode means
 268           NULL pointer write access in kernel mode.  */
 269        if (regs->psw.mask & PSW_MASK_PSTATE) {
 270                /* Low-address protection hit in user mode 'cannot happen'. */
 271                die (regs, "Low-address protection");
 272                do_exit(SIGKILL);
 273        }
 274
 275        do_no_context(regs);
 276}
 277
 278static noinline void do_sigbus(struct pt_regs *regs)
 279{
 280        /*
 281         * Send a sigbus, regardless of whether we were in kernel
 282         * or user mode.
 283         */
 284        force_sig_fault(SIGBUS, BUS_ADRERR,
 285                        (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
 286}
 287
 288static noinline int signal_return(struct pt_regs *regs)
 289{
 290        u16 instruction;
 291        int rc;
 292
 293        rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
 294        if (rc)
 295                return rc;
 296        if (instruction == 0x0a77) {
 297                set_pt_regs_flag(regs, PIF_SYSCALL);
 298                regs->int_code = 0x00040077;
 299                return 0;
 300        } else if (instruction == 0x0aad) {
 301                set_pt_regs_flag(regs, PIF_SYSCALL);
 302                regs->int_code = 0x000400ad;
 303                return 0;
 304        }
 305        return -EACCES;
 306}
 307
 308static noinline void do_fault_error(struct pt_regs *regs, int access,
 309                                        vm_fault_t fault)
 310{
 311        int si_code;
 312
 313        switch (fault) {
 314        case VM_FAULT_BADACCESS:
 315                if (access == VM_EXEC && signal_return(regs) == 0)
 316                        break;
 317                fallthrough;
 318        case VM_FAULT_BADMAP:
 319                /* Bad memory access. Check if it is kernel or user space. */
 320                if (user_mode(regs)) {
 321                        /* User mode accesses just cause a SIGSEGV */
 322                        si_code = (fault == VM_FAULT_BADMAP) ?
 323                                SEGV_MAPERR : SEGV_ACCERR;
 324                        do_sigsegv(regs, si_code);
 325                        break;
 326                }
 327                fallthrough;
 328        case VM_FAULT_BADCONTEXT:
 329        case VM_FAULT_PFAULT:
 330                do_no_context(regs);
 331                break;
 332        case VM_FAULT_SIGNAL:
 333                if (!user_mode(regs))
 334                        do_no_context(regs);
 335                break;
 336        default: /* fault & VM_FAULT_ERROR */
 337                if (fault & VM_FAULT_OOM) {
 338                        if (!user_mode(regs))
 339                                do_no_context(regs);
 340                        else
 341                                pagefault_out_of_memory();
 342                } else if (fault & VM_FAULT_SIGSEGV) {
 343                        /* Kernel mode? Handle exceptions or die */
 344                        if (!user_mode(regs))
 345                                do_no_context(regs);
 346                        else
 347                                do_sigsegv(regs, SEGV_MAPERR);
 348                } else if (fault & VM_FAULT_SIGBUS) {
 349                        /* Kernel mode? Handle exceptions or die */
 350                        if (!user_mode(regs))
 351                                do_no_context(regs);
 352                        else
 353                                do_sigbus(regs);
 354                } else
 355                        BUG();
 356                break;
 357        }
 358}
 359
 360/*
 361 * This routine handles page faults.  It determines the address,
 362 * and the problem, and then passes it off to one of the appropriate
 363 * routines.
 364 *
 365 * interruption code (int_code):
 366 *   04       Protection           ->  Write-Protection  (suppression)
 367 *   10       Segment translation  ->  Not present       (nullification)
 368 *   11       Page translation     ->  Not present       (nullification)
 369 *   3b       Region third trans.  ->  Not present       (nullification)
 370 */
 371static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 372{
 373        struct gmap *gmap;
 374        struct task_struct *tsk;
 375        struct mm_struct *mm;
 376        struct vm_area_struct *vma;
 377        enum fault_type type;
 378        unsigned long trans_exc_code;
 379        unsigned long address;
 380        unsigned int flags;
 381        vm_fault_t fault;
 382
 383        tsk = current;
 384        /*
 385         * The instruction that caused the program check has
 386         * been nullified. Don't signal single step via SIGTRAP.
 387         */
 388        clear_thread_flag(TIF_PER_TRAP);
 389
 390        if (kprobe_page_fault(regs, 14))
 391                return 0;
 392
 393        mm = tsk->mm;
 394        trans_exc_code = regs->int_parm_long;
 395
 396        /*
 397         * Verify that the fault happened in user space, that
 398         * we are not in an interrupt and that there is a 
 399         * user context.
 400         */
 401        fault = VM_FAULT_BADCONTEXT;
 402        type = get_fault_type(regs);
 403        switch (type) {
 404        case KERNEL_FAULT:
 405                goto out;
 406        case USER_FAULT:
 407        case GMAP_FAULT:
 408                if (faulthandler_disabled() || !mm)
 409                        goto out;
 410                break;
 411        }
 412
 413        address = trans_exc_code & __FAIL_ADDR_MASK;
 414        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 415        flags = FAULT_FLAG_DEFAULT;
 416        if (user_mode(regs))
 417                flags |= FAULT_FLAG_USER;
 418        if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 419                flags |= FAULT_FLAG_WRITE;
 420        mmap_read_lock(mm);
 421
 422        gmap = NULL;
 423        if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
 424                gmap = (struct gmap *) S390_lowcore.gmap;
 425                current->thread.gmap_addr = address;
 426                current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
 427                current->thread.gmap_int_code = regs->int_code & 0xffff;
 428                address = __gmap_translate(gmap, address);
 429                if (address == -EFAULT) {
 430                        fault = VM_FAULT_BADMAP;
 431                        goto out_up;
 432                }
 433                if (gmap->pfault_enabled)
 434                        flags |= FAULT_FLAG_RETRY_NOWAIT;
 435        }
 436
 437retry:
 438        fault = VM_FAULT_BADMAP;
 439        vma = find_vma(mm, address);
 440        if (!vma)
 441                goto out_up;
 442
 443        if (unlikely(vma->vm_start > address)) {
 444                if (!(vma->vm_flags & VM_GROWSDOWN))
 445                        goto out_up;
 446                if (expand_stack(vma, address))
 447                        goto out_up;
 448        }
 449
 450        /*
 451         * Ok, we have a good vm_area for this memory access, so
 452         * we can handle it..
 453         */
 454        fault = VM_FAULT_BADACCESS;
 455        if (unlikely(!(vma->vm_flags & access)))
 456                goto out_up;
 457
 458        if (is_vm_hugetlb_page(vma))
 459                address &= HPAGE_MASK;
 460        /*
 461         * If for any reason at all we couldn't handle the fault,
 462         * make sure we exit gracefully rather than endlessly redo
 463         * the fault.
 464         */
 465        fault = handle_mm_fault(vma, address, flags, regs);
 466        if (fault_signal_pending(fault, regs)) {
 467                fault = VM_FAULT_SIGNAL;
 468                if (flags & FAULT_FLAG_RETRY_NOWAIT)
 469                        goto out_up;
 470                goto out;
 471        }
 472        if (unlikely(fault & VM_FAULT_ERROR))
 473                goto out_up;
 474
 475        if (flags & FAULT_FLAG_ALLOW_RETRY) {
 476                if (fault & VM_FAULT_RETRY) {
 477                        if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
 478                            (flags & FAULT_FLAG_RETRY_NOWAIT)) {
 479                                /* FAULT_FLAG_RETRY_NOWAIT has been set,
 480                                 * mmap_lock has not been released */
 481                                current->thread.gmap_pfault = 1;
 482                                fault = VM_FAULT_PFAULT;
 483                                goto out_up;
 484                        }
 485                        flags &= ~FAULT_FLAG_RETRY_NOWAIT;
 486                        flags |= FAULT_FLAG_TRIED;
 487                        mmap_read_lock(mm);
 488                        goto retry;
 489                }
 490        }
 491        if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
 492                address =  __gmap_link(gmap, current->thread.gmap_addr,
 493                                       address);
 494                if (address == -EFAULT) {
 495                        fault = VM_FAULT_BADMAP;
 496                        goto out_up;
 497                }
 498                if (address == -ENOMEM) {
 499                        fault = VM_FAULT_OOM;
 500                        goto out_up;
 501                }
 502        }
 503        fault = 0;
 504out_up:
 505        mmap_read_unlock(mm);
 506out:
 507        return fault;
 508}
 509
 510void do_protection_exception(struct pt_regs *regs)
 511{
 512        unsigned long trans_exc_code;
 513        int access;
 514        vm_fault_t fault;
 515
 516        trans_exc_code = regs->int_parm_long;
 517        /*
 518         * Protection exceptions are suppressing, decrement psw address.
 519         * The exception to this rule are aborted transactions, for these
 520         * the PSW already points to the correct location.
 521         */
 522        if (!(regs->int_code & 0x200))
 523                regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 524        /*
 525         * Check for low-address protection.  This needs to be treated
 526         * as a special case because the translation exception code
 527         * field is not guaranteed to contain valid data in this case.
 528         */
 529        if (unlikely(!(trans_exc_code & 4))) {
 530                do_low_address(regs);
 531                return;
 532        }
 533        if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
 534                regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
 535                                        (regs->psw.addr & PAGE_MASK);
 536                access = VM_EXEC;
 537                fault = VM_FAULT_BADACCESS;
 538        } else {
 539                access = VM_WRITE;
 540                fault = do_exception(regs, access);
 541        }
 542        if (unlikely(fault))
 543                do_fault_error(regs, access, fault);
 544}
 545NOKPROBE_SYMBOL(do_protection_exception);
 546
 547void do_dat_exception(struct pt_regs *regs)
 548{
 549        int access;
 550        vm_fault_t fault;
 551
 552        access = VM_ACCESS_FLAGS;
 553        fault = do_exception(regs, access);
 554        if (unlikely(fault))
 555                do_fault_error(regs, access, fault);
 556}
 557NOKPROBE_SYMBOL(do_dat_exception);
 558
 559#ifdef CONFIG_PFAULT 
 560/*
 561 * 'pfault' pseudo page faults routines.
 562 */
 563static int pfault_disable;
 564
 565static int __init nopfault(char *str)
 566{
 567        pfault_disable = 1;
 568        return 1;
 569}
 570
 571__setup("nopfault", nopfault);
 572
 573struct pfault_refbk {
 574        u16 refdiagc;
 575        u16 reffcode;
 576        u16 refdwlen;
 577        u16 refversn;
 578        u64 refgaddr;
 579        u64 refselmk;
 580        u64 refcmpmk;
 581        u64 reserved;
 582} __attribute__ ((packed, aligned(8)));
 583
 584static struct pfault_refbk pfault_init_refbk = {
 585        .refdiagc = 0x258,
 586        .reffcode = 0,
 587        .refdwlen = 5,
 588        .refversn = 2,
 589        .refgaddr = __LC_LPP,
 590        .refselmk = 1ULL << 48,
 591        .refcmpmk = 1ULL << 48,
 592        .reserved = __PF_RES_FIELD
 593};
 594
 595int pfault_init(void)
 596{
 597        int rc;
 598
 599        if (pfault_disable)
 600                return -1;
 601        diag_stat_inc(DIAG_STAT_X258);
 602        asm volatile(
 603                "       diag    %1,%0,0x258\n"
 604                "0:     j       2f\n"
 605                "1:     la      %0,8\n"
 606                "2:\n"
 607                EX_TABLE(0b,1b)
 608                : "=d" (rc)
 609                : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
 610        return rc;
 611}
 612
 613static struct pfault_refbk pfault_fini_refbk = {
 614        .refdiagc = 0x258,
 615        .reffcode = 1,
 616        .refdwlen = 5,
 617        .refversn = 2,
 618};
 619
 620void pfault_fini(void)
 621{
 622
 623        if (pfault_disable)
 624                return;
 625        diag_stat_inc(DIAG_STAT_X258);
 626        asm volatile(
 627                "       diag    %0,0,0x258\n"
 628                "0:     nopr    %%r7\n"
 629                EX_TABLE(0b,0b)
 630                : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
 631}
 632
 633static DEFINE_SPINLOCK(pfault_lock);
 634static LIST_HEAD(pfault_list);
 635
 636#define PF_COMPLETE     0x0080
 637
 638/*
 639 * The mechanism of our pfault code: if Linux is running as guest, runs a user
 640 * space process and the user space process accesses a page that the host has
 641 * paged out we get a pfault interrupt.
 642 *
 643 * This allows us, within the guest, to schedule a different process. Without
 644 * this mechanism the host would have to suspend the whole virtual cpu until
 645 * the page has been paged in.
 646 *
 647 * So when we get such an interrupt then we set the state of the current task
 648 * to uninterruptible and also set the need_resched flag. Both happens within
 649 * interrupt context(!). If we later on want to return to user space we
 650 * recognize the need_resched flag and then call schedule().  It's not very
 651 * obvious how this works...
 652 *
 653 * Of course we have a lot of additional fun with the completion interrupt (->
 654 * host signals that a page of a process has been paged in and the process can
 655 * continue to run). This interrupt can arrive on any cpu and, since we have
 656 * virtual cpus, actually appear before the interrupt that signals that a page
 657 * is missing.
 658 */
 659static void pfault_interrupt(struct ext_code ext_code,
 660                             unsigned int param32, unsigned long param64)
 661{
 662        struct task_struct *tsk;
 663        __u16 subcode;
 664        pid_t pid;
 665
 666        /*
 667         * Get the external interruption subcode & pfault initial/completion
 668         * signal bit. VM stores this in the 'cpu address' field associated
 669         * with the external interrupt.
 670         */
 671        subcode = ext_code.subcode;
 672        if ((subcode & 0xff00) != __SUBCODE_MASK)
 673                return;
 674        inc_irq_stat(IRQEXT_PFL);
 675        /* Get the token (= pid of the affected task). */
 676        pid = param64 & LPP_PID_MASK;
 677        rcu_read_lock();
 678        tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 679        if (tsk)
 680                get_task_struct(tsk);
 681        rcu_read_unlock();
 682        if (!tsk)
 683                return;
 684        spin_lock(&pfault_lock);
 685        if (subcode & PF_COMPLETE) {
 686                /* signal bit is set -> a page has been swapped in by VM */
 687                if (tsk->thread.pfault_wait == 1) {
 688                        /* Initial interrupt was faster than the completion
 689                         * interrupt. pfault_wait is valid. Set pfault_wait
 690                         * back to zero and wake up the process. This can
 691                         * safely be done because the task is still sleeping
 692                         * and can't produce new pfaults. */
 693                        tsk->thread.pfault_wait = 0;
 694                        list_del(&tsk->thread.list);
 695                        wake_up_process(tsk);
 696                        put_task_struct(tsk);
 697                } else {
 698                        /* Completion interrupt was faster than initial
 699                         * interrupt. Set pfault_wait to -1 so the initial
 700                         * interrupt doesn't put the task to sleep.
 701                         * If the task is not running, ignore the completion
 702                         * interrupt since it must be a leftover of a PFAULT
 703                         * CANCEL operation which didn't remove all pending
 704                         * completion interrupts. */
 705                        if (tsk->state == TASK_RUNNING)
 706                                tsk->thread.pfault_wait = -1;
 707                }
 708        } else {
 709                /* signal bit not set -> a real page is missing. */
 710                if (WARN_ON_ONCE(tsk != current))
 711                        goto out;
 712                if (tsk->thread.pfault_wait == 1) {
 713                        /* Already on the list with a reference: put to sleep */
 714                        goto block;
 715                } else if (tsk->thread.pfault_wait == -1) {
 716                        /* Completion interrupt was faster than the initial
 717                         * interrupt (pfault_wait == -1). Set pfault_wait
 718                         * back to zero and exit. */
 719                        tsk->thread.pfault_wait = 0;
 720                } else {
 721                        /* Initial interrupt arrived before completion
 722                         * interrupt. Let the task sleep.
 723                         * An extra task reference is needed since a different
 724                         * cpu may set the task state to TASK_RUNNING again
 725                         * before the scheduler is reached. */
 726                        get_task_struct(tsk);
 727                        tsk->thread.pfault_wait = 1;
 728                        list_add(&tsk->thread.list, &pfault_list);
 729block:
 730                        /* Since this must be a userspace fault, there
 731                         * is no kernel task state to trample. Rely on the
 732                         * return to userspace schedule() to block. */
 733                        __set_current_state(TASK_UNINTERRUPTIBLE);
 734                        set_tsk_need_resched(tsk);
 735                        set_preempt_need_resched();
 736                }
 737        }
 738out:
 739        spin_unlock(&pfault_lock);
 740        put_task_struct(tsk);
 741}
 742
 743static int pfault_cpu_dead(unsigned int cpu)
 744{
 745        struct thread_struct *thread, *next;
 746        struct task_struct *tsk;
 747
 748        spin_lock_irq(&pfault_lock);
 749        list_for_each_entry_safe(thread, next, &pfault_list, list) {
 750                thread->pfault_wait = 0;
 751                list_del(&thread->list);
 752                tsk = container_of(thread, struct task_struct, thread);
 753                wake_up_process(tsk);
 754                put_task_struct(tsk);
 755        }
 756        spin_unlock_irq(&pfault_lock);
 757        return 0;
 758}
 759
 760static int __init pfault_irq_init(void)
 761{
 762        int rc;
 763
 764        rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 765        if (rc)
 766                goto out_extint;
 767        rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
 768        if (rc)
 769                goto out_pfault;
 770        irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
 771        cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
 772                                  NULL, pfault_cpu_dead);
 773        return 0;
 774
 775out_pfault:
 776        unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 777out_extint:
 778        pfault_disable = 1;
 779        return rc;
 780}
 781early_initcall(pfault_irq_init);
 782
 783#endif /* CONFIG_PFAULT */
 784
 785#if IS_ENABLED(CONFIG_PGSTE)
 786
 787void do_secure_storage_access(struct pt_regs *regs)
 788{
 789        unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
 790        struct vm_area_struct *vma;
 791        struct mm_struct *mm;
 792        struct page *page;
 793        int rc;
 794
 795        switch (get_fault_type(regs)) {
 796        case USER_FAULT:
 797                mm = current->mm;
 798                mmap_read_lock(mm);
 799                vma = find_vma(mm, addr);
 800                if (!vma) {
 801                        mmap_read_unlock(mm);
 802                        do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
 803                        break;
 804                }
 805                page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
 806                if (IS_ERR_OR_NULL(page)) {
 807                        mmap_read_unlock(mm);
 808                        break;
 809                }
 810                if (arch_make_page_accessible(page))
 811                        send_sig(SIGSEGV, current, 0);
 812                put_page(page);
 813                mmap_read_unlock(mm);
 814                break;
 815        case KERNEL_FAULT:
 816                page = phys_to_page(addr);
 817                if (unlikely(!try_get_page(page)))
 818                        break;
 819                rc = arch_make_page_accessible(page);
 820                put_page(page);
 821                if (rc)
 822                        BUG();
 823                break;
 824        case GMAP_FAULT:
 825        default:
 826                do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
 827                WARN_ON_ONCE(1);
 828        }
 829}
 830NOKPROBE_SYMBOL(do_secure_storage_access);
 831
 832void do_non_secure_storage_access(struct pt_regs *regs)
 833{
 834        unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
 835        struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
 836
 837        if (get_fault_type(regs) != GMAP_FAULT) {
 838                do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
 839                WARN_ON_ONCE(1);
 840                return;
 841        }
 842
 843        if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
 844                send_sig(SIGSEGV, current, 0);
 845}
 846NOKPROBE_SYMBOL(do_non_secure_storage_access);
 847
 848void do_secure_storage_violation(struct pt_regs *regs)
 849{
 850        /*
 851         * Either KVM messed up the secure guest mapping or the same
 852         * page is mapped into multiple secure guests.
 853         *
 854         * This exception is only triggered when a guest 2 is running
 855         * and can therefore never occur in kernel context.
 856         */
 857        printk_ratelimited(KERN_WARNING
 858                           "Secure storage violation in task: %s, pid %d\n",
 859                           current->comm, current->pid);
 860        send_sig(SIGSEGV, current, 0);
 861}
 862
 863#endif /* CONFIG_PGSTE */
 864