linux/arch/s390/mm/fault.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  S390 version
   4 *    Copyright IBM Corp. 1999
   5 *    Author(s): Hartmut Penner (hp@de.ibm.com)
   6 *               Ulrich Weigand (uweigand@de.ibm.com)
   7 *
   8 *  Derived from "arch/i386/mm/fault.c"
   9 *    Copyright (C) 1995  Linus Torvalds
  10 */
  11
  12#include <linux/kernel_stat.h>
  13#include <linux/perf_event.h>
  14#include <linux/signal.h>
  15#include <linux/sched.h>
  16#include <linux/sched/debug.h>
  17#include <linux/kernel.h>
  18#include <linux/errno.h>
  19#include <linux/string.h>
  20#include <linux/types.h>
  21#include <linux/ptrace.h>
  22#include <linux/mman.h>
  23#include <linux/mm.h>
  24#include <linux/compat.h>
  25#include <linux/smp.h>
  26#include <linux/kdebug.h>
  27#include <linux/init.h>
  28#include <linux/console.h>
  29#include <linux/extable.h>
  30#include <linux/hardirq.h>
  31#include <linux/kprobes.h>
  32#include <linux/uaccess.h>
  33#include <linux/hugetlb.h>
  34#include <asm/asm-offsets.h>
  35#include <asm/diag.h>
  36#include <asm/gmap.h>
  37#include <asm/irq.h>
  38#include <asm/mmu_context.h>
  39#include <asm/facility.h>
  40#include <asm/uv.h>
  41#include "../kernel/entry.h"
  42
  43#define __FAIL_ADDR_MASK -4096L
  44#define __SUBCODE_MASK 0x0600
  45#define __PF_RES_FIELD 0x8000000000000000ULL
  46
  47#define VM_FAULT_BADCONTEXT     ((__force vm_fault_t) 0x010000)
  48#define VM_FAULT_BADMAP         ((__force vm_fault_t) 0x020000)
  49#define VM_FAULT_BADACCESS      ((__force vm_fault_t) 0x040000)
  50#define VM_FAULT_SIGNAL         ((__force vm_fault_t) 0x080000)
  51#define VM_FAULT_PFAULT         ((__force vm_fault_t) 0x100000)
  52
  53enum fault_type {
  54        KERNEL_FAULT,
  55        USER_FAULT,
  56        GMAP_FAULT,
  57};
  58
  59static unsigned long store_indication __read_mostly;
  60
  61static int __init fault_init(void)
  62{
  63        if (test_facility(75))
  64                store_indication = 0xc00;
  65        return 0;
  66}
  67early_initcall(fault_init);
  68
  69/*
  70 * Find out which address space caused the exception.
  71 */
  72static enum fault_type get_fault_type(struct pt_regs *regs)
  73{
  74        unsigned long trans_exc_code;
  75
  76        trans_exc_code = regs->int_parm_long & 3;
  77        if (likely(trans_exc_code == 0)) {
  78                /* primary space exception */
  79                if (user_mode(regs))
  80                        return USER_FAULT;
  81                if (!IS_ENABLED(CONFIG_PGSTE))
  82                        return KERNEL_FAULT;
  83                if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
  84                        return GMAP_FAULT;
  85                return KERNEL_FAULT;
  86        }
  87        if (trans_exc_code == 2)
  88                return USER_FAULT;
  89        if (trans_exc_code == 1) {
  90                /* access register mode, not used in the kernel */
  91                return USER_FAULT;
  92        }
  93        /* home space exception -> access via kernel ASCE */
  94        return KERNEL_FAULT;
  95}
  96
  97static int bad_address(void *p)
  98{
  99        unsigned long dummy;
 100
 101        return get_kernel_nofault(dummy, (unsigned long *)p);
 102}
 103
 104static void dump_pagetable(unsigned long asce, unsigned long address)
 105{
 106        unsigned long *table = __va(asce & _ASCE_ORIGIN);
 107
 108        pr_alert("AS:%016lx ", asce);
 109        switch (asce & _ASCE_TYPE_MASK) {
 110        case _ASCE_TYPE_REGION1:
 111                table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
 112                if (bad_address(table))
 113                        goto bad;
 114                pr_cont("R1:%016lx ", *table);
 115                if (*table & _REGION_ENTRY_INVALID)
 116                        goto out;
 117                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 118                fallthrough;
 119        case _ASCE_TYPE_REGION2:
 120                table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
 121                if (bad_address(table))
 122                        goto bad;
 123                pr_cont("R2:%016lx ", *table);
 124                if (*table & _REGION_ENTRY_INVALID)
 125                        goto out;
 126                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 127                fallthrough;
 128        case _ASCE_TYPE_REGION3:
 129                table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
 130                if (bad_address(table))
 131                        goto bad;
 132                pr_cont("R3:%016lx ", *table);
 133                if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
 134                        goto out;
 135                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 136                fallthrough;
 137        case _ASCE_TYPE_SEGMENT:
 138                table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 139                if (bad_address(table))
 140                        goto bad;
 141                pr_cont("S:%016lx ", *table);
 142                if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
 143                        goto out;
 144                table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
 145        }
 146        table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
 147        if (bad_address(table))
 148                goto bad;
 149        pr_cont("P:%016lx ", *table);
 150out:
 151        pr_cont("\n");
 152        return;
 153bad:
 154        pr_cont("BAD\n");
 155}
 156
 157static void dump_fault_info(struct pt_regs *regs)
 158{
 159        unsigned long asce;
 160
 161        pr_alert("Failing address: %016lx TEID: %016lx\n",
 162                 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
 163        pr_alert("Fault in ");
 164        switch (regs->int_parm_long & 3) {
 165        case 3:
 166                pr_cont("home space ");
 167                break;
 168        case 2:
 169                pr_cont("secondary space ");
 170                break;
 171        case 1:
 172                pr_cont("access register ");
 173                break;
 174        case 0:
 175                pr_cont("primary space ");
 176                break;
 177        }
 178        pr_cont("mode while using ");
 179        switch (get_fault_type(regs)) {
 180        case USER_FAULT:
 181                asce = S390_lowcore.user_asce;
 182                pr_cont("user ");
 183                break;
 184        case GMAP_FAULT:
 185                asce = ((struct gmap *) S390_lowcore.gmap)->asce;
 186                pr_cont("gmap ");
 187                break;
 188        case KERNEL_FAULT:
 189                asce = S390_lowcore.kernel_asce;
 190                pr_cont("kernel ");
 191                break;
 192        default:
 193                unreachable();
 194        }
 195        pr_cont("ASCE.\n");
 196        dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
 197}
 198
 199int show_unhandled_signals = 1;
 200
 201void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
 202{
 203        if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 204                return;
 205        if (!unhandled_signal(current, signr))
 206                return;
 207        if (!printk_ratelimit())
 208                return;
 209        printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
 210               regs->int_code & 0xffff, regs->int_code >> 17);
 211        print_vma_addr(KERN_CONT "in ", regs->psw.addr);
 212        printk(KERN_CONT "\n");
 213        if (is_mm_fault)
 214                dump_fault_info(regs);
 215        show_regs(regs);
 216}
 217
 218/*
 219 * Send SIGSEGV to task.  This is an external routine
 220 * to keep the stack usage of do_page_fault small.
 221 */
 222static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 223{
 224        report_user_fault(regs, SIGSEGV, 1);
 225        force_sig_fault(SIGSEGV, si_code,
 226                        (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
 227}
 228
 229const struct exception_table_entry *s390_search_extables(unsigned long addr)
 230{
 231        const struct exception_table_entry *fixup;
 232
 233        fixup = search_extable(__start_dma_ex_table,
 234                               __stop_dma_ex_table - __start_dma_ex_table,
 235                               addr);
 236        if (!fixup)
 237                fixup = search_exception_tables(addr);
 238        return fixup;
 239}
 240
 241static noinline void do_no_context(struct pt_regs *regs)
 242{
 243        const struct exception_table_entry *fixup;
 244
 245        /* Are we prepared to handle this kernel fault?  */
 246        fixup = s390_search_extables(regs->psw.addr);
 247        if (fixup && ex_handle(fixup, regs))
 248                return;
 249
 250        /*
 251         * Oops. The kernel tried to access some bad page. We'll have to
 252         * terminate things with extreme prejudice.
 253         */
 254        if (get_fault_type(regs) == KERNEL_FAULT)
 255                printk(KERN_ALERT "Unable to handle kernel pointer dereference"
 256                       " in virtual kernel address space\n");
 257        else
 258                printk(KERN_ALERT "Unable to handle kernel paging request"
 259                       " in virtual user address space\n");
 260        dump_fault_info(regs);
 261        die(regs, "Oops");
 262        do_exit(SIGKILL);
 263}
 264
 265static noinline void do_low_address(struct pt_regs *regs)
 266{
 267        /* Low-address protection hit in kernel mode means
 268           NULL pointer write access in kernel mode.  */
 269        if (regs->psw.mask & PSW_MASK_PSTATE) {
 270                /* Low-address protection hit in user mode 'cannot happen'. */
 271                die (regs, "Low-address protection");
 272                do_exit(SIGKILL);
 273        }
 274
 275        do_no_context(regs);
 276}
 277
 278static noinline void do_sigbus(struct pt_regs *regs)
 279{
 280        /*
 281         * Send a sigbus, regardless of whether we were in kernel
 282         * or user mode.
 283         */
 284        force_sig_fault(SIGBUS, BUS_ADRERR,
 285                        (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
 286}
 287
 288static noinline void do_fault_error(struct pt_regs *regs, int access,
 289                                        vm_fault_t fault)
 290{
 291        int si_code;
 292
 293        switch (fault) {
 294        case VM_FAULT_BADACCESS:
 295        case VM_FAULT_BADMAP:
 296                /* Bad memory access. Check if it is kernel or user space. */
 297                if (user_mode(regs)) {
 298                        /* User mode accesses just cause a SIGSEGV */
 299                        si_code = (fault == VM_FAULT_BADMAP) ?
 300                                SEGV_MAPERR : SEGV_ACCERR;
 301                        do_sigsegv(regs, si_code);
 302                        break;
 303                }
 304                fallthrough;
 305        case VM_FAULT_BADCONTEXT:
 306        case VM_FAULT_PFAULT:
 307                do_no_context(regs);
 308                break;
 309        case VM_FAULT_SIGNAL:
 310                if (!user_mode(regs))
 311                        do_no_context(regs);
 312                break;
 313        default: /* fault & VM_FAULT_ERROR */
 314                if (fault & VM_FAULT_OOM) {
 315                        if (!user_mode(regs))
 316                                do_no_context(regs);
 317                        else
 318                                pagefault_out_of_memory();
 319                } else if (fault & VM_FAULT_SIGSEGV) {
 320                        /* Kernel mode? Handle exceptions or die */
 321                        if (!user_mode(regs))
 322                                do_no_context(regs);
 323                        else
 324                                do_sigsegv(regs, SEGV_MAPERR);
 325                } else if (fault & VM_FAULT_SIGBUS) {
 326                        /* Kernel mode? Handle exceptions or die */
 327                        if (!user_mode(regs))
 328                                do_no_context(regs);
 329                        else
 330                                do_sigbus(regs);
 331                } else
 332                        BUG();
 333                break;
 334        }
 335}
 336
 337/*
 338 * This routine handles page faults.  It determines the address,
 339 * and the problem, and then passes it off to one of the appropriate
 340 * routines.
 341 *
 342 * interruption code (int_code):
 343 *   04       Protection           ->  Write-Protection  (suppression)
 344 *   10       Segment translation  ->  Not present       (nullification)
 345 *   11       Page translation     ->  Not present       (nullification)
 346 *   3b       Region third trans.  ->  Not present       (nullification)
 347 */
 348static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 349{
 350        struct gmap *gmap;
 351        struct task_struct *tsk;
 352        struct mm_struct *mm;
 353        struct vm_area_struct *vma;
 354        enum fault_type type;
 355        unsigned long trans_exc_code;
 356        unsigned long address;
 357        unsigned int flags;
 358        vm_fault_t fault;
 359
 360        tsk = current;
 361        /*
 362         * The instruction that caused the program check has
 363         * been nullified. Don't signal single step via SIGTRAP.
 364         */
 365        clear_thread_flag(TIF_PER_TRAP);
 366
 367        if (kprobe_page_fault(regs, 14))
 368                return 0;
 369
 370        mm = tsk->mm;
 371        trans_exc_code = regs->int_parm_long;
 372
 373        /*
 374         * Verify that the fault happened in user space, that
 375         * we are not in an interrupt and that there is a 
 376         * user context.
 377         */
 378        fault = VM_FAULT_BADCONTEXT;
 379        type = get_fault_type(regs);
 380        switch (type) {
 381        case KERNEL_FAULT:
 382                goto out;
 383        case USER_FAULT:
 384        case GMAP_FAULT:
 385                if (faulthandler_disabled() || !mm)
 386                        goto out;
 387                break;
 388        }
 389
 390        address = trans_exc_code & __FAIL_ADDR_MASK;
 391        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 392        flags = FAULT_FLAG_DEFAULT;
 393        if (user_mode(regs))
 394                flags |= FAULT_FLAG_USER;
 395        if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 396                flags |= FAULT_FLAG_WRITE;
 397        mmap_read_lock(mm);
 398
 399        gmap = NULL;
 400        if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
 401                gmap = (struct gmap *) S390_lowcore.gmap;
 402                current->thread.gmap_addr = address;
 403                current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
 404                current->thread.gmap_int_code = regs->int_code & 0xffff;
 405                address = __gmap_translate(gmap, address);
 406                if (address == -EFAULT) {
 407                        fault = VM_FAULT_BADMAP;
 408                        goto out_up;
 409                }
 410                if (gmap->pfault_enabled)
 411                        flags |= FAULT_FLAG_RETRY_NOWAIT;
 412        }
 413
 414retry:
 415        fault = VM_FAULT_BADMAP;
 416        vma = find_vma(mm, address);
 417        if (!vma)
 418                goto out_up;
 419
 420        if (unlikely(vma->vm_start > address)) {
 421                if (!(vma->vm_flags & VM_GROWSDOWN))
 422                        goto out_up;
 423                if (expand_stack(vma, address))
 424                        goto out_up;
 425        }
 426
 427        /*
 428         * Ok, we have a good vm_area for this memory access, so
 429         * we can handle it..
 430         */
 431        fault = VM_FAULT_BADACCESS;
 432        if (unlikely(!(vma->vm_flags & access)))
 433                goto out_up;
 434
 435        if (is_vm_hugetlb_page(vma))
 436                address &= HPAGE_MASK;
 437        /*
 438         * If for any reason at all we couldn't handle the fault,
 439         * make sure we exit gracefully rather than endlessly redo
 440         * the fault.
 441         */
 442        fault = handle_mm_fault(vma, address, flags, regs);
 443        if (fault_signal_pending(fault, regs)) {
 444                fault = VM_FAULT_SIGNAL;
 445                if (flags & FAULT_FLAG_RETRY_NOWAIT)
 446                        goto out_up;
 447                goto out;
 448        }
 449        if (unlikely(fault & VM_FAULT_ERROR))
 450                goto out_up;
 451
 452        if (flags & FAULT_FLAG_ALLOW_RETRY) {
 453                if (fault & VM_FAULT_RETRY) {
 454                        if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
 455                            (flags & FAULT_FLAG_RETRY_NOWAIT)) {
 456                                /* FAULT_FLAG_RETRY_NOWAIT has been set,
 457                                 * mmap_lock has not been released */
 458                                current->thread.gmap_pfault = 1;
 459                                fault = VM_FAULT_PFAULT;
 460                                goto out_up;
 461                        }
 462                        flags &= ~FAULT_FLAG_RETRY_NOWAIT;
 463                        flags |= FAULT_FLAG_TRIED;
 464                        mmap_read_lock(mm);
 465                        goto retry;
 466                }
 467        }
 468        if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
 469                address =  __gmap_link(gmap, current->thread.gmap_addr,
 470                                       address);
 471                if (address == -EFAULT) {
 472                        fault = VM_FAULT_BADMAP;
 473                        goto out_up;
 474                }
 475                if (address == -ENOMEM) {
 476                        fault = VM_FAULT_OOM;
 477                        goto out_up;
 478                }
 479        }
 480        fault = 0;
 481out_up:
 482        mmap_read_unlock(mm);
 483out:
 484        return fault;
 485}
 486
 487void do_protection_exception(struct pt_regs *regs)
 488{
 489        unsigned long trans_exc_code;
 490        int access;
 491        vm_fault_t fault;
 492
 493        trans_exc_code = regs->int_parm_long;
 494        /*
 495         * Protection exceptions are suppressing, decrement psw address.
 496         * The exception to this rule are aborted transactions, for these
 497         * the PSW already points to the correct location.
 498         */
 499        if (!(regs->int_code & 0x200))
 500                regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 501        /*
 502         * Check for low-address protection.  This needs to be treated
 503         * as a special case because the translation exception code
 504         * field is not guaranteed to contain valid data in this case.
 505         */
 506        if (unlikely(!(trans_exc_code & 4))) {
 507                do_low_address(regs);
 508                return;
 509        }
 510        if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
 511                regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
 512                                        (regs->psw.addr & PAGE_MASK);
 513                access = VM_EXEC;
 514                fault = VM_FAULT_BADACCESS;
 515        } else {
 516                access = VM_WRITE;
 517                fault = do_exception(regs, access);
 518        }
 519        if (unlikely(fault))
 520                do_fault_error(regs, access, fault);
 521}
 522NOKPROBE_SYMBOL(do_protection_exception);
 523
 524void do_dat_exception(struct pt_regs *regs)
 525{
 526        int access;
 527        vm_fault_t fault;
 528
 529        access = VM_ACCESS_FLAGS;
 530        fault = do_exception(regs, access);
 531        if (unlikely(fault))
 532                do_fault_error(regs, access, fault);
 533}
 534NOKPROBE_SYMBOL(do_dat_exception);
 535
 536#ifdef CONFIG_PFAULT 
 537/*
 538 * 'pfault' pseudo page faults routines.
 539 */
 540static int pfault_disable;
 541
 542static int __init nopfault(char *str)
 543{
 544        pfault_disable = 1;
 545        return 1;
 546}
 547
 548__setup("nopfault", nopfault);
 549
 550struct pfault_refbk {
 551        u16 refdiagc;
 552        u16 reffcode;
 553        u16 refdwlen;
 554        u16 refversn;
 555        u64 refgaddr;
 556        u64 refselmk;
 557        u64 refcmpmk;
 558        u64 reserved;
 559} __attribute__ ((packed, aligned(8)));
 560
 561static struct pfault_refbk pfault_init_refbk = {
 562        .refdiagc = 0x258,
 563        .reffcode = 0,
 564        .refdwlen = 5,
 565        .refversn = 2,
 566        .refgaddr = __LC_LPP,
 567        .refselmk = 1ULL << 48,
 568        .refcmpmk = 1ULL << 48,
 569        .reserved = __PF_RES_FIELD
 570};
 571
 572int pfault_init(void)
 573{
 574        int rc;
 575
 576        if (pfault_disable)
 577                return -1;
 578        diag_stat_inc(DIAG_STAT_X258);
 579        asm volatile(
 580                "       diag    %1,%0,0x258\n"
 581                "0:     j       2f\n"
 582                "1:     la      %0,8\n"
 583                "2:\n"
 584                EX_TABLE(0b,1b)
 585                : "=d" (rc)
 586                : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
 587        return rc;
 588}
 589
 590static struct pfault_refbk pfault_fini_refbk = {
 591        .refdiagc = 0x258,
 592        .reffcode = 1,
 593        .refdwlen = 5,
 594        .refversn = 2,
 595};
 596
 597void pfault_fini(void)
 598{
 599
 600        if (pfault_disable)
 601                return;
 602        diag_stat_inc(DIAG_STAT_X258);
 603        asm volatile(
 604                "       diag    %0,0,0x258\n"
 605                "0:     nopr    %%r7\n"
 606                EX_TABLE(0b,0b)
 607                : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
 608}
 609
 610static DEFINE_SPINLOCK(pfault_lock);
 611static LIST_HEAD(pfault_list);
 612
 613#define PF_COMPLETE     0x0080
 614
 615/*
 616 * The mechanism of our pfault code: if Linux is running as guest, runs a user
 617 * space process and the user space process accesses a page that the host has
 618 * paged out we get a pfault interrupt.
 619 *
 620 * This allows us, within the guest, to schedule a different process. Without
 621 * this mechanism the host would have to suspend the whole virtual cpu until
 622 * the page has been paged in.
 623 *
 624 * So when we get such an interrupt then we set the state of the current task
 625 * to uninterruptible and also set the need_resched flag. Both happens within
 626 * interrupt context(!). If we later on want to return to user space we
 627 * recognize the need_resched flag and then call schedule().  It's not very
 628 * obvious how this works...
 629 *
 630 * Of course we have a lot of additional fun with the completion interrupt (->
 631 * host signals that a page of a process has been paged in and the process can
 632 * continue to run). This interrupt can arrive on any cpu and, since we have
 633 * virtual cpus, actually appear before the interrupt that signals that a page
 634 * is missing.
 635 */
 636static void pfault_interrupt(struct ext_code ext_code,
 637                             unsigned int param32, unsigned long param64)
 638{
 639        struct task_struct *tsk;
 640        __u16 subcode;
 641        pid_t pid;
 642
 643        /*
 644         * Get the external interruption subcode & pfault initial/completion
 645         * signal bit. VM stores this in the 'cpu address' field associated
 646         * with the external interrupt.
 647         */
 648        subcode = ext_code.subcode;
 649        if ((subcode & 0xff00) != __SUBCODE_MASK)
 650                return;
 651        inc_irq_stat(IRQEXT_PFL);
 652        /* Get the token (= pid of the affected task). */
 653        pid = param64 & LPP_PID_MASK;
 654        rcu_read_lock();
 655        tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 656        if (tsk)
 657                get_task_struct(tsk);
 658        rcu_read_unlock();
 659        if (!tsk)
 660                return;
 661        spin_lock(&pfault_lock);
 662        if (subcode & PF_COMPLETE) {
 663                /* signal bit is set -> a page has been swapped in by VM */
 664                if (tsk->thread.pfault_wait == 1) {
 665                        /* Initial interrupt was faster than the completion
 666                         * interrupt. pfault_wait is valid. Set pfault_wait
 667                         * back to zero and wake up the process. This can
 668                         * safely be done because the task is still sleeping
 669                         * and can't produce new pfaults. */
 670                        tsk->thread.pfault_wait = 0;
 671                        list_del(&tsk->thread.list);
 672                        wake_up_process(tsk);
 673                        put_task_struct(tsk);
 674                } else {
 675                        /* Completion interrupt was faster than initial
 676                         * interrupt. Set pfault_wait to -1 so the initial
 677                         * interrupt doesn't put the task to sleep.
 678                         * If the task is not running, ignore the completion
 679                         * interrupt since it must be a leftover of a PFAULT
 680                         * CANCEL operation which didn't remove all pending
 681                         * completion interrupts. */
 682                        if (task_is_running(tsk))
 683                                tsk->thread.pfault_wait = -1;
 684                }
 685        } else {
 686                /* signal bit not set -> a real page is missing. */
 687                if (WARN_ON_ONCE(tsk != current))
 688                        goto out;
 689                if (tsk->thread.pfault_wait == 1) {
 690                        /* Already on the list with a reference: put to sleep */
 691                        goto block;
 692                } else if (tsk->thread.pfault_wait == -1) {
 693                        /* Completion interrupt was faster than the initial
 694                         * interrupt (pfault_wait == -1). Set pfault_wait
 695                         * back to zero and exit. */
 696                        tsk->thread.pfault_wait = 0;
 697                } else {
 698                        /* Initial interrupt arrived before completion
 699                         * interrupt. Let the task sleep.
 700                         * An extra task reference is needed since a different
 701                         * cpu may set the task state to TASK_RUNNING again
 702                         * before the scheduler is reached. */
 703                        get_task_struct(tsk);
 704                        tsk->thread.pfault_wait = 1;
 705                        list_add(&tsk->thread.list, &pfault_list);
 706block:
 707                        /* Since this must be a userspace fault, there
 708                         * is no kernel task state to trample. Rely on the
 709                         * return to userspace schedule() to block. */
 710                        __set_current_state(TASK_UNINTERRUPTIBLE);
 711                        set_tsk_need_resched(tsk);
 712                        set_preempt_need_resched();
 713                }
 714        }
 715out:
 716        spin_unlock(&pfault_lock);
 717        put_task_struct(tsk);
 718}
 719
 720static int pfault_cpu_dead(unsigned int cpu)
 721{
 722        struct thread_struct *thread, *next;
 723        struct task_struct *tsk;
 724
 725        spin_lock_irq(&pfault_lock);
 726        list_for_each_entry_safe(thread, next, &pfault_list, list) {
 727                thread->pfault_wait = 0;
 728                list_del(&thread->list);
 729                tsk = container_of(thread, struct task_struct, thread);
 730                wake_up_process(tsk);
 731                put_task_struct(tsk);
 732        }
 733        spin_unlock_irq(&pfault_lock);
 734        return 0;
 735}
 736
 737static int __init pfault_irq_init(void)
 738{
 739        int rc;
 740
 741        rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 742        if (rc)
 743                goto out_extint;
 744        rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
 745        if (rc)
 746                goto out_pfault;
 747        irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
 748        cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
 749                                  NULL, pfault_cpu_dead);
 750        return 0;
 751
 752out_pfault:
 753        unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 754out_extint:
 755        pfault_disable = 1;
 756        return rc;
 757}
 758early_initcall(pfault_irq_init);
 759
 760#endif /* CONFIG_PFAULT */
 761
 762#if IS_ENABLED(CONFIG_PGSTE)
 763
 764void do_secure_storage_access(struct pt_regs *regs)
 765{
 766        unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
 767        struct vm_area_struct *vma;
 768        struct mm_struct *mm;
 769        struct page *page;
 770        int rc;
 771
 772        /*
 773         * bit 61 tells us if the address is valid, if it's not we
 774         * have a major problem and should stop the kernel or send a
 775         * SIGSEGV to the process. Unfortunately bit 61 is not
 776         * reliable without the misc UV feature so we need to check
 777         * for that as well.
 778         */
 779        if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
 780            !test_bit_inv(61, &regs->int_parm_long)) {
 781                /*
 782                 * When this happens, userspace did something that it
 783                 * was not supposed to do, e.g. branching into secure
 784                 * memory. Trigger a segmentation fault.
 785                 */
 786                if (user_mode(regs)) {
 787                        send_sig(SIGSEGV, current, 0);
 788                        return;
 789                }
 790
 791                /*
 792                 * The kernel should never run into this case and we
 793                 * have no way out of this situation.
 794                 */
 795                panic("Unexpected PGM 0x3d with TEID bit 61=0");
 796        }
 797
 798        switch (get_fault_type(regs)) {
 799        case USER_FAULT:
 800                mm = current->mm;
 801                mmap_read_lock(mm);
 802                vma = find_vma(mm, addr);
 803                if (!vma) {
 804                        mmap_read_unlock(mm);
 805                        do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
 806                        break;
 807                }
 808                page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
 809                if (IS_ERR_OR_NULL(page)) {
 810                        mmap_read_unlock(mm);
 811                        break;
 812                }
 813                if (arch_make_page_accessible(page))
 814                        send_sig(SIGSEGV, current, 0);
 815                put_page(page);
 816                mmap_read_unlock(mm);
 817                break;
 818        case KERNEL_FAULT:
 819                page = phys_to_page(addr);
 820                if (unlikely(!try_get_page(page)))
 821                        break;
 822                rc = arch_make_page_accessible(page);
 823                put_page(page);
 824                if (rc)
 825                        BUG();
 826                break;
 827        case GMAP_FAULT:
 828        default:
 829                do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
 830                WARN_ON_ONCE(1);
 831        }
 832}
 833NOKPROBE_SYMBOL(do_secure_storage_access);
 834
 835void do_non_secure_storage_access(struct pt_regs *regs)
 836{
 837        unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
 838        struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
 839
 840        if (get_fault_type(regs) != GMAP_FAULT) {
 841                do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
 842                WARN_ON_ONCE(1);
 843                return;
 844        }
 845
 846        if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
 847                send_sig(SIGSEGV, current, 0);
 848}
 849NOKPROBE_SYMBOL(do_non_secure_storage_access);
 850
 851void do_secure_storage_violation(struct pt_regs *regs)
 852{
 853        /*
 854         * Either KVM messed up the secure guest mapping or the same
 855         * page is mapped into multiple secure guests.
 856         *
 857         * This exception is only triggered when a guest 2 is running
 858         * and can therefore never occur in kernel context.
 859         */
 860        printk_ratelimited(KERN_WARNING
 861                           "Secure storage violation in task: %s, pid %d\n",
 862                           current->comm, current->pid);
 863        send_sig(SIGSEGV, current, 0);
 864}
 865
 866#endif /* CONFIG_PGSTE */
 867