linux/arch/s390/mm/fault.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  S390 version
   4 *    Copyright IBM Corp. 1999
   5 *    Author(s): Hartmut Penner (hp@de.ibm.com)
   6 *               Ulrich Weigand (uweigand@de.ibm.com)
   7 *
   8 *  Derived from "arch/i386/mm/fault.c"
   9 *    Copyright (C) 1995  Linus Torvalds
  10 */
  11
  12#include <linux/kernel_stat.h>
  13#include <linux/perf_event.h>
  14#include <linux/signal.h>
  15#include <linux/sched.h>
  16#include <linux/sched/debug.h>
  17#include <linux/kernel.h>
  18#include <linux/errno.h>
  19#include <linux/string.h>
  20#include <linux/types.h>
  21#include <linux/ptrace.h>
  22#include <linux/mman.h>
  23#include <linux/mm.h>
  24#include <linux/compat.h>
  25#include <linux/smp.h>
  26#include <linux/kdebug.h>
  27#include <linux/init.h>
  28#include <linux/console.h>
  29#include <linux/extable.h>
  30#include <linux/hardirq.h>
  31#include <linux/kprobes.h>
  32#include <linux/uaccess.h>
  33#include <linux/hugetlb.h>
  34#include <asm/asm-offsets.h>
  35#include <asm/diag.h>
  36#include <asm/pgtable.h>
  37#include <asm/gmap.h>
  38#include <asm/irq.h>
  39#include <asm/mmu_context.h>
  40#include <asm/facility.h>
  41#include "../kernel/entry.h"
  42
  43#define __FAIL_ADDR_MASK -4096L
  44#define __SUBCODE_MASK 0x0600
  45#define __PF_RES_FIELD 0x8000000000000000ULL
  46
  47#define VM_FAULT_BADCONTEXT     0x010000
  48#define VM_FAULT_BADMAP         0x020000
  49#define VM_FAULT_BADACCESS      0x040000
  50#define VM_FAULT_SIGNAL         0x080000
  51#define VM_FAULT_PFAULT         0x100000
  52
  53enum fault_type {
  54        KERNEL_FAULT,
  55        USER_FAULT,
  56        VDSO_FAULT,
  57        GMAP_FAULT,
  58};
  59
  60static unsigned long store_indication __read_mostly;
  61
  62static int __init fault_init(void)
  63{
  64        if (test_facility(75))
  65                store_indication = 0xc00;
  66        return 0;
  67}
  68early_initcall(fault_init);
  69
  70static inline int notify_page_fault(struct pt_regs *regs)
  71{
  72        int ret = 0;
  73
  74        /* kprobe_running() needs smp_processor_id() */
  75        if (kprobes_built_in() && !user_mode(regs)) {
  76                preempt_disable();
  77                if (kprobe_running() && kprobe_fault_handler(regs, 14))
  78                        ret = 1;
  79                preempt_enable();
  80        }
  81        return ret;
  82}
  83
  84
  85/*
  86 * Unlock any spinlocks which will prevent us from getting the
  87 * message out.
  88 */
  89void bust_spinlocks(int yes)
  90{
  91        if (yes) {
  92                oops_in_progress = 1;
  93        } else {
  94                int loglevel_save = console_loglevel;
  95                console_unblank();
  96                oops_in_progress = 0;
  97                /*
  98                 * OK, the message is on the console.  Now we call printk()
  99                 * without oops_in_progress set so that printk will give klogd
 100                 * a poke.  Hold onto your hats...
 101                 */
 102                console_loglevel = 15;
 103                printk(" ");
 104                console_loglevel = loglevel_save;
 105        }
 106}
 107
 108/*
 109 * Find out which address space caused the exception.
 110 */
 111static inline enum fault_type get_fault_type(struct pt_regs *regs)
 112{
 113        unsigned long trans_exc_code;
 114
 115        trans_exc_code = regs->int_parm_long & 3;
 116        if (likely(trans_exc_code == 0)) {
 117                /* primary space exception */
 118                if (IS_ENABLED(CONFIG_PGSTE) &&
 119                    test_pt_regs_flag(regs, PIF_GUEST_FAULT))
 120                        return GMAP_FAULT;
 121                if (current->thread.mm_segment == USER_DS)
 122                        return USER_FAULT;
 123                return KERNEL_FAULT;
 124        }
 125        if (trans_exc_code == 2) {
 126                /* secondary space exception */
 127                if (current->thread.mm_segment & 1) {
 128                        if (current->thread.mm_segment == USER_DS_SACF)
 129                                return USER_FAULT;
 130                        return KERNEL_FAULT;
 131                }
 132                return VDSO_FAULT;
 133        }
 134        if (trans_exc_code == 1) {
 135                /* access register mode, not used in the kernel */
 136                return USER_FAULT;
 137        }
 138        /* home space exception -> access via kernel ASCE */
 139        return KERNEL_FAULT;
 140}
 141
 142static int bad_address(void *p)
 143{
 144        unsigned long dummy;
 145
 146        return probe_kernel_address((unsigned long *)p, dummy);
 147}
 148
 149static void dump_pagetable(unsigned long asce, unsigned long address)
 150{
 151        unsigned long *table = __va(asce & _ASCE_ORIGIN);
 152
 153        pr_alert("AS:%016lx ", asce);
 154        switch (asce & _ASCE_TYPE_MASK) {
 155        case _ASCE_TYPE_REGION1:
 156                table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
 157                if (bad_address(table))
 158                        goto bad;
 159                pr_cont("R1:%016lx ", *table);
 160                if (*table & _REGION_ENTRY_INVALID)
 161                        goto out;
 162                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 163                /* fallthrough */
 164        case _ASCE_TYPE_REGION2:
 165                table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
 166                if (bad_address(table))
 167                        goto bad;
 168                pr_cont("R2:%016lx ", *table);
 169                if (*table & _REGION_ENTRY_INVALID)
 170                        goto out;
 171                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 172                /* fallthrough */
 173        case _ASCE_TYPE_REGION3:
 174                table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
 175                if (bad_address(table))
 176                        goto bad;
 177                pr_cont("R3:%016lx ", *table);
 178                if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
 179                        goto out;
 180                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 181                /* fallthrough */
 182        case _ASCE_TYPE_SEGMENT:
 183                table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 184                if (bad_address(table))
 185                        goto bad;
 186                pr_cont("S:%016lx ", *table);
 187                if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
 188                        goto out;
 189                table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
 190        }
 191        table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
 192        if (bad_address(table))
 193                goto bad;
 194        pr_cont("P:%016lx ", *table);
 195out:
 196        pr_cont("\n");
 197        return;
 198bad:
 199        pr_cont("BAD\n");
 200}
 201
 202static void dump_fault_info(struct pt_regs *regs)
 203{
 204        unsigned long asce;
 205
 206        pr_alert("Failing address: %016lx TEID: %016lx\n",
 207                 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
 208        pr_alert("Fault in ");
 209        switch (regs->int_parm_long & 3) {
 210        case 3:
 211                pr_cont("home space ");
 212                break;
 213        case 2:
 214                pr_cont("secondary space ");
 215                break;
 216        case 1:
 217                pr_cont("access register ");
 218                break;
 219        case 0:
 220                pr_cont("primary space ");
 221                break;
 222        }
 223        pr_cont("mode while using ");
 224        switch (get_fault_type(regs)) {
 225        case USER_FAULT:
 226                asce = S390_lowcore.user_asce;
 227                pr_cont("user ");
 228                break;
 229        case VDSO_FAULT:
 230                asce = S390_lowcore.vdso_asce;
 231                pr_cont("vdso ");
 232                break;
 233        case GMAP_FAULT:
 234                asce = ((struct gmap *) S390_lowcore.gmap)->asce;
 235                pr_cont("gmap ");
 236                break;
 237        case KERNEL_FAULT:
 238                asce = S390_lowcore.kernel_asce;
 239                pr_cont("kernel ");
 240                break;
 241        }
 242        pr_cont("ASCE.\n");
 243        dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
 244}
 245
 246int show_unhandled_signals = 1;
 247
 248void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
 249{
 250        if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 251                return;
 252        if (!unhandled_signal(current, signr))
 253                return;
 254        if (!printk_ratelimit())
 255                return;
 256        printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
 257               regs->int_code & 0xffff, regs->int_code >> 17);
 258        print_vma_addr(KERN_CONT "in ", regs->psw.addr);
 259        printk(KERN_CONT "\n");
 260        if (is_mm_fault)
 261                dump_fault_info(regs);
 262        show_regs(regs);
 263}
 264
 265/*
 266 * Send SIGSEGV to task.  This is an external routine
 267 * to keep the stack usage of do_page_fault small.
 268 */
 269static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 270{
 271        report_user_fault(regs, SIGSEGV, 1);
 272        force_sig_fault(SIGSEGV, si_code,
 273                        (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
 274                        current);
 275}
 276
 277const struct exception_table_entry *s390_search_extables(unsigned long addr)
 278{
 279        const struct exception_table_entry *fixup;
 280
 281        fixup = search_extable(__start_dma_ex_table,
 282                               __stop_dma_ex_table - __start_dma_ex_table,
 283                               addr);
 284        if (!fixup)
 285                fixup = search_exception_tables(addr);
 286        return fixup;
 287}
 288
 289static noinline void do_no_context(struct pt_regs *regs)
 290{
 291        const struct exception_table_entry *fixup;
 292
 293        /* Are we prepared to handle this kernel fault?  */
 294        fixup = s390_search_extables(regs->psw.addr);
 295        if (fixup) {
 296                regs->psw.addr = extable_fixup(fixup);
 297                return;
 298        }
 299
 300        /*
 301         * Oops. The kernel tried to access some bad page. We'll have to
 302         * terminate things with extreme prejudice.
 303         */
 304        if (get_fault_type(regs) == KERNEL_FAULT)
 305                printk(KERN_ALERT "Unable to handle kernel pointer dereference"
 306                       " in virtual kernel address space\n");
 307        else
 308                printk(KERN_ALERT "Unable to handle kernel paging request"
 309                       " in virtual user address space\n");
 310        dump_fault_info(regs);
 311        die(regs, "Oops");
 312        do_exit(SIGKILL);
 313}
 314
 315static noinline void do_low_address(struct pt_regs *regs)
 316{
 317        /* Low-address protection hit in kernel mode means
 318           NULL pointer write access in kernel mode.  */
 319        if (regs->psw.mask & PSW_MASK_PSTATE) {
 320                /* Low-address protection hit in user mode 'cannot happen'. */
 321                die (regs, "Low-address protection");
 322                do_exit(SIGKILL);
 323        }
 324
 325        do_no_context(regs);
 326}
 327
 328static noinline void do_sigbus(struct pt_regs *regs)
 329{
 330        /*
 331         * Send a sigbus, regardless of whether we were in kernel
 332         * or user mode.
 333         */
 334        force_sig_fault(SIGBUS, BUS_ADRERR,
 335                        (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
 336                        current);
 337}
 338
 339static noinline int signal_return(struct pt_regs *regs)
 340{
 341        u16 instruction;
 342        int rc;
 343
 344        rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
 345        if (rc)
 346                return rc;
 347        if (instruction == 0x0a77) {
 348                set_pt_regs_flag(regs, PIF_SYSCALL);
 349                regs->int_code = 0x00040077;
 350                return 0;
 351        } else if (instruction == 0x0aad) {
 352                set_pt_regs_flag(regs, PIF_SYSCALL);
 353                regs->int_code = 0x000400ad;
 354                return 0;
 355        }
 356        return -EACCES;
 357}
 358
 359static noinline void do_fault_error(struct pt_regs *regs, int access,
 360                                        vm_fault_t fault)
 361{
 362        int si_code;
 363
 364        switch (fault) {
 365        case VM_FAULT_BADACCESS:
 366                if (access == VM_EXEC && signal_return(regs) == 0)
 367                        break;
 368        case VM_FAULT_BADMAP:
 369                /* Bad memory access. Check if it is kernel or user space. */
 370                if (user_mode(regs)) {
 371                        /* User mode accesses just cause a SIGSEGV */
 372                        si_code = (fault == VM_FAULT_BADMAP) ?
 373                                SEGV_MAPERR : SEGV_ACCERR;
 374                        do_sigsegv(regs, si_code);
 375                        break;
 376                }
 377        case VM_FAULT_BADCONTEXT:
 378        case VM_FAULT_PFAULT:
 379                do_no_context(regs);
 380                break;
 381        case VM_FAULT_SIGNAL:
 382                if (!user_mode(regs))
 383                        do_no_context(regs);
 384                break;
 385        default: /* fault & VM_FAULT_ERROR */
 386                if (fault & VM_FAULT_OOM) {
 387                        if (!user_mode(regs))
 388                                do_no_context(regs);
 389                        else
 390                                pagefault_out_of_memory();
 391                } else if (fault & VM_FAULT_SIGSEGV) {
 392                        /* Kernel mode? Handle exceptions or die */
 393                        if (!user_mode(regs))
 394                                do_no_context(regs);
 395                        else
 396                                do_sigsegv(regs, SEGV_MAPERR);
 397                } else if (fault & VM_FAULT_SIGBUS) {
 398                        /* Kernel mode? Handle exceptions or die */
 399                        if (!user_mode(regs))
 400                                do_no_context(regs);
 401                        else
 402                                do_sigbus(regs);
 403                } else
 404                        BUG();
 405                break;
 406        }
 407}
 408
 409/*
 410 * This routine handles page faults.  It determines the address,
 411 * and the problem, and then passes it off to one of the appropriate
 412 * routines.
 413 *
 414 * interruption code (int_code):
 415 *   04       Protection           ->  Write-Protection  (suprression)
 416 *   10       Segment translation  ->  Not present       (nullification)
 417 *   11       Page translation     ->  Not present       (nullification)
 418 *   3b       Region third trans.  ->  Not present       (nullification)
 419 */
 420static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 421{
 422        struct gmap *gmap;
 423        struct task_struct *tsk;
 424        struct mm_struct *mm;
 425        struct vm_area_struct *vma;
 426        enum fault_type type;
 427        unsigned long trans_exc_code;
 428        unsigned long address;
 429        unsigned int flags;
 430        vm_fault_t fault;
 431
 432        tsk = current;
 433        /*
 434         * The instruction that caused the program check has
 435         * been nullified. Don't signal single step via SIGTRAP.
 436         */
 437        clear_pt_regs_flag(regs, PIF_PER_TRAP);
 438
 439        if (notify_page_fault(regs))
 440                return 0;
 441
 442        mm = tsk->mm;
 443        trans_exc_code = regs->int_parm_long;
 444
 445        /*
 446         * Verify that the fault happened in user space, that
 447         * we are not in an interrupt and that there is a 
 448         * user context.
 449         */
 450        fault = VM_FAULT_BADCONTEXT;
 451        type = get_fault_type(regs);
 452        switch (type) {
 453        case KERNEL_FAULT:
 454                goto out;
 455        case VDSO_FAULT:
 456                fault = VM_FAULT_BADMAP;
 457                goto out;
 458        case USER_FAULT:
 459        case GMAP_FAULT:
 460                if (faulthandler_disabled() || !mm)
 461                        goto out;
 462                break;
 463        }
 464
 465        address = trans_exc_code & __FAIL_ADDR_MASK;
 466        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 467        flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 468        if (user_mode(regs))
 469                flags |= FAULT_FLAG_USER;
 470        if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 471                flags |= FAULT_FLAG_WRITE;
 472        down_read(&mm->mmap_sem);
 473
 474        gmap = NULL;
 475        if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
 476                gmap = (struct gmap *) S390_lowcore.gmap;
 477                current->thread.gmap_addr = address;
 478                current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
 479                current->thread.gmap_int_code = regs->int_code & 0xffff;
 480                address = __gmap_translate(gmap, address);
 481                if (address == -EFAULT) {
 482                        fault = VM_FAULT_BADMAP;
 483                        goto out_up;
 484                }
 485                if (gmap->pfault_enabled)
 486                        flags |= FAULT_FLAG_RETRY_NOWAIT;
 487        }
 488
 489retry:
 490        fault = VM_FAULT_BADMAP;
 491        vma = find_vma(mm, address);
 492        if (!vma)
 493                goto out_up;
 494
 495        if (unlikely(vma->vm_start > address)) {
 496                if (!(vma->vm_flags & VM_GROWSDOWN))
 497                        goto out_up;
 498                if (expand_stack(vma, address))
 499                        goto out_up;
 500        }
 501
 502        /*
 503         * Ok, we have a good vm_area for this memory access, so
 504         * we can handle it..
 505         */
 506        fault = VM_FAULT_BADACCESS;
 507        if (unlikely(!(vma->vm_flags & access)))
 508                goto out_up;
 509
 510        if (is_vm_hugetlb_page(vma))
 511                address &= HPAGE_MASK;
 512        /*
 513         * If for any reason at all we couldn't handle the fault,
 514         * make sure we exit gracefully rather than endlessly redo
 515         * the fault.
 516         */
 517        fault = handle_mm_fault(vma, address, flags);
 518        /* No reason to continue if interrupted by SIGKILL. */
 519        if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
 520                fault = VM_FAULT_SIGNAL;
 521                if (flags & FAULT_FLAG_RETRY_NOWAIT)
 522                        goto out_up;
 523                goto out;
 524        }
 525        if (unlikely(fault & VM_FAULT_ERROR))
 526                goto out_up;
 527
 528        /*
 529         * Major/minor page fault accounting is only done on the
 530         * initial attempt. If we go through a retry, it is extremely
 531         * likely that the page will be found in page cache at that point.
 532         */
 533        if (flags & FAULT_FLAG_ALLOW_RETRY) {
 534                if (fault & VM_FAULT_MAJOR) {
 535                        tsk->maj_flt++;
 536                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 537                                      regs, address);
 538                } else {
 539                        tsk->min_flt++;
 540                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 541                                      regs, address);
 542                }
 543                if (fault & VM_FAULT_RETRY) {
 544                        if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
 545                            (flags & FAULT_FLAG_RETRY_NOWAIT)) {
 546                                /* FAULT_FLAG_RETRY_NOWAIT has been set,
 547                                 * mmap_sem has not been released */
 548                                current->thread.gmap_pfault = 1;
 549                                fault = VM_FAULT_PFAULT;
 550                                goto out_up;
 551                        }
 552                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 553                         * of starvation. */
 554                        flags &= ~(FAULT_FLAG_ALLOW_RETRY |
 555                                   FAULT_FLAG_RETRY_NOWAIT);
 556                        flags |= FAULT_FLAG_TRIED;
 557                        down_read(&mm->mmap_sem);
 558                        goto retry;
 559                }
 560        }
 561        if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
 562                address =  __gmap_link(gmap, current->thread.gmap_addr,
 563                                       address);
 564                if (address == -EFAULT) {
 565                        fault = VM_FAULT_BADMAP;
 566                        goto out_up;
 567                }
 568                if (address == -ENOMEM) {
 569                        fault = VM_FAULT_OOM;
 570                        goto out_up;
 571                }
 572        }
 573        fault = 0;
 574out_up:
 575        up_read(&mm->mmap_sem);
 576out:
 577        return fault;
 578}
 579
 580void do_protection_exception(struct pt_regs *regs)
 581{
 582        unsigned long trans_exc_code;
 583        int access;
 584        vm_fault_t fault;
 585
 586        trans_exc_code = regs->int_parm_long;
 587        /*
 588         * Protection exceptions are suppressing, decrement psw address.
 589         * The exception to this rule are aborted transactions, for these
 590         * the PSW already points to the correct location.
 591         */
 592        if (!(regs->int_code & 0x200))
 593                regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 594        /*
 595         * Check for low-address protection.  This needs to be treated
 596         * as a special case because the translation exception code
 597         * field is not guaranteed to contain valid data in this case.
 598         */
 599        if (unlikely(!(trans_exc_code & 4))) {
 600                do_low_address(regs);
 601                return;
 602        }
 603        if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
 604                regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
 605                                        (regs->psw.addr & PAGE_MASK);
 606                access = VM_EXEC;
 607                fault = VM_FAULT_BADACCESS;
 608        } else {
 609                access = VM_WRITE;
 610                fault = do_exception(regs, access);
 611        }
 612        if (unlikely(fault))
 613                do_fault_error(regs, access, fault);
 614}
 615NOKPROBE_SYMBOL(do_protection_exception);
 616
 617void do_dat_exception(struct pt_regs *regs)
 618{
 619        int access;
 620        vm_fault_t fault;
 621
 622        access = VM_READ | VM_EXEC | VM_WRITE;
 623        fault = do_exception(regs, access);
 624        if (unlikely(fault))
 625                do_fault_error(regs, access, fault);
 626}
 627NOKPROBE_SYMBOL(do_dat_exception);
 628
 629#ifdef CONFIG_PFAULT 
 630/*
 631 * 'pfault' pseudo page faults routines.
 632 */
 633static int pfault_disable;
 634
 635static int __init nopfault(char *str)
 636{
 637        pfault_disable = 1;
 638        return 1;
 639}
 640
 641__setup("nopfault", nopfault);
 642
 643struct pfault_refbk {
 644        u16 refdiagc;
 645        u16 reffcode;
 646        u16 refdwlen;
 647        u16 refversn;
 648        u64 refgaddr;
 649        u64 refselmk;
 650        u64 refcmpmk;
 651        u64 reserved;
 652} __attribute__ ((packed, aligned(8)));
 653
 654int pfault_init(void)
 655{
 656        struct pfault_refbk refbk = {
 657                .refdiagc = 0x258,
 658                .reffcode = 0,
 659                .refdwlen = 5,
 660                .refversn = 2,
 661                .refgaddr = __LC_LPP,
 662                .refselmk = 1ULL << 48,
 663                .refcmpmk = 1ULL << 48,
 664                .reserved = __PF_RES_FIELD };
 665        int rc;
 666
 667        if (pfault_disable)
 668                return -1;
 669        diag_stat_inc(DIAG_STAT_X258);
 670        asm volatile(
 671                "       diag    %1,%0,0x258\n"
 672                "0:     j       2f\n"
 673                "1:     la      %0,8\n"
 674                "2:\n"
 675                EX_TABLE(0b,1b)
 676                : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
 677        return rc;
 678}
 679
 680void pfault_fini(void)
 681{
 682        struct pfault_refbk refbk = {
 683                .refdiagc = 0x258,
 684                .reffcode = 1,
 685                .refdwlen = 5,
 686                .refversn = 2,
 687        };
 688
 689        if (pfault_disable)
 690                return;
 691        diag_stat_inc(DIAG_STAT_X258);
 692        asm volatile(
 693                "       diag    %0,0,0x258\n"
 694                "0:     nopr    %%r7\n"
 695                EX_TABLE(0b,0b)
 696                : : "a" (&refbk), "m" (refbk) : "cc");
 697}
 698
 699static DEFINE_SPINLOCK(pfault_lock);
 700static LIST_HEAD(pfault_list);
 701
 702#define PF_COMPLETE     0x0080
 703
 704/*
 705 * The mechanism of our pfault code: if Linux is running as guest, runs a user
 706 * space process and the user space process accesses a page that the host has
 707 * paged out we get a pfault interrupt.
 708 *
 709 * This allows us, within the guest, to schedule a different process. Without
 710 * this mechanism the host would have to suspend the whole virtual cpu until
 711 * the page has been paged in.
 712 *
 713 * So when we get such an interrupt then we set the state of the current task
 714 * to uninterruptible and also set the need_resched flag. Both happens within
 715 * interrupt context(!). If we later on want to return to user space we
 716 * recognize the need_resched flag and then call schedule().  It's not very
 717 * obvious how this works...
 718 *
 719 * Of course we have a lot of additional fun with the completion interrupt (->
 720 * host signals that a page of a process has been paged in and the process can
 721 * continue to run). This interrupt can arrive on any cpu and, since we have
 722 * virtual cpus, actually appear before the interrupt that signals that a page
 723 * is missing.
 724 */
 725static void pfault_interrupt(struct ext_code ext_code,
 726                             unsigned int param32, unsigned long param64)
 727{
 728        struct task_struct *tsk;
 729        __u16 subcode;
 730        pid_t pid;
 731
 732        /*
 733         * Get the external interruption subcode & pfault initial/completion
 734         * signal bit. VM stores this in the 'cpu address' field associated
 735         * with the external interrupt.
 736         */
 737        subcode = ext_code.subcode;
 738        if ((subcode & 0xff00) != __SUBCODE_MASK)
 739                return;
 740        inc_irq_stat(IRQEXT_PFL);
 741        /* Get the token (= pid of the affected task). */
 742        pid = param64 & LPP_PID_MASK;
 743        rcu_read_lock();
 744        tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 745        if (tsk)
 746                get_task_struct(tsk);
 747        rcu_read_unlock();
 748        if (!tsk)
 749                return;
 750        spin_lock(&pfault_lock);
 751        if (subcode & PF_COMPLETE) {
 752                /* signal bit is set -> a page has been swapped in by VM */
 753                if (tsk->thread.pfault_wait == 1) {
 754                        /* Initial interrupt was faster than the completion
 755                         * interrupt. pfault_wait is valid. Set pfault_wait
 756                         * back to zero and wake up the process. This can
 757                         * safely be done because the task is still sleeping
 758                         * and can't produce new pfaults. */
 759                        tsk->thread.pfault_wait = 0;
 760                        list_del(&tsk->thread.list);
 761                        wake_up_process(tsk);
 762                        put_task_struct(tsk);
 763                } else {
 764                        /* Completion interrupt was faster than initial
 765                         * interrupt. Set pfault_wait to -1 so the initial
 766                         * interrupt doesn't put the task to sleep.
 767                         * If the task is not running, ignore the completion
 768                         * interrupt since it must be a leftover of a PFAULT
 769                         * CANCEL operation which didn't remove all pending
 770                         * completion interrupts. */
 771                        if (tsk->state == TASK_RUNNING)
 772                                tsk->thread.pfault_wait = -1;
 773                }
 774        } else {
 775                /* signal bit not set -> a real page is missing. */
 776                if (WARN_ON_ONCE(tsk != current))
 777                        goto out;
 778                if (tsk->thread.pfault_wait == 1) {
 779                        /* Already on the list with a reference: put to sleep */
 780                        goto block;
 781                } else if (tsk->thread.pfault_wait == -1) {
 782                        /* Completion interrupt was faster than the initial
 783                         * interrupt (pfault_wait == -1). Set pfault_wait
 784                         * back to zero and exit. */
 785                        tsk->thread.pfault_wait = 0;
 786                } else {
 787                        /* Initial interrupt arrived before completion
 788                         * interrupt. Let the task sleep.
 789                         * An extra task reference is needed since a different
 790                         * cpu may set the task state to TASK_RUNNING again
 791                         * before the scheduler is reached. */
 792                        get_task_struct(tsk);
 793                        tsk->thread.pfault_wait = 1;
 794                        list_add(&tsk->thread.list, &pfault_list);
 795block:
 796                        /* Since this must be a userspace fault, there
 797                         * is no kernel task state to trample. Rely on the
 798                         * return to userspace schedule() to block. */
 799                        __set_current_state(TASK_UNINTERRUPTIBLE);
 800                        set_tsk_need_resched(tsk);
 801                        set_preempt_need_resched();
 802                }
 803        }
 804out:
 805        spin_unlock(&pfault_lock);
 806        put_task_struct(tsk);
 807}
 808
 809static int pfault_cpu_dead(unsigned int cpu)
 810{
 811        struct thread_struct *thread, *next;
 812        struct task_struct *tsk;
 813
 814        spin_lock_irq(&pfault_lock);
 815        list_for_each_entry_safe(thread, next, &pfault_list, list) {
 816                thread->pfault_wait = 0;
 817                list_del(&thread->list);
 818                tsk = container_of(thread, struct task_struct, thread);
 819                wake_up_process(tsk);
 820                put_task_struct(tsk);
 821        }
 822        spin_unlock_irq(&pfault_lock);
 823        return 0;
 824}
 825
 826static int __init pfault_irq_init(void)
 827{
 828        int rc;
 829
 830        rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 831        if (rc)
 832                goto out_extint;
 833        rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
 834        if (rc)
 835                goto out_pfault;
 836        irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
 837        cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
 838                                  NULL, pfault_cpu_dead);
 839        return 0;
 840
 841out_pfault:
 842        unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 843out_extint:
 844        pfault_disable = 1;
 845        return rc;
 846}
 847early_initcall(pfault_irq_init);
 848
 849#endif /* CONFIG_PFAULT */
 850